from datetime import datetime import json import bs4 import dateutil.parser import requests import fake_useragent from bs4 import BeautifulSoup import multiprocessing from functools import partial from utils import json_serial main_link = 'https://author.today/' book_link = main_link + 'work/' comment_link = main_link + 'comment/' book_id = 129108 book_data = {} # https://author.today/comment/loadThread?parentId=12457484&rootId=51272&rootType=Work&lastViewTime=&_=1645277190787 def load_replies_thread_data(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work', last_view_time: str = '', time_stamp: int = 0): thread = session.get(f'{comment_link}/loadThread' f'?parentId={parent_id}' f'&rootId={root_id}' f'&rootType={root_type}' f'&lastViewTime={last_view_time}').json() return None \ if 'data' not in thread or 'html' not in thread['data'] or not thread['data']['html'] \ else thread['data']['html'] def get_replies_thread_soup(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work', last_view_time: str = '', time_stamp: int = 0): replies_data = load_replies_thread_data(session, parent_id, root_id, root_type, last_view_time, time_stamp) return bs4.element.Tag() if not replies_data else BeautifulSoup(replies_data, 'lxml').find('body') def parse_comment(session: requests.Session, wrapper: bs4.element.Tag): current_comment = {} comment = wrapper.find('div', class_='comment') comment_id = int(comment.get('data-id')) current_comment['id'] = comment_id current_comment['thread'] = int(comment.get('data-thread')) current_comment['url'] = f'{book_link}{book_id}?c={current_comment["id"]}&th={current_comment["thread"]}' c_body = comment.find('div', class_='comment-body') current_comment['author'] = c_body.find('span', class_='comment-user-name').text.strip() current_comment['date'] = dateutil.parser.parse(c_body.find('time').find('span').get('data-time')) comment_text = c_body.find('article').text.strip() current_comment['text'] = comment_text if comment_text else 'No text data' replies_count = int(wrapper.find('div', class_='comment-toggle-expand').find('span', class_='replies-count').text) if replies_count > 0: comment_replies = wrapper.find('div', class_='replies') first_reply_wrapper = comment_replies.find('div', class_='comment-wrapper') # Check if it has no expanded replies if type(first_reply_wrapper) is not bs4.element.Tag: comment_replies = get_replies_thread_soup(session, comment_id, book_id) replies = [ parse_comment(session, reply) for reply in comment_replies.findAll('div', class_='comment-wrapper', recursive=False) ] current_comment['replies'] = replies return current_comment def parse_comments_page(session: requests.Session, comments: bs4.element.Tag): wrappers = comments.findAll('div', class_='comment-wrapper', recursive=False) comments = [parse_comment(session, wrapper) for wrapper in wrappers] return comments def get_comments_from_page(page: int, session: requests.Session): comments_page = session.get(f'{book_link}{book_id}?page={page}') doc = BeautifulSoup(comments_page.text, 'lxml') comments = parse_comments_page(session, doc.find('div', class_='comments')) print(f'Page processed: {page}') return comments if __name__ == '__main__': start_time = datetime.now() print(f'Start parsing at {start_time}') user = fake_useragent.UserAgent().random header = {'user-agent': user} session = requests.session() session.headers.update(**header) response = session.get(f'{book_link}{book_id}') soup = BeautifulSoup(response.text, 'lxml') body = soup.find('div', id='pjax-container') # Getting book data info = body.find('div', class_='panel book-panel').find('div', class_='book-meta-panel') book_data['name'] = info.find('h1', class_='book-title').text.strip() book_data['author'] = info.find('div', class_='book-authors').text.strip() book_data['url'] = f'{book_link}{book_id}' # Getting pages count pages_count = int(body.find('div', class_='pagination-container').find('li', class_='skipToLast').text) # Getting comments cpu_count = multiprocessing.cpu_count() print(f'Count of CPU: {cpu_count}\nCount of pages: {pages_count}') with multiprocessing.Pool(cpu_count) as process: comments = [ comment for comments in process.map(partial(get_comments_from_page, session=session), range(1, pages_count + 1)) for comment in comments ] # Sorting book_data['comments'] = sorted(comments, key=lambda comment: comment['date'], reverse=True) # Save to file with open(f"{book_data['name']}.json", "w") as outfile: json.dump(book_data, outfile, ensure_ascii=False, indent=2, default=json_serial) end_time = datetime.now() print(f'Finish parsing at {end_time}') print(f'Time spend: {end_time - start_time}')