repotest/main.py

132 lines
5.1 KiB
Python

from datetime import datetime
import json
import bs4
import dateutil.parser
import requests
import fake_useragent
from bs4 import BeautifulSoup
import multiprocessing
from functools import partial
from utils import json_serial
main_link = 'https://author.today/'
book_link = main_link + 'work/'
comment_link = main_link + 'comment/'
book_id = 129108
book_data = {}
# https://author.today/comment/loadThread?parentId=12457484&rootId=51272&rootType=Work&lastViewTime=&_=1645277190787
def load_replies_thread_data(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work',
last_view_time: str = '', time_stamp: int = 0):
thread = session.get(f'{comment_link}/loadThread'
f'?parentId={parent_id}'
f'&rootId={root_id}'
f'&rootType={root_type}'
f'&lastViewTime={last_view_time}').json()
return None \
if 'data' not in thread or 'html' not in thread['data'] or not thread['data']['html'] \
else thread['data']['html']
def get_replies_thread_soup(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work',
last_view_time: str = '', time_stamp: int = 0):
replies_data = load_replies_thread_data(session, parent_id, root_id, root_type, last_view_time, time_stamp)
return bs4.element.Tag() if not replies_data else BeautifulSoup(replies_data, 'lxml').find('body')
def parse_comment(session: requests.Session, wrapper: bs4.element.Tag):
current_comment = {}
comment = wrapper.find('div', class_='comment')
comment_id = int(comment.get('data-id'))
current_comment['id'] = comment_id
current_comment['thread'] = int(comment.get('data-thread'))
current_comment['url'] = f'{book_link}{book_id}?c={current_comment["id"]}&th={current_comment["thread"]}'
c_body = comment.find('div', class_='comment-body')
current_comment['author'] = c_body.find('span', class_='comment-user-name').text.strip()
current_comment['date'] = dateutil.parser.parse(c_body.find('time').find('span').get('data-time'))
comment_text = c_body.find('article').text.strip()
current_comment['text'] = comment_text if comment_text else 'No text data'
replies_count = int(wrapper.find('div', class_='comment-toggle-expand').find('span', class_='replies-count').text)
if replies_count > 0:
comment_replies = wrapper.find('div', class_='replies')
first_reply_wrapper = comment_replies.find('div', class_='comment-wrapper')
# Check if it has no expanded replies
if type(first_reply_wrapper) is not bs4.element.Tag:
comment_replies = get_replies_thread_soup(session, comment_id, book_id)
replies = [
parse_comment(session, reply)
for reply in comment_replies.findAll('div', class_='comment-wrapper', recursive=False)
]
current_comment['replies'] = replies
return current_comment
def parse_comments_page(session: requests.Session, comments: bs4.element.Tag):
wrappers = comments.findAll('div', class_='comment-wrapper', recursive=False)
comments = [parse_comment(session, wrapper) for wrapper in wrappers]
return comments
def get_comments_from_page(page: int, session: requests.Session):
comments_page = session.get(f'{book_link}{book_id}?page={page}')
doc = BeautifulSoup(comments_page.text, 'lxml')
comments = parse_comments_page(session, doc.find('div', class_='comments'))
print(f'Page processed: {page}')
return comments
if __name__ == '__main__':
start_time = datetime.now()
print(f'Start parsing at {start_time}')
user = fake_useragent.UserAgent().random
header = {'user-agent': user}
session = requests.session()
session.headers.update(**header)
response = session.get(f'{book_link}{book_id}')
soup = BeautifulSoup(response.text, 'lxml')
body = soup.find('div', id='pjax-container')
# Getting book data
info = body.find('div', class_='panel book-panel').find('div', class_='book-meta-panel')
book_data['name'] = info.find('h1', class_='book-title').text.strip()
book_data['author'] = info.find('div', class_='book-authors').text.strip()
book_data['url'] = f'{book_link}{book_id}'
# Getting pages count
pages_count = int(body.find('div', class_='pagination-container').find('li', class_='skipToLast').text)
# Getting comments
cpu_count = multiprocessing.cpu_count()
print(f'Count of CPU: {cpu_count}\nCount of pages: {pages_count}')
with multiprocessing.Pool(cpu_count) as process:
comments = [
comment
for comments in process.map(partial(get_comments_from_page, session=session), range(1, pages_count + 1))
for comment in comments
]
# Sorting
book_data['comments'] = sorted(comments, key=lambda comment: comment['date'], reverse=True)
# Save to file
with open(f"{book_data['name']}.json", "w") as outfile:
json.dump(book_data, outfile, ensure_ascii=False, indent=2, default=json_serial)
end_time = datetime.now()
print(f'Finish parsing at {end_time}')
print(f'Time spend: {end_time - start_time}')