repotest/main.py

from datetime import datetime
import json
import bs4
import dateutil.parser
import requests
import fake_useragent
from bs4 import BeautifulSoup
import multiprocessing
from functools import partial
from utils import json_serial

main_link = 'https://author.today/'
book_link = main_link + 'work/'
comment_link = main_link + 'comment/'
book_id = 129108
book_data = {}


# https://author.today/comment/loadThread?parentId=12457484&rootId=51272&rootType=Work&lastViewTime=&_=1645277190787
def load_replies_thread_data(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work',
                             last_view_time: str = '', time_stamp: int = 0):
    thread = session.get(f'{comment_link}/loadThread'
                         f'?parentId={parent_id}'
                         f'&rootId={root_id}'
                         f'&rootType={root_type}'
                         f'&lastViewTime={last_view_time}').json()

    return None \
        if 'data' not in thread or 'html' not in thread['data'] or not thread['data']['html'] \
        else thread['data']['html']


def get_replies_thread_soup(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work',
                            last_view_time: str = '', time_stamp: int = 0):
    replies_data = load_replies_thread_data(session, parent_id, root_id, root_type, last_view_time, time_stamp)
    return bs4.element.Tag() if not replies_data else BeautifulSoup(replies_data, 'lxml').find('body')


def parse_comment(session: requests.Session, wrapper: bs4.element.Tag):
    current_comment = {}

    comment = wrapper.find('div', class_='comment')
    comment_id = int(comment.get('data-id'))
    current_comment['id'] = comment_id
    current_comment['thread'] = int(comment.get('data-thread'))
    current_comment['url'] = f'{book_link}{book_id}?c={current_comment["id"]}&th={current_comment["thread"]}'

    c_body = comment.find('div', class_='comment-body')
    current_comment['author'] = c_body.find('span', class_='comment-user-name').text.strip()
    current_comment['date'] = dateutil.parser.parse(c_body.find('time').find('span').get('data-time'))

    comment_text = c_body.find('article').text.strip()
    current_comment['text'] = comment_text if comment_text else 'No text data'

    replies_count = int(wrapper.find('div', class_='comment-toggle-expand').find('span', class_='replies-count').text)
    if replies_count > 0:
        comment_replies = wrapper.find('div', class_='replies')
        first_reply_wrapper = comment_replies.find('div', class_='comment-wrapper')

        #  Check if it has no expanded replies
        if type(first_reply_wrapper) is not bs4.element.Tag:
            comment_replies = get_replies_thread_soup(session, comment_id, book_id)

        replies = [
            parse_comment(session, reply)
            for reply in comment_replies.findAll('div', class_='comment-wrapper', recursive=False)
        ]
        current_comment['replies'] = replies

    return current_comment


def parse_comments_page(session: requests.Session, comments: bs4.element.Tag):
    wrappers = comments.findAll('div', class_='comment-wrapper', recursive=False)
    comments = [parse_comment(session, wrapper) for wrapper in wrappers]
    return comments


def get_comments_from_page(page: int, session: requests.Session):
    comments_page = session.get(f'{book_link}{book_id}?page={page}')
    doc = BeautifulSoup(comments_page.text, 'lxml')
    comments = parse_comments_page(session, doc.find('div', class_='comments'))

    print(f'Page processed: {page}')

    return comments


if __name__ == '__main__':
    start_time = datetime.now()
    print(f'Start parsing at {start_time}')

    user = fake_useragent.UserAgent().random
    header = {'user-agent': user}

    session = requests.session()
    session.headers.update(**header)

    response = session.get(f'{book_link}{book_id}')
    soup = BeautifulSoup(response.text, 'lxml')
    body = soup.find('div', id='pjax-container')

    # Getting book data
    info = body.find('div', class_='panel book-panel').find('div', class_='book-meta-panel')
    book_data['name'] = info.find('h1', class_='book-title').text.strip()
    book_data['author'] = info.find('div', class_='book-authors').text.strip()
    book_data['url'] = f'{book_link}{book_id}'

    # Getting pages count
    pages_count = int(body.find('div', class_='pagination-container').find('li', class_='skipToLast').text)

    # Getting comments
    cpu_count = multiprocessing.cpu_count()
    print(f'Count of CPU: {cpu_count}\nCount of pages: {pages_count}')
    with multiprocessing.Pool(cpu_count) as process:
        comments = [
            comment
            for comments in process.map(partial(get_comments_from_page, session=session), range(1, pages_count + 1))
            for comment in comments
        ]

    # Sorting
    book_data['comments'] = sorted(comments, key=lambda comment: comment['date'], reverse=True)

    # Save to file
    with open(f"{book_data['name']}.json", "w") as outfile:
        json.dump(book_data, outfile, ensure_ascii=False, indent=2, default=json_serial)

    end_time = datetime.now()
    print(f'Finish parsing at {end_time}')
    print(f'Time spend: {end_time - start_time}')