first commit
commit
fff9b62d72
|
@ -0,0 +1,8 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (at_parser)" project-jdk-type="Python SDK" />
|
||||
</project>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/at_parser.iml" filepath="$PROJECT_DIR$/.idea/at_parser.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,131 @@
|
|||
from datetime import datetime
|
||||
import json
|
||||
import bs4
|
||||
import dateutil.parser
|
||||
import requests
|
||||
import fake_useragent
|
||||
from bs4 import BeautifulSoup
|
||||
import multiprocessing
|
||||
from functools import partial
|
||||
from utils import json_serial
|
||||
|
||||
main_link = 'https://author.today/'
|
||||
book_link = main_link + 'work/'
|
||||
comment_link = main_link + 'comment/'
|
||||
book_id = 129108
|
||||
book_data = {}
|
||||
|
||||
|
||||
# https://author.today/comment/loadThread?parentId=12457484&rootId=51272&rootType=Work&lastViewTime=&_=1645277190787
|
||||
def load_replies_thread_data(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work',
|
||||
last_view_time: str = '', time_stamp: int = 0):
|
||||
thread = session.get(f'{comment_link}/loadThread'
|
||||
f'?parentId={parent_id}'
|
||||
f'&rootId={root_id}'
|
||||
f'&rootType={root_type}'
|
||||
f'&lastViewTime={last_view_time}').json()
|
||||
|
||||
return None \
|
||||
if 'data' not in thread or 'html' not in thread['data'] or not thread['data']['html'] \
|
||||
else thread['data']['html']
|
||||
|
||||
|
||||
def get_replies_thread_soup(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work',
|
||||
last_view_time: str = '', time_stamp: int = 0):
|
||||
replies_data = load_replies_thread_data(session, parent_id, root_id, root_type, last_view_time, time_stamp)
|
||||
return bs4.element.Tag() if not replies_data else BeautifulSoup(replies_data, 'lxml').find('body')
|
||||
|
||||
|
||||
def parse_comment(session: requests.Session, wrapper: bs4.element.Tag):
|
||||
current_comment = {}
|
||||
|
||||
comment = wrapper.find('div', class_='comment')
|
||||
comment_id = int(comment.get('data-id'))
|
||||
current_comment['id'] = comment_id
|
||||
current_comment['thread'] = int(comment.get('data-thread'))
|
||||
current_comment['url'] = f'{book_link}{book_id}?c={current_comment["id"]}&th={current_comment["thread"]}'
|
||||
|
||||
c_body = comment.find('div', class_='comment-body')
|
||||
current_comment['author'] = c_body.find('span', class_='comment-user-name').text.strip()
|
||||
current_comment['date'] = dateutil.parser.parse(c_body.find('time').find('span').get('data-time'))
|
||||
|
||||
comment_text = c_body.find('article').text.strip()
|
||||
current_comment['text'] = comment_text if comment_text else 'No text data'
|
||||
|
||||
replies_count = int(wrapper.find('div', class_='comment-toggle-expand').find('span', class_='replies-count').text)
|
||||
if replies_count > 0:
|
||||
comment_replies = wrapper.find('div', class_='replies')
|
||||
first_reply_wrapper = comment_replies.find('div', class_='comment-wrapper')
|
||||
|
||||
# Check if it has no expanded replies
|
||||
if type(first_reply_wrapper) is not bs4.element.Tag:
|
||||
comment_replies = get_replies_thread_soup(session, comment_id, book_id)
|
||||
|
||||
replies = [
|
||||
parse_comment(session, reply)
|
||||
for reply in comment_replies.findAll('div', class_='comment-wrapper', recursive=False)
|
||||
]
|
||||
current_comment['replies'] = replies
|
||||
|
||||
return current_comment
|
||||
|
||||
|
||||
def parse_comments_page(session: requests.Session, comments: bs4.element.Tag):
|
||||
wrappers = comments.findAll('div', class_='comment-wrapper', recursive=False)
|
||||
comments = [parse_comment(session, wrapper) for wrapper in wrappers]
|
||||
return comments
|
||||
|
||||
|
||||
def get_comments_from_page(page: int, session: requests.Session):
|
||||
comments_page = session.get(f'{book_link}{book_id}?page={page}')
|
||||
doc = BeautifulSoup(comments_page.text, 'lxml')
|
||||
comments = parse_comments_page(session, doc.find('div', class_='comments'))
|
||||
|
||||
print(f'Page processed: {page}')
|
||||
|
||||
return comments
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
start_time = datetime.now()
|
||||
print(f'Start parsing at {start_time}')
|
||||
|
||||
user = fake_useragent.UserAgent().random
|
||||
header = {'user-agent': user}
|
||||
|
||||
session = requests.session()
|
||||
session.headers.update(**header)
|
||||
|
||||
response = session.get(f'{book_link}{book_id}')
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
body = soup.find('div', id='pjax-container')
|
||||
|
||||
# Getting book data
|
||||
info = body.find('div', class_='panel book-panel').find('div', class_='book-meta-panel')
|
||||
book_data['name'] = info.find('h1', class_='book-title').text.strip()
|
||||
book_data['author'] = info.find('div', class_='book-authors').text.strip()
|
||||
book_data['url'] = f'{book_link}{book_id}'
|
||||
|
||||
# Getting pages count
|
||||
pages_count = int(body.find('div', class_='pagination-container').find('li', class_='skipToLast').text)
|
||||
|
||||
# Getting comments
|
||||
cpu_count = multiprocessing.cpu_count()
|
||||
print(f'Count of CPU: {cpu_count}\nCount of pages: {pages_count}')
|
||||
with multiprocessing.Pool(cpu_count) as process:
|
||||
comments = [
|
||||
comment
|
||||
for comments in process.map(partial(get_comments_from_page, session=session), range(1, pages_count + 1))
|
||||
for comment in comments
|
||||
]
|
||||
|
||||
# Sorting
|
||||
book_data['comments'] = sorted(comments, key=lambda comment: comment['date'], reverse=True)
|
||||
|
||||
# Save to file
|
||||
with open(f"{book_data['name']}.json", "w") as outfile:
|
||||
json.dump(book_data, outfile, ensure_ascii=False, indent=2, default=json_serial)
|
||||
|
||||
end_time = datetime.now()
|
||||
print(f'Finish parsing at {end_time}')
|
||||
print(f'Time spend: {end_time - start_time}')
|
|
@ -0,0 +1,6 @@
|
|||
# Add python dependencies here.
|
||||
requests
|
||||
bs4
|
||||
lxml
|
||||
fake_useragent
|
||||
python-dateutil
|
|
@ -0,0 +1,127 @@
|
|||
<div class="comment-wrapper expanded-replies">
|
||||
<span class="anchor" id="comment_12636233"></span>
|
||||
|
||||
<div class="comment c-view c-allow-edit " data-id="12636233" data-level="2" data-thread="12629941"
|
||||
data-is-ignored="false" data-is-pinned="false">
|
||||
<div class="comment-avatar">
|
||||
<a href="/u/dima2914">
|
||||
<img class=""
|
||||
src="https://cm.author.today/content/2021/12/12/u/dima2914_637749114653718777.jpg?width=70&height=70&mode=crop"/>
|
||||
</a>
|
||||
</div>
|
||||
<div class="comment-body">
|
||||
<header class="">
|
||||
<a href="/u/dima2914">
|
||||
<span class="comment-user-name">pou29</span>
|
||||
</a>
|
||||
<time>
|
||||
<span class="hint-top" data-format="calendar" data-time="2022-02-15T10:55:31.8030000Z"></span>
|
||||
</time>
|
||||
<div class="comment-tool">
|
||||
<a data-clipboard="?c=12636233&th=12629941" class="support-link hint-top comment-url"
|
||||
data-hint="Ссылка на комментарий #12636233">#</a>
|
||||
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
|
||||
onclick="AppUtils.Comment.scrollToParent('#comment_12634372')">↑</a>
|
||||
</div>
|
||||
</header>
|
||||
<article>
|
||||
<div class="rich-content fr-view"><p>Я очень долго окладывал это произведение!! Ну не шло совсем!!! Еле
|
||||
осилил 10 глав в первом томе! А потом откладывал, откладывал.... какой же я был глупец<img
|
||||
class="emojione" alt="😇"
|
||||
src="https://cdn.jsdelivr.net/emojione/assets/3.0/png/64/1f607.png?v=3.0"/> </p></div>
|
||||
</article>
|
||||
|
||||
<footer>
|
||||
<div class="rating-count comment-rating-count hint-bottom-right" data-vote-id=""
|
||||
data-hint="Рейтинг комментария: 👍3 и 👎0">+3
|
||||
</div>
|
||||
|
||||
|
||||
<a class="comment-action action-abuse"
|
||||
onclick="AppComponents.FeedbackForm.show(12636233, 'Comment', '/work/51272?c=12636233&th=12629941')">пожаловаться</a>
|
||||
|
||||
</footer>
|
||||
<div class="reply-placeholder">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
|
||||
<i class="icon-minus-square-o"></i>
|
||||
</div>
|
||||
|
||||
<div class="comment-toggle comment-toggle-expand" data-state="loaded"
|
||||
onclick="AppUtils.Comment.expand(this, 12636233, 51272, 'Work')">
|
||||
<i class="icon-plus-square-o"></i> <span class="toggle-text">раскрыть ветвь</span>
|
||||
<span class="replies-count">1</span> <i class="icon-comments"></i>
|
||||
</div>
|
||||
|
||||
<div class="replies">
|
||||
|
||||
|
||||
<div class="comment-wrapper ">
|
||||
<span class="anchor" id="comment_12636484"></span>
|
||||
|
||||
<div class="comment c-view c-allow-edit " data-id="12636484" data-level="3" data-thread="12629941"
|
||||
data-is-ignored="false" data-is-pinned="false">
|
||||
<div class="comment-avatar">
|
||||
<a href="/u/id54937541">
|
||||
<img class=""
|
||||
src="https://cm.author.today/content/2019/02/05/u/id54937541_636849751708919048.jpg?width=70&height=70&mode=crop"/>
|
||||
</a>
|
||||
</div>
|
||||
<div class="comment-body">
|
||||
<header class="">
|
||||
<a href="/u/id54937541">
|
||||
<span class="comment-user-name">Михаил Игнатов</span>
|
||||
</a>
|
||||
<span class="label label-primary pull-left">автор</span>
|
||||
<time>
|
||||
<span class="hint-top" data-format="calendar"
|
||||
data-time="2022-02-15T11:16:11.0830000Z"></span>
|
||||
</time>
|
||||
<div class="comment-tool">
|
||||
<a data-clipboard="?c=12636484&th=12629941" class="support-link hint-top comment-url"
|
||||
data-hint="Ссылка на комментарий #12636484">#</a>
|
||||
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
|
||||
onclick="AppUtils.Comment.scrollToParent('#comment_12636233')">↑</a>
|
||||
</div>
|
||||
</header>
|
||||
<article>
|
||||
<div class="rich-content fr-view"><p>Автор крут, я знаю. <img class="emojione" alt="😇"
|
||||
src="https://cdn.jsdelivr.net/emojione/assets/3.0/png/64/1f607.png?v=3.0"/>
|
||||
</p></div>
|
||||
</article>
|
||||
|
||||
<footer>
|
||||
<div class="rating-count comment-rating-count hint-bottom-right" data-vote-id=""
|
||||
data-hint="Рейтинг комментария: 👍7 и 👎0">+7
|
||||
</div>
|
||||
|
||||
|
||||
<a class="comment-action action-abuse"
|
||||
onclick="AppComponents.FeedbackForm.show(12636484, 'Comment', '/work/51272?c=12636484&th=12629941')">пожаловаться</a>
|
||||
|
||||
</footer>
|
||||
<div class="reply-placeholder">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
|
||||
<i class="icon-minus-square-o"></i>
|
||||
</div>
|
||||
|
||||
<div class="comment-toggle comment-toggle-expand" data-state="none"
|
||||
onclick="AppUtils.Comment.expand(this, 12636484, 51272, 'Work')">
|
||||
<i class="icon-plus-square-o"></i> <span class="toggle-text">раскрыть ветвь</span>
|
||||
<span class="replies-count">0</span> <i class="icon-comments"></i>
|
||||
</div>
|
||||
|
||||
<div class="replies">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
|
@ -0,0 +1,163 @@
|
|||
<html>
|
||||
<body>
|
||||
<div class="comment-wrapper">
|
||||
<span class="anchor" id="comment_12721746"></span>
|
||||
<div class="comment c-view c-allow-edit" data-id="12721746" data-is-ignored="false" data-is-pinned="false"
|
||||
data-level="2" data-thread="12629941">
|
||||
<div class="comment-avatar">
|
||||
<a href="/u/ab_random">
|
||||
<img class=""
|
||||
src="https://cm.author.today/content/2020/03/04/u/ab_random_637189212355741343.jpg?width=70&height=70&mode=crop"/>
|
||||
</a>
|
||||
</div>
|
||||
<div class="comment-body">
|
||||
<header class="">
|
||||
<a href="/u/ab_random">
|
||||
<span class="comment-user-name">ab.random</span>
|
||||
</a>
|
||||
<time>
|
||||
<span class="hint-top" data-format="calendar" data-time="2022-02-20T19:50:14.6100000Z"></span>
|
||||
</time>
|
||||
<div class="comment-tool">
|
||||
<a class="support-link hint-top comment-url" data-clipboard="?c=12721746&th=12629941"
|
||||
data-hint="Ссылка на комментарий #12721746">#</a>
|
||||
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
|
||||
onclick="AppUtils.Comment.scrollToParent('#comment_12634372')">↑</a>
|
||||
</div>
|
||||
</header>
|
||||
<article>
|
||||
<div class="rich-content fr-view"><p>Этот коммент не несёт в себе смысловой нагрузки и нужен лишь для
|
||||
эксперимента. Можете не обращать на него внимание. Спасибо.</p></div>
|
||||
</article>
|
||||
<footer>
|
||||
<div class="rating-count comment-rating-count hint-bottom-right"
|
||||
data-hint="Рейтинг комментария: 👍0 и 👎0" data-vote-id="">0
|
||||
</div>
|
||||
<a class="comment-action action-abuse"
|
||||
onclick="AppComponents.FeedbackForm.show(12721746, 'Comment', '/work/51272?c=12721746&th=12629941')">пожаловаться</a>
|
||||
</footer>
|
||||
<div class="reply-placeholder">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
|
||||
<i class="icon-minus-square-o"></i>
|
||||
</div>
|
||||
<div class="comment-toggle comment-toggle-expand" data-state="none"
|
||||
onclick="AppUtils.Comment.expand(this, 12721746, 51272, 'Work')">
|
||||
<i class="icon-plus-square-o"></i> <span class="toggle-text">раскрыть ветвь</span>
|
||||
<span class="replies-count">0</span> <i class="icon-comments"></i>
|
||||
</div>
|
||||
<div class="replies">
|
||||
</div>
|
||||
</div>
|
||||
<div class="comment-wrapper expanded-replies">
|
||||
<span class="anchor" id="comment_12636233"></span>
|
||||
<div class="comment c-view c-allow-edit" data-id="12636233" data-is-ignored="false" data-is-pinned="false"
|
||||
data-level="2" data-thread="12629941">
|
||||
<div class="comment-avatar">
|
||||
<a href="/u/dima2914">
|
||||
<img class=""
|
||||
src="https://cm.author.today/content/2021/12/12/u/dima2914_637749114653718777.jpg?width=70&height=70&mode=crop"/>
|
||||
</a>
|
||||
</div>
|
||||
<div class="comment-body">
|
||||
<header class="">
|
||||
<a href="/u/dima2914">
|
||||
<span class="comment-user-name">pou29</span>
|
||||
</a>
|
||||
<time>
|
||||
<span class="hint-top" data-format="calendar" data-time="2022-02-15T10:55:31.8030000Z"></span>
|
||||
</time>
|
||||
<div class="comment-tool">
|
||||
<a class="support-link hint-top comment-url" data-clipboard="?c=12636233&th=12629941"
|
||||
data-hint="Ссылка на комментарий #12636233">#</a>
|
||||
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
|
||||
onclick="AppUtils.Comment.scrollToParent('#comment_12634372')">↑</a>
|
||||
</div>
|
||||
</header>
|
||||
<article>
|
||||
<div class="rich-content fr-view"><p>Я очень долго окладывал это произведение!! Ну не шло совсем!!! Еле
|
||||
осилил 10 глав в первом томе! А потом откладывал, откладывал.... какой же я был глупец<img alt="😇"
|
||||
class="emojione"
|
||||
src="https://cdn.jsdelivr.net/emojione/assets/3.0/png/64/1f607.png?v=3.0"/>
|
||||
</p></div>
|
||||
</article>
|
||||
<footer>
|
||||
<div class="rating-count comment-rating-count hint-bottom-right"
|
||||
data-hint="Рейтинг комментария: 👍3 и 👎0" data-vote-id="">+3
|
||||
</div>
|
||||
<a class="comment-action action-abuse"
|
||||
onclick="AppComponents.FeedbackForm.show(12636233, 'Comment', '/work/51272?c=12636233&th=12629941')">пожаловаться</a>
|
||||
</footer>
|
||||
<div class="reply-placeholder">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
|
||||
<i class="icon-minus-square-o"></i>
|
||||
</div>
|
||||
<div class="comment-toggle comment-toggle-expand" data-state="loaded"
|
||||
onclick="AppUtils.Comment.expand(this, 12636233, 51272, 'Work')">
|
||||
<i class="icon-plus-square-o"></i> <span class="toggle-text">раскрыть ветвь</span>
|
||||
<span class="replies-count">1</span> <i class="icon-comments"></i>
|
||||
</div>
|
||||
<div class="replies">
|
||||
<div class="comment-wrapper">
|
||||
<span class="anchor" id="comment_12636484"></span>
|
||||
<div class="comment c-view c-allow-edit" data-id="12636484" data-is-ignored="false" data-is-pinned="false"
|
||||
data-level="3" data-thread="12629941">
|
||||
<div class="comment-avatar">
|
||||
<a href="/u/id54937541">
|
||||
<img class=""
|
||||
src="https://cm.author.today/content/2019/02/05/u/id54937541_636849751708919048.jpg?width=70&height=70&mode=crop"/>
|
||||
</a>
|
||||
</div>
|
||||
<div class="comment-body">
|
||||
<header class="">
|
||||
<a href="/u/id54937541">
|
||||
<span class="comment-user-name">Михаил Игнатов</span>
|
||||
</a>
|
||||
<span class="label label-primary pull-left">автор</span>
|
||||
<time>
|
||||
<span class="hint-top" data-format="calendar"
|
||||
data-time="2022-02-15T11:16:11.0830000Z"></span>
|
||||
</time>
|
||||
<div class="comment-tool">
|
||||
<a class="support-link hint-top comment-url" data-clipboard="?c=12636484&th=12629941"
|
||||
data-hint="Ссылка на комментарий #12636484">#</a>
|
||||
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
|
||||
onclick="AppUtils.Comment.scrollToParent('#comment_12636233')">↑</a>
|
||||
</div>
|
||||
</header>
|
||||
<article>
|
||||
<div class="rich-content fr-view"><p>Автор крут, я знаю. <img alt="😇" class="emojione"
|
||||
src="https://cdn.jsdelivr.net/emojione/assets/3.0/png/64/1f607.png?v=3.0"/>
|
||||
</p></div>
|
||||
</article>
|
||||
<footer>
|
||||
<div class="rating-count comment-rating-count hint-bottom-right"
|
||||
data-hint="Рейтинг комментария: 👍7 и 👎0" data-vote-id="">+7
|
||||
</div>
|
||||
<a class="comment-action action-abuse"
|
||||
onclick="AppComponents.FeedbackForm.show(12636484, 'Comment', '/work/51272?c=12636484&th=12629941')">пожаловаться</a>
|
||||
</footer>
|
||||
<div class="reply-placeholder">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
|
||||
<i class="icon-minus-square-o"></i>
|
||||
</div>
|
||||
<div class="comment-toggle comment-toggle-expand" data-state="none"
|
||||
onclick="AppUtils.Comment.expand(this, 12636484, 51272, 'Work')">
|
||||
<i class="icon-plus-square-o"></i> <span class="toggle-text">раскрыть ветвь</span>
|
||||
<span class="replies-count">0</span> <i class="icon-comments"></i>
|
||||
</div>
|
||||
<div class="replies">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue