first commit

master
alex 2022-04-27 15:02:53 +03:00
commit fff9b62d72
20 changed files with 761002 additions and 0 deletions

8
.idea/.gitignore vendored 100644
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml 100644
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (at_parser)" project-jdk-type="Python SDK" />
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/at_parser.iml" filepath="$PROJECT_DIR$/.idea/at_parser.iml" />
</modules>
</component>
</project>

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

131
main.py 100644
View File

@ -0,0 +1,131 @@
from datetime import datetime
import json
import bs4
import dateutil.parser
import requests
import fake_useragent
from bs4 import BeautifulSoup
import multiprocessing
from functools import partial
from utils import json_serial
main_link = 'https://author.today/'
book_link = main_link + 'work/'
comment_link = main_link + 'comment/'
book_id = 129108
book_data = {}
# https://author.today/comment/loadThread?parentId=12457484&rootId=51272&rootType=Work&lastViewTime=&_=1645277190787
def load_replies_thread_data(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work',
last_view_time: str = '', time_stamp: int = 0):
thread = session.get(f'{comment_link}/loadThread'
f'?parentId={parent_id}'
f'&rootId={root_id}'
f'&rootType={root_type}'
f'&lastViewTime={last_view_time}').json()
return None \
if 'data' not in thread or 'html' not in thread['data'] or not thread['data']['html'] \
else thread['data']['html']
def get_replies_thread_soup(session: requests.Session, parent_id: int, root_id: int, root_type: str = 'Work',
last_view_time: str = '', time_stamp: int = 0):
replies_data = load_replies_thread_data(session, parent_id, root_id, root_type, last_view_time, time_stamp)
return bs4.element.Tag() if not replies_data else BeautifulSoup(replies_data, 'lxml').find('body')
def parse_comment(session: requests.Session, wrapper: bs4.element.Tag):
current_comment = {}
comment = wrapper.find('div', class_='comment')
comment_id = int(comment.get('data-id'))
current_comment['id'] = comment_id
current_comment['thread'] = int(comment.get('data-thread'))
current_comment['url'] = f'{book_link}{book_id}?c={current_comment["id"]}&th={current_comment["thread"]}'
c_body = comment.find('div', class_='comment-body')
current_comment['author'] = c_body.find('span', class_='comment-user-name').text.strip()
current_comment['date'] = dateutil.parser.parse(c_body.find('time').find('span').get('data-time'))
comment_text = c_body.find('article').text.strip()
current_comment['text'] = comment_text if comment_text else 'No text data'
replies_count = int(wrapper.find('div', class_='comment-toggle-expand').find('span', class_='replies-count').text)
if replies_count > 0:
comment_replies = wrapper.find('div', class_='replies')
first_reply_wrapper = comment_replies.find('div', class_='comment-wrapper')
# Check if it has no expanded replies
if type(first_reply_wrapper) is not bs4.element.Tag:
comment_replies = get_replies_thread_soup(session, comment_id, book_id)
replies = [
parse_comment(session, reply)
for reply in comment_replies.findAll('div', class_='comment-wrapper', recursive=False)
]
current_comment['replies'] = replies
return current_comment
def parse_comments_page(session: requests.Session, comments: bs4.element.Tag):
wrappers = comments.findAll('div', class_='comment-wrapper', recursive=False)
comments = [parse_comment(session, wrapper) for wrapper in wrappers]
return comments
def get_comments_from_page(page: int, session: requests.Session):
comments_page = session.get(f'{book_link}{book_id}?page={page}')
doc = BeautifulSoup(comments_page.text, 'lxml')
comments = parse_comments_page(session, doc.find('div', class_='comments'))
print(f'Page processed: {page}')
return comments
if __name__ == '__main__':
start_time = datetime.now()
print(f'Start parsing at {start_time}')
user = fake_useragent.UserAgent().random
header = {'user-agent': user}
session = requests.session()
session.headers.update(**header)
response = session.get(f'{book_link}{book_id}')
soup = BeautifulSoup(response.text, 'lxml')
body = soup.find('div', id='pjax-container')
# Getting book data
info = body.find('div', class_='panel book-panel').find('div', class_='book-meta-panel')
book_data['name'] = info.find('h1', class_='book-title').text.strip()
book_data['author'] = info.find('div', class_='book-authors').text.strip()
book_data['url'] = f'{book_link}{book_id}'
# Getting pages count
pages_count = int(body.find('div', class_='pagination-container').find('li', class_='skipToLast').text)
# Getting comments
cpu_count = multiprocessing.cpu_count()
print(f'Count of CPU: {cpu_count}\nCount of pages: {pages_count}')
with multiprocessing.Pool(cpu_count) as process:
comments = [
comment
for comments in process.map(partial(get_comments_from_page, session=session), range(1, pages_count + 1))
for comment in comments
]
# Sorting
book_data['comments'] = sorted(comments, key=lambda comment: comment['date'], reverse=True)
# Save to file
with open(f"{book_data['name']}.json", "w") as outfile:
json.dump(book_data, outfile, ensure_ascii=False, indent=2, default=json_serial)
end_time = datetime.now()
print(f'Finish parsing at {end_time}')
print(f'Time spend: {end_time - start_time}')

6
requirements.txt 100644
View File

@ -0,0 +1,6 @@
# Add python dependencies here.
requests
bs4
lxml
fake_useragent
python-dateutil

127
temp.html 100644
View File

@ -0,0 +1,127 @@
<div class="comment-wrapper expanded-replies">
<span class="anchor" id="comment_12636233"></span>
<div class="comment c-view c-allow-edit " data-id="12636233" data-level="2" data-thread="12629941"
data-is-ignored="false" data-is-pinned="false">
<div class="comment-avatar">
<a href="/u/dima2914">
<img class=""
src="https://cm.author.today/content/2021/12/12/u/dima2914_637749114653718777.jpg?width=70&amp;height=70&amp;mode=crop"/>
</a>
</div>
<div class="comment-body">
<header class="">
<a href="/u/dima2914">
<span class="comment-user-name">pou29</span>
</a>
<time>
<span class="hint-top" data-format="calendar" data-time="2022-02-15T10:55:31.8030000Z"></span>
</time>
<div class="comment-tool">
<a data-clipboard="?c=12636233&amp;th=12629941" class="support-link hint-top comment-url"
data-hint="Ссылка на комментарий #12636233">#</a>
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
onclick="AppUtils.Comment.scrollToParent('#comment_12634372')">↑</a>
</div>
</header>
<article>
<div class="rich-content fr-view"><p>Я очень долго окладывал это произведение!! Ну не шло совсем!!! Еле
осилил 10 глав в первом томе! А потом откладывал, откладывал.... какой же я был глупец<img
class="emojione" alt="😇"
src="https://cdn.jsdelivr.net/emojione/assets/3.0/png/64/1f607.png?v=3.0"/>&nbsp;</p></div>
</article>
<footer>
<div class="rating-count comment-rating-count hint-bottom-right" data-vote-id=""
data-hint="Рейтинг комментария: 👍3 и 👎0">+3
</div>
<a class="comment-action action-abuse"
onclick="AppComponents.FeedbackForm.show(12636233, 'Comment', '/work/51272?c=12636233&amp;th=12629941')">пожаловаться</a>
</footer>
<div class="reply-placeholder">
</div>
</div>
</div>
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
<i class="icon-minus-square-o"></i>
</div>
<div class="comment-toggle comment-toggle-expand" data-state="loaded"
onclick="AppUtils.Comment.expand(this, 12636233, 51272, 'Work')">
<i class="icon-plus-square-o"></i>&nbsp;<span class="toggle-text">раскрыть ветвь</span>
&nbsp;<span class="replies-count">1</span> <i class="icon-comments"></i>
</div>
<div class="replies">
<div class="comment-wrapper ">
<span class="anchor" id="comment_12636484"></span>
<div class="comment c-view c-allow-edit " data-id="12636484" data-level="3" data-thread="12629941"
data-is-ignored="false" data-is-pinned="false">
<div class="comment-avatar">
<a href="/u/id54937541">
<img class=""
src="https://cm.author.today/content/2019/02/05/u/id54937541_636849751708919048.jpg?width=70&amp;height=70&amp;mode=crop"/>
</a>
</div>
<div class="comment-body">
<header class="">
<a href="/u/id54937541">
<span class="comment-user-name">Михаил Игнатов</span>
</a>
<span class="label label-primary pull-left">автор</span>
<time>
<span class="hint-top" data-format="calendar"
data-time="2022-02-15T11:16:11.0830000Z"></span>
</time>
<div class="comment-tool">
<a data-clipboard="?c=12636484&amp;th=12629941" class="support-link hint-top comment-url"
data-hint="Ссылка на комментарий #12636484">#</a>
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
onclick="AppUtils.Comment.scrollToParent('#comment_12636233')">↑</a>
</div>
</header>
<article>
<div class="rich-content fr-view"><p>Автор крут, я знаю. <img class="emojione" alt="😇"
src="https://cdn.jsdelivr.net/emojione/assets/3.0/png/64/1f607.png?v=3.0"/>&nbsp;
</p></div>
</article>
<footer>
<div class="rating-count comment-rating-count hint-bottom-right" data-vote-id=""
data-hint="Рейтинг комментария: 👍7 и 👎0">+7
</div>
<a class="comment-action action-abuse"
onclick="AppComponents.FeedbackForm.show(12636484, 'Comment', '/work/51272?c=12636484&amp;th=12629941')">пожаловаться</a>
</footer>
<div class="reply-placeholder">
</div>
</div>
</div>
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
<i class="icon-minus-square-o"></i>
</div>
<div class="comment-toggle comment-toggle-expand" data-state="none"
onclick="AppUtils.Comment.expand(this, 12636484, 51272, 'Work')">
<i class="icon-plus-square-o"></i>&nbsp;<span class="toggle-text">раскрыть ветвь</span>
&nbsp;<span class="replies-count">0</span> <i class="icon-comments"></i>
</div>
<div class="replies">
</div>
</div>
</div>
</div>

54
temp.json 100644

File diff suppressed because one or more lines are too long

163
temp2.html 100644
View File

@ -0,0 +1,163 @@
<html>
<body>
<div class="comment-wrapper">
<span class="anchor" id="comment_12721746"></span>
<div class="comment c-view c-allow-edit" data-id="12721746" data-is-ignored="false" data-is-pinned="false"
data-level="2" data-thread="12629941">
<div class="comment-avatar">
<a href="/u/ab_random">
<img class=""
src="https://cm.author.today/content/2020/03/04/u/ab_random_637189212355741343.jpg?width=70&amp;height=70&amp;mode=crop"/>
</a>
</div>
<div class="comment-body">
<header class="">
<a href="/u/ab_random">
<span class="comment-user-name">ab.random</span>
</a>
<time>
<span class="hint-top" data-format="calendar" data-time="2022-02-20T19:50:14.6100000Z"></span>
</time>
<div class="comment-tool">
<a class="support-link hint-top comment-url" data-clipboard="?c=12721746&amp;th=12629941"
data-hint="Ссылка на комментарий #12721746">#</a>
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
onclick="AppUtils.Comment.scrollToParent('#comment_12634372')">↑</a>
</div>
</header>
<article>
<div class="rich-content fr-view"><p>Этот коммент не несёт в себе смысловой нагрузки и нужен лишь для
эксперимента. Можете не обращать на него внимание. Спасибо.</p></div>
</article>
<footer>
<div class="rating-count comment-rating-count hint-bottom-right"
data-hint="Рейтинг комментария: 👍0 и 👎0" data-vote-id="">0
</div>
<a class="comment-action action-abuse"
onclick="AppComponents.FeedbackForm.show(12721746, 'Comment', '/work/51272?c=12721746&amp;th=12629941')">пожаловаться</a>
</footer>
<div class="reply-placeholder">
</div>
</div>
</div>
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
<i class="icon-minus-square-o"></i>
</div>
<div class="comment-toggle comment-toggle-expand" data-state="none"
onclick="AppUtils.Comment.expand(this, 12721746, 51272, 'Work')">
<i class="icon-plus-square-o"></i> <span class="toggle-text">раскрыть ветвь</span>
 <span class="replies-count">0</span> <i class="icon-comments"></i>
</div>
<div class="replies">
</div>
</div>
<div class="comment-wrapper expanded-replies">
<span class="anchor" id="comment_12636233"></span>
<div class="comment c-view c-allow-edit" data-id="12636233" data-is-ignored="false" data-is-pinned="false"
data-level="2" data-thread="12629941">
<div class="comment-avatar">
<a href="/u/dima2914">
<img class=""
src="https://cm.author.today/content/2021/12/12/u/dima2914_637749114653718777.jpg?width=70&amp;height=70&amp;mode=crop"/>
</a>
</div>
<div class="comment-body">
<header class="">
<a href="/u/dima2914">
<span class="comment-user-name">pou29</span>
</a>
<time>
<span class="hint-top" data-format="calendar" data-time="2022-02-15T10:55:31.8030000Z"></span>
</time>
<div class="comment-tool">
<a class="support-link hint-top comment-url" data-clipboard="?c=12636233&amp;th=12629941"
data-hint="Ссылка на комментарий #12636233">#</a>
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
onclick="AppUtils.Comment.scrollToParent('#comment_12634372')">↑</a>
</div>
</header>
<article>
<div class="rich-content fr-view"><p>Я очень долго окладывал это произведение!! Ну не шло совсем!!! Еле
осилил 10 глав в первом томе! А потом откладывал, откладывал.... какой же я был глупец<img alt="😇"
class="emojione"
src="https://cdn.jsdelivr.net/emojione/assets/3.0/png/64/1f607.png?v=3.0"/> 
</p></div>
</article>
<footer>
<div class="rating-count comment-rating-count hint-bottom-right"
data-hint="Рейтинг комментария: 👍3 и 👎0" data-vote-id="">+3
</div>
<a class="comment-action action-abuse"
onclick="AppComponents.FeedbackForm.show(12636233, 'Comment', '/work/51272?c=12636233&amp;th=12629941')">пожаловаться</a>
</footer>
<div class="reply-placeholder">
</div>
</div>
</div>
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
<i class="icon-minus-square-o"></i>
</div>
<div class="comment-toggle comment-toggle-expand" data-state="loaded"
onclick="AppUtils.Comment.expand(this, 12636233, 51272, 'Work')">
<i class="icon-plus-square-o"></i> <span class="toggle-text">раскрыть ветвь</span>
 <span class="replies-count">1</span> <i class="icon-comments"></i>
</div>
<div class="replies">
<div class="comment-wrapper">
<span class="anchor" id="comment_12636484"></span>
<div class="comment c-view c-allow-edit" data-id="12636484" data-is-ignored="false" data-is-pinned="false"
data-level="3" data-thread="12629941">
<div class="comment-avatar">
<a href="/u/id54937541">
<img class=""
src="https://cm.author.today/content/2019/02/05/u/id54937541_636849751708919048.jpg?width=70&amp;height=70&amp;mode=crop"/>
</a>
</div>
<div class="comment-body">
<header class="">
<a href="/u/id54937541">
<span class="comment-user-name">Михаил Игнатов</span>
</a>
<span class="label label-primary pull-left">автор</span>
<time>
<span class="hint-top" data-format="calendar"
data-time="2022-02-15T11:16:11.0830000Z"></span>
</time>
<div class="comment-tool">
<a class="support-link hint-top comment-url" data-clipboard="?c=12636484&amp;th=12629941"
data-hint="Ссылка на комментарий #12636484">#</a>
<a class="support-link to-parent hint-top" data-hint="Показать родительский комментарий"
onclick="AppUtils.Comment.scrollToParent('#comment_12636233')">↑</a>
</div>
</header>
<article>
<div class="rich-content fr-view"><p>Автор крут, я знаю. <img alt="😇" class="emojione"
src="https://cdn.jsdelivr.net/emojione/assets/3.0/png/64/1f607.png?v=3.0"/> 
</p></div>
</article>
<footer>
<div class="rating-count comment-rating-count hint-bottom-right"
data-hint="Рейтинг комментария: 👍7 и 👎0" data-vote-id="">+7
</div>
<a class="comment-action action-abuse"
onclick="AppComponents.FeedbackForm.show(12636484, 'Comment', '/work/51272?c=12636484&amp;th=12629941')">пожаловаться</a>
</footer>
<div class="reply-placeholder">
</div>
</div>
</div>
<div class="comment-toggle comment-toggle-collapse" onclick="AppUtils.Comment.collapse(this)">
<i class="icon-minus-square-o"></i>
</div>
<div class="comment-toggle comment-toggle-expand" data-state="none"
onclick="AppUtils.Comment.expand(this, 12636484, 51272, 'Work')">
<i class="icon-plus-square-o"></i> <span class="toggle-text">раскрыть ветвь</span>
 <span class="replies-count">0</span> <i class="icon-comments"></i>
</div>
<div class="replies">
</div>
</div>
</div>
</div>
</body>
</html>

8
utils.py 100644
View File

@ -0,0 +1,8 @@
from datetime import date, datetime
def json_serial(obj):
"""JSON serializer for objects not serializable by default json code"""
if isinstance(obj, (datetime, date)):
return obj.isoformat()
raise TypeError("Type %s not serializable" % type(obj))