|
|
class Doc: |
|
|
def __init__(self, fulltext: str = '', title: str = '', params: dict = {}): |
|
|
self.params = params |
|
|
self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()] |
|
|
self.title, self.lines = self._get_title(title) |
|
|
self.container = Container(lines=self.lines, title=self.title, father=self, params=params) |
|
|
self.tasks = [c.get_task(self.container.one_liner) for c in self.container.containers if c.task] |
|
|
self.fulltext = fulltext |
|
|
|
|
|
def _get_title(self, title): |
|
|
lines = self.lines |
|
|
if self.params['type'] == 'input_text': |
|
|
if self.lines and self.lines[0] and self.lines[0].type == 'title': |
|
|
title = self.lines[0].text |
|
|
lines = lines[1:] |
|
|
else: |
|
|
title = 'the title is missing' |
|
|
return title, lines |
|
|
|
|
|
def replace_tasks(self, resolutions: [str]): |
|
|
starts = self.params['startswith_'] |
|
|
reverts = {starts[k]: k for k in starts} |
|
|
task_starter = reverts['task'] |
|
|
lines = self.fulltext.split('\n') |
|
|
new_lines = [line if not line.startswith(task_starter) else next(iter(resolutions)) for line in lines] |
|
|
new_fulltext = "\n".join(new_lines) |
|
|
return new_fulltext |
|
|
|
|
|
|
|
|
class InputDoc(Doc): |
|
|
|
|
|
def __init__(self, fulltext='', title=''): |
|
|
self.params = { |
|
|
'type': 'input_text', |
|
|
'startswith_': |
|
|
{'!!': 'title', '++': 'comment', '??': 'task', |
|
|
'# ': '1', '## ': '2', '### ': '3', '####': '4', '#####': '5', '######': '6'} |
|
|
} |
|
|
super().__init__(fulltext=fulltext, title=title, params=self.params) |
|
|
|
|
|
|
|
|
class WikiPage(Doc): |
|
|
|
|
|
def __init__(self, fulltext='', title=''): |
|
|
self.params = { |
|
|
'type': 'wiki', |
|
|
'startswith_': |
|
|
{'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'}, |
|
|
'endswith_': |
|
|
[' ==', ' ===', ' ====', ' =====', ' ======', ' ======'], |
|
|
|
|
|
'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography", |
|
|
"Cinematic adaptations", "Further reading", "Maps"] |
|
|
} |
|
|
super().__init__(fulltext=fulltext, title=title, params=self.params) |
|
|
|
|
|
def get_paragraphs(self, chunk=500): |
|
|
return self.container.get_paragraphs(chunk) |
|
|
|
|
|
|
|
|
class Container: |
|
|
|
|
|
def __init__(self, lines=[], level=0, title='', father=None, params={}): |
|
|
|
|
|
self.normals = [] |
|
|
self.normal = '' |
|
|
self.comments = [] |
|
|
self.comment = '' |
|
|
self.tasks = [] |
|
|
self.task = '' |
|
|
self.children = [] |
|
|
self.level = level |
|
|
self.title = title |
|
|
self.father = father |
|
|
|
|
|
self._expand(lines) |
|
|
|
|
|
if params and 'discarded' in params.keys(): |
|
|
self.children = [child for child in self.children if child.title not in params['discarded']] |
|
|
|
|
|
self.containers = [self] |
|
|
for child in self.children: |
|
|
self.containers += child.containers |
|
|
self.one_liner = self.title + ' ' + self.comment |
|
|
self.root_text = self.one_liner + ' ' + self.normal |
|
|
self.text = self.root_text |
|
|
for child in self.children: |
|
|
self.text += ' ' + child.text |
|
|
|
|
|
self.summary = self.text |
|
|
|
|
|
def _expand(self, lines): |
|
|
new_child = False |
|
|
new_child_lines = [] |
|
|
new_child_title = [] |
|
|
for line in lines: |
|
|
if not new_child: |
|
|
if line.type == 'normal': |
|
|
self.normals.append(line) |
|
|
self.normal += ' ' + line.text |
|
|
elif line.type == 'comment': |
|
|
self.comments.append(line) |
|
|
self.comment += ' ' + line.text |
|
|
elif line.type == 'task': |
|
|
self.tasks.append(line) |
|
|
self.task += ' ' + line.text |
|
|
elif line.is_structure: |
|
|
new_child = True |
|
|
new_child_lines = [] |
|
|
new_child_title = line.text |
|
|
line.level = self.level + 1 |
|
|
self.one_liner = self.title + self.comment |
|
|
else: |
|
|
if self.level + 1 < line.level or not line.is_structure: |
|
|
new_child_lines.append(line) |
|
|
elif self.level + 1 == line.level: |
|
|
self.children.append(Container(lines=new_child_lines, |
|
|
level=self.level + 1, |
|
|
title=new_child_title, |
|
|
father=self)) |
|
|
new_child_lines = [] |
|
|
new_child_title = line.text |
|
|
if new_child: |
|
|
self.children.append(Container(lines=new_child_lines, |
|
|
level=self.level + 1, |
|
|
title=new_child_title, |
|
|
father=self)) |
|
|
|
|
|
def get_task(self, doc_one_liner): |
|
|
siblings_ = self.father.children.copy() |
|
|
index = siblings_.index(self) |
|
|
siblings_before_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if idx < index] |
|
|
siblings_after_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if index < idx] |
|
|
|
|
|
task = {'description': self.task, |
|
|
'about': self.one_liner, |
|
|
'doc_description': doc_one_liner, |
|
|
'above': self.father.one_liner, |
|
|
'before': siblings_before_context, |
|
|
'after': siblings_after_context} |
|
|
return task |
|
|
|
|
|
def get_paragraphs(self, chunk=500): |
|
|
if len(self.text) < chunk: |
|
|
paragraphs = [self.text] |
|
|
else: |
|
|
paragraphs = [self.root_text] |
|
|
for child in self.children: |
|
|
paragraphs += child.get_paragraphs(chunk) |
|
|
return paragraphs |
|
|
|
|
|
|
|
|
class Line: |
|
|
|
|
|
def __init__(self, text, params): |
|
|
self.text = text |
|
|
self.type, self.text = self._parse_text(params) |
|
|
self.level = int(self.type) if self.type.isdigit() else -1 |
|
|
self.is_structure = 0 < self.level |
|
|
|
|
|
def _parse_text(self, params): |
|
|
def strip_text(text_, start, end): |
|
|
text_ = text_.split(start)[1] |
|
|
if end != "": |
|
|
text_ = text_.split(end)[0] |
|
|
|
|
|
return text_.strip() |
|
|
|
|
|
startswith_ = params['startswith_'] |
|
|
|
|
|
endswith_ = params['endswith_'] if 'endswith_' in params.keys() else [""] * len(startswith_) |
|
|
types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter]) |
|
|
for i, starter in enumerate(startswith_.keys()) |
|
|
if self.text.startswith(starter)] |
|
|
(text, type_) = types[0] if types else (self.text, 'normal') |
|
|
return type_, text.strip() |
|
|
|