class Doc: def __init__(self, fulltext: str = '', title: str = '', params: dict = {}): self.params = params self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()] self.title, self.lines = self._get_title(title) self.container = Container(lines=self.lines, title=self.title, father=self, params=params) self.tasks = [c.get_task(self.container.one_liner) for c in self.container.containers if c.task] self.fulltext = fulltext def _get_title(self, title): lines = self.lines if self.params['type'] == 'input_text': if self.lines and self.lines[0] and self.lines[0].type == 'title': title = self.lines[0].text lines = lines[1:] else: title = 'the title is missing' return title, lines def replace_tasks(self, resolutions: [str]): starts = self.params['startswith_'] reverts = {starts[k]: k for k in starts} task_starter = reverts['task'] lines = self.fulltext.split('\n') new_lines = [line if not line.startswith(task_starter) else next(iter(resolutions)) for line in lines] new_fulltext = "\n".join(new_lines) return new_fulltext class InputDoc(Doc): def __init__(self, fulltext='', title=''): self.params = { 'type': 'input_text', 'startswith_': {'!!': 'title', '++': 'comment', '??': 'task', '# ': '1', '## ': '2', '### ': '3', '####': '4', '#####': '5', '######': '6'} } super().__init__(fulltext=fulltext, title=title, params=self.params) class WikiPage(Doc): def __init__(self, fulltext='', title=''): self.params = { 'type': 'wiki', 'startswith_': {'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'}, 'endswith_': [' ==', ' ===', ' ====', ' =====', ' ======', ' ======'], 'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography", "Cinematic adaptations", "Further reading", "Maps"] } super().__init__(fulltext=fulltext, title=title, params=self.params) def get_paragraphs(self, chunk=500): return self.container.get_paragraphs(chunk) class Container: def __init__(self, lines=[], level=0, title='', father=None, params={}): self.normals = [] self.normal = '' self.comments = [] self.comment = '' self.tasks = [] self.task = '' self.children = [] self.level = level self.title = title self.father = father self._expand(lines) if params and 'discarded' in params.keys(): self.children = [child for child in self.children if child.title not in params['discarded']] self.containers = [self] for child in self.children: self.containers += child.containers self.one_liner = self.title + ' ' + self.comment self.root_text = self.one_liner + ' ' + self.normal self.text = self.root_text for child in self.children: self.text += ' ' + child.text self.summary = self.text def _expand(self, lines): new_child = False new_child_lines = [] new_child_title = [] for line in lines: if not new_child: if line.type == 'normal': self.normals.append(line) self.normal += ' ' + line.text elif line.type == 'comment': self.comments.append(line) self.comment += ' ' + line.text elif line.type == 'task': self.tasks.append(line) self.task += ' ' + line.text elif line.is_structure: new_child = True new_child_lines = [] new_child_title = line.text line.level = self.level + 1 self.one_liner = self.title + self.comment else: if self.level + 1 < line.level or not line.is_structure: new_child_lines.append(line) elif self.level + 1 == line.level: self.children.append(Container(lines=new_child_lines, level=self.level + 1, title=new_child_title, father=self)) new_child_lines = [] new_child_title = line.text if new_child: self.children.append(Container(lines=new_child_lines, level=self.level + 1, title=new_child_title, father=self)) def get_task(self, doc_one_liner): siblings_ = self.father.children.copy() index = siblings_.index(self) siblings_before_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if idx < index] siblings_after_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if index < idx] task = {'description': self.task, 'about': self.one_liner, 'doc_description': doc_one_liner, 'above': self.father.one_liner, 'before': siblings_before_context, 'after': siblings_after_context} return task def get_paragraphs(self, chunk=500): if len(self.text) < chunk: paragraphs = [self.text] else: paragraphs = [self.root_text] for child in self.children: paragraphs += child.get_paragraphs(chunk) return paragraphs class Line: def __init__(self, text, params): self.text = text self.type, self.text = self._parse_text(params) self.level = int(self.type) if self.type.isdigit() else -1 self.is_structure = 0 < self.level def _parse_text(self, params): def strip_text(text_, start, end): text_ = text_.split(start)[1] if end != "": text_ = text_.split(end)[0] # text += ". \n" return text_.strip() startswith_ = params['startswith_'] endswith_ = params['endswith_'] if 'endswith_' in params.keys() else [""] * len(startswith_) types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter]) for i, starter in enumerate(startswith_.keys()) if self.text.startswith(starter)] (text, type_) = types[0] if types else (self.text, 'normal') return type_, text.strip()