| import re | |
| class Container: | |
| def __init__(self, title: str = '', fulltext: str = '', level: int = 0): | |
| self.title = title | |
| self.fulltext = fulltext | |
| self.children = [] | |
| self.text = '' | |
| self.level = level | |
| self.docs = [] | |
| self.expand() | |
| self.to_docs() | |
| def expand(self, max_length=700): | |
| if 0 < self.level: | |
| split_title = self.fulltext.split(Container.title_separators[self.level]) | |
| if 1 < len(split_title): | |
| self.title += ('\n' + re.sub(Container.title_headers[self.level], '', split_title[0])) | |
| self.fulltext = split_title[1] | |
| if self.title in Container.discarded: | |
| self.fulltext = self.text = '' | |
| if self.fulltext: | |
| if max_length < len(self.fulltext): | |
| split_text = self.fulltext.split(Container.separators[self.level]) | |
| if self.fulltext[0] != '=': | |
| self.text += self.title + '\n' + split_text[0] | |
| split_text.pop(0) | |
| self.children = [Container(fulltext=t, level=self.level + 1, title=self.title) for t in split_text] | |
| else: | |
| self.text += '\n' + self.fulltext | |
| def to_docs(self): | |
| self.docs = [self.text] if 60 < len(self.text) else [] | |
| for child in self.children: | |
| self.docs += child.root_text | |
| def group_docs(self, max_length=700): | |
| grouped_docs = [] | |
| for doc in self.docs: | |
| if grouped_docs and len(grouped_docs[-1])+len(doc) < max_length: | |
| doc = grouped_docs.pop()+' '+doc | |
| grouped_docs.append(doc) | |
| return grouped_docs | |
| def __str__(self): | |
| card = "... level : " + str(self.level) + " words :" + str(len(self.text.split(' '))) + "\n" | |
| card += "... title : " + self.title[:100] + "\n" | |
| card += "... text : " + self.text[:100] + "\n" | |
| card += "... fulllength : " + str(len(self.fulltext)) + "\n" | |
| card += "... length : " + str(len(self.text)) + "\n\n" | |
| for child in self.children: | |
| card += child.__str__() | |
| return card | |
| def get_texts(self): | |
| return self.group_docs() | |