import re class Container: def __init__(self, title: str = '', fulltext: str = '', level: int = 0): self.title = title self.fulltext = fulltext self.children = [] self.text = '' self.level = level self.docs = [] self.expand() self.to_docs() def expand(self, max_length=700): if 0 < self.level: split_title = self.fulltext.split(Container.title_separators[self.level]) if 1 < len(split_title): self.title += ('\n' + re.sub(Container.title_headers[self.level], '', split_title[0])) self.fulltext = split_title[1] if self.title in Container.discarded: self.fulltext = self.text = '' if self.fulltext: if max_length < len(self.fulltext): split_text = self.fulltext.split(Container.separators[self.level]) if self.fulltext[0] != '=': self.text += self.title + '\n' + split_text[0] split_text.pop(0) self.children = [Container(fulltext=t, level=self.level + 1, title=self.title) for t in split_text] else: self.text += '\n' + self.fulltext def to_docs(self): self.docs = [self.text] if 60 < len(self.text) else [] for child in self.children: self.docs += child.root_text def group_docs(self, max_length=700): grouped_docs = [] for doc in self.docs: if grouped_docs and len(grouped_docs[-1])+len(doc) < max_length: doc = grouped_docs.pop()+' '+doc grouped_docs.append(doc) return grouped_docs def __str__(self): card = "... level : " + str(self.level) + " words :" + str(len(self.text.split(' '))) + "\n" card += "... title : " + self.title[:100] + "\n" card += "... text : " + self.text[:100] + "\n" card += "... fulllength : " + str(len(self.fulltext)) + "\n" card += "... length : " + str(len(self.text)) + "\n\n" for child in self.children: card += child.__str__() return card def get_texts(self): return self.group_docs()