GPTdoc / src /model /model.py
YvesP's picture
initial load
65642c3
import re
class Container:
def __init__(self, title: str = '', fulltext: str = '', level: int = 0):
self.title = title
self.fulltext = fulltext
self.children = []
self.text = ''
self.level = level
self.docs = []
self.expand()
self.to_docs()
def expand(self, max_length=700):
if 0 < self.level:
split_title = self.fulltext.split(Container.title_separators[self.level])
if 1 < len(split_title):
self.title += ('\n' + re.sub(Container.title_headers[self.level], '', split_title[0]))
self.fulltext = split_title[1]
if self.title in Container.discarded:
self.fulltext = self.text = ''
if self.fulltext:
if max_length < len(self.fulltext):
split_text = self.fulltext.split(Container.separators[self.level])
if self.fulltext[0] != '=':
self.text += self.title + '\n' + split_text[0]
split_text.pop(0)
self.children = [Container(fulltext=t, level=self.level + 1, title=self.title) for t in split_text]
else:
self.text += '\n' + self.fulltext
def to_docs(self):
self.docs = [self.text] if 60 < len(self.text) else []
for child in self.children:
self.docs += child.root_text
def group_docs(self, max_length=700):
grouped_docs = []
for doc in self.docs:
if grouped_docs and len(grouped_docs[-1])+len(doc) < max_length:
doc = grouped_docs.pop()+' '+doc
grouped_docs.append(doc)
return grouped_docs
def __str__(self):
card = "... level : " + str(self.level) + " words :" + str(len(self.text.split(' '))) + "\n"
card += "... title : " + self.title[:100] + "\n"
card += "... text : " + self.text[:100] + "\n"
card += "... fulllength : " + str(len(self.fulltext)) + "\n"
card += "... length : " + str(len(self.text)) + "\n\n"
for child in self.children:
card += child.__str__()
return card
def get_texts(self):
return self.group_docs()