Spaces:

Hexamind
/

GPTdoc

Build error

App Files Files Community

GPTdoc / src /model /model.py

YvesP

initial load

65642c3 over 2 years ago

raw

history blame contribute delete

2.23 kB

	import re


	class Container:

	def __init__(self, title: str = '', fulltext: str = '', level: int = 0):

	self.title = title
	self.fulltext = fulltext
	self.children = []
	self.text = ''
	self.level = level
	self.docs = []
	self.expand()
	self.to_docs()

	def expand(self, max_length=700):

	if 0 < self.level:
	split_title = self.fulltext.split(Container.title_separators[self.level])
	if 1 < len(split_title):
	self.title += ('\n' + re.sub(Container.title_headers[self.level], '', split_title[0]))
	self.fulltext = split_title[1]
	if self.title in Container.discarded:
	self.fulltext = self.text = ''
	if self.fulltext:
	if max_length < len(self.fulltext):
	split_text = self.fulltext.split(Container.separators[self.level])
	if self.fulltext[0] != '=':
	self.text += self.title + '\n' + split_text[0]
	split_text.pop(0)
	self.children = [Container(fulltext=t, level=self.level + 1, title=self.title) for t in split_text]
	else:
	self.text += '\n' + self.fulltext

	def to_docs(self):
	self.docs = [self.text] if 60 < len(self.text) else []
	for child in self.children:
	self.docs += child.root_text

	def group_docs(self, max_length=700):
	grouped_docs = []
	for doc in self.docs:
	if grouped_docs and len(grouped_docs[-1])+len(doc) < max_length:
	doc = grouped_docs.pop()+' '+doc
	grouped_docs.append(doc)
	return grouped_docs

	def __str__(self):
	card = "... level : " + str(self.level) + " words :" + str(len(self.text.split(' '))) + "\n"
	card += "... title : " + self.title[:100] + "\n"
	card += "... text : " + self.text[:100] + "\n"
	card += "... fulllength : " + str(len(self.fulltext)) + "\n"
	card += "... length : " + str(len(self.text)) + "\n\n"
	for child in self.children:
	card += child.__str__()
	return card

	def get_texts(self):
	return self.group_docs()