File size: 2,226 Bytes
65642c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import re


class Container:

    def __init__(self, title: str = '', fulltext: str = '', level: int = 0):

        self.title = title
        self.fulltext = fulltext
        self.children = []
        self.text = ''
        self.level = level
        self.docs = []
        self.expand()
        self.to_docs()

    def expand(self, max_length=700):

        if 0 < self.level:
            split_title = self.fulltext.split(Container.title_separators[self.level])
            if 1 < len(split_title):
                self.title += ('\n' + re.sub(Container.title_headers[self.level], '', split_title[0]))
                self.fulltext = split_title[1]
                if self.title in Container.discarded:
                    self.fulltext = self.text = ''
        if self.fulltext:
            if max_length < len(self.fulltext):
                split_text = self.fulltext.split(Container.separators[self.level])
                if self.fulltext[0] != '=':
                    self.text += self.title + '\n' + split_text[0]
                    split_text.pop(0)
                self.children = [Container(fulltext=t, level=self.level + 1, title=self.title) for t in split_text]
            else:
                self.text += '\n' + self.fulltext

    def to_docs(self):
        self.docs = [self.text] if 60 < len(self.text) else []
        for child in self.children:
            self.docs += child.root_text

    def group_docs(self, max_length=700):
        grouped_docs = []
        for doc in self.docs:
            if grouped_docs and len(grouped_docs[-1])+len(doc) < max_length:
                doc = grouped_docs.pop()+' '+doc
            grouped_docs.append(doc)
        return grouped_docs

    def __str__(self):
        card = "... level : " + str(self.level) + "   words :" + str(len(self.text.split(' '))) + "\n"
        card += "... title :  " + self.title[:100] + "\n"
        card += "... text :  " + self.text[:100] + "\n"
        card += "... fulllength : " + str(len(self.fulltext)) + "\n"
        card += "... length : " + str(len(self.text)) + "\n\n"
        for child in self.children:
            card += child.__str__()
        return card

    def get_texts(self):
        return self.group_docs()