File size: 4,753 Bytes
498db6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
the class works but lots of code could be reused 
"""


class Doc:
    def __init__(self, fulltext: str = '', title: str = '', params: dict = {}):
        self.params = params
        self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()]
        self.title, self.lines = self._get_title(title)
        self.container = Container(lines=self.lines, title=self.title, father=self, params=params)
        self.fulltext = fulltext

    def _get_title(self, title):
        lines = self.lines
        if self.params['type'] == 'input_text':
            if self.lines and self.lines[0] and self.lines[0].type == 'title':
                title = self.lines[0].text
                lines = lines[1:]
            else:
                title = 'the title is missing'
        return title, lines


class WikiPage(Doc):

    def __init__(self, fulltext='', title=''):
        self.params = {
            'type': 'wiki',
            'startswith_':
                {'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'},
            'endswith_':
                [' ==', ' ===', ' ====', ' =====', ' ======', ' ======'],

            'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography",
                          "Cinematic adaptations", "Further reading", "Maps"]
        }
        super().__init__(fulltext=fulltext, title=title, params=self.params)

    def get_paragraphs(self, chunk=500):
        return self.container.get_paragraphs(chunk)


class Container:

    def __init__(self, lines=[], level=0, title='', father=None, params={}):

        self.children = []
        self.level = level
        self.title = title
        self.father = father
        self.lines = []
        self._expand(lines)
        if params and 'discarded' in params.keys():
            self.children = [child for child in self.children if child.title not in params['discarded']]
        self.containers = [self]
        for child in self.children:
            self.containers += child.containers
        self.text = ''
        for child in self.children:
            self.text += ' ' + child.text

    def _expand(self, lines):
        new_child = False
        new_child_lines = []
        new_child_title = []
        for line in lines:
            if not new_child:
                if line.is_structure:
                    new_child = True
                    new_child_lines = []
                    new_child_title = line.text
                    line.level = self.level + 1
                else:
                    self.lines.append(line)

            else:
                if self.level + 1 < line.level or not line.is_structure:
                    new_child_lines.append(line)
                elif self.level + 1 == line.level:
                    self.children.append(Container(lines=new_child_lines,
                                                   level=self.level + 1,
                                                   title=new_child_title,
                                                   father=self))
                    new_child_lines = []
                    new_child_title = line.text
        if new_child:
            self.children.append(Container(lines=new_child_lines,
                                           level=self.level + 1,
                                           title=new_child_title,
                                           father=self))

    def get_paragraphs(self, chunk=500):
        if len(self.text) < chunk:
            paragraphs = [self.text]
        else:
            paragraphs = [self.root_text]
            for child in self.children:
                paragraphs += child.get_paragraphs(chunk)
        return paragraphs


class Line:

    def __init__(self, text, params):
        self.text = text
        self.params = params
        self.type, self.text = self._parse_text()
        self.level = int(self.type) if self.type.isdigit() else -1
        self.is_structure = 0 < self.level


    def _parse_text(self):
        def strip_text(text_, start, end):
            text_ = text_.split(start)[1]
            if end != "":
                text_ = text_.split(end)[0]
            # text += ". \n"
            return text_.strip()

        startswith_ = self.params['startswith_']

        endswith_ = self.params['endswith_'] if 'endswith_' in self.params.keys() else [""] * len(startswith_)
        types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter])
                 for i, starter in enumerate(startswith_.keys())
                 if self.text.startswith(starter)]
        (text, type_) = types[0] if types else (self.text, 'normal')
        return type_, text.strip()