File size: 1,233 Bytes
1295a89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def split_list(array: list[str], separator: str) -> list[str]:
    r = []
    placeholder = "\uE000"
    for s in array:
        s_with_marker = s.replace(separator, separator + placeholder)
        parts = s_with_marker.split(placeholder)
        r.extend(parts)
    return r


def split(text: str) -> list[str]:
    for replacement in [' \n', '\n ', '\n\n']:
        while replacement in text:
            text = text.replace(replacement, '\n')
    protections = ['d. h.', 'Abs.', 'Art.', 'Bem.', 'Bst.', ' ff.', ' f.', '(ff.', '(f.', 'insbes.', 'S.', 'V.']
    for protection in protections:
        text = text.replace(protection, protection.replace('.', '\uE000'))
    placeholder = "\uE001"
    for i in range(3, len(text) - 3):
        if text[i] == '.':
            if (
                (text[i - 2] == ' ') or
                ( not text[i + 2].isupper()) or
                (text[i - 1].isdigit())
            ):
                text = text[:i] + placeholder + text[i+1:]
    array = [text]
    for value in ['\n', '. ', '? ']:
        array = split_list(array, value)
    final_list = []
    for s in array:
        cleaned_s = s.replace(placeholder, '.').strip()
        final_list.append(cleaned_s)
    return final_list