File size: 4,521 Bytes
bbcd8ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# preprocess.py

import re

# === 1. Oxia → Tonos replacements ===
# These replace legacy Greek accents with the modern Unicode tonos versions
OXIA_TO_TONOS = {
    "ά": "ά",  # U+1F71 → U+03AC (alpha)
    "έ": "έ",  # U+1F73 → U+03AD (epsilon)
    "ή": "ή",  # U+1F75 → U+03AE (eta)
    "ί": "ί",  # U+1F77 → U+03AF (iota)
    "ύ": "ύ",  # U+1F7B → U+03CD (upsilon)
    "ό": "ό",  # U+1F79 → U+03CC (omicron)
    "ώ": "ώ",  # U+1F7D → U+03CE (omega)
}

# === 2. Diphthong component sets ===
diphth_y = {'α', 'ε', 'η', 'ο'}
upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'}

diphth_i = {'α', 'ε', 'ο', 'υ'}
iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'}

# Iota subscript/adscript combinations
adscr_i_first = {
    'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ',
    'ᾶ','ῆ','ῶ','ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ',
    'ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'
}
adscr_i_second = {'ι'}

# === 3. Word processor: expansion and diphthong merging ===

def process_word(word):
    """
    Expand special Greek letters and merge diphthongs.

    Args:
        word (str): A lowercase Greek word.

    Returns:
        list of str: A list of tokens (letters or diphthongs).
    """
    expanded = []

    # Step 1: Expand characters like ζ → δσ, ξ → κσ, etc.
    for char in word:
        if char == 'ζ':
            expanded.extend(['δ', 'σ'])
        elif char == 'ς':
            expanded.append('σ')
        elif char == 'ῥ':
            expanded.append('ρ')
        elif char == 'ξ':
            expanded.extend(['κ', 'σ'])
        elif char == 'ψ':
            expanded.extend(['π', 'σ'])
        else:
            expanded.append(char)

    # Step 2: Merge diphthongs and adscript combinations
    combined = []
    i = 0
    while i < len(expanded):
        a = expanded[i]
        b = expanded[i+1] if i + 1 < len(expanded) else ''

        if a in diphth_y and b in upsilon_forms:
            combined.append(a + b)
            i += 2
        elif a in diphth_i and b in iota_forms:
            combined.append(a + b)
            i += 2
        elif a in adscr_i_first and b in adscr_i_second:
            combined.append(a + b)
            i += 2
        else:
            combined.append(a)
            i += 1

    return combined

# === 4. Accent Normalization ===

def replace_oxia_with_tonos(text):
    """
    Replace oxia accents in text with tonos equivalents using Unicode mapping.

    Args:
        text (str): Input Greek string.

    Returns:
        str: Normalized string with tonos accents.
    """
    return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text)

# === 5. Full Preprocessor ===

def preprocess_greek_line(line):
    """
    Normalize, extract, and tokenize a line of Greek text.

    Steps:
    1. Normalize oxia to tonos.
    2. Extract valid Greek words and discard punctuation.
    3. Expand compound characters and merge diphthongs.
    4. Flatten the tokens across all words.

    Args:
        line (str): A full Greek sentence or phrase.

    Returns:
        list of str: A flat list of tokens (letters or diphthongs).
    """
    # Step 1: Replace oxia with tonos
    line = replace_oxia_with_tonos(line)

    # Step 2: Extract only Greek characters (ignore punctuation, numbers, etc.)
    words = re.findall(
        r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"
        r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ"
        r"ἐἑἒἓἔἕἘἙἜἝ"
        r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ"
        r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ"
        r"ὀὁὂὃὄὅὈὉὊὋὌὍ"
        r"ὐὑὒὓὔὕὖὗὙὛὝ"
        r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ"
        r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ"
        r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ"
        r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ"
        r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+",
        line.lower()
    )

    # Step 3: Tokenize each word using expansion rules
    token_lists = [process_word(word) for word in words]

    # Step 4: Flatten token lists across all words
    tokens = [token for tokens in token_lists for token in tokens]

    return tokens