Spaces:
Running
Running
File size: 4,521 Bytes
bbcd8ef | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | # preprocess.py
import re
# === 1. Oxia → Tonos replacements ===
# These replace legacy Greek accents with the modern Unicode tonos versions
OXIA_TO_TONOS = {
"ά": "ά", # U+1F71 → U+03AC (alpha)
"έ": "έ", # U+1F73 → U+03AD (epsilon)
"ή": "ή", # U+1F75 → U+03AE (eta)
"ί": "ί", # U+1F77 → U+03AF (iota)
"ύ": "ύ", # U+1F7B → U+03CD (upsilon)
"ό": "ό", # U+1F79 → U+03CC (omicron)
"ώ": "ώ", # U+1F7D → U+03CE (omega)
}
# === 2. Diphthong component sets ===
diphth_y = {'α', 'ε', 'η', 'ο'}
upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'}
diphth_i = {'α', 'ε', 'ο', 'υ'}
iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'}
# Iota subscript/adscript combinations
adscr_i_first = {
'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ',
'ᾶ','ῆ','ῶ','ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ',
'ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'
}
adscr_i_second = {'ι'}
# === 3. Word processor: expansion and diphthong merging ===
def process_word(word):
"""
Expand special Greek letters and merge diphthongs.
Args:
word (str): A lowercase Greek word.
Returns:
list of str: A list of tokens (letters or diphthongs).
"""
expanded = []
# Step 1: Expand characters like ζ → δσ, ξ → κσ, etc.
for char in word:
if char == 'ζ':
expanded.extend(['δ', 'σ'])
elif char == 'ς':
expanded.append('σ')
elif char == 'ῥ':
expanded.append('ρ')
elif char == 'ξ':
expanded.extend(['κ', 'σ'])
elif char == 'ψ':
expanded.extend(['π', 'σ'])
else:
expanded.append(char)
# Step 2: Merge diphthongs and adscript combinations
combined = []
i = 0
while i < len(expanded):
a = expanded[i]
b = expanded[i+1] if i + 1 < len(expanded) else ''
if a in diphth_y and b in upsilon_forms:
combined.append(a + b)
i += 2
elif a in diphth_i and b in iota_forms:
combined.append(a + b)
i += 2
elif a in adscr_i_first and b in adscr_i_second:
combined.append(a + b)
i += 2
else:
combined.append(a)
i += 1
return combined
# === 4. Accent Normalization ===
def replace_oxia_with_tonos(text):
"""
Replace oxia accents in text with tonos equivalents using Unicode mapping.
Args:
text (str): Input Greek string.
Returns:
str: Normalized string with tonos accents.
"""
return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text)
# === 5. Full Preprocessor ===
def preprocess_greek_line(line):
"""
Normalize, extract, and tokenize a line of Greek text.
Steps:
1. Normalize oxia to tonos.
2. Extract valid Greek words and discard punctuation.
3. Expand compound characters and merge diphthongs.
4. Flatten the tokens across all words.
Args:
line (str): A full Greek sentence or phrase.
Returns:
list of str: A flat list of tokens (letters or diphthongs).
"""
# Step 1: Replace oxia with tonos
line = replace_oxia_with_tonos(line)
# Step 2: Extract only Greek characters (ignore punctuation, numbers, etc.)
words = re.findall(
r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"
r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ"
r"ἐἑἒἓἔἕἘἙἜἝ"
r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ"
r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ"
r"ὀὁὂὃὄὅὈὉὊὋὌὍ"
r"ὐὑὒὓὔὕὖὗὙὛὝ"
r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ"
r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ"
r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ"
r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ"
r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+",
line.lower()
)
# Step 3: Tokenize each word using expansion rules
token_lists = [process_word(word) for word in words]
# Step 4: Flatten token lists across all words
tokens = [token for tokens in token_lists for token in tokens]
return tokens |