macronizer / preprocess.py
al1808th's picture
output fix
bbcd8ef
# preprocess.py
import re
# === 1. Oxia → Tonos replacements ===
# These replace legacy Greek accents with the modern Unicode tonos versions
OXIA_TO_TONOS = {
"ά": "ά", # U+1F71 → U+03AC (alpha)
"έ": "έ", # U+1F73 → U+03AD (epsilon)
"ή": "ή", # U+1F75 → U+03AE (eta)
"ί": "ί", # U+1F77 → U+03AF (iota)
"ύ": "ύ", # U+1F7B → U+03CD (upsilon)
"ό": "ό", # U+1F79 → U+03CC (omicron)
"ώ": "ώ", # U+1F7D → U+03CE (omega)
}
# === 2. Diphthong component sets ===
diphth_y = {'α', 'ε', 'η', 'ο'}
upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'}
diphth_i = {'α', 'ε', 'ο', 'υ'}
iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'}
# Iota subscript/adscript combinations
adscr_i_first = {
'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ',
'ᾶ','ῆ','ῶ','ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ',
'ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'
}
adscr_i_second = {'ι'}
# === 3. Word processor: expansion and diphthong merging ===
def process_word(word):
"""
Expand special Greek letters and merge diphthongs.
Args:
word (str): A lowercase Greek word.
Returns:
list of str: A list of tokens (letters or diphthongs).
"""
expanded = []
# Step 1: Expand characters like ζ → δσ, ξ → κσ, etc.
for char in word:
if char == 'ζ':
expanded.extend(['δ', 'σ'])
elif char == 'ς':
expanded.append('σ')
elif char == 'ῥ':
expanded.append('ρ')
elif char == 'ξ':
expanded.extend(['κ', 'σ'])
elif char == 'ψ':
expanded.extend(['π', 'σ'])
else:
expanded.append(char)
# Step 2: Merge diphthongs and adscript combinations
combined = []
i = 0
while i < len(expanded):
a = expanded[i]
b = expanded[i+1] if i + 1 < len(expanded) else ''
if a in diphth_y and b in upsilon_forms:
combined.append(a + b)
i += 2
elif a in diphth_i and b in iota_forms:
combined.append(a + b)
i += 2
elif a in adscr_i_first and b in adscr_i_second:
combined.append(a + b)
i += 2
else:
combined.append(a)
i += 1
return combined
# === 4. Accent Normalization ===
def replace_oxia_with_tonos(text):
"""
Replace oxia accents in text with tonos equivalents using Unicode mapping.
Args:
text (str): Input Greek string.
Returns:
str: Normalized string with tonos accents.
"""
return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text)
# === 5. Full Preprocessor ===
def preprocess_greek_line(line):
"""
Normalize, extract, and tokenize a line of Greek text.
Steps:
1. Normalize oxia to tonos.
2. Extract valid Greek words and discard punctuation.
3. Expand compound characters and merge diphthongs.
4. Flatten the tokens across all words.
Args:
line (str): A full Greek sentence or phrase.
Returns:
list of str: A flat list of tokens (letters or diphthongs).
"""
# Step 1: Replace oxia with tonos
line = replace_oxia_with_tonos(line)
# Step 2: Extract only Greek characters (ignore punctuation, numbers, etc.)
words = re.findall(
r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"
r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ"
r"ἐἑἒἓἔἕἘἙἜἝ"
r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ"
r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ"
r"ὀὁὂὃὄὅὈὉὊὋὌὍ"
r"ὐὑὒὓὔὕὖὗὙὛὝ"
r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ"
r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ"
r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ"
r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ"
r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+",
line.lower()
)
# Step 3: Tokenize each word using expansion rules
token_lists = [process_word(word) for word in words]
# Step 4: Flatten token lists across all words
tokens = [token for tokens in token_lists for token in tokens]
return tokens