Spaces:
Running
Running
| # preprocess.py | |
| import re | |
| # === 1. Oxia → Tonos replacements === | |
| # These replace legacy Greek accents with the modern Unicode tonos versions | |
| OXIA_TO_TONOS = { | |
| "ά": "ά", # U+1F71 → U+03AC (alpha) | |
| "έ": "έ", # U+1F73 → U+03AD (epsilon) | |
| "ή": "ή", # U+1F75 → U+03AE (eta) | |
| "ί": "ί", # U+1F77 → U+03AF (iota) | |
| "ύ": "ύ", # U+1F7B → U+03CD (upsilon) | |
| "ό": "ό", # U+1F79 → U+03CC (omicron) | |
| "ώ": "ώ", # U+1F7D → U+03CE (omega) | |
| } | |
| # === 2. Diphthong component sets === | |
| diphth_y = {'α', 'ε', 'η', 'ο'} | |
| upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'} | |
| diphth_i = {'α', 'ε', 'ο', 'υ'} | |
| iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'} | |
| # Iota subscript/adscript combinations | |
| adscr_i_first = { | |
| 'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ', | |
| 'ᾶ','ῆ','ῶ','ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ', | |
| 'ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ' | |
| } | |
| adscr_i_second = {'ι'} | |
| # === 3. Word processor: expansion and diphthong merging === | |
| def process_word(word): | |
| """ | |
| Expand special Greek letters and merge diphthongs. | |
| Args: | |
| word (str): A lowercase Greek word. | |
| Returns: | |
| list of str: A list of tokens (letters or diphthongs). | |
| """ | |
| expanded = [] | |
| # Step 1: Expand characters like ζ → δσ, ξ → κσ, etc. | |
| for char in word: | |
| if char == 'ζ': | |
| expanded.extend(['δ', 'σ']) | |
| elif char == 'ς': | |
| expanded.append('σ') | |
| elif char == 'ῥ': | |
| expanded.append('ρ') | |
| elif char == 'ξ': | |
| expanded.extend(['κ', 'σ']) | |
| elif char == 'ψ': | |
| expanded.extend(['π', 'σ']) | |
| else: | |
| expanded.append(char) | |
| # Step 2: Merge diphthongs and adscript combinations | |
| combined = [] | |
| i = 0 | |
| while i < len(expanded): | |
| a = expanded[i] | |
| b = expanded[i+1] if i + 1 < len(expanded) else '' | |
| if a in diphth_y and b in upsilon_forms: | |
| combined.append(a + b) | |
| i += 2 | |
| elif a in diphth_i and b in iota_forms: | |
| combined.append(a + b) | |
| i += 2 | |
| elif a in adscr_i_first and b in adscr_i_second: | |
| combined.append(a + b) | |
| i += 2 | |
| else: | |
| combined.append(a) | |
| i += 1 | |
| return combined | |
| # === 4. Accent Normalization === | |
| def replace_oxia_with_tonos(text): | |
| """ | |
| Replace oxia accents in text with tonos equivalents using Unicode mapping. | |
| Args: | |
| text (str): Input Greek string. | |
| Returns: | |
| str: Normalized string with tonos accents. | |
| """ | |
| return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text) | |
| # === 5. Full Preprocessor === | |
| def preprocess_greek_line(line): | |
| """ | |
| Normalize, extract, and tokenize a line of Greek text. | |
| Steps: | |
| 1. Normalize oxia to tonos. | |
| 2. Extract valid Greek words and discard punctuation. | |
| 3. Expand compound characters and merge diphthongs. | |
| 4. Flatten the tokens across all words. | |
| Args: | |
| line (str): A full Greek sentence or phrase. | |
| Returns: | |
| list of str: A flat list of tokens (letters or diphthongs). | |
| """ | |
| # Step 1: Replace oxia with tonos | |
| line = replace_oxia_with_tonos(line) | |
| # Step 2: Extract only Greek characters (ignore punctuation, numbers, etc.) | |
| words = re.findall( | |
| r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ" | |
| r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ" | |
| r"ἐἑἒἓἔἕἘἙἜἝ" | |
| r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ" | |
| r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ" | |
| r"ὀὁὂὃὄὅὈὉὊὋὌὍ" | |
| r"ὐὑὒὓὔὕὖὗὙὛὝ" | |
| r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ" | |
| r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ" | |
| r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ" | |
| r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ" | |
| r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+", | |
| line.lower() | |
| ) | |
| # Step 3: Tokenize each word using expansion rules | |
| token_lists = [process_word(word) for word in words] | |
| # Step 4: Flatten token lists across all words | |
| tokens = [token for tokens in token_lists for token in tokens] | |
| return tokens |