Spaces:

al1808th
/

macronizer

Running

App Files Files Community

macronizer / preprocess.py

al1808th

output fix

bbcd8ef 8 days ago

raw

history blame contribute delete

4.52 kB

	# preprocess.py

	import re

	# === 1. Oxia → Tonos replacements ===
	# These replace legacy Greek accents with the modern Unicode tonos versions
	OXIA_TO_TONOS = {
	"ά": "ά", # U+1F71 → U+03AC (alpha)
	"έ": "έ", # U+1F73 → U+03AD (epsilon)
	"ή": "ή", # U+1F75 → U+03AE (eta)
	"ί": "ί", # U+1F77 → U+03AF (iota)
	"ύ": "ύ", # U+1F7B → U+03CD (upsilon)
	"ό": "ό", # U+1F79 → U+03CC (omicron)
	"ώ": "ώ", # U+1F7D → U+03CE (omega)
	}

	# === 2. Diphthong component sets ===
	diphth_y = {'α', 'ε', 'η', 'ο'}
	upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'}

	diphth_i = {'α', 'ε', 'ο', 'υ'}
	iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'}

	# Iota subscript/adscript combinations
	adscr_i_first = {
	'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ',
	'ᾶ','ῆ','ῶ','ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ',
	'ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'
	}
	adscr_i_second = {'ι'}

	# === 3. Word processor: expansion and diphthong merging ===

	def process_word(word):
	"""
	Expand special Greek letters and merge diphthongs.

	Args:
	word (str): A lowercase Greek word.

	Returns:
	list of str: A list of tokens (letters or diphthongs).
	"""
	expanded = []

	# Step 1: Expand characters like ζ → δσ, ξ → κσ, etc.
	for char in word:
	if char == 'ζ':
	expanded.extend(['δ', 'σ'])
	elif char == 'ς':
	expanded.append('σ')
	elif char == 'ῥ':
	expanded.append('ρ')
	elif char == 'ξ':
	expanded.extend(['κ', 'σ'])
	elif char == 'ψ':
	expanded.extend(['π', 'σ'])
	else:
	expanded.append(char)

	# Step 2: Merge diphthongs and adscript combinations
	combined = []
	i = 0
	while i < len(expanded):
	a = expanded[i]
	b = expanded[i+1] if i + 1 < len(expanded) else ''

	if a in diphth_y and b in upsilon_forms:
	combined.append(a + b)
	i += 2
	elif a in diphth_i and b in iota_forms:
	combined.append(a + b)
	i += 2
	elif a in adscr_i_first and b in adscr_i_second:
	combined.append(a + b)
	i += 2
	else:
	combined.append(a)
	i += 1

	return combined

	# === 4. Accent Normalization ===

	def replace_oxia_with_tonos(text):
	"""
	Replace oxia accents in text with tonos equivalents using Unicode mapping.

	Args:
	text (str): Input Greek string.

	Returns:
	str: Normalized string with tonos accents.
	"""
	return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text)

	# === 5. Full Preprocessor ===

	def preprocess_greek_line(line):
	"""
	Normalize, extract, and tokenize a line of Greek text.

	Steps:
	1. Normalize oxia to tonos.
	2. Extract valid Greek words and discard punctuation.
	3. Expand compound characters and merge diphthongs.
	4. Flatten the tokens across all words.

	Args:
	line (str): A full Greek sentence or phrase.

	Returns:
	list of str: A flat list of tokens (letters or diphthongs).
	"""
	# Step 1: Replace oxia with tonos
	line = replace_oxia_with_tonos(line)

	# Step 2: Extract only Greek characters (ignore punctuation, numbers, etc.)
	words = re.findall(
	r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"
	r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ"
	r"ἐἑἒἓἔἕἘἙἜἝ"
	r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ"
	r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ"
	r"ὀὁὂὃὄὅὈὉὊὋὌὍ"
	r"ὐὑὒὓὔὕὖὗὙὛὝ"
	r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ"
	r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ"
	r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ"
	r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ"
	r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+",
	line.lower()
	)

	# Step 3: Tokenize each word using expansion rules
	token_lists = [process_word(word) for word in words]

	# Step 4: Flatten token lists across all words
	tokens = [token for tokens in token_lists for token in tokens]

	return tokens