Spaces:

thomascerniglia
/

DialectAnalysis

Sleeping

App Files Files Community

DialectAnalysis / dialect_analysis /features.py

thomascerniglia

Upload 8 files

d0326ea verified about 1 month ago

raw

history blame contribute delete

13.8 kB

	from __future__ import annotations

	import unicodedata
	from collections import Counter
	from typing import Any, Dict, List, Mapping, Tuple

	from .normalization import sigma_normalize, strip_greek_diacritics


	PARTICLES: Tuple[str, ...] = ("μεν", "δε", "γαρ", "τε", "δη", "ουν")
	ENDINGS_PLAIN: Tuple[str, ...] = ("οι", "αι", "ηι", "οισι")

	# Infinitive endings (high-signal morphology when present).
	# These are matched on diacritic-stripped, sigma-normalized tokens.
	INFINITIVE_ENDINGS_PLAIN: Tuple[str, ...] = (
	"ειν", # common Attic/Ionic/Koine infinitive
	"μεναι", # Aeolic-style infinitive
	"μεν", # Doric/Aeolic-style infinitive
	)

	# A few additional, high-signal Homeric / epic-Ionic patterns (MVP).
	# Matched on diacritic-stripped tokens.
	EPIC_ENDINGS_PLAIN: Tuple[str, ...] = (
	"οιο", # e.g., Ἠελίοιο
	"φι", # e.g., -φι instrumental
	"εσσι", # -εσσι(ν)
	# Epic/Ionic genitive (sigma-normalized): -ηος (e.g., Ἀχιλῆος -> αχιληοσ)
	"ηοσ",
	# Epic patronymic genitive (e.g., Πηληϊάδεω, Ἀτρεΐδεω)
	"αδεω",
	"ιδεω",
	)

	# Dative plural patterns (useful for Ionic/Epic vs Attic/Koine tendencies).
	# Matched on diacritic-stripped tokens.
	DATIVE_PLURAL_ENDINGS_PLAIN: Tuple[str, ...] = (
	"οισι",
	"ηισι",
	"αισι",
	"οις",
	"αις",
	)

	# Epic particles (very small MVP subset; diacritics stripped and sigma-normalized).
	EPIC_PARTICLES_PLAIN: Tuple[str, ...] = (
	"κε",
	"κεν",
	# Very common Homeric particle (often written ἄρ/ἄρ᾽)
	"αρ",
	# Homeric/epic pronoun form
	"μιν",
	)

	# A few very common Homeric-vocabulary tokens (NOT dialect-specific in isolation).
	# We only treat these as weak epic-Ionic evidence when multiple hits occur.
	EPIC_WORDS_PLAIN: Tuple[str, ...] = (
	"εννεπε",
	"αειδε",
	"μουσα",
	"μηνιν",
	"θεα",
	)

	# Very small lexicalized Attic-vs-Ionic spelling cues (MVP).
	# These are substring-based to catch inflectional variants.
	ATTIC_TT_STEMS: Tuple[str, ...] = (
	"θαλαττ", # θάλαττα
	"γλωττ", # γλῶττα
	"πραττ", # πράττω
	"ταττ", # τάττω
	)

	IONIC_SS_STEMS: Tuple[str, ...] = (
	"θαλασσ", # θάλασσα
	"γλωσσ", # γλῶσσα
	"πρασσ", # πράσσω
	"τασσ", # τάσσω
	)

	# Preposition preference (edition-dependent but often helpful): εἰς vs ἐς.
	PREPOSITIONS_PLAIN: Tuple[str, ...] = (
	# NOTE: these are sigma-normalized (final ς -> σ)
	"εισ",
	"εσ",
	)

	# Koine-leaning function words (very small MVP set; genre-sensitive).
	# These should be low-weight, positive-only cues.
	KOINE_FUNCTION_WORDS_PLAIN: Tuple[str, ...] = (
	"ινα",
	"οτι",
	# NOTE: sigma-normalized
	"καθωσ",
	# NT-style narrative formula is common in Koine
	"εγενετο",
	)

	# Literary/poetic morphology cues.
	# - Doric 1pl active ending often appears as -μες (vs -μεν).
	# - Aeolic pronoun forms like ἄμμι/ὔμμι are strong when they occur.
	POETIC_MORPH_CUES: Tuple[str, ...] = (
	"verb_1pl_mes",
	"aeolic_ammi",
	"aeolic_ummi",
	)


	def _ends_with_iota_subscript_cluster(token: str, base_letter: str) -> bool:
	"""True if token ends with base_letter + iota-subscript (any accents allowed)."""

	if not token:
	return False

	decomposed = unicodedata.normalize("NFD", token)
	i = len(decomposed) - 1
	saw_ypogegrammeni = False
	while i >= 0 and unicodedata.combining(decomposed[i]):
	if decomposed[i] == "\u0345":
	saw_ypogegrammeni = True
	i -= 1

	if i < 0:
	return False

	base = decomposed[i]
	return base == base_letter and saw_ypogegrammeni


	def extract_features(tokens: List[str]) -> Dict[str, Any]:
	"""Extract interpretable linguistic feature counts from tokens."""

	token_count = len(tokens)
	particles = Counter({p: 0 for p in PARTICLES})
	endings = Counter({e: 0 for e in (*ENDINGS_PLAIN, "ᾳ")})
	infinitives = Counter({e: 0 for e in INFINITIVE_ENDINGS_PLAIN})

	epic_endings = Counter({e: 0 for e in EPIC_ENDINGS_PLAIN})

	dative_plural_endings = Counter({e: 0 for e in DATIVE_PLURAL_ENDINGS_PLAIN})
	epic_particles = Counter({p: 0 for p in EPIC_PARTICLES_PLAIN})

	epic_words = Counter({w: 0 for w in EPIC_WORDS_PLAIN})

	prepositions = Counter({p: 0 for p in PREPOSITIONS_PLAIN})
	koine_words = Counter({w: 0 for w in KOINE_FUNCTION_WORDS_PLAIN})

	lexical_cues = Counter(
	{
	"attic_tt": 0,
	"ionic_ss": 0,
	}
	)

	# Mild Doric cue: initial rough-breathed alpha (e.g., ἁ as article in Doric).
	doric_ha_initial = 0

	poetic_morph = Counter({k: 0 for k in POETIC_MORPH_CUES})

	# Orthographic patterns
	tt_count = 0
	ss_count = 0

	alpha_endings = 0
	eta_endings = 0

	# Script evidence: helps detect non-Greek input or encoding issues.
	greek_alpha_chars = 0
	alpha_chars = 0

	for tok in tokens:
	if not tok:
	continue

	for ch in tok:
	if not ch.isalpha():
	continue
	alpha_chars += 1
	code = ord(ch)
	if (0x0370 <= code <= 0x03FF) or (0x1F00 <= code <= 0x1FFF):
	greek_alpha_chars += 1

	plain = sigma_normalize(strip_greek_diacritics(tok))
	# Doric 1pl -μες (sigma-normalized: -μεσ).
	# Guard against counting very short tokens.
	if len(plain) >= 5 and plain.endswith("μεσ"):
	poetic_morph["verb_1pl_mes"] += 1

	# Aeolic pronoun forms (very high signal).
	if plain == "αμμι":
	poetic_morph["aeolic_ammi"] += 1
	if plain == "υμμι":
	poetic_morph["aeolic_ummi"] += 1

	# Doric cue: token begins with alpha + rough breathing.
	# This is intentionally weak; lots of words can have rough breathing.
	nfd = unicodedata.normalize("NFD", tok)
	if nfd:
	base0 = nfd[0]
	# Collect leading combining marks
	j = 1
	has_rough = False
	while j < len(nfd) and unicodedata.combining(nfd[j]):
	# COMBINING REVERSED COMMA ABOVE (rough breathing)
	if nfd[j] == "\u0314":
	has_rough = True
	j += 1
	if base0 == "α" and has_rough:
	doric_ha_initial += 1

	# Count orthographic patterns (occurrences, not just token presence)
	tt_count += plain.count("ττ")
	ss_count += plain.count("σσ")

	if plain in particles:
	particles[plain] += 1

	if plain in epic_particles:
	epic_particles[plain] += 1

	if plain in epic_words:
	epic_words[plain] += 1

	if plain in prepositions:
	prepositions[plain] += 1

	if plain in koine_words:
	koine_words[plain] += 1

	# Lexicalized Attic/Ionic cues
	if any(stem in plain for stem in ATTIC_TT_STEMS):
	lexical_cues["attic_tt"] += 1
	if any(stem in plain for stem in IONIC_SS_STEMS):
	lexical_cues["ionic_ss"] += 1

	for ending in ENDINGS_PLAIN:
	if plain.endswith(ending):
	endings[ending] += 1

	# Infinitive endings (prefer longer endings first to avoid double-counting)
	# Guard against short function words like the particle "μεν".
	if len(plain) >= 5:
	if plain.endswith("μεναι"):
	infinitives["μεναι"] += 1
	elif plain.endswith("ειν"):
	infinitives["ειν"] += 1
	elif plain.endswith("μεν"):
	infinitives["μεν"] += 1

	for ending in EPIC_ENDINGS_PLAIN:
	if plain.endswith(ending):
	epic_endings[ending] += 1

	for ending in DATIVE_PLURAL_ENDINGS_PLAIN:
	if plain.endswith(ending):
	dative_plural_endings[ending] += 1

	if _ends_with_iota_subscript_cluster(tok, "α"):
	endings["ᾳ"] += 1

	if plain.endswith(("α", "ας", "αν")):
	alpha_endings += 1
	if plain.endswith(("η", "ης", "ην")):
	eta_endings += 1

	return {
	"token_count": token_count,
	"particles": dict(particles),
	"endings": dict(endings),
	"infinitives": dict(infinitives),
	"epic_endings": dict(epic_endings),
	"dative_plural_endings": dict(dative_plural_endings),
	"epic_particles": dict(epic_particles),
	"epic_words": dict(epic_words),
	"prepositions": dict(prepositions),
	"koine_words": dict(koine_words),
	"lexical_cues": dict(lexical_cues),
	"patterns": {
	"tt": tt_count,
	"ss": ss_count,
	},
	"orthography": {
	"alpha_endings": alpha_endings,
	"eta_endings": eta_endings,
	},
	"script": {
	"greek_alpha_chars": greek_alpha_chars,
	"alpha_chars": alpha_chars,
	},
	"doric_cues": {
	"ha_initial": doric_ha_initial,
	},
	"poetic_morph": dict(poetic_morph),
	}


	def rate_per_100(count: int, token_count: int) -> float:
	if token_count <= 0:
	return 0.0
	return 100.0 * (count / token_count)


	def compute_rates(feature_dict: Mapping[str, Any]) -> Dict[str, Any]:
	"""Compute per-100-token rates from feature counts."""

	token_count = int(feature_dict.get("token_count", 0) or 0)
	particles: Mapping[str, int] = feature_dict.get("particles", {}) or {}
	endings: Mapping[str, int] = feature_dict.get("endings", {}) or {}
	infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
	orth: Mapping[str, int] = feature_dict.get("orthography", {}) or {}
	patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
	epic_particles: Mapping[str, int] = feature_dict.get("epic_particles", {}) or {}
	epic_endings: Mapping[str, int] = feature_dict.get("epic_endings", {}) or {}
	dative_plural_endings: Mapping[str, int] = feature_dict.get("dative_plural_endings", {}) or {}
	prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
	koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
	lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
	doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
	poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}

	particle_rates = {p: rate_per_100(int(particles.get(p, 0) or 0), token_count) for p in PARTICLES}
	ending_rates = {e: rate_per_100(int(endings.get(e, 0) or 0), token_count) for e in (*ENDINGS_PLAIN, "ᾳ")}
	infinitive_rates = {
	e: rate_per_100(int(infinitives.get(e, 0) or 0), token_count) for e in INFINITIVE_ENDINGS_PLAIN
	}

	alpha_rate = rate_per_100(int(orth.get("alpha_endings", 0) or 0), token_count)
	eta_rate = rate_per_100(int(orth.get("eta_endings", 0) or 0), token_count)

	marked_rate = ending_rates.get("οισι", 0.0) + ending_rates.get("ηι", 0.0) + ending_rates.get("ᾳ", 0.0)

	pattern_rates = {
	"tt": rate_per_100(int(patterns.get("tt", 0) or 0), token_count),
	"ss": rate_per_100(int(patterns.get("ss", 0) or 0), token_count),
	}

	epic_particle_rates = {p: rate_per_100(int(epic_particles.get(p, 0) or 0), token_count) for p in EPIC_PARTICLES_PLAIN}
	epic_ending_rates = {e: rate_per_100(int(epic_endings.get(e, 0) or 0), token_count) for e in EPIC_ENDINGS_PLAIN}
	dative_plural_ending_rates = {
	e: rate_per_100(int(dative_plural_endings.get(e, 0) or 0), token_count)
	for e in DATIVE_PLURAL_ENDINGS_PLAIN
	}

	preposition_rates = {p: rate_per_100(int(prepositions.get(p, 0) or 0), token_count) for p in PREPOSITIONS_PLAIN}
	koine_word_rates = {w: rate_per_100(int(koine_words.get(w, 0) or 0), token_count) for w in KOINE_FUNCTION_WORDS_PLAIN}
	lexical_cue_rates = {
	"attic_tt": rate_per_100(int(lexical_cues.get("attic_tt", 0) or 0), token_count),
	"ionic_ss": rate_per_100(int(lexical_cues.get("ionic_ss", 0) or 0), token_count),
	}
	doric_cue_rates = {
	"ha_initial": rate_per_100(int(doric_cues.get("ha_initial", 0) or 0), token_count),
	}

	poetic_morph_rates = {
	k: rate_per_100(int(poetic_morph.get(k, 0) or 0), token_count) for k in POETIC_MORPH_CUES
	}

	return {
	"particles_per_100": particle_rates,
	"endings_per_100": ending_rates,
	"infinitives_per_100": infinitive_rates,
	"patterns_per_100": pattern_rates,
	"epic_particles_per_100": epic_particle_rates,
	"epic_endings_per_100": epic_ending_rates,
	"dative_plural_endings_per_100": dative_plural_ending_rates,
	"prepositions_per_100": preposition_rates,
	"koine_words_per_100": koine_word_rates,
	"lexical_cues_per_100": lexical_cue_rates,
	"doric_cues_per_100": doric_cue_rates,
	"poetic_morph_per_100": poetic_morph_rates,
	"alpha_endings_per_100": alpha_rate,
	"eta_endings_per_100": eta_rate,
	"marked_endings_per_100": marked_rate,
	}