# Packaging-ready functions for grc_core_web_sm # # This file is included in the wheel via `spacy package --code`. # It loads the lookup table from vocab.lookups (embedded in the model) # instead of from a filesystem path. from __future__ import annotations import string import unicodedata from typing import Dict from spacy.language import Language from spacy.tokens import Doc, Token # ----- Greek accent normalization (inline, no external dependency) ----- # # Grave (VARIA) → Acute (OXIA) mapping for polytonic Greek _GRAVE_TO_ACUTE: Dict[str, str] = { # Alpha with breathing "\u1F02": "\u1F04", "\u1F03": "\u1F05", "\u1F0A": "\u1F0C", "\u1F0B": "\u1F0D", # Epsilon "\u1F12": "\u1F14", "\u1F13": "\u1F15", "\u1F1A": "\u1F1C", "\u1F1B": "\u1F1D", # Eta "\u1F22": "\u1F24", "\u1F23": "\u1F25", "\u1F2A": "\u1F2C", "\u1F2B": "\u1F2D", # Iota "\u1F32": "\u1F34", "\u1F33": "\u1F35", "\u1F3A": "\u1F3C", "\u1F3B": "\u1F3D", # Omicron "\u1F42": "\u1F44", "\u1F43": "\u1F45", "\u1F4A": "\u1F4C", "\u1F4B": "\u1F4D", # Upsilon "\u1F52": "\u1F54", "\u1F53": "\u1F55", "\u1F5B": "\u1F5D", # Omega "\u1F62": "\u1F64", "\u1F63": "\u1F65", "\u1F6A": "\u1F6C", "\u1F6B": "\u1F6D", # Simple vowel + varia "\u1F70": "\u1F71", "\u1F72": "\u1F73", "\u1F74": "\u1F75", "\u1F76": "\u1F77", "\u1F78": "\u1F79", "\u1F7A": "\u1F7B", "\u1F7C": "\u1F7D", # With iota subscript "\u1F82": "\u1F84", "\u1F83": "\u1F85", "\u1F8A": "\u1F8C", "\u1F8B": "\u1F8D", "\u1F92": "\u1F94", "\u1F93": "\u1F95", "\u1F9A": "\u1F9C", "\u1F9B": "\u1F9D", "\u1FA2": "\u1FA4", "\u1FA3": "\u1FA5", "\u1FAA": "\u1FAC", "\u1FAB": "\u1FAD", "\u1FB2": "\u1FB4", "\u1FC2": "\u1FC4", "\u1FF2": "\u1FF4", # Capitals "\u1FBA": "\u1FBB", "\u1FC8": "\u1FC9", "\u1FCA": "\u1FCB", "\u1FDA": "\u1FDB", "\u1FEA": "\u1FEB", "\u1FF8": "\u1FF9", "\u1FFA": "\u1FFB", # Dialytika + varia "\u1FD2": "\u1FD3", "\u1FE2": "\u1FE3", } _GRAVE_TO_ACUTE_TABLE = str.maketrans(_GRAVE_TO_ACUTE) def _normalize_greek(text: str) -> str: """Normalize Greek text for lookup: grave→acute, strip macron/breve, NFC.""" if not text: return text # Strip macrons/breves (NFD → remove U+0304/U+0306 → NFC) decomposed = unicodedata.normalize("NFD", text) stripped = decomposed.replace("\u0304", "").replace("\u0306", "") result = unicodedata.normalize("NFC", stripped) # Grave → acute (precomposed) result = result.translate(_GRAVE_TO_ACUTE_TABLE) # Combining grave → acute if "\u0300" in result: result = result.replace("\u0300", "\u0301") # Final NFC return unicodedata.normalize("NFC", result) # ----- lookup_lemmatizer (loads from vocab.lookups) ----- # if not Token.has_extension("predicted_lemma"): Token.set_extension("predicted_lemma", default=None) @Language.component(name="lookup_lemmatizer") def lookup_lemmatizer(doc: Doc) -> Doc: """Lookup-based lemmatizer for Ancient Greek. Assigns lemmas using a 1.2M-entry dictionary built from CLTK Morpheus, UD treebanks, and Wiktionary. Normalizes grave→acute accents at query time so running-text forms match citation entries. Runs after trainable_lemmatizer: overrides only when a lookup match exists, preserving the trainable model's output for unseen forms. In this packaging-ready version, lookups are loaded from the model's vocab.lookups (embedded in vocab/lookups.bin) rather than a filesystem path. """ lookups = doc.vocab.lookups if not lookups.has_table("lemma_lookup"): return doc table = lookups.get_table("lemma_lookup") for token in doc: # Store trainable lemmatizer's prediction token._.predicted_lemma = token.lemma_ # Skip punctuation if token.pos_ == "PUNCT" or token.text in string.punctuation: continue # Normalize for lookup (grave→acute, strip macron/breve) normalized = _normalize_greek(token.text) # Direct match if normalized in table: token.lemma_ = table[normalized] continue # Case-insensitive fallback for capitalized words if normalized and normalized[0].isupper(): lower = normalized.lower() if lower in table: token.lemma_ = table[lower] return doc