grc_dep_web_sm / package_functions.py
diyclassics's picture
Initial release: grc_dep_web_sm v3.8.0
425e8a0 verified
# Packaging-ready functions for grc_core_web_sm
#
# This file is included in the wheel via `spacy package --code`.
# It loads the lookup table from vocab.lookups (embedded in the model)
# instead of from a filesystem path.
from __future__ import annotations
import string
import unicodedata
from typing import Dict
from spacy.language import Language
from spacy.tokens import Doc, Token
# ----- Greek accent normalization (inline, no external dependency) ----- #
# Grave (VARIA) → Acute (OXIA) mapping for polytonic Greek
_GRAVE_TO_ACUTE: Dict[str, str] = {
# Alpha with breathing
"\u1F02": "\u1F04", "\u1F03": "\u1F05",
"\u1F0A": "\u1F0C", "\u1F0B": "\u1F0D",
# Epsilon
"\u1F12": "\u1F14", "\u1F13": "\u1F15",
"\u1F1A": "\u1F1C", "\u1F1B": "\u1F1D",
# Eta
"\u1F22": "\u1F24", "\u1F23": "\u1F25",
"\u1F2A": "\u1F2C", "\u1F2B": "\u1F2D",
# Iota
"\u1F32": "\u1F34", "\u1F33": "\u1F35",
"\u1F3A": "\u1F3C", "\u1F3B": "\u1F3D",
# Omicron
"\u1F42": "\u1F44", "\u1F43": "\u1F45",
"\u1F4A": "\u1F4C", "\u1F4B": "\u1F4D",
# Upsilon
"\u1F52": "\u1F54", "\u1F53": "\u1F55", "\u1F5B": "\u1F5D",
# Omega
"\u1F62": "\u1F64", "\u1F63": "\u1F65",
"\u1F6A": "\u1F6C", "\u1F6B": "\u1F6D",
# Simple vowel + varia
"\u1F70": "\u1F71", "\u1F72": "\u1F73", "\u1F74": "\u1F75",
"\u1F76": "\u1F77", "\u1F78": "\u1F79", "\u1F7A": "\u1F7B",
"\u1F7C": "\u1F7D",
# With iota subscript
"\u1F82": "\u1F84", "\u1F83": "\u1F85",
"\u1F8A": "\u1F8C", "\u1F8B": "\u1F8D",
"\u1F92": "\u1F94", "\u1F93": "\u1F95",
"\u1F9A": "\u1F9C", "\u1F9B": "\u1F9D",
"\u1FA2": "\u1FA4", "\u1FA3": "\u1FA5",
"\u1FAA": "\u1FAC", "\u1FAB": "\u1FAD",
"\u1FB2": "\u1FB4", "\u1FC2": "\u1FC4", "\u1FF2": "\u1FF4",
# Capitals
"\u1FBA": "\u1FBB", "\u1FC8": "\u1FC9", "\u1FCA": "\u1FCB",
"\u1FDA": "\u1FDB", "\u1FEA": "\u1FEB", "\u1FF8": "\u1FF9",
"\u1FFA": "\u1FFB",
# Dialytika + varia
"\u1FD2": "\u1FD3", "\u1FE2": "\u1FE3",
}
_GRAVE_TO_ACUTE_TABLE = str.maketrans(_GRAVE_TO_ACUTE)
def _normalize_greek(text: str) -> str:
"""Normalize Greek text for lookup: grave→acute, strip macron/breve, NFC."""
if not text:
return text
# Strip macrons/breves (NFD → remove U+0304/U+0306 → NFC)
decomposed = unicodedata.normalize("NFD", text)
stripped = decomposed.replace("\u0304", "").replace("\u0306", "")
result = unicodedata.normalize("NFC", stripped)
# Grave → acute (precomposed)
result = result.translate(_GRAVE_TO_ACUTE_TABLE)
# Combining grave → acute
if "\u0300" in result:
result = result.replace("\u0300", "\u0301")
# Final NFC
return unicodedata.normalize("NFC", result)
# ----- lookup_lemmatizer (loads from vocab.lookups) ----- #
if not Token.has_extension("predicted_lemma"):
Token.set_extension("predicted_lemma", default=None)
@Language.component(name="lookup_lemmatizer")
def lookup_lemmatizer(doc: Doc) -> Doc:
"""Lookup-based lemmatizer for Ancient Greek.
Assigns lemmas using a 1.2M-entry dictionary built from CLTK Morpheus,
UD treebanks, and Wiktionary. Normalizes grave→acute accents at query
time so running-text forms match citation entries.
Runs after trainable_lemmatizer: overrides only when a lookup match
exists, preserving the trainable model's output for unseen forms.
In this packaging-ready version, lookups are loaded from the model's
vocab.lookups (embedded in vocab/lookups.bin) rather than a filesystem path.
"""
lookups = doc.vocab.lookups
if not lookups.has_table("lemma_lookup"):
return doc
table = lookups.get_table("lemma_lookup")
for token in doc:
# Store trainable lemmatizer's prediction
token._.predicted_lemma = token.lemma_
# Skip punctuation
if token.pos_ == "PUNCT" or token.text in string.punctuation:
continue
# Normalize for lookup (grave→acute, strip macron/breve)
normalized = _normalize_greek(token.text)
# Direct match
if normalized in table:
token.lemma_ = table[normalized]
continue
# Case-insensitive fallback for capitalized words
if normalized and normalized[0].isupper():
lower = normalized.lower()
if lower in table:
token.lemma_ = table[lower]
return doc