| | |
| | |
| | |
| | |
| | |
| |
|
| | from __future__ import annotations |
| |
|
| | import string |
| | import unicodedata |
| | from typing import Dict |
| |
|
| | from spacy.language import Language |
| | from spacy.tokens import Doc, Token |
| |
|
| |
|
| | |
| |
|
| | |
| | _GRAVE_TO_ACUTE: Dict[str, str] = { |
| | |
| | "\u1F02": "\u1F04", "\u1F03": "\u1F05", |
| | "\u1F0A": "\u1F0C", "\u1F0B": "\u1F0D", |
| | |
| | "\u1F12": "\u1F14", "\u1F13": "\u1F15", |
| | "\u1F1A": "\u1F1C", "\u1F1B": "\u1F1D", |
| | |
| | "\u1F22": "\u1F24", "\u1F23": "\u1F25", |
| | "\u1F2A": "\u1F2C", "\u1F2B": "\u1F2D", |
| | |
| | "\u1F32": "\u1F34", "\u1F33": "\u1F35", |
| | "\u1F3A": "\u1F3C", "\u1F3B": "\u1F3D", |
| | |
| | "\u1F42": "\u1F44", "\u1F43": "\u1F45", |
| | "\u1F4A": "\u1F4C", "\u1F4B": "\u1F4D", |
| | |
| | "\u1F52": "\u1F54", "\u1F53": "\u1F55", "\u1F5B": "\u1F5D", |
| | |
| | "\u1F62": "\u1F64", "\u1F63": "\u1F65", |
| | "\u1F6A": "\u1F6C", "\u1F6B": "\u1F6D", |
| | |
| | "\u1F70": "\u1F71", "\u1F72": "\u1F73", "\u1F74": "\u1F75", |
| | "\u1F76": "\u1F77", "\u1F78": "\u1F79", "\u1F7A": "\u1F7B", |
| | "\u1F7C": "\u1F7D", |
| | |
| | "\u1F82": "\u1F84", "\u1F83": "\u1F85", |
| | "\u1F8A": "\u1F8C", "\u1F8B": "\u1F8D", |
| | "\u1F92": "\u1F94", "\u1F93": "\u1F95", |
| | "\u1F9A": "\u1F9C", "\u1F9B": "\u1F9D", |
| | "\u1FA2": "\u1FA4", "\u1FA3": "\u1FA5", |
| | "\u1FAA": "\u1FAC", "\u1FAB": "\u1FAD", |
| | "\u1FB2": "\u1FB4", "\u1FC2": "\u1FC4", "\u1FF2": "\u1FF4", |
| | |
| | "\u1FBA": "\u1FBB", "\u1FC8": "\u1FC9", "\u1FCA": "\u1FCB", |
| | "\u1FDA": "\u1FDB", "\u1FEA": "\u1FEB", "\u1FF8": "\u1FF9", |
| | "\u1FFA": "\u1FFB", |
| | |
| | "\u1FD2": "\u1FD3", "\u1FE2": "\u1FE3", |
| | } |
| |
|
| | _GRAVE_TO_ACUTE_TABLE = str.maketrans(_GRAVE_TO_ACUTE) |
| |
|
| |
|
| | def _normalize_greek(text: str) -> str: |
| | """Normalize Greek text for lookup: grave→acute, strip macron/breve, NFC.""" |
| | if not text: |
| | return text |
| | |
| | decomposed = unicodedata.normalize("NFD", text) |
| | stripped = decomposed.replace("\u0304", "").replace("\u0306", "") |
| | result = unicodedata.normalize("NFC", stripped) |
| | |
| | result = result.translate(_GRAVE_TO_ACUTE_TABLE) |
| | |
| | if "\u0300" in result: |
| | result = result.replace("\u0300", "\u0301") |
| | |
| | return unicodedata.normalize("NFC", result) |
| |
|
| |
|
| | |
| |
|
| | if not Token.has_extension("predicted_lemma"): |
| | Token.set_extension("predicted_lemma", default=None) |
| |
|
| |
|
| | @Language.component(name="lookup_lemmatizer") |
| | def lookup_lemmatizer(doc: Doc) -> Doc: |
| | """Lookup-based lemmatizer for Ancient Greek. |
| | |
| | Assigns lemmas using a 1.2M-entry dictionary built from CLTK Morpheus, |
| | UD treebanks, and Wiktionary. Normalizes grave→acute accents at query |
| | time so running-text forms match citation entries. |
| | |
| | Runs after trainable_lemmatizer: overrides only when a lookup match |
| | exists, preserving the trainable model's output for unseen forms. |
| | |
| | In this packaging-ready version, lookups are loaded from the model's |
| | vocab.lookups (embedded in vocab/lookups.bin) rather than a filesystem path. |
| | """ |
| | lookups = doc.vocab.lookups |
| | if not lookups.has_table("lemma_lookup"): |
| | return doc |
| | table = lookups.get_table("lemma_lookup") |
| |
|
| | for token in doc: |
| | |
| | token._.predicted_lemma = token.lemma_ |
| |
|
| | |
| | if token.pos_ == "PUNCT" or token.text in string.punctuation: |
| | continue |
| |
|
| | |
| | normalized = _normalize_greek(token.text) |
| |
|
| | |
| | if normalized in table: |
| | token.lemma_ = table[normalized] |
| | continue |
| |
|
| | |
| | if normalized and normalized[0].isupper(): |
| | lower = normalized.lower() |
| | if lower in table: |
| | token.lemma_ = table[lower] |
| |
|
| | return doc |
| |
|