grc_dep_web_sm / package_functions.py

Initial release: grc_dep_web_sm v3.8.0

425e8a0 verified 9 days ago

4.43 kB

	# Packaging-ready functions for grc_core_web_sm
	#
	# This file is included in the wheel via `spacy package --code`.
	# It loads the lookup table from vocab.lookups (embedded in the model)
	# instead of from a filesystem path.

	from __future__ import annotations

	import string
	import unicodedata
	from typing import Dict

	from spacy.language import Language
	from spacy.tokens import Doc, Token


	# ----- Greek accent normalization (inline, no external dependency) ----- #

	# Grave (VARIA) → Acute (OXIA) mapping for polytonic Greek
	_GRAVE_TO_ACUTE: Dict[str, str] = {
	# Alpha with breathing
	"\u1F02": "\u1F04", "\u1F03": "\u1F05",
	"\u1F0A": "\u1F0C", "\u1F0B": "\u1F0D",
	# Epsilon
	"\u1F12": "\u1F14", "\u1F13": "\u1F15",
	"\u1F1A": "\u1F1C", "\u1F1B": "\u1F1D",
	# Eta
	"\u1F22": "\u1F24", "\u1F23": "\u1F25",
	"\u1F2A": "\u1F2C", "\u1F2B": "\u1F2D",
	# Iota
	"\u1F32": "\u1F34", "\u1F33": "\u1F35",
	"\u1F3A": "\u1F3C", "\u1F3B": "\u1F3D",
	# Omicron
	"\u1F42": "\u1F44", "\u1F43": "\u1F45",
	"\u1F4A": "\u1F4C", "\u1F4B": "\u1F4D",
	# Upsilon
	"\u1F52": "\u1F54", "\u1F53": "\u1F55", "\u1F5B": "\u1F5D",
	# Omega
	"\u1F62": "\u1F64", "\u1F63": "\u1F65",
	"\u1F6A": "\u1F6C", "\u1F6B": "\u1F6D",
	# Simple vowel + varia
	"\u1F70": "\u1F71", "\u1F72": "\u1F73", "\u1F74": "\u1F75",
	"\u1F76": "\u1F77", "\u1F78": "\u1F79", "\u1F7A": "\u1F7B",
	"\u1F7C": "\u1F7D",
	# With iota subscript
	"\u1F82": "\u1F84", "\u1F83": "\u1F85",
	"\u1F8A": "\u1F8C", "\u1F8B": "\u1F8D",
	"\u1F92": "\u1F94", "\u1F93": "\u1F95",
	"\u1F9A": "\u1F9C", "\u1F9B": "\u1F9D",
	"\u1FA2": "\u1FA4", "\u1FA3": "\u1FA5",
	"\u1FAA": "\u1FAC", "\u1FAB": "\u1FAD",
	"\u1FB2": "\u1FB4", "\u1FC2": "\u1FC4", "\u1FF2": "\u1FF4",
	# Capitals
	"\u1FBA": "\u1FBB", "\u1FC8": "\u1FC9", "\u1FCA": "\u1FCB",
	"\u1FDA": "\u1FDB", "\u1FEA": "\u1FEB", "\u1FF8": "\u1FF9",
	"\u1FFA": "\u1FFB",
	# Dialytika + varia
	"\u1FD2": "\u1FD3", "\u1FE2": "\u1FE3",
	}

	_GRAVE_TO_ACUTE_TABLE = str.maketrans(_GRAVE_TO_ACUTE)


	def _normalize_greek(text: str) -> str:
	"""Normalize Greek text for lookup: grave→acute, strip macron/breve, NFC."""
	if not text:
	return text
	# Strip macrons/breves (NFD → remove U+0304/U+0306 → NFC)
	decomposed = unicodedata.normalize("NFD", text)
	stripped = decomposed.replace("\u0304", "").replace("\u0306", "")
	result = unicodedata.normalize("NFC", stripped)
	# Grave → acute (precomposed)
	result = result.translate(_GRAVE_TO_ACUTE_TABLE)
	# Combining grave → acute
	if "\u0300" in result:
	result = result.replace("\u0300", "\u0301")
	# Final NFC
	return unicodedata.normalize("NFC", result)


	# ----- lookup_lemmatizer (loads from vocab.lookups) ----- #

	if not Token.has_extension("predicted_lemma"):
	Token.set_extension("predicted_lemma", default=None)


	@Language.component(name="lookup_lemmatizer")
	def lookup_lemmatizer(doc: Doc) -> Doc:
	"""Lookup-based lemmatizer for Ancient Greek.

	Assigns lemmas using a 1.2M-entry dictionary built from CLTK Morpheus,
	UD treebanks, and Wiktionary. Normalizes grave→acute accents at query
	time so running-text forms match citation entries.

	Runs after trainable_lemmatizer: overrides only when a lookup match
	exists, preserving the trainable model's output for unseen forms.

	In this packaging-ready version, lookups are loaded from the model's
	vocab.lookups (embedded in vocab/lookups.bin) rather than a filesystem path.
	"""
	lookups = doc.vocab.lookups
	if not lookups.has_table("lemma_lookup"):
	return doc
	table = lookups.get_table("lemma_lookup")

	for token in doc:
	# Store trainable lemmatizer's prediction
	token._.predicted_lemma = token.lemma_

	# Skip punctuation
	if token.pos_ == "PUNCT" or token.text in string.punctuation:
	continue

	# Normalize for lookup (grave→acute, strip macron/breve)
	normalized = _normalize_greek(token.text)

	# Direct match
	if normalized in table:
	token.lemma_ = table[normalized]
	continue

	# Case-insensitive fallback for capitalized words
	if normalized and normalized[0].isupper():
	lower = normalized.lower()
	if lower in table:
	token.lemma_ = table[lower]

	return doc