Spaces:

thomascerniglia
/

DialectAnalysis

Sleeping

App Files Files Community

DialectAnalysis / dialect_analysis /normalization.py

thomascerniglia

Upload 8 files

d0326ea verified about 1 month ago

raw

history blame contribute delete

1.86 kB

	from __future__ import annotations

	import re
	import unicodedata
	from typing import List

	# A small punctuation set that commonly appears in Greek texts.
	_EXTRA_PUNCT = "··;;—–…«»‹›“”‘’" # ano teleia, Greek question mark, dashes, quotes


	def strip_greek_diacritics(text: str) -> str:
	"""Strip diacritics while preserving iota subscript as an explicit iota.

	- Converts combining GREEK YPOGEGRAMMENI (U+0345) to 'ι'.
	- Removes other combining marks (accents, breathings, etc.).
	"""

	decomposed = unicodedata.normalize("NFD", text)
	out_chars: List[str] = []
	for ch in decomposed:
	if ch == "\u0345":
	out_chars.append("ι")
	continue
	if unicodedata.combining(ch):
	continue
	out_chars.append(ch)
	return unicodedata.normalize("NFC", "".join(out_chars))


	def sigma_normalize(token: str) -> str:
	"""Normalize sigma variants for matching."""

	return token.replace("ς", "σ")


	def normalize_text(text: str, *, strip_diacritics: bool = False) -> str:
	"""Normalize input Greek text.

	- Lowercase
	- Remove punctuation
	- Optionally strip diacritics

	Keep diacritics by default so feature extraction can detect iota-subscript
	endings like -ᾳ.
	"""

	lowered = text.lower()

	# Replace tabs/newlines with spaces.
	cleaned = lowered.translate(str.maketrans({"\n": " ", "\t": " "}))
	cleaned = cleaned.translate(str.maketrans({ch: " " for ch in _EXTRA_PUNCT}))

	# Remove remaining punctuation/symbols while keeping word chars and spaces.
	cleaned = re.sub(r"[^\w\s]", " ", cleaned, flags=re.UNICODE)
	cleaned = re.sub(r"\s+", " ", cleaned).strip()

	if strip_diacritics:
	cleaned = strip_greek_diacritics(cleaned)

	return cleaned