Ethosoft
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
edec8b7
"""Core type definitions for NedoTurkishTokenizer.
Defines the Token dataclass, SegmentationCandidate for the candidate-based
segmentation engine, token type constants, and punctuation character sets.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
# ── Token type constants ──────────────────────────────────────────────────────
ROOT = "ROOT"
SUFFIX = "SUFFIX"
FOREIGN = "FOREIGN"
PUNCT = "PUNCT"
NUM = "NUM"
DATE = "DATE"
UNIT = "UNIT"
URL = "URL"
MENTION = "MENTION"
HASHTAG = "HASHTAG"
EMOJI = "EMOJI"
ACRONYM = "ACRONYM"
# Special token types that represent non-textual entities
SPECIAL_TYPES: frozenset[str] = frozenset(
{NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
)
# All recognized token types
ALL_TYPES: frozenset[str] = frozenset(
{ROOT, SUFFIX, FOREIGN, PUNCT, NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
)
# ── Punctuation character set ────────────────────────────────────────────────
PUNCT_CHARS: frozenset[str] = frozenset(
"'?.,;:!-\u2013\u2014()[]{}\"`/\\|@#$%^&*+=<>~"
"\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a"
"\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7"
)
# Digits β€” used alongside PUNCT_CHARS for pure-punctuation detection
_DIGITS: frozenset[str] = frozenset("0123456789")
def is_punct_token(text: str) -> bool:
"""Return True if *text* consists entirely of punctuation / digit characters."""
stripped = text.strip()
if not stripped:
return False
return all(
c in PUNCT_CHARS or c in _DIGITS or (ord(c) > 0x02FF and not c.isalpha())
for c in stripped
)
# ── Token dataclass ──────────────────────────────────────────────────────────
@dataclass
class Token:
"""Internal token representation.
*text* uses the leading-space convention: a space prefix indicates
that this token starts a new word. Suffixes within a word have
no leading space.
The *metadata* dict carries optional annotation fields (all prefixed
with ``_``), for example ``_caps``, ``_foreign``, ``_canonical``.
"""
text: str
token_type: str
morph_pos: int = 0
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
"""Convert to the public API dict format."""
result: dict[str, Any] = {
"token": self.text,
"token_type": self.token_type,
"morph_pos": self.morph_pos,
}
result.update(self.metadata)
return result
# ── Segmentation candidate ───────────────────────────────────────────────────
@dataclass
class SegmentationCandidate:
"""One possible way to segment a word into tokens.
The candidate-generation engine produces multiple candidates per word,
then the selection step picks the highest-scoring one.
*source* is a short human-readable tag describing the strategy that
produced this candidate (e.g. ``"tdk_root"``, ``"suffix_chain"``,
``"foreign"``).
"""
tokens: list[Token]
score: float
source: str