Ethosoft
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
edec8b7 | """Core type definitions for NedoTurkishTokenizer. | |
| Defines the Token dataclass, SegmentationCandidate for the candidate-based | |
| segmentation engine, token type constants, and punctuation character sets. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from typing import Any | |
| # ββ Token type constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ROOT = "ROOT" | |
| SUFFIX = "SUFFIX" | |
| FOREIGN = "FOREIGN" | |
| PUNCT = "PUNCT" | |
| NUM = "NUM" | |
| DATE = "DATE" | |
| UNIT = "UNIT" | |
| URL = "URL" | |
| MENTION = "MENTION" | |
| HASHTAG = "HASHTAG" | |
| EMOJI = "EMOJI" | |
| ACRONYM = "ACRONYM" | |
| # Special token types that represent non-textual entities | |
| SPECIAL_TYPES: frozenset[str] = frozenset( | |
| {NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM} | |
| ) | |
| # All recognized token types | |
| ALL_TYPES: frozenset[str] = frozenset( | |
| {ROOT, SUFFIX, FOREIGN, PUNCT, NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM} | |
| ) | |
| # ββ Punctuation character set ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PUNCT_CHARS: frozenset[str] = frozenset( | |
| "'?.,;:!-\u2013\u2014()[]{}\"`/\\|@#$%^&*+=<>~" | |
| "\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a" | |
| "\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7" | |
| ) | |
| # Digits β used alongside PUNCT_CHARS for pure-punctuation detection | |
| _DIGITS: frozenset[str] = frozenset("0123456789") | |
| def is_punct_token(text: str) -> bool: | |
| """Return True if *text* consists entirely of punctuation / digit characters.""" | |
| stripped = text.strip() | |
| if not stripped: | |
| return False | |
| return all( | |
| c in PUNCT_CHARS or c in _DIGITS or (ord(c) > 0x02FF and not c.isalpha()) | |
| for c in stripped | |
| ) | |
| # ββ Token dataclass ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class Token: | |
| """Internal token representation. | |
| *text* uses the leading-space convention: a space prefix indicates | |
| that this token starts a new word. Suffixes within a word have | |
| no leading space. | |
| The *metadata* dict carries optional annotation fields (all prefixed | |
| with ``_``), for example ``_caps``, ``_foreign``, ``_canonical``. | |
| """ | |
| text: str | |
| token_type: str | |
| morph_pos: int = 0 | |
| metadata: dict[str, Any] = field(default_factory=dict) | |
| def to_dict(self) -> dict[str, Any]: | |
| """Convert to the public API dict format.""" | |
| result: dict[str, Any] = { | |
| "token": self.text, | |
| "token_type": self.token_type, | |
| "morph_pos": self.morph_pos, | |
| } | |
| result.update(self.metadata) | |
| return result | |
| # ββ Segmentation candidate βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class SegmentationCandidate: | |
| """One possible way to segment a word into tokens. | |
| The candidate-generation engine produces multiple candidates per word, | |
| then the selection step picks the highest-scoring one. | |
| *source* is a short human-readable tag describing the strategy that | |
| produced this candidate (e.g. ``"tdk_root"``, ``"suffix_chain"``, | |
| ``"foreign"``). | |
| """ | |
| tokens: list[Token] | |
| score: float | |
| source: str | |