Ethosoft

Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper

edec8b7 about 1 month ago

3.52 kB

	"""Core type definitions for NedoTurkishTokenizer.

	Defines the Token dataclass, SegmentationCandidate for the candidate-based
	segmentation engine, token type constants, and punctuation character sets.
	"""

	from __future__ import annotations

	from dataclasses import dataclass, field
	from typing import Any


	# ── Token type constants ──────────────────────────────────────────────────────

	ROOT = "ROOT"
	SUFFIX = "SUFFIX"
	FOREIGN = "FOREIGN"
	PUNCT = "PUNCT"
	NUM = "NUM"
	DATE = "DATE"
	UNIT = "UNIT"
	URL = "URL"
	MENTION = "MENTION"
	HASHTAG = "HASHTAG"
	EMOJI = "EMOJI"
	ACRONYM = "ACRONYM"

	# Special token types that represent non-textual entities
	SPECIAL_TYPES: frozenset[str] = frozenset(
	{NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
	)

	# All recognized token types
	ALL_TYPES: frozenset[str] = frozenset(
	{ROOT, SUFFIX, FOREIGN, PUNCT, NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
	)

	# ── Punctuation character set ────────────────────────────────────────────────

	PUNCT_CHARS: frozenset[str] = frozenset(
	"'?.,;:!-\u2013\u2014()[]{}\"`/\\\|@#$%^&*+=<>~"
	"\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a"
	"\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7"
	)

	# Digits — used alongside PUNCT_CHARS for pure-punctuation detection
	_DIGITS: frozenset[str] = frozenset("0123456789")


	def is_punct_token(text: str) -> bool:
	"""Return True if text consists entirely of punctuation / digit characters."""
	stripped = text.strip()
	if not stripped:
	return False
	return all(
	c in PUNCT_CHARS or c in _DIGITS or (ord(c) > 0x02FF and not c.isalpha())
	for c in stripped
	)


	# ── Token dataclass ──────────────────────────────────────────────────────────


	@dataclass
	class Token:
	"""Internal token representation.

	text uses the leading-space convention: a space prefix indicates
	that this token starts a new word. Suffixes within a word have
	no leading space.

	The metadata dict carries optional annotation fields (all prefixed
	with ``_``), for example ``_caps``, ``_foreign``, ``_canonical``.
	"""

	text: str
	token_type: str
	morph_pos: int = 0
	metadata: dict[str, Any] = field(default_factory=dict)

	def to_dict(self) -> dict[str, Any]:
	"""Convert to the public API dict format."""
	result: dict[str, Any] = {
	"token": self.text,
	"token_type": self.token_type,
	"morph_pos": self.morph_pos,
	}
	result.update(self.metadata)
	return result


	# ── Segmentation candidate ───────────────────────────────────────────────────


	@dataclass
	class SegmentationCandidate:
	"""One possible way to segment a word into tokens.

	The candidate-generation engine produces multiple candidates per word,
	then the selection step picks the highest-scoring one.

	source is a short human-readable tag describing the strategy that
	produced this candidate (e.g. ``"tdk_root"``, ``"suffix_chain"``,
	``"foreign"``).
	"""

	tokens: list[Token]
	score: float
	source: str