Ethosoft

Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper

edec8b7 about 1 month ago

6.22 kB

	"""Tokenization engine — orchestrates the full pipeline.

	This is the central pipeline that ties together all modules:
	1. Text normalization (Unicode, whitespace)
	2. ALL CAPS detection and lowercasing
	3. Special span extraction (URLs, numbers, dates, acronyms, emojis)
	4. Word-level segmentation with candidate generation/selection
	5. Post-annotation (allomorph labels, compound info, acronym expansion)
	6. Number/unit reclassification safety net
	"""

	from __future__ import annotations

	from ._domain_vocab import ALL_DOMAIN_ROOTS
	from .morphology import annotate_acronyms, annotate_canonical, annotate_compounds
	from .normalization import detect_all_caps, normalize_text
	from .resources import load_tdk_words
	from .segmentation import segment_word, split_into_words
	from .special_spans import find_special_spans, make_special_tokens, reclassify_numbers_in_tokens


	class TokenizationEngine:
	"""Core tokenization engine.

	Stateless after initialisation: loads TDK and domain vocabulary once,
	then processes texts through a deterministic pipeline.

	This class is NOT the public API. Use ``NedoTurkishTokenizer``
	instead, which delegates to this engine.
	"""

	def __init__(self) -> None:
	self._tdk: set[str] = load_tdk_words()
	self._domain_roots: frozenset[str] = ALL_DOMAIN_ROOTS

	def tokenize(self, text: str) -> list[dict[str, object]]:
	"""Run the full tokenization pipeline on text.

	Returns a list of token dicts, each with at minimum:
	``token``, ``token_type``, ``morph_pos``.
	"""
	if not text or not text.strip():
	return []

	# ── 1. Normalize ─────────────────────────────────────────────────
	text = normalize_text(text)

	# ── 2. ALL CAPS detection ────────────────────────────────────────
	text, caps_set = detect_all_caps(text)

	# ── 3. Special span extraction ───────────────────────────────────
	spans = find_special_spans(text)

	tokens: list[dict[str, object]] = []
	pos = 0

	for start, end, span_type, original in spans:
	# Tokenize normal text before this special span
	if pos < start:
	segment = text[pos:start]
	if segment.strip():
	seg_tokens = self._tokenize_segment(segment, caps_set)
	tokens.extend(seg_tokens)

	# Insert special tokens directly
	tokens.extend(make_special_tokens(span_type, original))
	pos = end

	# Tokenize remaining text after last special span
	if pos < len(text):
	segment = text[pos:]
	if segment.strip():
	seg_tokens = self._tokenize_segment(segment, caps_set)
	tokens.extend(seg_tokens)

	# ── 5. Post-annotation passes ────────────────────────────────────
	tokens = reclassify_numbers_in_tokens(tokens)
	tokens = annotate_canonical(tokens)
	tokens = annotate_compounds(tokens)
	tokens = annotate_acronyms(tokens)

	# ── 6. Finalize morph_pos ────────────────────────────────────────
	tokens = _compute_morph_pos(tokens)

	# ── 7. Strip internal leading spaces from token text ─────────────
	# Leading spaces are used internally to detect word boundaries
	# during morph_pos computation but are NOT part of the public API.
	tokens = _strip_token_text(tokens)

	return tokens

	def _tokenize_segment(
	self, segment: str, caps_set: frozenset[str]
	) -> list[dict[str, object]]:
	"""Tokenize a plain-text segment (no special spans)."""
	words = split_into_words(segment)
	tokens: list[dict[str, object]] = []

	for word in words:
	word_tokens = segment_word(
	word, self._tdk, self._domain_roots, caps_set
	)
	tokens.extend(word_tokens)

	return tokens


	# ── Helper: compute morph_pos across the full token stream ───────────────────

	def _compute_morph_pos(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
	"""Recompute ``morph_pos`` consistently across the token stream.

	Rules:
	- Word-initial tokens (leading space, special types, PUNCT) → morph_pos = 0
	- SUFFIX tokens increment the position counter
	- Apostrophe suffixes continue from the previous word
	"""
	result: list[dict[str, object]] = []
	word_pos = 0

	for tok in tokens:
	raw = str(tok["token"])
	token_type = str(tok["token_type"])

	is_word_start = raw.startswith(" ") or raw.strip().startswith("<")

	# Apostrophe suffixes continue the previous word
	if tok.get("_apo_suffix"):
	is_word_start = False

	if is_word_start or token_type in (
	"NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM", "PUNCT"
	):
	word_pos = 0
	morph_pos = 0
	elif token_type == "SUFFIX":
	word_pos += 1
	morph_pos = word_pos
	else:
	# ROOT or FOREIGN within a word (shouldn't normally happen)
	word_pos = 0
	morph_pos = 0

	result.append({**tok, "morph_pos": morph_pos})

	return result


	def _strip_token_text(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
	"""Remove internal leading whitespace from all token text strings.

	During pipeline processing, a leading space in ``token`` signals
	a word-initial token. Once ``morph_pos`` has been computed, this
	space is no longer needed and must be stripped so the public API
	returns clean text.
	"""
	return [{**tok, "token": str(tok["token"]).lstrip()} for tok in tokens]