linguis_trie.py · Remeinium/WWHO at main

WWHO / linguis_trie.py

WWHO

e59ea28 about 2 months ago

11.1 kB

	"""
	==========================================
	Table-Driven DFA Tokenizer
	==========================================
	"""

	from __future__ import annotations

	import json
	import os
	from dataclasses import dataclass, field
	from typing import Optional


	# ---------------------------------------------------------------------------
	# Schema loading and validation
	# ---------------------------------------------------------------------------

	class SchemaError(ValueError):
	"""Raised when a language schema JSON is malformed or incomplete."""

	@dataclass
	class LanguageSchema:
	language: str
	grammar_notation: str
	unicode_blocks: list[tuple[int, int]]
	char_classes: dict[str, set[int]]
	transitions: dict[str, dict[str, Optional[str]]]
	start_state: str
	accept_states: set[str]
	emit_states: set[str]

	def get_regex(self) -> str:
	parts = []
	for cps in self.char_classes.values():
	for cp in cps:
	parts.append(chr(cp))

	if not parts:
	return ""

	safe_parts = []
	for p in parts:
	if p in ('-', ']', '\\', '^'):
	safe_parts.append('\\' + p)
	else:
	safe_parts.append(p)

	char_set = "".join(set(safe_parts))
	return f"[{char_set}]+"


	class SchemaLoader:
	def load(self, path: str) -> LanguageSchema:
	with open(path, "r", encoding="utf-8") as fh:
	raw = json.load(fh)

	language = raw.get("language", "unknown")
	grammar = raw.get("grammar_notation", "")

	if "char_classes" not in raw:
	raise SchemaError(f"[{path}] Missing 'char_classes' key.")
	if "dfa" not in raw:
	raise SchemaError(f"[{path}] Missing 'dfa' key.")

	unicode_blocks = []
	for rng in raw.get("unicode_blocks", []):
	unicode_blocks.append((int(rng[0], 16), int(rng[1], 16)))

	char_classes: dict[str, set[int]] = {}
	for label, definition in raw["char_classes"].items():
	if label.startswith("_"):
	continue
	cps: set[int] = set()
	for rng in definition.get("ranges", []):
	lo, hi = int(rng[0], 16), int(rng[1], 16)
	cps.update(range(lo, hi + 1))
	for cp_hex in definition.get("codepoints", []):
	cps.add(int(cp_hex, 16))
	char_classes[label] = cps

	dfa_raw = raw["dfa"]
	start_state = dfa_raw.get("start", "START")
	accept_states = set(dfa_raw.get("accept_states", []))
	emit_states = set(dfa_raw.get("emit_states", []))
	transitions = dfa_raw.get("transitions", {})

	return LanguageSchema(
	language=language,
	grammar_notation=grammar,
	unicode_blocks=unicode_blocks,
	char_classes=char_classes,
	transitions=transitions,
	start_state=start_state,
	accept_states=accept_states,
	emit_states=emit_states,
	)


	# ---------------------------------------------------------------------------
	# Codepoint classifier
	# ---------------------------------------------------------------------------

	class CharClassifier:
	def __init__(self, schema: LanguageSchema):
	self._table: dict[int, str] = {}
	for label, cps in schema.char_classes.items():
	for cp in cps:
	if cp in self._table:
	continue
	self._table[cp] = label

	def classify(self, ch: str) -> str:
	return self._table.get(ord(ch), "O")


	# ---------------------------------------------------------------------------
	# DFA Tokenizer
	# ---------------------------------------------------------------------------

	class LinguisTrie:
	def __init__(self, schema: LanguageSchema):
	self._schema = schema
	self._classifier = CharClassifier(schema)
	self._transitions = schema.transitions
	self._start = schema.start_state
	self._accept = schema.accept_states
	self._emit = schema.emit_states

	def tokenize(self, text: str, leading_space: bool = False) -> list[str]:
	tokens: list[str] = []
	n = len(text)
	pos = 0

	pending_space = " " if leading_space and text and text[0] not in (" ", "\t", "\n", "\r") else ""

	while pos < n:
	ch = text[pos]

	# ─── Whitespace handling (leading-space mode) ────────────
	if leading_space and ch in (" ", "\t", "\n", "\r"):
	ws_buffer = ""
	while pos < n and text[pos] in (" ", "\t", "\n", "\r"):
	ws_buffer += text[pos]
	pos += 1

	if ws_buffer.endswith(" "):
	for ws_char in ws_buffer[:-1]:
	tokens.append(ws_char)
	pending_space = " "
	else:
	for ws_char in ws_buffer:
	tokens.append(ws_char)
	pending_space = ""
	continue

	# ─── DFA syllable recognition ────────────────────
	cls = self._classifier.classify(ch)
	init_next = self._transitions.get(self._start, {}).get(cls)

	if init_next is None:
	if pending_space:
	tokens.append(pending_space + ch)
	pending_space = ""
	else:
	tokens.append(ch)
	pos += 1
	continue

	if init_next in self._emit:
	tokens.append(pending_space + ch)
	pending_space = ""
	pos += 1
	continue

	span_start = pos
	state = init_next
	pos += 1
	last_accept_pos = pos if state in self._accept else -1

	while pos < n:
	ch2 = text[pos]
	cls2 = self._classifier.classify(ch2)
	next_state = self._transitions.get(state, {}).get(cls2)

	if next_state is None:
	break

	state = next_state
	pos += 1

	if state in self._accept:
	last_accept_pos = pos
	elif state in self._emit:
	last_accept_pos = pos
	break

	if last_accept_pos > span_start:
	emit_end = last_accept_pos
	else:
	emit_end = span_start + 1 # Fallback: Emit only the first character as an ORPHAN

	tokens.append(pending_space + text[span_start:emit_end])
	pending_space = ""
	pos = emit_end

	if pending_space:
	tokens.append(pending_space)

	return tokens

	# ------------------------------------------------------------------
	# Helpers
	# ------------------------------------------------------------------

	@property
	def language(self) -> str:
	return self._schema.language

	@property
	def unicode_blocks(self) -> list[tuple[int, int]]:
	return self._schema.unicode_blocks

	@property
	def regex(self) -> str:
	return self._schema.get_regex()

	@property
	def grammar(self) -> str:
	return self._schema.grammar_notation


	# ---------------------------------------------------------------------------
	# Factory
	# ---------------------------------------------------------------------------

	_SCHEMA_DIR = os.path.join(os.path.dirname(__file__), "schemas")

	_schema_loader = SchemaLoader()
	_dfa_cache: dict[str, LinguisTrie] = {}


	def build_linguis_trie(schema_path: str) -> LinguisTrie:
	if schema_path not in _dfa_cache:
	schema = _schema_loader.load(schema_path)
	_dfa_cache[schema_path] = LinguisTrie(schema)
	return _dfa_cache[schema_path]


	def load_dfa_map(script_mode: str) -> dict[str, LinguisTrie]:
	import glob
	dfa_map = {}
	pattern = os.path.join(_SCHEMA_DIR, "*.json")
	for file in glob.glob(pattern):
	try:
	trie = build_linguis_trie(file)
	if script_mode in ("mixed", "all") or script_mode == trie.language:
	dfa_map[trie.language] = trie
	except Exception as e:
	print(f"Warning: Failed to load schema {file}: {e}")
	return dfa_map


	# ---------------------------------------------------------------------------
	# Self-test
	# ---------------------------------------------------------------------------

	if __name__ == "__main__":
	import sys

	print("=" * 65)
	print("DFA Tokenizer — self-test")
	print("=" * 65)

	# --- Load All Schemas ---
	dfas = load_dfa_map("all")
	sinhala_dfa = dfas.get("sinhala")

	if sinhala_dfa:
	print(f"\n[Sinhala DFA] grammar: {sinhala_dfa.grammar}\n")

	sinhala_tests = [
	"ශ්‍රී ලංකා ද්වීපයේ ස්වෛරීභාවය සහ ත්‍රිවිධ හමුදාව.",
	"භාෂාවේ ප්‍රෞඪත්වය විදහාපායි",
	"ආචාර්යවරයාගේ වෛද්‍ය විද්‍යා පර්යේෂණය සාර්ථකයි.",
	"චන්ද්‍රයාගේ ආලෝකය පෘථිවියට ක්ෂණිකව ලැබේ.",
	"මම ක්‍ෂණිකව ගඟට පැන්නා",
	"සඤ්ඤක ක්ෂමතාවය ක්‍රමය සහ ඥානය",
	"ද්වී ත්වේ ලං කඃ",
	"2026 වසරේ AI තාක්ෂණය 60% දියුණුයි!",
	]

	for text in sinhala_tests:
	toks = sinhala_dfa.tokenize(text, leading_space=True)
	print(f" Input : {text}")
	print(f" Syllables: {toks}")
	print(f" Count : {len(toks)}")
	print("-" * 65)

	# --- Devanagari ---
	deva_dfa = dfas.get("devanagari")
	if deva_dfa:
	print(f"\n[Devanagari DFA] grammar: {deva_dfa.grammar}\n")

	deva_tests = [
	"नमस्ते",
	"भारत",
	"हिन्दी",
	"संस्कृत",
	"क़िला",
	"ज़िंदगी",
	"प्रेम",
	"द्वारा",
	"श्रीमान्",
	"हिन्दुस्तान",
	"नमस्कार दुनिया",
	"मैं ठीक हूँ",
	"विद्यालय में पढ़ाई होती है।",
	]

	for text in deva_tests:
	toks = deva_dfa.tokenize(text, leading_space=True)
	print(f" Input : {text}")
	print(f" Syllables: {toks}")
	print(f" Count : {len(toks)}")
	print("-" * 65)

	print("\nAll self-tests complete.")
	sys.exit(0)