Spaces:

Kalana001
/

SinCode

Running

SinCode / core /dictionary.py

Refactor to core/ package: softmax MLM normalization, ambiguity handling, context-aware English detection (37/40 = 92.5%)

9906dbd 3 days ago

raw

history blame contribute delete

2.8 kB

	"""
	Dictionary adapter for retrieving Sinhala transliteration candidates.
	"""

	from typing import Dict, List, Set

	from core.constants import MAX_CANDIDATES
	from core.english import ENGLISH_VOCAB
	from core.scorer import CandidateScorer
	from core.transliterate import rule_based_transliterate


	class DictionaryAdapter:
	"""Retrieves transliteration candidates from the Sinhala dictionary."""

	def __init__(self, dictionary_dict: Dict[str, List[str]]):
	self.dictionary = dictionary_dict

	def get_candidates(self, word: str, rule_output: str = "") -> List[str]:
	"""
	Return candidate transliterations for a Romanized word.

	Priority:
	1. English corpus match → keep original word
	2. Dictionary lookup → exact / lowercase
	3. Subword decomposition → only when 1 & 2 yield nothing

	When more candidates exist than MAX_CANDIDATES, results are
	sorted by Levenshtein distance to ``rule_output`` so the most
	phonetically plausible entries survive the cut.
	"""
	cands: List[str] = []
	word_lower = word.lower()

	# 1. English corpus check
	if word_lower in ENGLISH_VOCAB:
	cands.append(word)

	# 2. Sinhala dictionary check
	if word in self.dictionary:
	cands.extend(self.dictionary[word])
	elif word_lower in self.dictionary:
	cands.extend(self.dictionary[word_lower])

	# 3. Deduplicate preserving order
	if cands:
	cands = list(dict.fromkeys(cands))
	# Sort Sinhala candidates by closeness to rule output
	if rule_output and len(cands) > MAX_CANDIDATES:
	english = [c for c in cands if c.lower() in ENGLISH_VOCAB]
	sinhala = [c for c in cands if c.lower() not in ENGLISH_VOCAB]
	sinhala.sort(
	key=lambda c: CandidateScorer.levenshtein(c, rule_output)
	)
	cands = english + sinhala
	return cands

	# 4. Subword fallback (compound words)
	length = len(word)
	if length > 3:
	for i in range(2, length - 1):
	part1, part2 = word[:i], word[i:]
	p1 = self.dictionary.get(part1) or self.dictionary.get(part1.lower())
	p2 = self.dictionary.get(part2) or self.dictionary.get(part2.lower())

	if p1 and p2:
	for w1 in p1[:3]:
	for w2 in p2[:3]:
	cands.append(w1 + w2)

	return list(dict.fromkeys(cands)) if cands else []

	@staticmethod
	def get_rule_output(word: str) -> str:
	"""Generate Sinhala output via the phonetic rule engine."""
	return rule_based_transliterate(word)