| """ |
| Dictionary adapter for retrieving Sinhala transliteration candidates. |
| """ |
|
|
| from typing import Dict, List, Set |
|
|
| from core.constants import MAX_CANDIDATES |
| from core.english import ENGLISH_VOCAB |
| from core.scorer import CandidateScorer |
| from core.transliterate import rule_based_transliterate |
|
|
|
|
| class DictionaryAdapter: |
| """Retrieves transliteration candidates from the Sinhala dictionary.""" |
|
|
| def __init__(self, dictionary_dict: Dict[str, List[str]]): |
| self.dictionary = dictionary_dict |
|
|
| def get_candidates(self, word: str, rule_output: str = "") -> List[str]: |
| """ |
| Return candidate transliterations for a Romanized word. |
| |
| Priority: |
| 1. English corpus match → keep original word |
| 2. Dictionary lookup → exact / lowercase |
| 3. Subword decomposition → only when 1 & 2 yield nothing |
| |
| When more candidates exist than MAX_CANDIDATES, results are |
| sorted by Levenshtein distance to ``rule_output`` so the most |
| phonetically plausible entries survive the cut. |
| """ |
| cands: List[str] = [] |
| word_lower = word.lower() |
|
|
| |
| if word_lower in ENGLISH_VOCAB: |
| cands.append(word) |
|
|
| |
| if word in self.dictionary: |
| cands.extend(self.dictionary[word]) |
| elif word_lower in self.dictionary: |
| cands.extend(self.dictionary[word_lower]) |
|
|
| |
| if cands: |
| cands = list(dict.fromkeys(cands)) |
| |
| if rule_output and len(cands) > MAX_CANDIDATES: |
| english = [c for c in cands if c.lower() in ENGLISH_VOCAB] |
| sinhala = [c for c in cands if c.lower() not in ENGLISH_VOCAB] |
| sinhala.sort( |
| key=lambda c: CandidateScorer.levenshtein(c, rule_output) |
| ) |
| cands = english + sinhala |
| return cands |
|
|
| |
| length = len(word) |
| if length > 3: |
| for i in range(2, length - 1): |
| part1, part2 = word[:i], word[i:] |
| p1 = self.dictionary.get(part1) or self.dictionary.get(part1.lower()) |
| p2 = self.dictionary.get(part2) or self.dictionary.get(part2.lower()) |
|
|
| if p1 and p2: |
| for w1 in p1[:3]: |
| for w2 in p2[:3]: |
| cands.append(w1 + w2) |
|
|
| return list(dict.fromkeys(cands)) if cands else [] |
|
|
| @staticmethod |
| def get_rule_output(word: str) -> str: |
| """Generate Sinhala output via the phonetic rule engine.""" |
| return rule_based_transliterate(word) |
|
|