File size: 2,803 Bytes
9906dbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Dictionary adapter for retrieving Sinhala transliteration candidates.
"""

from typing import Dict, List, Set

from core.constants import MAX_CANDIDATES
from core.english import ENGLISH_VOCAB
from core.scorer import CandidateScorer
from core.transliterate import rule_based_transliterate


class DictionaryAdapter:
    """Retrieves transliteration candidates from the Sinhala dictionary."""

    def __init__(self, dictionary_dict: Dict[str, List[str]]):
        self.dictionary = dictionary_dict

    def get_candidates(self, word: str, rule_output: str = "") -> List[str]:
        """
        Return candidate transliterations for a Romanized word.

        Priority:
            1. English corpus match  → keep original word
            2. Dictionary lookup     → exact / lowercase
            3. Subword decomposition → only when 1 & 2 yield nothing

        When more candidates exist than MAX_CANDIDATES, results are
        sorted by Levenshtein distance to ``rule_output`` so the most
        phonetically plausible entries survive the cut.
        """
        cands: List[str] = []
        word_lower = word.lower()

        # 1. English corpus check
        if word_lower in ENGLISH_VOCAB:
            cands.append(word)

        # 2. Sinhala dictionary check
        if word in self.dictionary:
            cands.extend(self.dictionary[word])
        elif word_lower in self.dictionary:
            cands.extend(self.dictionary[word_lower])

        # 3. Deduplicate preserving order
        if cands:
            cands = list(dict.fromkeys(cands))
            # Sort Sinhala candidates by closeness to rule output
            if rule_output and len(cands) > MAX_CANDIDATES:
                english = [c for c in cands if c.lower() in ENGLISH_VOCAB]
                sinhala = [c for c in cands if c.lower() not in ENGLISH_VOCAB]
                sinhala.sort(
                    key=lambda c: CandidateScorer.levenshtein(c, rule_output)
                )
                cands = english + sinhala
            return cands

        # 4. Subword fallback (compound words)
        length = len(word)
        if length > 3:
            for i in range(2, length - 1):
                part1, part2 = word[:i], word[i:]
                p1 = self.dictionary.get(part1) or self.dictionary.get(part1.lower())
                p2 = self.dictionary.get(part2) or self.dictionary.get(part2.lower())

                if p1 and p2:
                    for w1 in p1[:3]:
                        for w2 in p2[:3]:
                            cands.append(w1 + w2)

        return list(dict.fromkeys(cands)) if cands else []

    @staticmethod
    def get_rule_output(word: str) -> str:
        """Generate Sinhala output via the phonetic rule engine."""
        return rule_based_transliterate(word)