File size: 7,508 Bytes
9906dbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Static mapping tables for the SinCode engine.

Includes common-word overrides, context-dependent overrides,
and phonetic mapping tables (consonants, vowels, modifiers).
"""

from typing import Dict, List

# ─── Common Word Overrides ──────────────────────────────────────────────────
# High-frequency Singlish words whose romanisation is ambiguous (long vs.
# short vowel, retroflex vs. dental, etc.).  When a word appears here the
# decoder uses the override directly, bypassing MLM/fidelity scoring.
# Only add words that are *unambiguous* — i.e. one dominant Sinhala form
# in colloquial written chat.  Context-dependent words (e.g. "eka") should
# NOT be listed so that MLM can resolve them.

COMMON_WORDS: Dict[str, str] = {
    # Pronouns & particles
    "oya":      "ඔයා",       # you
    "oyaa":     "ඔයා",
    "eya":      "ඒයා",       # he/she
    "eyaa":     "ඒයා",
    "api":      "අපි",       # we
    "mama":     "මම",        # I
    "mage":     "මගේ",       # my
    "oyage":    "ඔයාගේ",     # your
    # Common verbs (past tense)
    "awa":      "ආවා",       # came
    "aawa":     "ආවා",
    "giya":     "ගියා",       # went
    "kala":     "කළා",       # did
    "kiwa":     "කිව්වා",      # said
    "kiwwa":    "කිව්වා",
    "yewwa":    "යැව්වා",     # sent
    "gawa":     "ගැව්වා",     # hit
    "katha":    "කතා",       # talked / story
    # Time
    "heta":     "හෙට",       # tomorrow
    "ada":      "අද",        # today
    "iye":      "ඊයේ",       # yesterday
    # Common adverbs / particles
    "one":      "ඕනෙ",       # need/want
    "oney":     "ඕනේ",
    "naa":      "නෑ",        # no (long form)
    "na":       "නෑ",        # no
    "hari":     "හරි",        # ok / right
    "wage":     "වගේ",       # like
    "nisa":     "නිසා",       # because
    "inne":     "ඉන්නෙ",     # being/staying (colloquial)
    "inna":     "ඉන්න",      # stay (imperative)
    "kalin":    "කලින්",      # before / earlier
    "madi":     "මදි",        # insufficient / not enough
    # Common verb endings
    "giye":     "ගියේ",       # went (emphatic)
    "una":      "උනා",       # became / happened
    "wuna":     "උනා",       # became (alt spelling)
    # Locations / misc
    "gedaradi": "ගෙදරදී",     # at home
    "gedara":   "ගෙදර",       # home
    # Common adjectives / other
    "honda":    "හොඳ",       # good
    "ape":      "අපේ",       # our
    "me":       "මේ",        # this
    "passe":    "පස්සෙ",      # after / later
    "ba":       "බෑ",        # can't
    "bari":     "බැරි",       # impossible
    "bri":      "බැරි",       # can't (abbrev)
    "danne":    "දන්නෙ",     # know
    "wada":     "වැඩ",       # work (noun)
    "epa":      "එපා",       # don't
    # Common ad-hoc abbreviations
    "mn":       "මං",        # man (I, informal first person)
    "mta":      "මට",        # mata
    "oyta":     "ඔයාට",      # oyata
    "oyata":    "ඔයාට",      # to you
    "krnna":    "කරන්න",     # karanna
    "blnna":    "බලන්න",     # balanna
    "on":       "ඕනෙ",       # one (abbrev)
    # Common -nawa verb endings
    "thiyanawa": "තියෙනවා",   # is/has
    "wenawa":   "වෙනවා",     # becomes
    "enawa":    "එනවා",      # comes
    "yanawa":   "යනවා",      # goes
    "hithenawa":"හිතෙනවා",   # thinks/feels
    "penenawa": "පේනවා",     # appears/visible
    "karamu":   "කරමු",      # let's do
    "balamu":   "බලමු",      # let's see
    "damu":     "දාමු",       # let's put
    "yamu":     "යමු",        # let's go
    # Short English abbreviations (keys are lowercase for lookup)
    "pr":       "PR",
    "dm":       "DM",
    "ai":       "AI",
    "it":       "IT",
    "qa":       "QA",
    "ui":       "UI",
    "ok":       "OK",
    # Common ad-hoc abbreviations (contd.)
    "ek":       "එක",        # eka (short form)
    "ekta":     "එකට",       # ekata = to that one
    "ekat":     "ඒකට",       # that-thing + to (standalone form)
    "eke":      "එකේ",       # of that one
    "hta":      "හෙට",       # heta (abbrev)
    "damma":    "දැම්මා",    # put/posted
    "gannako":  "ගන්නකෝ",   # take (imperative, long ō)
    # Additional words for accuracy
    "gena":     "ගැන",       # about
    "mata":     "මට",        # to me
    "laga":     "ළඟ",        # near
    "poth":     "පොත",       # book
    "iwara":    "ඉවර",       # finished
    "karanna":  "කරන්න",     # to do
    "hadamu":   "හදමු",      # let's make
    "kiyawala":  "කියවලා",    # having read
    "baya":     "බය",        # fear/scared
}

# Context-dependent words: use this form ONLY when the previous word is
# NOT English. When "eka" follows an English noun (e.g., "assignment eka")
# the scorer resolves it to එක naturally; standalone "eka" maps to ඒක.
CONTEXT_WORDS_STANDALONE: Dict[str, str] = {
    "eka":  "ඒක",     # that thing (standalone)
    "ekak": "එකක්",   # one of (quantifier — same either way)
}


# ─── Phonetic Mapping Tables ────────────────────────────────────────────────
# Singlish Romanized → Sinhala Unicode
# Tables are ordered longest-pattern-first so greedy replacement works.

CONSONANTS: List[str] = [
    "nnd", "nndh", "nng",
    "th", "dh", "gh", "ch", "ph", "bh", "jh", "sh",
    "GN", "KN", "Lu", "kh", "Th", "Dh",
    "S", "d", "c", "th", "t", "k", "D", "n", "p", "b", "m",
    "\\y",
    "Y", "y", "j", "l", "v", "w", "s", "h",
    "N", "L", "K", "G", "P", "B", "f", "g", "r",
]

CONSONANTS_UNI: List[str] = [
    "ඬ", "ඳ", "ඟ",
    "ත", "ධ", "ඝ", "ච", "ඵ", "භ", "ඣ", "ෂ",
    "ඥ", "ඤ", "ළු", "ඛ", "ඨ", "ඪ",
    "ශ", "ද", "ච", "ත", "ට", "ක", "ඩ", "න", "ප", "බ", "ම",
    "‍ය",
    "‍ය", "ය", "ජ", "ල", "ව", "ව", "ස", "හ",
    "ණ", "ළ", "ඛ", "ඝ", "ඵ", "ඹ", "ෆ", "ග", "ර",
]

VOWELS: List[str] = [
    "oo", "o\\)", "oe", "aa", "a\\)", "Aa", "A\\)", "ae",
    "ii", "i\\)", "ie", "ee", "ea", "e\\)", "ei",
    "uu", "u\\)", "au",
    "\\a", "a", "A", "i", "e", "u", "o", "I",
]

VOWELS_UNI: List[str] = [
    "ඌ", "ඕ", "ඕ", "ආ", "ආ", "ඈ", "ඈ", "ඈ",
    "ඊ", "ඊ", "ඊ", "ඊ", "ඒ", "ඒ", "ඒ",
    "ඌ", "ඌ", "ඖ",
    "ඇ", "අ", "ඇ", "ඉ", "එ", "උ", "ඔ", "ඓ",
]

VOWEL_MODIFIERS_UNI: List[str] = [
    "ූ", "ෝ", "ෝ", "ා", "ා", "ෑ", "ෑ", "ෑ",
    "ී", "ී", "ී", "ී", "ේ", "ේ", "ේ",
    "ූ", "ූ", "ෞ",
    "ැ", "", "ැ", "ි", "ෙ", "ු", "ො", "ෛ",
]

SPECIAL_CONSONANTS: List[str] = ["\\n", "\\h", "\\N", "\\R", "R", "\\r"]
SPECIAL_CONSONANTS_UNI: List[str] = ["ං", "ඃ", "ඞ", "ඍ", "ර්\u200D", "ර්\u200D"]

SPECIAL_CHARS: List[str] = ["ruu", "ru"]
SPECIAL_CHARS_UNI: List[str] = ["ෲ", "ෘ"]

N_VOWELS: int = 26