SinCode / core /mappings.py
Kalana's picture
Refactor to core/ package: softmax MLM normalization, ambiguity handling, context-aware English detection (37/40 = 92.5%)
9906dbd
"""
Static mapping tables for the SinCode engine.
Includes common-word overrides, context-dependent overrides,
and phonetic mapping tables (consonants, vowels, modifiers).
"""
from typing import Dict, List
# ─── Common Word Overrides ──────────────────────────────────────────────────
# High-frequency Singlish words whose romanisation is ambiguous (long vs.
# short vowel, retroflex vs. dental, etc.). When a word appears here the
# decoder uses the override directly, bypassing MLM/fidelity scoring.
# Only add words that are *unambiguous* — i.e. one dominant Sinhala form
# in colloquial written chat. Context-dependent words (e.g. "eka") should
# NOT be listed so that MLM can resolve them.
COMMON_WORDS: Dict[str, str] = {
# Pronouns & particles
"oya": "ඔයා", # you
"oyaa": "ඔයා",
"eya": "ඒයා", # he/she
"eyaa": "ඒයා",
"api": "අපි", # we
"mama": "මම", # I
"mage": "මගේ", # my
"oyage": "ඔයාගේ", # your
# Common verbs (past tense)
"awa": "ආවා", # came
"aawa": "ආවා",
"giya": "ගියා", # went
"kala": "කළා", # did
"kiwa": "කිව්වා", # said
"kiwwa": "කිව්වා",
"yewwa": "යැව්වා", # sent
"gawa": "ගැව්වා", # hit
"katha": "කතා", # talked / story
# Time
"heta": "හෙට", # tomorrow
"ada": "අද", # today
"iye": "ඊයේ", # yesterday
# Common adverbs / particles
"one": "ඕනෙ", # need/want
"oney": "ඕනේ",
"naa": "නෑ", # no (long form)
"na": "නෑ", # no
"hari": "හරි", # ok / right
"wage": "වගේ", # like
"nisa": "නිසා", # because
"inne": "ඉන්නෙ", # being/staying (colloquial)
"inna": "ඉන්න", # stay (imperative)
"kalin": "කලින්", # before / earlier
"madi": "මදි", # insufficient / not enough
# Common verb endings
"giye": "ගියේ", # went (emphatic)
"una": "උනා", # became / happened
"wuna": "උනා", # became (alt spelling)
# Locations / misc
"gedaradi": "ගෙදරදී", # at home
"gedara": "ගෙදර", # home
# Common adjectives / other
"honda": "හොඳ", # good
"ape": "අපේ", # our
"me": "මේ", # this
"passe": "පස්සෙ", # after / later
"ba": "බෑ", # can't
"bari": "බැරි", # impossible
"bri": "බැරි", # can't (abbrev)
"danne": "දන්නෙ", # know
"wada": "වැඩ", # work (noun)
"epa": "එපා", # don't
# Common ad-hoc abbreviations
"mn": "මං", # man (I, informal first person)
"mta": "මට", # mata
"oyta": "ඔයාට", # oyata
"oyata": "ඔයාට", # to you
"krnna": "කරන්න", # karanna
"blnna": "බලන්න", # balanna
"on": "ඕනෙ", # one (abbrev)
# Common -nawa verb endings
"thiyanawa": "තියෙනවා", # is/has
"wenawa": "වෙනවා", # becomes
"enawa": "එනවා", # comes
"yanawa": "යනවා", # goes
"hithenawa":"හිතෙනවා", # thinks/feels
"penenawa": "පේනවා", # appears/visible
"karamu": "කරමු", # let's do
"balamu": "බලමු", # let's see
"damu": "දාමු", # let's put
"yamu": "යමු", # let's go
# Short English abbreviations (keys are lowercase for lookup)
"pr": "PR",
"dm": "DM",
"ai": "AI",
"it": "IT",
"qa": "QA",
"ui": "UI",
"ok": "OK",
# Common ad-hoc abbreviations (contd.)
"ek": "එක", # eka (short form)
"ekta": "එකට", # ekata = to that one
"ekat": "ඒකට", # that-thing + to (standalone form)
"eke": "එකේ", # of that one
"hta": "හෙට", # heta (abbrev)
"damma": "දැම්මා", # put/posted
"gannako": "ගන්නකෝ", # take (imperative, long ō)
# Additional words for accuracy
"gena": "ගැන", # about
"mata": "මට", # to me
"laga": "ළඟ", # near
"poth": "පොත", # book
"iwara": "ඉවර", # finished
"karanna": "කරන්න", # to do
"hadamu": "හදමු", # let's make
"kiyawala": "කියවලා", # having read
"baya": "බය", # fear/scared
}
# Context-dependent words: use this form ONLY when the previous word is
# NOT English. When "eka" follows an English noun (e.g., "assignment eka")
# the scorer resolves it to එක naturally; standalone "eka" maps to ඒක.
CONTEXT_WORDS_STANDALONE: Dict[str, str] = {
"eka": "ඒක", # that thing (standalone)
"ekak": "එකක්", # one of (quantifier — same either way)
}
# ─── Phonetic Mapping Tables ────────────────────────────────────────────────
# Singlish Romanized → Sinhala Unicode
# Tables are ordered longest-pattern-first so greedy replacement works.
CONSONANTS: List[str] = [
"nnd", "nndh", "nng",
"th", "dh", "gh", "ch", "ph", "bh", "jh", "sh",
"GN", "KN", "Lu", "kh", "Th", "Dh",
"S", "d", "c", "th", "t", "k", "D", "n", "p", "b", "m",
"\\y",
"Y", "y", "j", "l", "v", "w", "s", "h",
"N", "L", "K", "G", "P", "B", "f", "g", "r",
]
CONSONANTS_UNI: List[str] = [
"ඬ", "ඳ", "ඟ",
"ත", "ධ", "ඝ", "ච", "ඵ", "භ", "ඣ", "ෂ",
"ඥ", "ඤ", "ළු", "ඛ", "ඨ", "ඪ",
"ශ", "ද", "ච", "ත", "ට", "ක", "ඩ", "න", "ප", "බ", "ම",
"‍ය",
"‍ය", "ය", "ජ", "ල", "ව", "ව", "ස", "හ",
"ණ", "ළ", "ඛ", "ඝ", "ඵ", "ඹ", "ෆ", "ග", "ර",
]
VOWELS: List[str] = [
"oo", "o\\)", "oe", "aa", "a\\)", "Aa", "A\\)", "ae",
"ii", "i\\)", "ie", "ee", "ea", "e\\)", "ei",
"uu", "u\\)", "au",
"\\a", "a", "A", "i", "e", "u", "o", "I",
]
VOWELS_UNI: List[str] = [
"ඌ", "ඕ", "ඕ", "ආ", "ආ", "ඈ", "ඈ", "ඈ",
"ඊ", "ඊ", "ඊ", "ඊ", "ඒ", "ඒ", "ඒ",
"ඌ", "ඌ", "ඖ",
"ඇ", "අ", "ඇ", "ඉ", "එ", "උ", "ඔ", "ඓ",
]
VOWEL_MODIFIERS_UNI: List[str] = [
"ූ", "ෝ", "ෝ", "ා", "ා", "ෑ", "ෑ", "ෑ",
"ී", "ී", "ී", "ී", "ේ", "ේ", "ේ",
"ූ", "ූ", "ෞ",
"ැ", "", "ැ", "ි", "ෙ", "ු", "ො", "ෛ",
]
SPECIAL_CONSONANTS: List[str] = ["\\n", "\\h", "\\N", "\\R", "R", "\\r"]
SPECIAL_CONSONANTS_UNI: List[str] = ["ං", "ඃ", "ඞ", "ඍ", "ර්\u200D", "ර්\u200D"]
SPECIAL_CHARS: List[str] = ["ruu", "ru"]
SPECIAL_CHARS_UNI: List[str] = ["ෲ", "ෘ"]
N_VOWELS: int = 26