Spaces:

Kalana001
/

SinCode

Running

SinCode / core /mappings.py

Refactor to core/ package: softmax MLM normalization, ambiguity handling, context-aware English detection (37/40 = 92.5%)

9906dbd 3 days ago

raw

history blame contribute delete

7.51 kB

	"""
	Static mapping tables for the SinCode engine.

	Includes common-word overrides, context-dependent overrides,
	and phonetic mapping tables (consonants, vowels, modifiers).
	"""

	from typing import Dict, List

	# ─── Common Word Overrides ──────────────────────────────────────────────────
	# High-frequency Singlish words whose romanisation is ambiguous (long vs.
	# short vowel, retroflex vs. dental, etc.). When a word appears here the
	# decoder uses the override directly, bypassing MLM/fidelity scoring.
	# Only add words that are unambiguous — i.e. one dominant Sinhala form
	# in colloquial written chat. Context-dependent words (e.g. "eka") should
	# NOT be listed so that MLM can resolve them.

	COMMON_WORDS: Dict[str, str] = {
	# Pronouns & particles
	"oya": "ඔයා", # you
	"oyaa": "ඔයා",
	"eya": "ඒයා", # he/she
	"eyaa": "ඒයා",
	"api": "අපි", # we
	"mama": "මම", # I
	"mage": "මගේ", # my
	"oyage": "ඔයාගේ", # your
	# Common verbs (past tense)
	"awa": "ආවා", # came
	"aawa": "ආවා",
	"giya": "ගියා", # went
	"kala": "කළා", # did
	"kiwa": "කිව්වා", # said
	"kiwwa": "කිව්වා",
	"yewwa": "යැව්වා", # sent
	"gawa": "ගැව්වා", # hit
	"katha": "කතා", # talked / story
	# Time
	"heta": "හෙට", # tomorrow
	"ada": "අද", # today
	"iye": "ඊයේ", # yesterday
	# Common adverbs / particles
	"one": "ඕනෙ", # need/want
	"oney": "ඕනේ",
	"naa": "නෑ", # no (long form)
	"na": "නෑ", # no
	"hari": "හරි", # ok / right
	"wage": "වගේ", # like
	"nisa": "නිසා", # because
	"inne": "ඉන්නෙ", # being/staying (colloquial)
	"inna": "ඉන්න", # stay (imperative)
	"kalin": "කලින්", # before / earlier
	"madi": "මදි", # insufficient / not enough
	# Common verb endings
	"giye": "ගියේ", # went (emphatic)
	"una": "උනා", # became / happened
	"wuna": "උනා", # became (alt spelling)
	# Locations / misc
	"gedaradi": "ගෙදරදී", # at home
	"gedara": "ගෙදර", # home
	# Common adjectives / other
	"honda": "හොඳ", # good
	"ape": "අපේ", # our
	"me": "මේ", # this
	"passe": "පස්සෙ", # after / later
	"ba": "බෑ", # can't
	"bari": "බැරි", # impossible
	"bri": "බැරි", # can't (abbrev)
	"danne": "දන්නෙ", # know
	"wada": "වැඩ", # work (noun)
	"epa": "එපා", # don't
	# Common ad-hoc abbreviations
	"mn": "මං", # man (I, informal first person)
	"mta": "මට", # mata
	"oyta": "ඔයාට", # oyata
	"oyata": "ඔයාට", # to you
	"krnna": "කරන්න", # karanna
	"blnna": "බලන්න", # balanna
	"on": "ඕනෙ", # one (abbrev)
	# Common -nawa verb endings
	"thiyanawa": "තියෙනවා", # is/has
	"wenawa": "වෙනවා", # becomes
	"enawa": "එනවා", # comes
	"yanawa": "යනවා", # goes
	"hithenawa":"හිතෙනවා", # thinks/feels
	"penenawa": "පේනවා", # appears/visible
	"karamu": "කරමු", # let's do
	"balamu": "බලමු", # let's see
	"damu": "දාමු", # let's put
	"yamu": "යමු", # let's go
	# Short English abbreviations (keys are lowercase for lookup)
	"pr": "PR",
	"dm": "DM",
	"ai": "AI",
	"it": "IT",
	"qa": "QA",
	"ui": "UI",
	"ok": "OK",
	# Common ad-hoc abbreviations (contd.)
	"ek": "එක", # eka (short form)
	"ekta": "එකට", # ekata = to that one
	"ekat": "ඒකට", # that-thing + to (standalone form)
	"eke": "එකේ", # of that one
	"hta": "හෙට", # heta (abbrev)
	"damma": "දැම්මා", # put/posted
	"gannako": "ගන්නකෝ", # take (imperative, long ō)
	# Additional words for accuracy
	"gena": "ගැන", # about
	"mata": "මට", # to me
	"laga": "ළඟ", # near
	"poth": "පොත", # book
	"iwara": "ඉවර", # finished
	"karanna": "කරන්න", # to do
	"hadamu": "හදමු", # let's make
	"kiyawala": "කියවලා", # having read
	"baya": "බය", # fear/scared
	}

	# Context-dependent words: use this form ONLY when the previous word is
	# NOT English. When "eka" follows an English noun (e.g., "assignment eka")
	# the scorer resolves it to එක naturally; standalone "eka" maps to ඒක.
	CONTEXT_WORDS_STANDALONE: Dict[str, str] = {
	"eka": "ඒක", # that thing (standalone)
	"ekak": "එකක්", # one of (quantifier — same either way)
	}


	# ─── Phonetic Mapping Tables ────────────────────────────────────────────────
	# Singlish Romanized → Sinhala Unicode
	# Tables are ordered longest-pattern-first so greedy replacement works.

	CONSONANTS: List[str] = [
	"nnd", "nndh", "nng",
	"th", "dh", "gh", "ch", "ph", "bh", "jh", "sh",
	"GN", "KN", "Lu", "kh", "Th", "Dh",
	"S", "d", "c", "th", "t", "k", "D", "n", "p", "b", "m",
	"\\y",
	"Y", "y", "j", "l", "v", "w", "s", "h",
	"N", "L", "K", "G", "P", "B", "f", "g", "r",
	]

	CONSONANTS_UNI: List[str] = [
	"ඬ", "ඳ", "ඟ",
	"ත", "ධ", "ඝ", "ච", "ඵ", "භ", "ඣ", "ෂ",
	"ඥ", "ඤ", "ළු", "ඛ", "ඨ", "ඪ",
	"ශ", "ද", "ච", "ත", "ට", "ක", "ඩ", "න", "ප", "බ", "ම",
	"‍ය",
	"‍ය", "ය", "ජ", "ල", "ව", "ව", "ස", "හ",
	"ණ", "ළ", "ඛ", "ඝ", "ඵ", "ඹ", "ෆ", "ග", "ර",
	]

	VOWELS: List[str] = [
	"oo", "o\\)", "oe", "aa", "a\\)", "Aa", "A\\)", "ae",
	"ii", "i\\)", "ie", "ee", "ea", "e\\)", "ei",
	"uu", "u\\)", "au",
	"\\a", "a", "A", "i", "e", "u", "o", "I",
	]

	VOWELS_UNI: List[str] = [
	"ඌ", "ඕ", "ඕ", "ආ", "ආ", "ඈ", "ඈ", "ඈ",
	"ඊ", "ඊ", "ඊ", "ඊ", "ඒ", "ඒ", "ඒ",
	"ඌ", "ඌ", "ඖ",
	"ඇ", "අ", "ඇ", "ඉ", "එ", "උ", "ඔ", "ඓ",
	]

	VOWEL_MODIFIERS_UNI: List[str] = [
	"ූ", "ෝ", "ෝ", "ා", "ා", "ෑ", "ෑ", "ෑ",
	"ී", "ී", "ී", "ී", "ේ", "ේ", "ේ",
	"ූ", "ූ", "ෞ",
	"ැ", "", "ැ", "ි", "ෙ", "ු", "ො", "ෛ",
	]

	SPECIAL_CONSONANTS: List[str] = ["\\n", "\\h", "\\N", "\\R", "R", "\\r"]
	SPECIAL_CONSONANTS_UNI: List[str] = ["ං", "ඃ", "ඞ", "ඍ", "ර්\u200D", "ර්\u200D"]

	SPECIAL_CHARS: List[str] = ["ruu", "ru"]
	SPECIAL_CHARS_UNI: List[str] = ["ෲ", "ෘ"]

	N_VOWELS: int = 26