Kalana commited on
Commit
237f296
Β·
1 Parent(s): dbf34e8

Switch to fine-tuned model (Kalana001/xlm-roberta-sinhala-sincode)

Browse files

- Model uploaded to HF Hub for clean deployment
- Expanded COMMON_WORDS overrides (+25 entries)
- Patched dictionary.pkl (correct forms, removed wrong candidates)
- Added tech English words to CORE_ENGLISH_WORDS
- Evaluation: 101/110 (91.8%) exact match, CER=0.007

Files changed (4) hide show
  1. core/constants.py +2 -1
  2. core/english.py +1 -0
  3. core/mappings.py +36 -1
  4. dictionary.pkl +2 -2
core/constants.py CHANGED
@@ -6,7 +6,8 @@ import re
6
 
7
  # ─── Model & Data Paths ─────────────────────────────────────────────────────
8
 
9
- DEFAULT_MODEL_NAME = "FacebookAI/xlm-roberta-base"
 
10
  DEFAULT_DICTIONARY_PATH = "dictionary.pkl"
11
 
12
  ENGLISH_CORPUS_URL = (
 
6
 
7
  # ─── Model & Data Paths ─────────────────────────────────────────────────────
8
 
9
+ # DEFAULT_MODEL_NAME = "FacebookAI/xlm-roberta-base"
10
+ DEFAULT_MODEL_NAME = "Kalana001/xlm-roberta-sinhala-sincode"
11
  DEFAULT_DICTIONARY_PATH = "dictionary.pkl"
12
 
13
  ENGLISH_CORPUS_URL = (
core/english.py CHANGED
@@ -24,6 +24,7 @@ CORE_ENGLISH_WORDS: Set[str] = {
24
  "hall", "exam", "PR", "DM", "page", "app", "bug", "fix",
25
  "log", "push", "pull", "branch", "build", "run", "save",
26
  "link", "edit", "file", "open", "close", "live", "view",
 
27
  }
28
 
29
 
 
24
  "hall", "exam", "PR", "DM", "page", "app", "bug", "fix",
25
  "log", "push", "pull", "branch", "build", "run", "save",
26
  "link", "edit", "file", "open", "close", "live", "view",
27
+ "deployments", "leaderboard", "instagram", "github", "standup",
28
  }
29
 
30
 
core/mappings.py CHANGED
@@ -95,7 +95,7 @@ COMMON_WORDS: Dict[str, str] = {
95
  "it": "IT",
96
  "qa": "QA",
97
  "ui": "UI",
98
- "ok": "OK",
99
  # Common ad-hoc abbreviations (contd.)
100
  "ek": "ΰΆ‘ΰΆš", # eka (short form)
101
  "ekta": "ΰΆ‘ΰΆšΰΆ§", # ekata = to that one
@@ -114,6 +114,41 @@ COMMON_WORDS: Dict[str, str] = {
114
  "hadamu": "ΰ·„ΰΆ―ΰΆΈΰ·”", # let's make
115
  "kiyawala": "ΰΆšΰ·’ΰΆΊΰ·€ΰΆ½ΰ·", # having read
116
  "baya": "ΰΆΆΰΆΊ", # fear/scared
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  }
118
 
119
  # Context-dependent words: use this form ONLY when the previous word is
 
95
  "it": "IT",
96
  "qa": "QA",
97
  "ui": "UI",
98
+ "ok": "ok",
99
  # Common ad-hoc abbreviations (contd.)
100
  "ek": "ΰΆ‘ΰΆš", # eka (short form)
101
  "ekta": "ΰΆ‘ΰΆšΰΆ§", # ekata = to that one
 
114
  "hadamu": "ΰ·„ΰΆ―ΰΆΈΰ·”", # let's make
115
  "kiyawala": "ΰΆšΰ·’ΰΆΊΰ·€ΰΆ½ΰ·", # having read
116
  "baya": "ΰΆΆΰΆΊ", # fear/scared
117
+ # Ad-hoc and alternative spellings (accuracy fixes)
118
+ "kema": "ΰΆšΰ·‘ΰΆΈ", # food (colloquial spelling)
119
+ "kama": "ΰΆšΰ·‘ΰΆΈ", # food (alt spelling)
120
+ "hodai": "ΰ·„ΰ·œΰΆ³ΰΆΊΰ·’", # good! (no-n spelling)
121
+ "oyge": "ΰΆ”ΰΆΊΰ·ΰΆœΰ·™", # your (shortened form)
122
+ "iwra": "ΰΆ‰ΰ·€ΰΆ»", # finished (vowel-stripped)
123
+ "krd": "࢚ࢻාࢯ", # did? (extreme abbreviation)
124
+ "handawata": "ΰ·„ΰ·ΰΆ±ΰ·ŠΰΆ―ΰ·‘ΰ·€ΰΆ§", # in the evening
125
+ "wenwa": "වෙࢱවා", # becomes/happens
126
+ "ep": "࢑ࢴා", # epa (single-syllable abbrev)
127
+ "prashnya": "ࢴ්\u200dࢻශ්\u200dࢱࢺ", # question (without final vowel)
128
+ # ── Verb forms / participles (no English conflict) ────────────────────
129
+ "penawa": "ΰΆ΄ΰ·šΰΆ±ΰ·€ΰ·", # appears/visible (alt spelling of penenawa)
130
+ "thiyana": "ΰΆ­ΰ·’ΰΆΊΰ·™ΰΆ±", # that which is/exists (relative participle)
131
+ "enakota": "ΰΆ‘ΰΆ±ΰΆšΰ·œΰΆ§", # when (you/they) come
132
+ "hadanna": "ΰ·„ΰΆ―ΰΆ±ΰ·ŠΰΆ±", # to make/build (imperative)
133
+ "yawwa": "ΰΆΊΰ·ΰ·€ΰ·Šΰ·€ΰ·", # sent (alt spelling of yewwa)
134
+ "gihilla": "ΰΆœΰ·’ΰ·„ΰ·’ΰΆ½ΰ·ŠΰΆ½ΰ·", # having gone
135
+ "kewata": "ΰΆšΰ·‘ΰ·€ΰΆ§", # having eaten / for the eating
136
+ "kiyla": "ΰΆšΰ·’ΰΆΊΰΆ½ΰ·", # having said (ad-hoc spelling)
137
+ "krganna": "࢚ࢻ࢜ࢱ්ࢱ", # to do-and-get (ad-hoc abbreviation)
138
+ # ── Adjectives (no English conflict) ────────────────────────────────────
139
+ "amarui": "ࢅࢸාࢻුࢺි", # difficult / hard
140
+ "hodama": "ΰ·„ΰ·œΰΆ³ΰΆΈ", # best (superlative of honda)
141
+ # ── Particles / negation (no English conflict) ───────────────────────────
142
+ "nathi": "ࢱැࢭි", # without / lacking (negation)
143
+ "nati": "ࢱැࢭි", # without (alt spelling)
144
+ "naththe": "ΰΆ±ΰ·ΰΆ­ΰ·ŠΰΆ­ΰ·™", # negative participle (not ...ing)
145
+ "dan": "ࢯැࢱ්", # now
146
+ "oni": "ΰΆ•ΰΆ±ΰ·’", # need/want (alt spelling of one)
147
+ # ── Time ────────────────────────────────────────────────────────────────
148
+ "udee": "ΰΆ‹ΰΆ―ΰ·š", # morning
149
+ # ── Ad-hoc abbreviations (no English conflict) ───────────────────────────
150
+ "hri": "ΰ·„ΰΆ»ΰ·’", # ok/right (shortened hari)
151
+ "mge": "ࢸ࢜ේ", # my (shortened mage)
152
  }
153
 
154
  # Context-dependent words: use this form ONLY when the previous word is
dictionary.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7444b74f2fcf8f208e47f087ee778f11086eab74f54e4f3e07fb6cc06c88ea8
3
- size 326599345
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95bee1990d76dfe091be460c9ec06798fb4589a707c9823fb7324f3b47c00744
3
+ size 326599128