Spaces:

Kalana001
/

SinCode

Running

App Files Files Community

Kalana001 commited on Feb 4

Commit

3849559

verified ·

1 Parent(s): e131ad8

Upload 3 files

Browse files

Files changed (3) hide show

app.py +82 -0
dictionary.pkl +3 -0
sincode_model.py +268 -0

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import streamlit as st
+import time
+from sincode_model import BeamSearchDecoder
+from PIL import Image
+import base64
+st.set_page_config(page_title="සිංCode Prototype", page_icon="🇱🇰", layout="centered")
+def add_bg_from_local(image_file):
+    try:
+        with open(image_file, "rb") as f:
+            data = f.read()
+        b64_data = base64.b64encode(data).decode()
+        st.markdown(
+            f"""
+            <style>
+            .stApp {{
+                background-image: linear-gradient(rgba(0,0,0,0.7), rgba(0,0,0,0.7)), url(data:image/png;base64,{b64_data});
+                background-size: cover;
+                background-position: center;
+                background-attachment: fixed;
+            }}
+            </style>
+            """,
+            unsafe_allow_html=True
+        )
+    except FileNotFoundError:
+        pass
+@st.cache_resource
+def load_system():
+    decoder = BeamSearchDecoder()
+    return decoder
+background_path = "/content/drive/MyDrive/FYP/background.png"
+add_bg_from_local(background_path)
+with st.sidebar:
+    logo = Image.open("/content/drive/MyDrive/FYP/SinCodeLogo.jpg")
+    st.image(logo, width=200)
+    st.title("සිංCode Project")
+    st.info("Prototype")
+    st.markdown("### 🏗 Architecture")
+    st.success("""
+    **Hybrid Neuro-Symbolic Engine**
+    Combines rule-based speed with Deep Learning (XLM-R) context awareness.
+    **Adaptive Code-Switching**
+    Intelligently detects and preserves English contexts.
+    **Contextual Disambiguation**
+    Resolves Singlish ambiguity using sentence-level probability.
+    """)
+    st.markdown("---")
+    st.write("© 2026 Kalana Chandrasekara")
+st.title("සිංCode: Context-Aware Transliteration")
+st.markdown("Type Singlish sentences below. The system handles **code-mixing**, **ambiguity**, and **punctuation**.")
+input_text = st.text_area("Input Text", height=100, placeholder="e.g., Singlish sentences type krnna")
+if st.button("Transliterate", type="primary", use_container_width=True) and input_text:
+    try:
+        with st.spinner("Processing..."):
+            decoder = load_system()
+            start_time = time.time()
+            result, trace_logs = decoder.decode(input_text)
+            end_time = time.time()
+        st.success("Transliteration Complete")
+        st.markdown(f"### {result}")
+        st.caption(f"Time: {round(end_time - start_time, 2)}s")
+        with st.expander("See How It Works (Debug Info)", expanded=True):
+            st.write("Below shows the candidate scoring for each word step:")
+            for log in trace_logs:
+                st.markdown(log)
+                st.divider()
+    except Exception as e:
+        st.error(f"Error: {e}")

dictionary.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7444b74f2fcf8f208e47f087ee778f11086eab74f54e4f3e07fb6cc06c88ea8
+size 326599345

sincode_model.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import torch
+import math
+import re
+import os
+import requests
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+# --- 0. SETUP ROBUST ENGLISH VOCAB ---
+def load_english_corpus():
+    # 1. Define Core "Safety" Words
+    core_english = {
+        "transliteration", "sincode", "prototype", "assignment", "singlish",
+        "rest", "complete", "tutorial", "small", "mistakes", "game", "play",
+        "type", "test", "online", "code", "mixing", "project", "demo", "today",
+        "tomorrow", "presentation", "slide"
+    }
+    url = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
+    file_path = "english_20k.txt"
+    download_success = False
+    # 2. Try to Load/Download 20k Corpus
+    if not os.path.exists(file_path):
+        try:
+            print("🌐 Downloading English Corpus...")
+            r = requests.get(url, timeout=5)
+            with open(file_path, "wb") as f:
+                f.write(r.content)
+            download_success = True
+        except:
+            print("Internet Warning: Could not download English corpus. Using fallback list.")
+    else:
+        download_success = True
+    # 3. Combine Lists
+    full_vocab = core_english.copy()
+    if download_success and os.path.exists(file_path):
+        try:
+            with open(file_path, "r") as f:
+                downloaded_words = set(f.read().splitlines())
+                full_vocab.update(downloaded_words)
+        except:
+            pass
+    print(f"English Vocab Loaded: {len(full_vocab)} words")
+    return full_vocab
+ENGLISH_VOCAB = load_english_corpus()
+# --- 1. RULE BASED ENGINE ---
+# (Standard Rule Variables)
+nVowels = 26
+consonants = ["nnd", "nndh", "nng", "th", "dh", "gh", "ch", "ph", "bh", "jh", "sh", "GN", "KN", "Lu", "kh", "Th", "Dh", "S", "d", "c", "th", "t", "k", "D", "n", "p", "b", "m", "\\u005C" + "y", "Y", "y", "j", "l", "v", "w", "s", "h", "N", "L", "K", "G", "P", "B", "f", "g", "r"]
+consonantsUni = ["ඬ", "ඳ", "ඟ", "ත", "ධ", "ඝ", "ච", "ඵ", "භ", "ඣ", "ෂ", "ඥ", "ඤ", "ළු", "ඛ", "ඨ", "ඪ", "ශ", "ද", "ච", "ත", "ට", "ක", "ඩ", "න", "ප", "බ", "ම", "‍ය", "‍ය", "ය", "ජ", "ල", "ව", "ව", "ස", "හ", "ණ", "ළ", "ඛ", "ඝ", "ඵ", "ඹ", "ෆ", "ග", "ර"]
+vowels = ["oo", "o\\)", "oe", "aa", "a\\)", "Aa", "A\\)", "ae", "ii", "i\\)", "ie", "ee", "ea", "e\\)", "ei", "uu", "u\\)", "au", "\\a", "a", "A", "i", "e", "u", "o", "I"]
+vowelsUni = ["ඌ", "ඕ", "ඕ", "ආ", "ආ", "ඈ", "ඈ", "ඈ", "ඊ", "ඊ", "ඊ", "ඊ", "ඒ", "ඒ", "ඒ", "ඌ", "ඌ", "ඖ", "ඇ", "අ", "ඇ", "ඉ", "එ", "උ", "ඔ", "ඓ"]
+vowelModifiersUni = ["ූ", "ෝ", "ෝ", "ා", "ා", "ෑ", "ෑ", "ෑ", "ී", "ී", "ී", "ී", "ේ", "ේ", "ේ", "ූ", "ූ", "ෞ", "ැ", "", "ැ", "ි", "ෙ", "ු", "ො", "ෛ"]
+specialConsonants = ["\\n", "\\h", "\\N", "\\R", "R", "\\r"]
+specialConsonantsUni = ["ං", "ඃ", "ඞ", "ඍ", "ර්"+"\u200D", "ර්"+"\u200D"]
+specialChar = ["ruu", "ru"]
+specialCharUni = ["ෲ", "ෘ"]
+def rule_based_transliterate(text):
+    for i in range(len(specialConsonants)):
+        text = text.replace(specialConsonants[i], specialConsonantsUni[i])
+    for i in range(len(specialCharUni)):
+        for j in range(len(consonants)):
+            s = consonants[j] + specialChar[i]
+            v = consonantsUni[j] + specialCharUni[i]
+            r = s.replace(s+"/G", "")
+            text = text.replace(r, v)
+    for j in range(len(consonants)):
+        for i in range(len(vowels)):
+            s = consonants[j] + "r" + vowels[i]
+            v = consonantsUni[j] + "්‍ර" + vowelModifiersUni[i]
+            r = s.replace(s+"/G", "")
+            text = text.replace(r, v)
+        s = consonants[j] + "r"
+        v = consonantsUni[j] + "්‍ර"
+        r = s.replace(s+"/G", "")
+        text = text.replace(r, v)
+    for i in range(len(consonants)):
+        for j in range(nVowels):
+            s = consonants[i] + vowels[j]
+            v = consonantsUni[i] + vowelModifiersUni[j]
+            r = s.replace(s+"/G", "")
+            text = text.replace(r, v)
+    for i in range(len(consonants)):
+        r = consonants[i].replace(consonants[i]+"/G", "")
+        text = text.replace(r, consonantsUni[i] + "්")
+    for i in range(len(vowels)):
+        r = vowels[i].replace(vowels[i]+"/G", "")
+        text = text.replace(r, vowelsUni[i])
+    return text
+# --- 2. DICTIONARY ADAPTER ---
+class DictionaryAdapter:
+    def __init__(self, dictionary_dict):
+        self.dictionary = dictionary_dict
+    def get_candidates(self, word):
+        cands = []
+        word_lower = word.lower()
+        # 1. English Corpus Check
+        if word_lower in ENGLISH_VOCAB:
+            cands.append(word)
+        # 2. Sinhala Dictionary Check
+        if word in self.dictionary:
+            cands.extend(self.dictionary[word])
+        elif word_lower in self.dictionary:
+            cands.extend(self.dictionary[word_lower])
+        # 3. Clean & Return
+        if cands:
+            return list(dict.fromkeys(cands))
+        # 4. Fallback: Subwords (Only if NO candidates found)
+        length = len(word)
+        if length > 3:
+            for i in range(2, length - 1):
+                part1 = word[:i]
+                part2 = word[i:]
+                p1_cands = self.dictionary.get(part1) or self.dictionary.get(part1.lower())
+                p2_cands = self.dictionary.get(part2) or self.dictionary.get(part2.lower())
+                if p1_cands and p2_cands:
+                    cands1 = list(enumerate(p1_cands[:3]))
+                    cands2 = list(enumerate(p2_cands[:3]))
+                    for rank1, w1 in cands1:
+                        for rank2, w2 in cands2:
+                            cands.append(w1 + w2)
+        if cands:
+            return list(set(cands))
+        return []
+    def get_rule_output(self, word):
+        return rule_based_transliterate(word)
+# --- 3. BEAM SEARCH DECODER (With Enhanced Trace) ---
+class BeamSearchDecoder:
+    def __init__(self, model_name="FacebookAI/xlm-roberta-base", dictionary_path="dictionary.pkl", device=None):
+        if device is None:
+            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        else:
+            self.device = device
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
+        self.model.to(self.device)
+        self.model.eval()
+        import pickle
+        with open(dictionary_path, "rb") as f:
+            d_data = pickle.load(f)
+        self.adapter = DictionaryAdapter(d_data)
+    def batch_score(self, contexts, candidates):
+        inputs = self.tokenizer(contexts, return_tensors="pt", padding=True, truncation=True).to(self.device)
+        mask_token_id = self.tokenizer.mask_token_id
+        scores = []
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            logits = outputs.logits
+        for i, target in enumerate(candidates):
+            token_ids = inputs.input_ids[i]
+            mask_indices = (token_ids == mask_token_id).nonzero(as_tuple=True)
+            if len(mask_indices[0]) == 0:
+                scores.append(-100.0); continue
+            mask_pos = mask_indices[0].item()
+            probs = torch.softmax(logits[i, mask_pos, :], dim=0)
+            target_ids = self.tokenizer.encode(target, add_special_tokens=False)
+            if not target_ids:
+                scores.append(-100.0); continue
+            word_score = sum([math.log(probs[tid].item() + 1e-9) for tid in target_ids])
+            scores.append(word_score / len(target_ids))
+        return scores
+    def decode(self, sentence, beam_width=3):
+        words = sentence.split()
+        candidate_sets, penalties, future_context = [], [], []
+        punct_pattern = re.compile(r"^(\W*)(.*?)(\W*)$")
+        trace_logs = []
+        for raw in words:
+            match = punct_pattern.match(raw)
+            prefix, core, suffix = match.groups() if match else ("", raw, "")
+            if not core:
+                candidate_sets.append([raw]); penalties.append([0.0]); future_context.append(raw)
+                continue
+            # 1. Get Candidates
+            cands = self.adapter.get_candidates(core)
+            rule_cand = self.adapter.get_rule_output(core)
+            if not cands:
+                cands = [rule_cand]
+                curr_penalties = [0.0]
+            else:
+                curr_penalties = []
+                has_english = any(c.lower() in ENGLISH_VOCAB for c in cands)
+                for c in cands:
+                    is_eng = c.lower() in ENGLISH_VOCAB
+                    is_rule_match = (c == rule_cand)
+                    if is_eng:
+                        curr_penalties.append(0.0)
+                    elif has_english:
+                        curr_penalties.append(5.0)
+                    elif is_rule_match:
+                        curr_penalties.append(0.0)
+                    else:
+                        curr_penalties.append(2.0)
+            final_cands = [prefix + c + suffix for c in cands]
+            candidate_sets.append(final_cands[:6])
+            penalties.append(curr_penalties[:6])
+            best_idx = curr_penalties.index(min(curr_penalties))
+            future_context.append(final_cands[best_idx])
+        beam = [([], 0.0)]
+        for t in range(len(words)):
+            candidates = candidate_sets[t]
+            curr_penalties = penalties[t]
+            next_beam = []
+            batch_ctx, batch_tgt, batch_meta = [], [], []
+            for p_idx, (p_path, p_score) in enumerate(beam):
+                for c_idx, cand in enumerate(candidates):
+                    future = future_context[t+1:] if t+1 < len(words) else []
+                    ctx = " ".join(p_path + [self.tokenizer.mask_token] + future)
+                    batch_ctx.append(ctx)
+                    batch_tgt.append(cand)
+                    batch_meta.append((p_idx, c_idx))
+            if batch_ctx:
+                scores = self.batch_score(batch_ctx, batch_tgt)
+                # --- TRACE LOGGING ---
+                step_log = f"**Step {t+1}: {words[t]}**\n"
+                for i, score in enumerate(scores):
+                    p_idx, c_idx = batch_meta[i]
+                    orig_path, orig_score = beam[p_idx]
+                    final_score = score - curr_penalties[c_idx]
+                    next_beam.append((orig_path + [batch_tgt[i]], orig_score + final_score))
+                    # Add to log if score is reasonable (reduce noise)
+                    if score > -25.0:
+                        word = batch_tgt[i]
+                        penalty = curr_penalties[c_idx]
+                        step_log += f"- `{word}` (Pen: {penalty}) -> **{final_score:.2f}**\n"
+                trace_logs.append(step_log)
+            if not next_beam: continue
+            beam = sorted(next_beam, key=lambda x: x[1], reverse=True)[:beam_width]
+        final_output = " ".join(beam[0][0]) if beam else ""
+        return final_output, trace_logs