Spaces:

ravish5
/

ShabdaAI

Running

App Files Files Community

ravish5 commited on Sep 30, 2025

Commit

4082cf2

verified ·

1 Parent(s): f604ef6

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -74

app.py CHANGED Viewed

@@ -1,34 +1,55 @@
-import os, re, pathlib
 import numpy as np
 import pandas as pd
 import torch
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 from sentence_transformers import SentenceTransformer
 import gradio as gr
-# --- Setup paths ---
 PROJECT_DIR = pathlib.Path(__file__).parent.resolve()
 DATA_DIR = PROJECT_DIR / "data"
 DATA_DIR.mkdir(parents=True, exist_ok=True)
 CSV_PATH = DATA_DIR / "sample_indic.csv"
-# --- Load dataset ---
-df = pd.read_csv(CSV_PATH, encoding="utf-8")
 _ZW = r"\u200b\u200c\u200d\ufeff"
 ZW_RE = re.compile(f"[{_ZW}]")
 def normalize_text(s: str) -> str:
     if not isinstance(s, str):
         return ""
-    s = s.replace("\u0964", "।")  # danda fix
     s = ZW_RE.sub("", s)
     s = re.sub(r"\s+", " ", s).strip()
     return s
 df["context_norm"] = df["context"].apply(normalize_text)
-# --- Embedding model ---
 EMB_MODEL_NAME = "intfloat/multilingual-e5-base"
 emb_model = SentenceTransformer(EMB_MODEL_NAME)
 emb_model.eval()
@@ -37,127 +58,199 @@ def encode_queries(texts):
     texts = [normalize_text(t) for t in texts]
     prefixed = [f"query: {t}" for t in texts]
     with torch.inference_mode():
-        return emb_model.encode(prefixed, normalize_embeddings=True)
 def encode_passages(texts):
     texts = [normalize_text(t) for t in texts]
     prefixed = [f"passage: {t}" for t in texts]
     with torch.inference_mode():
-        return emb_model.encode(prefixed, normalize_embeddings=True)
-# --- Build embeddings for whole dataset ---
-PASSAGE_EMBS = encode_passages(df["context_norm"].tolist())
-# --- Retriever ---
-def retrieve_top_k(query: str, lang_code: str, k: int = 3):
-    if not query.strip():
         return []
     qv = encode_queries([query])[0]
     sims = np.dot(PASSAGE_EMBS, qv)
-    mask = (df["language"] == lang_code).to_numpy()
-    sims = np.where(mask, sims, -1e9)
     idxs = np.argsort(-sims)[:k]
     results = []
     for rank, i in enumerate(idxs):
-        if sims[i] < -1e8:
-            continue
-        results.append({"rank": int(rank+1), "similarity": float(sims[i]), "context": df.iloc[i]["context_norm"]})
     return results
-# --- QA reader ---
 READER_MODEL = "deepset/xlm-roberta-large-squad2"
 device = 0 if torch.cuda.is_available() else -1
-qa = pipeline("question-answering", model=READER_MODEL, tokenizer=AutoTokenizer.from_pretrained(READER_MODEL), device=device)
-def answer_with_context(question, context):
-    out = qa(question=normalize_text(question), context=normalize_text(context))
-    return {"answer": out.get("answer","").strip(), "score": float(out.get("score",0.0))}
-# --- Translators (NLLB-200) ---
 NLLB_ID = "facebook/nllb-200-distilled-600M"
-nllb_tok = AutoTokenizer.from_pretrained(NLLB_ID)
 nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_ID)
-def build_translator(src, tgt):
-    return pipeline("translation", model=nllb_model, tokenizer=nllb_tok, src_lang=src, tgt_lang=tgt, device=device)
-trans_te_en = build_translator("tel_Telu", "eng_Latn")
-trans_kn_en = build_translator("kan_Knda", "eng_Latn")
-def te_to_en(text): return trans_te_en(text, max_length=256)[0]["translation_text"].strip() if text else ""
-def kn_to_en(text): return trans_kn_en(text, max_length=256)[0]["translation_text"].strip() if text else ""
-# --- Gradio App ---
 INTRO_MD = """
-### ShabdaAI (Telugu + Kannada ↔ English)
-- **Mode 1:** Answer using provided context passage
-- **Mode 2:** If no passage, retrieve from small Telugu+Kannada corpus
-> Retrieval: **intfloat/multilingual-e5-base**
-> Reader: **deepset/xlm-roberta-large-squad2**
-> Translation: **NLLB-200**
 """
-def ui_answer(mode, lang_choice, translate_outputs_en, translate_inputs_en, question, user_context, top_k):
-    if not question:
-        return "", "", "0.000", "", "—", "—", "—"
-    # Pick language + translator
     if lang_choice == "Telugu":
-        to_en = te_to_en; lang_code = "te"
     else:
-        to_en = kn_to_en; lang_code = "kn"
-    # Input translations
-    q_en = to_en(question) if translate_inputs_en else "—"
-    ctx_en = to_en(user_context) if translate_inputs_en and user_context else "—"
     if mode == "With my context":
         res = answer_with_context(question, user_context)
-        ans = res["answer"]; score = res["score"]
         ans_en = to_en(ans) if translate_outputs_en and ans else ""
-        return ans, ans_en, f"{score:.3f}", user_context, ctx_en, q_en, "—"
     else:
-        cands = retrieve_top_k(question, lang_code, k=top_k)
-        best = {"answer":"", "score":-1.0, "context":""}
-        for c in cands:
-            out = answer_with_context(question, c["context"])
-            if out["score"] > best["score"]:
-                best = {"answer":out["answer"], "score":out["score"], "context":c["context"]}
-        ans = best["answer"]; ans_en = to_en(ans) if translate_outputs_en and ans else ""
-        tbl = "\n".join([f"{r['rank']}. (sim={r['similarity']:.3f}) {r['context']}" for r in cands]) or "—"
-        return ans, ans_en, f"{best['score']:.3f}", best["context"], ctx_en, q_en, tbl
 with gr.Blocks() as demo:
     gr.Markdown(INTRO_MD)
     with gr.Row():
-        mode = gr.Radio(choices=["With my context","No context (search sample data)"], value="With my context", label="Mode")
-        lang_choice = gr.Dropdown(choices=["Telugu","Kannada"], value="Telugu", label="Language")
-        top_k = gr.Slider(1,5,value=3,step=1,label="Top-K passages (for No-context mode)")
     with gr.Row():
-        translate_outputs_en = gr.Checkbox(value=True, label="Translate Answer → English")
-        translate_inputs_en  = gr.Checkbox(value=True, label="Translate Inputs → English")
-    question = gr.Textbox(label="Question", placeholder="e.g. హైదరాబాద్ ఎక్కడ ఉంది? / ಬೆಂಗಳೂರು ಯಾವ ರಾಜ್ಯದ ರಾಜಧಾನಿ?")
-    user_context = gr.Textbox(label="Passage / Context (optional)", lines=4)
     btn = gr.Button("Answer")
-    # Outputs
-    answer_lang = gr.Textbox(label="Answer (Original Language)")
     answer_en = gr.Textbox(label="Answer (English)")
     score = gr.Textbox(label="Confidence score")
-    used_ctx = gr.Textbox(label="Used context (Original)")
     ctx_en_box = gr.Textbox(label="Used context (English)")
     q_en_box = gr.Textbox(label="Question (English)")
-    retrieved = gr.Textbox(label="Top-K retrieved passages", lines=4)
     btn.click(
         fn=ui_answer,
-        inputs=[mode, lang_choice, translate_outputs_en, translate_inputs_en, question, user_context, top_k],
-        outputs=[answer_lang, answer_en, score, used_ctx, ctx_en_box, q_en_box, retrieved]
     )
 if __name__ == "__main__":

+import os, re, pathlib, json
 import numpy as np
 import pandas as pd
 import torch
+from transformers import pipeline, AutoTokenizer
 from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForSeq2SeqLM
 import gradio as gr
 PROJECT_DIR = pathlib.Path(__file__).parent.resolve()
 DATA_DIR = PROJECT_DIR / "data"
 DATA_DIR.mkdir(parents=True, exist_ok=True)
 CSV_PATH = DATA_DIR / "sample_indic.csv"
+SAMPLE_ROWS = [
+    {"id":"kn1","language":"kn","context":"ಬೆಂಗಳೂರು ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ.","question":"ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ ಯಾವುದು?","answer_text":"ಬೆಂಗಳೂರು"},
+    {"id":"kn2","language":"kn","context":"ಕನ್ನಡ ಒಂದು ದ್ರಾವಿಡ ಭಾಷೆ.","question":"ಕನ್ನಡ ಯಾವ ಭಾಷಾ ಕುಟುಂಬಕ್ಕೆ ಸೇರಿದೆ?","answer_text":"ದ್ರಾವಿಡ"},
+    {"id":"kn3","language":"kn","context":"ಮೈಸೂರು ಅರಮನೆ ಕರ್ನಾಟಕದ ಪ್ರಸಿದ್ಧ ತಾಣ.","question":"ಮೈಸೂರು ಅರಮನೆ ಎಲ್ಲಿದೆ?","answer_text":"ಕರ್ನಾಟಕ"},
+    {"id":"kn4","language":"kn","context":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್ ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜನಾಗಿದ್ದನು.","question":"ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜ ಯಾರು?","answer_text":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್"},
+    {"id":"kn5","language":"kn","context":"ಹಂಪಿ ಯುನೆಸ್ಕೋ ವಿಶ್ವ ಪರಂಪರೆ ತಾಣವಾಗಿದೆ.","question":"ಹಂಪಿ ಯಾವ ರೀತಿಯ ತಾಣ?","answer_text":"ವಿಶ್ವ ಪರಂಪರೆ ತಾಣ"},
+    {"id":"te1","language":"te","context":"తెలంగాణ రాష్ట్ర రాజధాని హైదరాబాదు. ఈ నగరం ఐటి పరిశ్రమకు ప్రసిద్ధి.","question":"తెలంగాణ రాష్ట్ర రాజధాని ఏది?","answer_text":"హైదరాబాదు"},
+    {"id":"te2","language":"te","context":"తెలుగు భాష ద్రావిడ భాషా కుటుంబానికి చెందినది. దాని లిపి తెలుగు లిపి.","question":"తెలుగు భాష ఏ లిపిని ఉపయోగిస్తుంది?","answer_text":"తెలుగు లిపి"},
+    {"id":"te3","language":"te","context":"సీతాకోక చిలుకలకు రెండు రెక్కలు ఉంటాయి. ఇవి పూల మకరందం తాగుతాయి.","question":"సీతాకోక చిలుకకు ఎన్ని రెక్కలు ఉన్నాయి?","answer_text":"రెండు"},
+    {"id":"te4","language":"te","context":"విశాఖపట్నం ఒక తీర నగరం. ఇది ఆంధ్రప్రదేశ్‌లోని ప్రముఖ నౌకాశ్రయం.","question":"విశాఖపట్నం ఏ రకమైన నగరం?","answer_text":"తీర నగరం"},
+    {"id":"te5","language":"te","context":"చార్మినార్ హైదరాబాద్ లో ఉంది. ఇది చారిత్రక స్మారక చిహ్నం.","question":"చార్మినార్ ఎక్కడ ఉంది?","answer_text":"హైదరాబాద్"},
+]
+def ensure_sample_csv(path: pathlib.Path):
+    if not path.exists():
+        df = pd.DataFrame(SAMPLE_ROWS)
+        df.to_csv(path, index=False, encoding="utf-8")
+        print(f"[init] Wrote sample Kannada data to {path}")
+ensure_sample_csv(CSV_PATH)
 _ZW = r"\u200b\u200c\u200d\ufeff"
 ZW_RE = re.compile(f"[{_ZW}]")
 def normalize_text(s: str) -> str:
     if not isinstance(s, str):
         return ""
+    s = s.replace("\u0964", "।")
     s = ZW_RE.sub("", s)
     s = re.sub(r"\s+", " ", s).strip()
     return s
+df = pd.read_csv(CSV_PATH, encoding="utf-8")
 df["context_norm"] = df["context"].apply(normalize_text)
+CORPUS = df["context_norm"].tolist()
 EMB_MODEL_NAME = "intfloat/multilingual-e5-base"
 emb_model = SentenceTransformer(EMB_MODEL_NAME)
 emb_model.eval()
     texts = [normalize_text(t) for t in texts]
     prefixed = [f"query: {t}" for t in texts]
     with torch.inference_mode():
+        vecs = emb_model.encode(prefixed, normalize_embeddings=True)
+    return vecs
 def encode_passages(texts):
     texts = [normalize_text(t) for t in texts]
     prefixed = [f"passage: {t}" for t in texts]
     with torch.inference_mode():
+        vecs = emb_model.encode(prefixed, normalize_embeddings=True)
+    return vecs
+PASSAGE_EMBS = encode_passages(CORPUS)
+def retrieve_top_k(query: str, k: int = 3):
+    if not query or not query.strip():
         return []
     qv = encode_queries([query])[0]
     sims = np.dot(PASSAGE_EMBS, qv)
     idxs = np.argsort(-sims)[:k]
     results = []
     for rank, i in enumerate(idxs):
+        results.append({"rank": int(rank+1), "similarity": float(sims[i]), "context": CORPUS[i]})
     return results
 READER_MODEL = "deepset/xlm-roberta-large-squad2"
 device = 0 if torch.cuda.is_available() else -1
+tokenizer = AutoTokenizer.from_pretrained(READER_MODEL, use_fast=True)
+qa = pipeline("question-answering", model=READER_MODEL, tokenizer=tokenizer, device=device)
+# --- Kannada -> English translator (offline, NLLB-200) ---
+# Model: facebook/nllb-200-distilled-600M
+# Kannada = 'kan_Knda', English = 'eng_Latn'
 NLLB_ID = "facebook/nllb-200-distilled-600M"
+nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_ID)
 nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_ID)
+# Telugu -> English
+trans_te_en = pipeline(
+    "translation",
+    model=nllb_model,
+    tokenizer=nllb_tokenizer,
+    src_lang="tel_Telu",
+    tgt_lang="eng_Latn",
+    device=device
+)
+def te_to_en(text: str) -> str:
+    text = (text or "").strip()
+    if not text: return ""
+    return trans_te_en(text, max_length=256)[0]["translation_text"].strip()
+# Kannada -> English
+trans_kn_en = pipeline(
+    "translation",
+    model=nllb_model,
+    tokenizer=nllb_tokenizer,
+    src_lang="kan_Knda",
+    tgt_lang="eng_Latn",
+    device=device
+)
+def kn_to_en(text: str) -> str:
+    text = (text or "").strip()
+    if not text: return ""
+    return trans_kn_en(text, max_length=256)[0]["translation_text"].strip()
+def answer_with_context(question: str, context: str):
+    question = normalize_text(question)
+    context = normalize_text(context)
+    if not question or not context:
+        return {"answer": "", "score": 0.0}
+    out = qa(question=question, context=context)
+    ans = out.get("answer", "").strip()
+    score = float(out.get("score", 0.0))
+    return {"answer": ans, "score": score}
+def no_context_flow(question: str, top_k: int = 3):
+    cands = retrieve_top_k(question, k=top_k)
+    if not cands:
+        return {"answer": "", "score": 0.0, "used_context": "", "retrieved": []}
+    best = {"answer": "", "score": -1.0, "used_context": ""}
+    for c in cands:
+        out = answer_with_context(question, c["context"])
+        if out["score"] > best["score"]:
+            best = {"answer": out["answer"], "score": out["score"], "used_context": c["context"]}
+    return {"answer": best["answer"], "score": best["score"], "used_context": best["used_context"], "retrieved": cands}
 INTRO_MD = """
+### ShabdaAI (Kannada ↔ English)
+- **ಮೋಡ್ 1:** ನಾನು ನೀಡುವ ಪ್ಯಾಸೇಜ್ (context) ಆಧರಿಸಿ ಉತ್ತರಿಸು
+- **ಮೋಡ್ 2:** ಪ್ಯಾಸೇಜ್ ಇಲ್ಲದಿದ್ದರೆ — ಸಣ್ಣ ಕನ್ನಡ ಕಾರ್ಪಸ್‌ನಿಂದ *ಹುಡುಕು → ಓದು* ಮಾಡಿ ಉತ್ತರಿಸು
+- **మోడ్ 1:** నేను ఇచ్చే ప్యాసేజ్ (context) పై సమాధానం ఇవ్వు
+- **మోడ్ 2:** ప్యాసేజ్ ఇవ్వకపోతే — చిన్న తెలుగు కార్పస్‌లో *సెర్చ్ → రీడ్* చేసి సమాధానం ఇవ్వు
+> Models: **intfloat/multilingual-e5-base** (retrieval) + **deepset/xlm-roberta-large-squad2** (extractive QA)
 """
+def ui_answer(mode, translate_outputs_en, translate_inputs_en, question, user_context, top_k, lang_choice):
+    question = question or ""
+    user_context = user_context or ""
+    # Choose translator
     if lang_choice == "Telugu":
+        to_en = te_to_en
     else:
+        to_en = kn_to_en
+    # Optional translations
+    q_en = to_en(question) if translate_inputs_en and question else ""
+    ctx_en = to_en(user_context) if translate_inputs_en and user_context else ""
     if mode == "With my context":
         res = answer_with_context(question, user_context)
+        ans = res["answer"]
         ans_en = to_en(ans) if translate_outputs_en and ans else ""
+        return ans, ans_en, f"{res['score']:.3f}", user_context, ctx_en or "—", q_en or "—", "—"
     else:
+        res = no_context_flow(question, top_k=int(top_k))
+        ans = res["answer"]
+        ans_en = to_en(ans) if translate_outputs_en and ans else ""
+        retrieved_tbl = "\n".join(
+            [f"{r['rank']}. (sim={r['similarity']:.3f}) {r['context']}" for r in res.get("retrieved", [])]
+        ) or "—"
+        return ans, ans_en, f"{res['score']:.3f}", res["used_context"], ctx_en or "—", q_en or "—", retrieved_tbl
 with gr.Blocks() as demo:
     gr.Markdown(INTRO_MD)
     with gr.Row():
+        mode = gr.Radio(
+            choices=["With my context", "No context (search sample data)"],
+            value="With my context",
+            label="Mode"
+        )
+        top_k = gr.Slider(1, 5, value=3, step=1, label="Top-K passages (for No-context mode)")
     with gr.Row():
+        translate_outputs_en = gr.Checkbox(value=True, label="Translate ANSWER (Kannada → English)")
+        translate_inputs_en  = gr.Checkbox(value=True, label="Translate INPUTS (Question/Context → English)")
+    question = gr.Textbox(label="ಪ್ರಶ್ನೆ (Question)", placeholder="ಉದಾ: ಬೆಂಗಳೂರು ಯಾವ ರಾಜ್ಯದ ರಾಜಧಾನಿ?")
+    user_context = gr.Textbox(label="ಪ್ಯಾಸೇಜ್ / ಸಂದರ್ಭ (optional)", lines=4)
+    lang_choice = gr.Dropdown(
+    choices=["Telugu", "Kannada"],
+    value="Kannada",
+    label="Language"
+    )
     btn = gr.Button("Answer")
+        # Answers
+    answer_local = gr.Textbox(label="Answer (Telugu/Kannada)")
     answer_en = gr.Textbox(label="Answer (English)")
+    # Confidence + contexts
     score = gr.Textbox(label="Confidence score")
+    used_ctx = gr.Textbox(label="Used context (Telugu/Kannada)")
     ctx_en_box = gr.Textbox(label="Used context (English)")
     q_en_box = gr.Textbox(label="Question (English)")
+    retrieved = gr.Textbox(label="Top-K retrieved passages (Telugu/Kannada)", lines=4)
     btn.click(
         fn=ui_answer,
+        inputs=[mode, translate_outputs_en, translate_inputs_en, question, user_context, top_k, lang_choice],
+        outputs=[answer_local, answer_en, score, used_ctx, ctx_en_box, q_en_box, retrieved]
     )
 if __name__ == "__main__":