Spaces:

ravish5
/

ShabdaAI

Running

App Files Files Community

ravish5 commited on Mar 11

Commit

4f99eb7

verified ·

1 Parent(s): 484a475

Update app.py

Browse files

Files changed (1) hide show

app.py +220 -153

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
-import os, re, pathlib, json
 import numpy as np
 import pandas as pd
 import torch
-from transformers import pipeline, AutoTokenizer
 from sentence_transformers import SentenceTransformer
-from transformers import AutoModelForSeq2SeqLM
 import gradio as gr
 PROJECT_DIR = pathlib.Path(__file__).parent.resolve()
@@ -14,39 +14,44 @@ DATA_DIR = PROJECT_DIR / "data"
 DATA_DIR.mkdir(parents=True, exist_ok=True)
 CSV_PATH = DATA_DIR / "sample_indic.csv"
 SAMPLE_ROWS = [
     {"id":"kn1","language":"kn","context":"ಬೆಂಗಳೂರು ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ.","question":"ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ ಯಾವುದು?","answer_text":"ಬೆಂಗಳೂರು"},
     {"id":"kn2","language":"kn","context":"ಕನ್ನಡ ಒಂದು ದ್ರಾವಿಡ ಭಾಷೆ.","question":"ಕನ್ನಡ ಯಾವ ಭಾಷಾ ಕುಟುಂಬಕ್ಕೆ ಸೇರಿದೆ?","answer_text":"ದ್ರಾವಿಡ"},
     {"id":"kn3","language":"kn","context":"ಮೈಸೂರು ಅರಮನೆ ಕರ್ನಾಟಕದ ಪ್ರಸಿದ್ಧ ತಾಣ.","question":"ಮೈಸೂರು ಅರಮನೆ ಎಲ್ಲಿದೆ?","answer_text":"ಕರ್ನಾಟಕ"},
     {"id":"kn4","language":"kn","context":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್ ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜನಾಗಿದ್ದನು.","question":"ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜ ಯಾರು?","answer_text":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್"},
     {"id":"kn5","language":"kn","context":"ಹಂಪಿ ಯುನೆಸ್ಕೋ ವಿಶ್ವ ಪರಂಪರೆ ತಾಣವಾಗಿದೆ.","question":"ಹಂಪಿ ಯಾವ ರೀತಿಯ ತಾಣ?","answer_text":"ವಿಶ್ವ ಪರಂಪರೆ ತಾಣ"},
-    {"id":"te1","language":"te","context":"తెలంగాణ రాష్ట్ర రాజధాని హైదరాబాదు. ఈ నగరం ఐటి పరిశ్రమకు ప్రసిద్ధి.","question":"తెలంగాణ రాష్ట్ర రాజధాని ఏది?","answer_text":"హైదరాబాదు"},
-    {"id":"te2","language":"te","context":"తెలుగు భాష ద్రావిడ భాషా కుటుంబానికి చెందినది. దాని లిపి తెలుగు లిపి.","question":"తెలుగు భాష ఏ లిపిని ఉపయోగిస్తుంది?","answer_text":"తెలుగు లిపి"},
-    {"id":"te3","language":"te","context":"సీతాకోక చిలుకలకు రెండు రెక్కలు ఉంటాయి. ఇవి పూల మకరందం తాగుతాయి.","question":"సీతాకోక చిలుకకు ఎన్ని రెక్కలు ఉన్నాయి?","answer_text":"రెండు"},
-    {"id":"te4","language":"te","context":"విశాఖపట్నం ఒక తీర నగరం. ఇది ఆంధ్రప్రదేశ్‌లోని ప్రముఖ నౌకాశ్రయం.","question":"విశాఖపట్నం ఏ రకమైన నగరం?","answer_text":"తీర నగరం"},
-    {"id":"te5","language":"te","context":"చార్మినార్ హైదరాబాద్ లో ఉంది. ఇది చారిత్రక స్మారక చిహ్నం.","question":"చార్మినార్ ఎక్కడ ఉంది?","answer_text":"హైదరాబాద్"},
 ]
 def ensure_sample_csv(path: pathlib.Path):
     if not path.exists():
         df = pd.DataFrame(SAMPLE_ROWS)
         df.to_csv(path, index=False, encoding="utf-8")
-        print(f"[init] Wrote sample Kannada data to {path}")
 ensure_sample_csv(CSV_PATH)
 _ZW = r"\u200b\u200c\u200d\ufeff"
 ZW_RE = re.compile(f"[{_ZW}]")
-def normalize_text(s: str) -> str:
-    if not isinstance(s, str):
         return ""
-    s = s.replace("\u0964", "।")
-    s = ZW_RE.sub("", s)
-    s = re.sub(r"\s+", " ", s).strip()
     return s
-df = pd.read_csv(CSV_PATH, encoding="utf-8")
 df["context_norm"] = df["context"].apply(normalize_text)
 CORPUS = df["context_norm"].tolist()
@@ -54,205 +59,267 @@ EMB_MODEL_NAME = "intfloat/multilingual-e5-base"
 emb_model = SentenceTransformer(EMB_MODEL_NAME)
 emb_model.eval()
 def encode_queries(texts):
-    texts = [normalize_text(t) for t in texts]
-    prefixed = [f"query: {t}" for t in texts]
-    with torch.inference_mode():
-        vecs = emb_model.encode(prefixed, normalize_embeddings=True)
-    return vecs
 def encode_passages(texts):
-    texts = [normalize_text(t) for t in texts]
-    prefixed = [f"passage: {t}" for t in texts]
-    with torch.inference_mode():
-        vecs = emb_model.encode(prefixed, normalize_embeddings=True)
-    return vecs
-PASSAGE_EMBS = encode_passages(CORPUS)
-def retrieve_top_k(query: str, k: int = 3):
-    if not query or not query.strip():
-        return []
-    qv = encode_queries([query])[0]
-    sims = np.dot(PASSAGE_EMBS, qv)
-    idxs = np.argsort(-sims)[:k]
-    results = []
-    for rank, i in enumerate(idxs):
-        results.append({"rank": int(rank+1), "similarity": float(sims[i]), "context": CORPUS[i]})
     return results
-READER_MODEL = "deepset/xlm-roberta-large-squad2"
-device = 0 if torch.cuda.is_available() else -1
-tokenizer = AutoTokenizer.from_pretrained(READER_MODEL, use_fast=True)
-qa = pipeline("question-answering", model=READER_MODEL, tokenizer=tokenizer, device=device)
-# --- Kannada -> English translator (offline, NLLB-200) ---
-# Model: facebook/nllb-200-distilled-600M
-# Kannada = 'kan_Knda', English = 'eng_Latn'
-NLLB_ID = "facebook/nllb-200-distilled-600M"
-nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_ID)
-nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_ID)
-# Telugu -> English
-trans_te_en = pipeline(
-    "translation",
-    model=nllb_model,
-    tokenizer=nllb_tokenizer,
-    src_lang="tel_Telu",
-    tgt_lang="eng_Latn",
-    device=device
 )
-def te_to_en(text: str) -> str:
-    text = (text or "").strip()
-    if not text: return ""
-    return trans_te_en(text, max_length=256)[0]["translation_text"].strip()
-# Kannada -> English
-trans_kn_en = pipeline(
-    "translation",
-    model=nllb_model,
-    tokenizer=nllb_tokenizer,
-    src_lang="kan_Knda",
-    tgt_lang="eng_Latn",
-    device=device
 )
-def kn_to_en(text: str) -> str:
-    text = (text or "").strip()
-    if not text: return ""
-    return trans_kn_en(text, max_length=256)[0]["translation_text"].strip()
-def answer_with_context(question: str, context: str):
-    question = normalize_text(question)
-    context = normalize_text(context)
-    if not question or not context:
-        return {"answer": "", "score": 0.0}
-    out = qa(question=question, context=context)
-    ans = out.get("answer", "").strip()
-    score = float(out.get("score", 0.0))
-    return {"answer": ans, "score": score}
-def no_context_flow(question: str, top_k: int = 3):
-    cands = retrieve_top_k(question, k=top_k)
-    if not cands:
-        return {"answer": "", "score": 0.0, "used_context": "", "retrieved": []}
-    best = {"answer": "", "score": -1.0, "used_context": ""}
-    for c in cands:
-        out = answer_with_context(question, c["context"])
-        if out["score"] > best["score"]:
-            best = {"answer": out["answer"], "score": out["score"], "used_context": c["context"]}
-    return {"answer": best["answer"], "score": best["score"], "used_context": best["used_context"], "retrieved": cands}
-INTRO_MD = """
-### ShabdaAI (Kannada, Telugu ↔ English)
-- **ಮೋಡ್ 1:** ನಾನು ನೀಡುವ ಪ್ಯಾಸೇಜ್ (context) ಆಧರಿಸಿ ಉತ್ತರಿಸು
-- **ಮೋಡ್ 2:** ಪ್ಯಾಸೇಜ್ ಇಲ್ಲದಿದ್ದರೆ — ಸಣ್ಣ ಕನ್ನಡ ಕಾರ್ಪಸ್‌ನಿಂದ *ಹುಡುಕು → ಓದು* ಮಾಡಿ ಉತ್ತರಿಸು
-- **మోడ్ 1:** నేను ఇచ్చే ప్యాసేజ్ (context) పై సమాధానం ఇవ్వు
-- **మోడ్ 2:** ప్యాసేజ్ ఇవ్వకపోతే — చిన్న తెలుగు కార్పస్‌లో *సెర్చ్ → రీడ్* చేసి సమాధానం ఇవ్వు
-> Models: **intfloat/multilingual-e5-base** (retrieval) + **deepset/xlm-roberta-large-squad2** (extractive QA)
 """
-def ui_answer(mode, translate_outputs_en, translate_inputs_en, question, user_context, top_k, lang_choice):
-    question = question or ""
-    user_context = user_context or ""
-    # Choose translator
-    if lang_choice == "Telugu":
-        to_en = te_to_en
     else:
-        to_en = kn_to_en
-    # Optional translations
-    q_en = to_en(question) if translate_inputs_en and question else ""
-    ctx_en = to_en(user_context) if translate_inputs_en and user_context else ""
-    if mode == "With my context":
-        res = answer_with_context(question, user_context)
-        ans = res["answer"]
-        ans_en = to_en(ans) if translate_outputs_en and ans else ""
-        return ans, ans_en, f"{res['score']:.3f}", user_context, ctx_en or "—", q_en or "—", "—"
     else:
-        res = no_context_flow(question, top_k=int(top_k))
-        ans = res["answer"]
-        ans_en = to_en(ans) if translate_outputs_en and ans else ""
-        retrieved_tbl = "\n".join(
-            [f"{r['rank']}. (sim={r['similarity']:.3f}) {r['context']}" for r in res.get("retrieved", [])]
-        ) or "—"
-        return ans, ans_en, f"{res['score']:.3f}", res["used_context"], ctx_en or "—", q_en or "—", retrieved_tbl
 with gr.Blocks() as demo:
     gr.Markdown(INTRO_MD)
-    with gr.Row():
-        mode = gr.Radio(
-            choices=["With my context", "No context (search sample data)"],
-            value="With my context",
-            label="Mode"
-        )
-        top_k = gr.Slider(1, 5, value=3, step=1, label="Top-K passages (for No-context mode)")
-    with gr.Row():
-        translate_outputs_en = gr.Checkbox(value=True, label="Translate ANSWER (Kannada, Telugu → English)")
-        translate_inputs_en  = gr.Checkbox(value=True, label="Translate INPUTS (Question/Context → English)")
-    question = gr.Textbox(label="ಪ್ರಶ್ನೆ/ప్రశ్న (Question)", placeholder="ಉದಾ: ಬೆಂಗಳೂರು ಯಾವ ರಾಜ್ಯದ ರಾಜಧಾನಿ?")
-    user_context = gr.Textbox(label="ಪ್ಯಾಸೇಜ್ / ಸಂದರ್ಭ/ప్యాసేజ్ / కాంటెక్స్ట్ (optional)", lines=4)
-    lang_choice = gr.Dropdown(
-    choices=["Telugu", "Kannada"],
-    value="Kannada",
-    label="Language"
-    )
-    btn = gr.Button("Answer")
-        # Answers
-    answer_local = gr.Textbox(label="Answer (Telugu/Kannada)")
-    answer_en = gr.Textbox(label="Answer (English)")
-    # Confidence + contexts
-    score = gr.Textbox(label="Confidence score")
-    used_ctx = gr.Textbox(label="Used context (Telugu/Kannada)")
-    ctx_en_box = gr.Textbox(label="Used context (English)")
-    q_en_box = gr.Textbox(label="Question (English)")
-    retrieved = gr.Textbox(label="Top-K retrieved passages (Telugu/Kannada)", lines=4)
     btn.click(
-        fn=ui_answer,
-        inputs=[mode, translate_outputs_en, translate_inputs_en, question, user_context, top_k, lang_choice],
-        outputs=[answer_local, answer_en, score, used_ctx, ctx_en_box, q_en_box, retrieved]
     )
-if __name__ == "__main__":
-    os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
-    demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)

+import os, re, pathlib
 import numpy as np
 import pandas as pd
 import torch
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 from sentence_transformers import SentenceTransformer
 import gradio as gr
+from sklearn.metrics.pairwise import cosine_similarity
 PROJECT_DIR = pathlib.Path(__file__).parent.resolve()
 DATA_DIR.mkdir(parents=True, exist_ok=True)
 CSV_PATH = DATA_DIR / "sample_indic.csv"
 SAMPLE_ROWS = [
     {"id":"kn1","language":"kn","context":"ಬೆಂಗಳೂರು ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ.","question":"ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ ಯಾವುದು?","answer_text":"ಬೆಂಗಳೂರು"},
     {"id":"kn2","language":"kn","context":"ಕನ್ನಡ ಒಂದು ದ್ರಾವಿಡ ಭಾಷೆ.","question":"ಕನ್ನಡ ಯಾವ ಭಾಷಾ ಕುಟುಂಬಕ್ಕೆ ಸೇರಿದೆ?","answer_text":"ದ್ರಾವಿಡ"},
     {"id":"kn3","language":"kn","context":"ಮೈಸೂರು ಅರಮನೆ ಕರ್ನಾಟಕದ ಪ್ರಸಿದ್ಧ ತಾಣ.","question":"ಮೈಸೂರು ಅರಮನೆ ಎಲ್ಲಿದೆ?","answer_text":"ಕರ್ನಾಟಕ"},
     {"id":"kn4","language":"kn","context":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್ ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜನಾಗಿದ್ದನು.","question":"ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜ ಯಾರು?","answer_text":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್"},
     {"id":"kn5","language":"kn","context":"ಹಂಪಿ ಯುನೆಸ್ಕೋ ವಿಶ್ವ ಪರಂಪರೆ ತಾಣವಾಗಿದೆ.","question":"ಹಂಪಿ ಯಾವ ರೀತಿಯ ತಾಣ?","answer_text":"ವಿಶ್ವ ಪರಂಪರೆ ತಾಣ"},
+    {"id":"hi1","language":"hi","context":"दिल्ली भारत की राजधानी है।","question":"भारत की राजधानी क्या है?","answer_text":"दिल्ली"},
+    {"id":"hi2","language":"hi","context":"हिंदी एक इंडो-आर्यन भाषा है।","question":"हिंदी किस भाषा परिवार से संबंधित है?","answer_text":"इंडो-आर्यन"},
+    {"id":"hi3","language":"hi","context":"ताजमहल आगरा में स्थित है।","question":"ताजमहल कहाँ स्थित है?","answer_text":"आगरा"},
+    {"id":"hi4","language":"hi","context":"गंगा भारत की एक प्रमुख नदी है।","question":"गंगा क्या है?","answer_text":"नदी"},
+    {"id":"hi5","language":"hi","context":"मुंबई भारत का एक प्रमुख शहर है।","question":"मुंबई किस देश में है?","answer_text":"भारत"},
 ]
 def ensure_sample_csv(path: pathlib.Path):
     if not path.exists():
         df = pd.DataFrame(SAMPLE_ROWS)
         df.to_csv(path, index=False, encoding="utf-8")
 ensure_sample_csv(CSV_PATH)
 _ZW = r"\u200b\u200c\u200d\ufeff"
 ZW_RE = re.compile(f"[{_ZW}]")
+def normalize_text(s: str):
+    if not isinstance(s,str):
         return ""
+    s = ZW_RE.sub("",s)
+    s = re.sub(r"\s+"," ",s).strip()
     return s
+df = pd.read_csv(CSV_PATH)
 df["context_norm"] = df["context"].apply(normalize_text)
 CORPUS = df["context_norm"].tolist()
 emb_model = SentenceTransformer(EMB_MODEL_NAME)
 emb_model.eval()
 def encode_queries(texts):
+    texts=[f"query: {normalize_text(t)}" for t in texts]
+    return emb_model.encode(texts,normalize_embeddings=True)
 def encode_passages(texts):
+    texts=[f"passage: {normalize_text(t)}" for t in texts]
+    return emb_model.encode(texts,normalize_embeddings=True)
+PASSAGE_EMBS=encode_passages(CORPUS)
+def retrieve_top_k(query,k=3):
+    qv=encode_queries([query])[0]
+    sims=np.dot(PASSAGE_EMBS,qv)
+    idxs=np.argsort(-sims)[:k]
+    results=[]
+    for rank,i in enumerate(idxs):
+        results.append(
+            {"rank":rank+1,"similarity":float(sims[i]),"context":CORPUS[i]}
+        )
     return results
+READER_MODEL="deepset/xlm-roberta-large-squad2"
+device=0 if torch.cuda.is_available() else -1
+tokenizer=AutoTokenizer.from_pretrained(READER_MODEL)
+qa=pipeline("question-answering",model=READER_MODEL,tokenizer=tokenizer,device=device)
+def answer_with_context(question,context):
+    out=qa(question=question,context=context)
+    return {"answer":out["answer"],"score":float(out["score"])}
+def no_context_flow(question,top_k=3):
+    cands=retrieve_top_k(question,k=top_k)
+    best={"answer":"","score":-1,"used_context":""}
+    for c in cands:
+        out=answer_with_context(question,c["context"])
+        if out["score"]>best["score"]:
+            best={"answer":out["answer"],"score":out["score"],"used_context":c["context"]}
+    return {"answer":best["answer"],"score":best["score"],"used_context":best["used_context"],"retrieved":cands}
+NLLB_ID="facebook/nllb-200-distilled-600M"
+nllb_tokenizer=AutoTokenizer.from_pretrained(NLLB_ID)
+nllb_model=AutoModelForSeq2SeqLM.from_pretrained(NLLB_ID)
+trans_hi_en=pipeline(
+"translation",
+model=nllb_model,
+tokenizer=nllb_tokenizer,
+src_lang="hin_Deva",
+tgt_lang="eng_Latn",
+device=device
 )
+trans_kn_en=pipeline(
+"translation",
+model=nllb_model,
+tokenizer=nllb_tokenizer,
+src_lang="kan_Knda",
+tgt_lang="eng_Latn",
+device=device
 )
+def hi_to_en(text):
+    return trans_hi_en(text)[0]["translation_text"]
+def kn_to_en(text):
+    return trans_kn_en(text)[0]["translation_text"]
+def exact_match(pred,gold):
+    return int(normalize_text(pred)==normalize_text(gold))
+def token_f1(pred,gold):
+    p=set(pred.split())
+    g=set(gold.split())
+    common=len(p & g)
+    if common==0:
+        return 0
+    precision=common/len(p)
+    recall=common/len(g)
+    return 2*precision*recall/(precision+recall)
+def semantic_similarity(pred,gold):
+    emb=encode_queries([pred,gold])
+    return float(cosine_similarity([emb[0]],[emb[1]])[0][0])
+def evaluate_answer(question):
+    row=df[df["question"]==question]
+    if row.empty:
+        return {}
+    gold=row.iloc[0]["answer_text"]
+    result=no_context_flow(question)
+    pred=result["answer"]
+    return {
+        "prediction":pred,
+        "gold":gold,
+        "em":exact_match(pred,gold),
+        "f1":token_f1(pred,gold),
+        "sim":semantic_similarity(pred,gold)
+    }
+INTRO_MD="""
+### ShabdaAI Multilingual QA
+Supports
+Kannada
+Hindi
+Models
+multilingual-e5-base (retrieval)
+xlm-roberta-large-squad2 (QA)
+nllb-200 (translation)
 """
+def ui_answer(mode,question,user_context,top_k,lang_choice):
+    if mode=="With context":
+        res=answer_with_context(question,user_context)
+        ans=res["answer"]
+        used=user_context
     else:
+        res=no_context_flow(question,top_k)
+        ans=res["answer"]
+        used=res["used_context"]
+    if lang_choice=="Hindi":
+        ans_en=hi_to_en(ans)
     else:
+        ans_en=kn_to_en(ans)
+    ev=evaluate_answer(question)
+    retrieved="\n".join(
+        [f"{r['rank']}. {r['context']} ({r['similarity']:.3f})" for r in res.get("retrieved",[])]
+    )
+    return ans,ans_en,res["score"],used,retrieved,ev.get("em"),ev.get("f1"),ev.get("sim")
 with gr.Blocks() as demo:
     gr.Markdown(INTRO_MD)
+    mode=gr.Radio(["With context","No context"],value="With context")
+    question=gr.Textbox(label="Question")
+    user_context=gr.Textbox(label="Context")
+    top_k=gr.Slider(1,5,3)
+    lang_choice=gr.Dropdown(["Hindi","Kannada"],value="Kannada")
+    btn=gr.Button("Answer")
+    ans_local=gr.Textbox(label="Answer")
+    ans_en=gr.Textbox(label="Answer English")
+    score=gr.Textbox(label="Confidence")
+    used=gr.Textbox(label="Used Context")
+    retrieved=gr.Textbox(label="Retrieved Contexts")
+    em=gr.Textbox(label="Exact Match")
+    f1=gr.Textbox(label="F1 Score")
+    sim=gr.Textbox(label="Semantic Similarity")
     btn.click(
+        ui_answer,
+        inputs=[mode,question,user_context,top_k,lang_choice],
+        outputs=[ans_local,ans_en,score,used,retrieved,em,f1,sim]
     )
+if __name__=="__main__":
+    demo.launch(server_name="0.0.0.0",port=7860)