ravish5 commited on
Commit
4f99eb7
·
verified ·
1 Parent(s): 484a475

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -153
app.py CHANGED
@@ -1,12 +1,12 @@
1
- import os, re, pathlib, json
2
  import numpy as np
3
  import pandas as pd
4
 
5
  import torch
6
- from transformers import pipeline, AutoTokenizer
7
  from sentence_transformers import SentenceTransformer
8
- from transformers import AutoModelForSeq2SeqLM
9
  import gradio as gr
 
10
 
11
 
12
  PROJECT_DIR = pathlib.Path(__file__).parent.resolve()
@@ -14,39 +14,44 @@ DATA_DIR = PROJECT_DIR / "data"
14
  DATA_DIR.mkdir(parents=True, exist_ok=True)
15
  CSV_PATH = DATA_DIR / "sample_indic.csv"
16
 
 
17
  SAMPLE_ROWS = [
18
  {"id":"kn1","language":"kn","context":"ಬೆಂಗಳೂರು ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ.","question":"ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ ಯಾವುದು?","answer_text":"ಬೆಂಗಳೂರು"},
19
  {"id":"kn2","language":"kn","context":"ಕನ್ನಡ ಒಂದು ದ್ರಾವಿಡ ಭಾಷೆ.","question":"ಕನ್ನಡ ಯಾವ ಭಾಷಾ ಕುಟುಂಬಕ್ಕೆ ಸೇರಿದೆ?","answer_text":"ದ್ರಾವಿಡ"},
20
  {"id":"kn3","language":"kn","context":"ಮೈಸೂರು ಅರಮನೆ ಕರ್ನಾಟಕದ ಪ್ರಸಿದ್ಧ ತಾಣ.","question":"ಮೈಸೂರು ಅರಮನೆ ಎಲ್ಲಿದೆ?","answer_text":"ಕರ್ನಾಟಕ"},
21
  {"id":"kn4","language":"kn","context":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್ ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜನಾಗಿದ್ದನು.","question":"ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜ ಯಾರು?","answer_text":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್"},
22
  {"id":"kn5","language":"kn","context":"ಹಂಪಿ ಯುನೆಸ್ಕೋ ವಿಶ್ವ ಪರಂಪರೆ ತಾಣವಾಗಿದೆ.","question":"ಹಂಪಿ ಯಾವ ರೀತಿಯ ತಾಣ?","answer_text":"ವಿಶ್ವ ಪರಂಪರೆ ತಾಣ"},
23
- {"id":"te1","language":"te","context":"తెలంగాణ రాష్ట్ర రాజధాని హైదరాబాదు. ఈ నగరం ఐటి పరిశ్రమకు ప్రసిద్ధి.","question":"తెలంగాణ రాష్ట్ర రాజధాని ఏది?","answer_text":"హైదరాబాదు"},
24
- {"id":"te2","language":"te","context":"తెలుగు భాష ద్రావిడ భాషా కుటుంబానికి చెందినది. దాని లిపి తెలుగు లిపి.","question":"తెలుగు భాష లిపిని ఉపయోగిస్తుంది?","answer_text":"తెలుగు లిపి"},
25
- {"id":"te3","language":"te","context":"సీతాకోక చిలుకలకు రెండు రెక్కలు ఉంటాయి. ఇవి పూల మకరందం తాగుతాయి.","question":"సీతాకోక చిలుకకు ఎన్ని రెక్కలు ఉన్నాయి?","answer_text":"రెండు"},
26
- {"id":"te4","language":"te","context":"విశాఖపట్నం ఒక తీర నగరం. ఇది ఆంధ్రప్రదేశ్‌లోని ప్రముఖ నౌకాశ్రయం.","question":"విశాఖపట్నం రకమైన నగరం?","answer_text":"తీర నగరం"},
27
- {"id":"te5","language":"te","context":"చార్మినార్ హైదరాబాద్ లో ఉంది. ఇది చారిత్రక స్మారక చిహ్నం.","question":"చార్మినార్ ఎక్కడ ఉంది?","answer_text":"హైదరాబాద్"},
 
28
  ]
29
 
 
30
  def ensure_sample_csv(path: pathlib.Path):
31
  if not path.exists():
32
  df = pd.DataFrame(SAMPLE_ROWS)
33
  df.to_csv(path, index=False, encoding="utf-8")
34
- print(f"[init] Wrote sample Kannada data to {path}")
35
 
36
  ensure_sample_csv(CSV_PATH)
37
 
 
38
  _ZW = r"\u200b\u200c\u200d\ufeff"
39
  ZW_RE = re.compile(f"[{_ZW}]")
40
- def normalize_text(s: str) -> str:
41
- if not isinstance(s, str):
 
42
  return ""
43
- s = s.replace("\u0964", "।")
44
- s = ZW_RE.sub("", s)
45
- s = re.sub(r"\s+", " ", s).strip()
46
  return s
47
 
48
- df = pd.read_csv(CSV_PATH, encoding="utf-8")
 
49
  df["context_norm"] = df["context"].apply(normalize_text)
 
50
  CORPUS = df["context_norm"].tolist()
51
 
52
 
@@ -54,205 +59,267 @@ EMB_MODEL_NAME = "intfloat/multilingual-e5-base"
54
  emb_model = SentenceTransformer(EMB_MODEL_NAME)
55
  emb_model.eval()
56
 
 
57
  def encode_queries(texts):
58
- texts = [normalize_text(t) for t in texts]
59
- prefixed = [f"query: {t}" for t in texts]
60
- with torch.inference_mode():
61
- vecs = emb_model.encode(prefixed, normalize_embeddings=True)
62
- return vecs
63
 
64
  def encode_passages(texts):
65
- texts = [normalize_text(t) for t in texts]
66
- prefixed = [f"passage: {t}" for t in texts]
67
- with torch.inference_mode():
68
- vecs = emb_model.encode(prefixed, normalize_embeddings=True)
69
- return vecs
70
 
71
- PASSAGE_EMBS = encode_passages(CORPUS)
72
 
 
73
 
74
 
75
- def retrieve_top_k(query: str, k: int = 3):
76
- if not query or not query.strip():
77
 
78
- return []
79
- qv = encode_queries([query])[0]
80
- sims = np.dot(PASSAGE_EMBS, qv)
81
 
 
82
 
83
- idxs = np.argsort(-sims)[:k]
84
- results = []
85
- for rank, i in enumerate(idxs):
86
- results.append({"rank": int(rank+1), "similarity": float(sims[i]), "context": CORPUS[i]})
87
 
 
 
 
88
 
89
  return results
90
 
91
 
92
- READER_MODEL = "deepset/xlm-roberta-large-squad2"
93
- device = 0 if torch.cuda.is_available() else -1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
 
95
 
96
- tokenizer = AutoTokenizer.from_pretrained(READER_MODEL, use_fast=True)
97
- qa = pipeline("question-answering", model=READER_MODEL, tokenizer=tokenizer, device=device)
98
 
99
 
100
- # --- Kannada -> English translator (offline, NLLB-200) ---
101
- # Model: facebook/nllb-200-distilled-600M
102
- # Kannada = 'kan_Knda', English = 'eng_Latn'
103
- NLLB_ID = "facebook/nllb-200-distilled-600M"
104
- nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_ID)
105
- nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_ID)
106
- # Telugu -> English
107
- trans_te_en = pipeline(
108
- "translation",
109
- model=nllb_model,
110
- tokenizer=nllb_tokenizer,
111
- src_lang="tel_Telu",
112
- tgt_lang="eng_Latn",
113
- device=device
114
  )
115
 
116
- def te_to_en(text: str) -> str:
117
- text = (text or "").strip()
118
- if not text: return ""
119
- return trans_te_en(text, max_length=256)[0]["translation_text"].strip()
120
-
121
- # Kannada -> English
122
- trans_kn_en = pipeline(
123
- "translation",
124
- model=nllb_model,
125
- tokenizer=nllb_tokenizer,
126
- src_lang="kan_Knda",
127
- tgt_lang="eng_Latn",
128
- device=device
129
  )
130
 
131
- def kn_to_en(text: str) -> str:
132
- text = (text or "").strip()
133
- if not text: return ""
134
- return trans_kn_en(text, max_length=256)[0]["translation_text"].strip()
135
 
 
136
 
 
137
 
138
- def answer_with_context(question: str, context: str):
139
- question = normalize_text(question)
140
- context = normalize_text(context)
141
- if not question or not context:
142
- return {"answer": "", "score": 0.0}
143
- out = qa(question=question, context=context)
144
- ans = out.get("answer", "").strip()
145
- score = float(out.get("score", 0.0))
146
- return {"answer": ans, "score": score}
147
 
148
- def no_context_flow(question: str, top_k: int = 3):
149
- cands = retrieve_top_k(question, k=top_k)
150
- if not cands:
151
- return {"answer": "", "score": 0.0, "used_context": "", "retrieved": []}
152
- best = {"answer": "", "score": -1.0, "used_context": ""}
153
- for c in cands:
154
- out = answer_with_context(question, c["context"])
155
- if out["score"] > best["score"]:
156
- best = {"answer": out["answer"], "score": out["score"], "used_context": c["context"]}
157
- return {"answer": best["answer"], "score": best["score"], "used_context": best["used_context"], "retrieved": cands}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
 
 
160
 
 
161
 
 
162
 
 
163
 
 
164
 
 
165
 
 
166
 
 
167
 
 
 
 
 
 
168
 
169
- INTRO_MD = """
170
- ### ShabdaAI (Kannada, Telugu ↔ English)
171
- - **ಮೋಡ್ 1:** ನಾನು ನೀಡುವ ಪ್ಯಾಸೇಜ್ (context) ಆಧರಿಸಿ ಉತ್ತರಿಸು
172
- - **ಮೋಡ್ 2:** ಪ್ಯಾಸೇಜ್ ಇಲ್ಲದಿದ್ದರೆ — ಸಣ್ಣ ಕನ್ನಡ ಕಾರ್ಪಸ್‌ನಿಂದ *ಹುಡುಕು → ಓದು* ಮಾಡಿ ಉತ್ತರಿಸು
173
- - **మోడ్ 1:** నేను ఇచ్చే ప్యాసేజ్ (context) పై సమాధానం ఇవ్వు
174
- - **మోడ్ 2:** ప్యాసేజ్ ఇవ్వకపోతే — చిన్న తెలుగు కార్పస్‌లో *సెర్చ్ → రీడ్* చేసి సమాధానం ఇవ్వు
175
 
176
- > Models: **intfloat/multilingual-e5-base** (retrieval) + **deepset/xlm-roberta-large-squad2** (extractive QA)
177
 
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  """
180
 
181
- def ui_answer(mode, translate_outputs_en, translate_inputs_en, question, user_context, top_k, lang_choice):
182
- question = question or ""
183
- user_context = user_context or ""
184
 
185
- # Choose translator
186
- if lang_choice == "Telugu":
187
- to_en = te_to_en
 
 
 
 
 
 
 
 
188
  else:
189
- to_en = kn_to_en
190
 
191
- # Optional translations
192
- q_en = to_en(question) if translate_inputs_en and question else ""
193
- ctx_en = to_en(user_context) if translate_inputs_en and user_context else ""
194
 
195
- if mode == "With my context":
196
- res = answer_with_context(question, user_context)
197
- ans = res["answer"]
198
- ans_en = to_en(ans) if translate_outputs_en and ans else ""
199
- return ans, ans_en, f"{res['score']:.3f}", user_context, ctx_en or "—", q_en or "—", "—"
 
 
 
200
 
201
  else:
202
- res = no_context_flow(question, top_k=int(top_k))
203
- ans = res["answer"]
204
- ans_en = to_en(ans) if translate_outputs_en and ans else ""
205
- retrieved_tbl = "\n".join(
206
- [f"{r['rank']}. (sim={r['similarity']:.3f}) {r['context']}" for r in res.get("retrieved", [])]
207
- ) or "—"
208
- return ans, ans_en, f"{res['score']:.3f}", res["used_context"], ctx_en or "—", q_en or "—", retrieved_tbl
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
 
212
 
213
  with gr.Blocks() as demo:
 
214
  gr.Markdown(INTRO_MD)
215
 
216
- with gr.Row():
217
- mode = gr.Radio(
218
- choices=["With my context", "No context (search sample data)"],
219
- value="With my context",
220
- label="Mode"
221
- )
222
- top_k = gr.Slider(1, 5, value=3, step=1, label="Top-K passages (for No-context mode)")
223
- with gr.Row():
224
- translate_outputs_en = gr.Checkbox(value=True, label="Translate ANSWER (Kannada, Telugu → English)")
225
- translate_inputs_en = gr.Checkbox(value=True, label="Translate INPUTS (Question/Context → English)")
226
-
227
- question = gr.Textbox(label="ಪ್ರಶ್ನೆ/ప్రశ్న (Question)", placeholder="ಉದಾ: ಬೆಂಗಳೂರು ಯಾವ ರಾಜ್ಯದ ರಾಜಧಾನಿ?")
228
- user_context = gr.Textbox(label="ಪ್ಯಾಸೇಜ್ / ಸಂದರ್ಭ/ప్యాసేజ్ / కాంటెక్స్ట్ (optional)", lines=4)
229
-
230
- lang_choice = gr.Dropdown(
231
- choices=["Telugu", "Kannada"],
232
- value="Kannada",
233
- label="Language"
234
- )
235
 
236
- btn = gr.Button("Answer")
237
 
238
- # Answers
239
- answer_local = gr.Textbox(label="Answer (Telugu/Kannada)")
240
- answer_en = gr.Textbox(label="Answer (English)")
241
 
242
- # Confidence + contexts
243
- score = gr.Textbox(label="Confidence score")
244
- used_ctx = gr.Textbox(label="Used context (Telugu/Kannada)")
245
- ctx_en_box = gr.Textbox(label="Used context (English)")
246
- q_en_box = gr.Textbox(label="Question (English)")
 
 
 
 
 
 
247
 
248
- retrieved = gr.Textbox(label="Top-K retrieved passages (Telugu/Kannada)", lines=4)
249
 
250
  btn.click(
251
- fn=ui_answer,
252
- inputs=[mode, translate_outputs_en, translate_inputs_en, question, user_context, top_k, lang_choice],
253
- outputs=[answer_local, answer_en, score, used_ctx, ctx_en_box, q_en_box, retrieved]
 
 
 
 
254
  )
255
 
256
- if __name__ == "__main__":
257
- os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
258
- demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
 
 
1
+ import os, re, pathlib
2
  import numpy as np
3
  import pandas as pd
4
 
5
  import torch
6
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
7
  from sentence_transformers import SentenceTransformer
 
8
  import gradio as gr
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
 
11
 
12
  PROJECT_DIR = pathlib.Path(__file__).parent.resolve()
 
14
  DATA_DIR.mkdir(parents=True, exist_ok=True)
15
  CSV_PATH = DATA_DIR / "sample_indic.csv"
16
 
17
+
18
  SAMPLE_ROWS = [
19
  {"id":"kn1","language":"kn","context":"ಬೆಂಗಳೂರು ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ.","question":"ಕರ್ನಾಟಕದ ರಾಜಧಾನಿ ಯಾವುದು?","answer_text":"ಬೆಂಗಳೂರು"},
20
  {"id":"kn2","language":"kn","context":"ಕನ್ನಡ ಒಂದು ದ್ರಾವಿಡ ಭಾಷೆ.","question":"ಕನ್ನಡ ಯಾವ ಭಾಷಾ ಕುಟುಂಬಕ್ಕೆ ಸೇರಿದೆ?","answer_text":"ದ್ರಾವಿಡ"},
21
  {"id":"kn3","language":"kn","context":"ಮೈಸೂರು ಅರಮನೆ ಕರ್ನಾಟಕದ ಪ್ರಸಿದ್ಧ ತಾಣ.","question":"ಮೈಸೂರು ಅರಮನೆ ಎಲ್ಲಿದೆ?","answer_text":"ಕರ್ನಾಟಕ"},
22
  {"id":"kn4","language":"kn","context":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್ ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜನಾಗಿದ್ದನು.","question":"ಮೈಸೂರು ಸಾಮ್ರಾಜ್ಯದ ರಾಜ ಯಾರು?","answer_text":"ಟಿಪ್ಪು ಸುಲ್ತಾನ್"},
23
  {"id":"kn5","language":"kn","context":"ಹಂಪಿ ಯುನೆಸ್ಕೋ ವಿಶ್ವ ಪರಂಪರೆ ತಾಣವಾಗಿದೆ.","question":"ಹಂಪಿ ಯಾವ ರೀತಿಯ ತಾಣ?","answer_text":"ವಿಶ್ವ ಪರಂಪರೆ ತಾಣ"},
24
+
25
+ {"id":"hi1","language":"hi","context":"दिल्ली भारत की राजधानी है।","question":"भारत की राजधानी क्या है?","answer_text":"दिल्ली"},
26
+ {"id":"hi2","language":"hi","context":"हिंदी एक इंडो-आर्यन भाषा है।","question":"हिंदी किस भाषा परिवार से संबंधित है?","answer_text":"इंडो-आर्यन"},
27
+ {"id":"hi3","language":"hi","context":"ताजमहल आगरा में स्थित है।","question":"ताजमहल कहाँ स्थित है?","answer_text":"आगरा"},
28
+ {"id":"hi4","language":"hi","context":"गंगा भारत की एक प्रमुख नदी है।","question":"गंगा क्या है?","answer_text":"नदी"},
29
+ {"id":"hi5","language":"hi","context":"मुंबई भारत का एक प्रमुख शहर है।","question":"मुंबई किस देश में है?","answer_text":"भारत"},
30
  ]
31
 
32
+
33
  def ensure_sample_csv(path: pathlib.Path):
34
  if not path.exists():
35
  df = pd.DataFrame(SAMPLE_ROWS)
36
  df.to_csv(path, index=False, encoding="utf-8")
 
37
 
38
  ensure_sample_csv(CSV_PATH)
39
 
40
+
41
  _ZW = r"\u200b\u200c\u200d\ufeff"
42
  ZW_RE = re.compile(f"[{_ZW}]")
43
+
44
+ def normalize_text(s: str):
45
+ if not isinstance(s,str):
46
  return ""
47
+ s = ZW_RE.sub("",s)
48
+ s = re.sub(r"\s+"," ",s).strip()
 
49
  return s
50
 
51
+
52
+ df = pd.read_csv(CSV_PATH)
53
  df["context_norm"] = df["context"].apply(normalize_text)
54
+
55
  CORPUS = df["context_norm"].tolist()
56
 
57
 
 
59
  emb_model = SentenceTransformer(EMB_MODEL_NAME)
60
  emb_model.eval()
61
 
62
+
63
  def encode_queries(texts):
64
+ texts=[f"query: {normalize_text(t)}" for t in texts]
65
+ return emb_model.encode(texts,normalize_embeddings=True)
 
 
 
66
 
67
  def encode_passages(texts):
68
+ texts=[f"passage: {normalize_text(t)}" for t in texts]
69
+ return emb_model.encode(texts,normalize_embeddings=True)
 
 
 
70
 
 
71
 
72
+ PASSAGE_EMBS=encode_passages(CORPUS)
73
 
74
 
75
+ def retrieve_top_k(query,k=3):
 
76
 
77
+ qv=encode_queries([query])[0]
78
+ sims=np.dot(PASSAGE_EMBS,qv)
 
79
 
80
+ idxs=np.argsort(-sims)[:k]
81
 
82
+ results=[]
83
+ for rank,i in enumerate(idxs):
 
 
84
 
85
+ results.append(
86
+ {"rank":rank+1,"similarity":float(sims[i]),"context":CORPUS[i]}
87
+ )
88
 
89
  return results
90
 
91
 
92
+ READER_MODEL="deepset/xlm-roberta-large-squad2"
93
+
94
+ device=0 if torch.cuda.is_available() else -1
95
+
96
+ tokenizer=AutoTokenizer.from_pretrained(READER_MODEL)
97
+ qa=pipeline("question-answering",model=READER_MODEL,tokenizer=tokenizer,device=device)
98
+
99
+
100
+ def answer_with_context(question,context):
101
+
102
+ out=qa(question=question,context=context)
103
+
104
+ return {"answer":out["answer"],"score":float(out["score"])}
105
+
106
+
107
+ def no_context_flow(question,top_k=3):
108
+
109
+ cands=retrieve_top_k(question,k=top_k)
110
+
111
+ best={"answer":"","score":-1,"used_context":""}
112
+
113
+ for c in cands:
114
+
115
+ out=answer_with_context(question,c["context"])
116
+
117
+ if out["score"]>best["score"]:
118
 
119
+ best={"answer":out["answer"],"score":out["score"],"used_context":c["context"]}
120
 
121
+ return {"answer":best["answer"],"score":best["score"],"used_context":best["used_context"],"retrieved":cands}
 
122
 
123
 
124
+
125
+ NLLB_ID="facebook/nllb-200-distilled-600M"
126
+
127
+ nllb_tokenizer=AutoTokenizer.from_pretrained(NLLB_ID)
128
+ nllb_model=AutoModelForSeq2SeqLM.from_pretrained(NLLB_ID)
129
+
130
+
131
+ trans_hi_en=pipeline(
132
+ "translation",
133
+ model=nllb_model,
134
+ tokenizer=nllb_tokenizer,
135
+ src_lang="hin_Deva",
136
+ tgt_lang="eng_Latn",
137
+ device=device
138
  )
139
 
140
+ trans_kn_en=pipeline(
141
+ "translation",
142
+ model=nllb_model,
143
+ tokenizer=nllb_tokenizer,
144
+ src_lang="kan_Knda",
145
+ tgt_lang="eng_Latn",
146
+ device=device
 
 
 
 
 
 
147
  )
148
 
 
 
 
 
149
 
150
+ def hi_to_en(text):
151
 
152
+ return trans_hi_en(text)[0]["translation_text"]
153
 
 
 
 
 
 
 
 
 
 
154
 
155
+ def kn_to_en(text):
156
+
157
+ return trans_kn_en(text)[0]["translation_text"]
158
+
159
+
160
+
161
+ def exact_match(pred,gold):
162
+
163
+ return int(normalize_text(pred)==normalize_text(gold))
164
+
165
+
166
+ def token_f1(pred,gold):
167
+
168
+ p=set(pred.split())
169
+ g=set(gold.split())
170
+
171
+ common=len(p & g)
172
+
173
+ if common==0:
174
+
175
+ return 0
176
+
177
+ precision=common/len(p)
178
+
179
+ recall=common/len(g)
180
+
181
+ return 2*precision*recall/(precision+recall)
182
+
183
+
184
+ def semantic_similarity(pred,gold):
185
+
186
+ emb=encode_queries([pred,gold])
187
+
188
+ return float(cosine_similarity([emb[0]],[emb[1]])[0][0])
189
 
190
 
191
+ def evaluate_answer(question):
192
 
193
+ row=df[df["question"]==question]
194
 
195
+ if row.empty:
196
 
197
+ return {}
198
 
199
+ gold=row.iloc[0]["answer_text"]
200
 
201
+ result=no_context_flow(question)
202
 
203
+ pred=result["answer"]
204
 
205
+ return {
206
 
207
+ "prediction":pred,
208
+ "gold":gold,
209
+ "em":exact_match(pred,gold),
210
+ "f1":token_f1(pred,gold),
211
+ "sim":semantic_similarity(pred,gold)
212
 
213
+ }
 
 
 
 
 
214
 
 
215
 
216
 
217
+ INTRO_MD="""
218
+ ### ShabdaAI Multilingual QA
219
+
220
+ Supports
221
+
222
+ Kannada
223
+ Hindi
224
+
225
+ Models
226
+
227
+ multilingual-e5-base (retrieval)
228
+
229
+ xlm-roberta-large-squad2 (QA)
230
+
231
+ nllb-200 (translation)
232
  """
233
 
 
 
 
234
 
235
+ def ui_answer(mode,question,user_context,top_k,lang_choice):
236
+
237
+
238
+ if mode=="With context":
239
+
240
+ res=answer_with_context(question,user_context)
241
+
242
+ ans=res["answer"]
243
+
244
+ used=user_context
245
+
246
  else:
 
247
 
248
+ res=no_context_flow(question,top_k)
 
 
249
 
250
+ ans=res["answer"]
251
+
252
+ used=res["used_context"]
253
+
254
+
255
+ if lang_choice=="Hindi":
256
+
257
+ ans_en=hi_to_en(ans)
258
 
259
  else:
 
 
 
 
 
 
 
260
 
261
+ ans_en=kn_to_en(ans)
262
+
263
+
264
+ ev=evaluate_answer(question)
265
+
266
+
267
+ retrieved="\n".join(
268
+
269
+ [f"{r['rank']}. {r['context']} ({r['similarity']:.3f})" for r in res.get("retrieved",[])]
270
+
271
+ )
272
+
273
+
274
+ return ans,ans_en,res["score"],used,retrieved,ev.get("em"),ev.get("f1"),ev.get("sim")
275
 
276
 
277
 
278
  with gr.Blocks() as demo:
279
+
280
  gr.Markdown(INTRO_MD)
281
 
282
+ mode=gr.Radio(["With context","No context"],value="With context")
283
+
284
+ question=gr.Textbox(label="Question")
285
+
286
+ user_context=gr.Textbox(label="Context")
287
+
288
+ top_k=gr.Slider(1,5,3)
289
+
290
+ lang_choice=gr.Dropdown(["Hindi","Kannada"],value="Kannada")
291
+
292
+ btn=gr.Button("Answer")
293
+
 
 
 
 
 
 
 
294
 
295
+ ans_local=gr.Textbox(label="Answer")
296
 
297
+ ans_en=gr.Textbox(label="Answer English")
 
 
298
 
299
+ score=gr.Textbox(label="Confidence")
300
+
301
+ used=gr.Textbox(label="Used Context")
302
+
303
+ retrieved=gr.Textbox(label="Retrieved Contexts")
304
+
305
+ em=gr.Textbox(label="Exact Match")
306
+
307
+ f1=gr.Textbox(label="F1 Score")
308
+
309
+ sim=gr.Textbox(label="Semantic Similarity")
310
 
 
311
 
312
  btn.click(
313
+
314
+ ui_answer,
315
+
316
+ inputs=[mode,question,user_context,top_k,lang_choice],
317
+
318
+ outputs=[ans_local,ans_en,score,used,retrieved,em,f1,sim]
319
+
320
  )
321
 
322
+
323
+ if __name__=="__main__":
324
+
325
+ demo.launch(server_name="0.0.0.0",port=7860)