import gradio as gr import torch import sentencepiece as spm from transformers import RobertaForTokenClassification from huggingface_hub import hf_hub_download import csv import io from collections import Counter MODEL_ID = "hellosindh/sindhi-bert-ner" print("Loading model...", flush=True) model = RobertaForTokenClassification.from_pretrained(MODEL_ID) model.eval() print("Loading tokenizer...", flush=True) sp_path = hf_hub_download(repo_id=MODEL_ID, filename="sindhi_bpe_32k.model") sp = spm.SentencePieceProcessor() sp.Load(sp_path) print("✅ Ready!", flush=True) ID2TAG = model.config.id2label BOS_ID = 2 EOS_ID = 3 # ── Entity config — Sindhi labels ───────────────── ENTITY_CONFIG = { "PERSON": {"color": "#c084fc", "bg": "rgba(192,132,252,0.15)", "sindhi": "ماڻهو"}, "LOCATION": {"color": "#818cf8", "bg": "rgba(129,140,248,0.15)", "sindhi": "جڳهه"}, "ORGANIZATION": {"color": "#38bdf8", "bg": "rgba(56,189,248,0.15)", "sindhi": "ادارو"}, "DATE_TIME": {"color": "#34d399", "bg": "rgba(52,211,153,0.15)", "sindhi": "تاريخ"}, "EVENT": {"color": "#fbbf24", "bg": "rgba(251,191,36,0.15)", "sindhi": "واقعو"}, "TITLE": {"color": "#fb923c", "bg": "rgba(251,146,60,0.15)", "sindhi": "لقب"}, } FALLBACK_CFG = { "color": "#6b7280", "bg": "rgba(107,114,128,0.15)", "sindhi": "ادارو", # unknown entities shown as ادارو } def predict_ner(sentence): if not sentence.strip(): return _empty_html(), _empty_summary(), "", None, gr.update(visible=False) words = sentence.split() input_ids = [BOS_ID] word_map = [-1] for i, word in enumerate(words): subwords = sp.EncodeAsIds(word) if not subwords: continue for j, sw in enumerate(subwords): input_ids.append(sw) word_map.append(i if j == 0 else -1) input_ids.append(EOS_ID) word_map.append(-1) tensor = torch.tensor([input_ids]) with torch.no_grad(): logits = model(tensor).logits[0] probs = torch.softmax(logits, dim=-1) preds = torch.argmax(logits, dim=-1).tolist() conf = probs.max(dim=-1).values.tolist() word_tags = {} word_conf = {} for pos, (pred, wid) in enumerate(zip(preds, word_map)): if wid >= 0: word_tags[wid] = ID2TAG[pred] word_conf[wid] = conf[pos] entities = [] html_words = [] i = 0 while i < len(words): tag = word_tags.get(i, "O") if tag.startswith("B-"): etype = tag[2:] entity_words = [words[i]] scores = [word_conf.get(i, 0)] j = i + 1 while j < len(words): if word_tags.get(j, "O") == f"I-{etype}": entity_words.append(words[j]) scores.append(word_conf.get(j, 0)) j += 1 else: break entity_text = " ".join(entity_words) avg_score = sum(scores) / len(scores) cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG) html_words.append( f'' f'' f'{cfg["sindhi"]}' f'{entity_text}' f'' ) entities.append({ "text": entity_text, "type": etype, "sindhi": cfg["sindhi"], "score": avg_score, "color": cfg["color"], }) i = j else: html_words.append( f'{words[i]}' ) i += 1 highlighted = f"""
{" ".join(html_words)}
""" summary = _build_summary(entities) conf_html = _build_confidence(entities) csv_file = _build_csv(entities) legend = _build_legend(entities) if entities else "" return highlighted, summary, conf_html, csv_file, gr.update(value=legend, visible=bool(entities)) def _empty_html(): return """
ڪو بہ سنڌي جملو لکو
""" def _empty_summary(): return """
اعتماد جوڳا نتيجا نہ مليا
""" def _build_summary(entities): if not entities: return _empty_summary() counts = Counter(e["type"] for e in entities) cards = "" for etype, cnt in sorted(counts.items(), key=lambda x: -x[1]): cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG) cards += f"""
{cfg['sindhi']} {cnt}
""" # No مجموعي header — just cards directly return f"""
{cards}
""" def _build_confidence(entities): if not entities: return "" bars = "" for ent in entities: cfg = ENTITY_CONFIG.get(ent["type"], FALLBACK_CFG) pct = int(ent["score"] * 100) bars += f"""
{ent['text']}
{ent['sindhi']} {pct}%
""" return f"""
اعتماد
{bars}
""" def _build_legend(entities): """Show only entity types found in this result.""" found_types = list(dict.fromkeys(e["type"] for e in entities)) # preserve order items = "" for etype in found_types: cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG) items += ( f'' f'{cfg["sindhi"]}' ) return f"""
{items}
""" def _build_csv(entities): if not entities: return None output = io.StringIO() writer = csv.writer(output) writer.writerow(["Entity", "Type", "Sindhi Type", "Confidence"]) for ent in entities: writer.writerow([ ent["text"], ent["type"], ent["sindhi"], f"{ent['score']*100:.1f}%" ]) path = "/tmp/sindhi_ner.csv" with open(path, "w", encoding="utf-8-sig", newline="") as f: f.write(output.getvalue()) return path CSS = """ @import url('https://fonts.googleapis.com/css2?family=Lateef:wght@400;700&family=Scheherazade+New:wght@400;700&family=Outfit:wght@400;600;700;800&display=swap'); /* Base — Outfit for UI chrome only */ *, body, .gradio-container { font-family: 'Outfit', sans-serif !important; } body, .gradio-container { background: #08081a !important; } .gradio-container { max-width: 980px !important; margin: 0 auto !important; padding: 16px !important; } /* Labels */ label > span { color: #9333ea !important; font-size: 0.82em !important; font-weight: 700 !important; letter-spacing: 0.8px !important; text-transform: uppercase !important; font-family: 'Outfit', sans-serif !important; } /* Textarea — Lateef font, large size */ textarea { background: #130825 !important; border: 1px solid #6d28d960 !important; border-radius: 14px !important; color: #e2e8f0 !important; font-size: 1.4em !important; direction: rtl !important; font-family: 'Lateef', 'Scheherazade New', serif !important; caret-color: #c084fc !important; line-height: 2.2em !important; padding: 14px 16px !important; } textarea:focus { border-color: #c084fc !important; box-shadow: 0 0 0 3px #7c3aed15 !important; outline: none !important; } textarea::placeholder { color: #4c1d95 !important; font-family: 'Lateef', 'Scheherazade New', serif !important; font-size: 1em !important; } /* Search button */ button.primary { background: linear-gradient(135deg, #6d28d9, #9333ea, #c084fc) !important; border: none !important; border-radius: 12px !important; color: #fff !important; font-weight: 800 !important; font-size: 1em !important; font-family: 'Lateef', 'Scheherazade New', serif !important; letter-spacing: 0.5px !important; transition: all 0.3s ease !important; padding: 14px !important; width: 100% !important; margin-top: 8px !important; } button.primary:hover { box-shadow: 0 6px 24px #7c3aed50 !important; transform: translateY(-1px) !important; } /* Examples — below button, clean look */ .examples-holder { background: transparent !important; border: none !important; padding: 0 !important; margin-top: 10px !important; } .examples-holder > .label-wrap { display: none !important; } .examples table { background: #130825 !important; border: 1px solid #6d28d930 !important; border-radius: 10px !important; width: 100% !important; } .examples table thead { display: none !important; } .examples table td { color: #94a3b8 !important; font-family: 'Lateef', 'Scheherazade New', serif !important; font-size: 1.15em !important; direction: rtl !important; text-align: right !important; padding: 8px 14px !important; border-bottom: 1px solid #1e1040 !important; } .examples table tr:last-child td { border-bottom: none !important; } .examples table tr:hover td { color: #c084fc !important; background: #1a0533 !important; cursor: pointer !important; } /* File download */ .file-preview { background: #130825 !important; border: 1px solid #6d28d940 !important; border-radius: 10px !important; } /* Scrollbar */ ::-webkit-scrollbar { width: 5px; } ::-webkit-scrollbar-track { background: #08081a; } ::-webkit-scrollbar-thumb { background: #6d28d9; border-radius: 3px; } """ HEADER = """

سنڌي اسمن جي سڃاڻپ

SINDHI NAMED ENTITY RECOGNITION

""" EXAMPLES = [ ["شيخ اياز شڪارپور ۾ پيدا ٿيو"], ["يونيورسٽي آف سنڌ، حيدرآباد ۾ آھي"], ["سيد مراد علي شاھ سنڌ جو وڏو وزير آھي، سندس تعلق پاڪستان پيپلز پارٽي سان آھي"], ["پاڪستان ۽ ڀارت جي ويڙھ 2025ع ۾ لڳي"], ["ڊاڪٽر نبي بخش بلوچ 16 ڊسمبر 1917ع تي سنجھوري ۾ پيدا ٿيو"], ["بينظير ڀٽو پاڪستان جي پھرين عورت وزيراعظم هئي"], ] with gr.Blocks(css=CSS, title="سنڌي NER") as demo: gr.HTML(HEADER) with gr.Row(): # ── Left column: input → button → examples ── with gr.Column(scale=3): inp = gr.Textbox( label="سنڌي جملو لکو", placeholder="شيخ اياز شڪارپور ۾ پيدا ٿيو...", lines=4, rtl=True ) btn = gr.Button("🔍 ڳوليو", variant="primary") # Examples BELOW button gr.Examples( examples=EXAMPLES, inputs=inp, label=None, ) # ── Right column: summary ──────────────────── with gr.Column(scale=2): summary_out = gr.HTML(value=_empty_summary()) gr.HTML("
") # Highlighted output highlighted_out = gr.HTML(value=_empty_html()) # Confidence bars conf_out = gr.HTML() # Legend — hidden until search, no header text legend_out = gr.HTML(visible=False) # CSV download csv_out = gr.File( label="📥 ڊائونلوڊ ڪريو (CSV)", file_types=[".csv"], interactive=False ) gr.HTML("""
hellosindh · sindhi-bert-ner · MIT License
""") # Wire up both click and enter btn.click( fn=predict_ner, inputs=inp, outputs=[highlighted_out, summary_out, conf_out, csv_out, legend_out] ) inp.submit( fn=predict_ner, inputs=inp, outputs=[highlighted_out, summary_out, conf_out, csv_out, legend_out] ) demo.launch()