import gradio as gr
import torch
import sentencepiece as spm
from transformers import RobertaForTokenClassification
from huggingface_hub import hf_hub_download
import csv
import io
from collections import Counter
MODEL_ID = "hellosindh/sindhi-bert-ner"
print("Loading model...", flush=True)
model = RobertaForTokenClassification.from_pretrained(MODEL_ID)
model.eval()
print("Loading tokenizer...", flush=True)
sp_path = hf_hub_download(repo_id=MODEL_ID, filename="sindhi_bpe_32k.model")
sp = spm.SentencePieceProcessor()
sp.Load(sp_path)
print("✅ Ready!", flush=True)
ID2TAG = model.config.id2label
BOS_ID = 2
EOS_ID = 3
# ── Entity config — Sindhi labels ─────────────────
ENTITY_CONFIG = {
"PERSON": {"color": "#c084fc", "bg": "rgba(192,132,252,0.15)", "sindhi": "ماڻهو"},
"LOCATION": {"color": "#818cf8", "bg": "rgba(129,140,248,0.15)", "sindhi": "جڳهه"},
"ORGANIZATION": {"color": "#38bdf8", "bg": "rgba(56,189,248,0.15)", "sindhi": "ادارو"},
"DATE_TIME": {"color": "#34d399", "bg": "rgba(52,211,153,0.15)", "sindhi": "تاريخ"},
"EVENT": {"color": "#fbbf24", "bg": "rgba(251,191,36,0.15)", "sindhi": "واقعو"},
"TITLE": {"color": "#fb923c", "bg": "rgba(251,146,60,0.15)", "sindhi": "لقب"},
}
FALLBACK_CFG = {
"color": "#6b7280",
"bg": "rgba(107,114,128,0.15)",
"sindhi": "ادارو", # unknown entities shown as ادارو
}
def predict_ner(sentence):
if not sentence.strip():
return _empty_html(), _empty_summary(), "", None, gr.update(visible=False)
words = sentence.split()
input_ids = [BOS_ID]
word_map = [-1]
for i, word in enumerate(words):
subwords = sp.EncodeAsIds(word)
if not subwords:
continue
for j, sw in enumerate(subwords):
input_ids.append(sw)
word_map.append(i if j == 0 else -1)
input_ids.append(EOS_ID)
word_map.append(-1)
tensor = torch.tensor([input_ids])
with torch.no_grad():
logits = model(tensor).logits[0]
probs = torch.softmax(logits, dim=-1)
preds = torch.argmax(logits, dim=-1).tolist()
conf = probs.max(dim=-1).values.tolist()
word_tags = {}
word_conf = {}
for pos, (pred, wid) in enumerate(zip(preds, word_map)):
if wid >= 0:
word_tags[wid] = ID2TAG[pred]
word_conf[wid] = conf[pos]
entities = []
html_words = []
i = 0
while i < len(words):
tag = word_tags.get(i, "O")
if tag.startswith("B-"):
etype = tag[2:]
entity_words = [words[i]]
scores = [word_conf.get(i, 0)]
j = i + 1
while j < len(words):
if word_tags.get(j, "O") == f"I-{etype}":
entity_words.append(words[j])
scores.append(word_conf.get(j, 0))
j += 1
else:
break
entity_text = " ".join(entity_words)
avg_score = sum(scores) / len(scores)
cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG)
html_words.append(
f''
f''
f'{cfg["sindhi"]}'
f'{entity_text}'
f''
)
entities.append({
"text": entity_text,
"type": etype,
"sindhi": cfg["sindhi"],
"score": avg_score,
"color": cfg["color"],
})
i = j
else:
html_words.append(
f'{words[i]}'
)
i += 1
highlighted = f"""
{" ".join(html_words)}
"""
summary = _build_summary(entities)
conf_html = _build_confidence(entities)
csv_file = _build_csv(entities)
legend = _build_legend(entities) if entities else ""
return highlighted, summary, conf_html, csv_file, gr.update(value=legend, visible=bool(entities))
def _empty_html():
return """
ڪو بہ سنڌي جملو لکو
"""
def _empty_summary():
return """
اعتماد جوڳا نتيجا نہ مليا
"""
def _build_summary(entities):
if not entities:
return _empty_summary()
counts = Counter(e["type"] for e in entities)
cards = ""
for etype, cnt in sorted(counts.items(), key=lambda x: -x[1]):
cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG)
cards += f"""
{cfg['sindhi']}
{cnt}
"""
# No مجموعي header — just cards directly
return f"""
{cards}
"""
def _build_confidence(entities):
if not entities:
return ""
bars = ""
for ent in entities:
cfg = ENTITY_CONFIG.get(ent["type"], FALLBACK_CFG)
pct = int(ent["score"] * 100)
bars += f"""
{ent['text']}
{ent['sindhi']}
{pct}%
"""
return f"""
"""
def _build_legend(entities):
"""Show only entity types found in this result."""
found_types = list(dict.fromkeys(e["type"] for e in entities)) # preserve order
items = ""
for etype in found_types:
cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG)
items += (
f''
f'{cfg["sindhi"]}'
)
return f"""
"""
def _build_csv(entities):
if not entities:
return None
output = io.StringIO()
writer = csv.writer(output)
writer.writerow(["Entity", "Type", "Sindhi Type", "Confidence"])
for ent in entities:
writer.writerow([
ent["text"], ent["type"],
ent["sindhi"], f"{ent['score']*100:.1f}%"
])
path = "/tmp/sindhi_ner.csv"
with open(path, "w", encoding="utf-8-sig", newline="") as f:
f.write(output.getvalue())
return path
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Lateef:wght@400;700&family=Scheherazade+New:wght@400;700&family=Outfit:wght@400;600;700;800&display=swap');
/* Base — Outfit for UI chrome only */
*, body, .gradio-container {
font-family: 'Outfit', sans-serif !important;
}
body, .gradio-container {
background: #08081a !important;
}
.gradio-container {
max-width: 980px !important;
margin: 0 auto !important;
padding: 16px !important;
}
/* Labels */
label > span {
color: #9333ea !important;
font-size: 0.82em !important;
font-weight: 700 !important;
letter-spacing: 0.8px !important;
text-transform: uppercase !important;
font-family: 'Outfit', sans-serif !important;
}
/* Textarea — Lateef font, large size */
textarea {
background: #130825 !important;
border: 1px solid #6d28d960 !important;
border-radius: 14px !important;
color: #e2e8f0 !important;
font-size: 1.4em !important;
direction: rtl !important;
font-family: 'Lateef', 'Scheherazade New', serif !important;
caret-color: #c084fc !important;
line-height: 2.2em !important;
padding: 14px 16px !important;
}
textarea:focus {
border-color: #c084fc !important;
box-shadow: 0 0 0 3px #7c3aed15 !important;
outline: none !important;
}
textarea::placeholder {
color: #4c1d95 !important;
font-family: 'Lateef', 'Scheherazade New', serif !important;
font-size: 1em !important;
}
/* Search button */
button.primary {
background: linear-gradient(135deg, #6d28d9, #9333ea, #c084fc) !important;
border: none !important;
border-radius: 12px !important;
color: #fff !important;
font-weight: 800 !important;
font-size: 1em !important;
font-family: 'Lateef', 'Scheherazade New', serif !important;
letter-spacing: 0.5px !important;
transition: all 0.3s ease !important;
padding: 14px !important;
width: 100% !important;
margin-top: 8px !important;
}
button.primary:hover {
box-shadow: 0 6px 24px #7c3aed50 !important;
transform: translateY(-1px) !important;
}
/* Examples — below button, clean look */
.examples-holder {
background: transparent !important;
border: none !important;
padding: 0 !important;
margin-top: 10px !important;
}
.examples-holder > .label-wrap {
display: none !important;
}
.examples table {
background: #130825 !important;
border: 1px solid #6d28d930 !important;
border-radius: 10px !important;
width: 100% !important;
}
.examples table thead {
display: none !important;
}
.examples table td {
color: #94a3b8 !important;
font-family: 'Lateef', 'Scheherazade New', serif !important;
font-size: 1.15em !important;
direction: rtl !important;
text-align: right !important;
padding: 8px 14px !important;
border-bottom: 1px solid #1e1040 !important;
}
.examples table tr:last-child td {
border-bottom: none !important;
}
.examples table tr:hover td {
color: #c084fc !important;
background: #1a0533 !important;
cursor: pointer !important;
}
/* File download */
.file-preview {
background: #130825 !important;
border: 1px solid #6d28d940 !important;
border-radius: 10px !important;
}
/* Scrollbar */
::-webkit-scrollbar { width: 5px; }
::-webkit-scrollbar-track { background: #08081a; }
::-webkit-scrollbar-thumb { background: #6d28d9; border-radius: 3px; }
"""
HEADER = """
سنڌي اسمن جي سڃاڻپ
SINDHI NAMED ENTITY RECOGNITION
"""
EXAMPLES = [
["شيخ اياز شڪارپور ۾ پيدا ٿيو"],
["يونيورسٽي آف سنڌ، حيدرآباد ۾ آھي"],
["سيد مراد علي شاھ سنڌ جو وڏو وزير آھي، سندس تعلق پاڪستان پيپلز پارٽي سان آھي"],
["پاڪستان ۽ ڀارت جي ويڙھ 2025ع ۾ لڳي"],
["ڊاڪٽر نبي بخش بلوچ 16 ڊسمبر 1917ع تي سنجھوري ۾ پيدا ٿيو"],
["بينظير ڀٽو پاڪستان جي پھرين عورت وزيراعظم هئي"],
]
with gr.Blocks(css=CSS, title="سنڌي NER") as demo:
gr.HTML(HEADER)
with gr.Row():
# ── Left column: input → button → examples ──
with gr.Column(scale=3):
inp = gr.Textbox(
label="سنڌي جملو لکو",
placeholder="شيخ اياز شڪارپور ۾ پيدا ٿيو...",
lines=4,
rtl=True
)
btn = gr.Button("🔍 ڳوليو", variant="primary")
# Examples BELOW button
gr.Examples(
examples=EXAMPLES,
inputs=inp,
label=None,
)
# ── Right column: summary ────────────────────
with gr.Column(scale=2):
summary_out = gr.HTML(value=_empty_summary())
gr.HTML("")
# Highlighted output
highlighted_out = gr.HTML(value=_empty_html())
# Confidence bars
conf_out = gr.HTML()
# Legend — hidden until search, no header text
legend_out = gr.HTML(visible=False)
# CSV download
csv_out = gr.File(
label="📥 ڊائونلوڊ ڪريو (CSV)",
file_types=[".csv"],
interactive=False
)
gr.HTML("""
hellosindh · sindhi-bert-ner · MIT License
""")
# Wire up both click and enter
btn.click(
fn=predict_ner,
inputs=inp,
outputs=[highlighted_out, summary_out, conf_out, csv_out, legend_out]
)
inp.submit(
fn=predict_ner,
inputs=inp,
outputs=[highlighted_out, summary_out, conf_out, csv_out, legend_out]
)
demo.launch()