sindhi-ner / app.py
hellosindh's picture
Update app.py
7524a95 verified
import gradio as gr
import torch
import sentencepiece as spm
from transformers import RobertaForTokenClassification
from huggingface_hub import hf_hub_download
import csv
import io
from collections import Counter
MODEL_ID = "hellosindh/sindhi-bert-ner"
print("Loading model...", flush=True)
model = RobertaForTokenClassification.from_pretrained(MODEL_ID)
model.eval()
print("Loading tokenizer...", flush=True)
sp_path = hf_hub_download(repo_id=MODEL_ID, filename="sindhi_bpe_32k.model")
sp = spm.SentencePieceProcessor()
sp.Load(sp_path)
print("✅ Ready!", flush=True)
ID2TAG = model.config.id2label
BOS_ID = 2
EOS_ID = 3
# ── Entity config — Sindhi labels ─────────────────
ENTITY_CONFIG = {
"PERSON": {"color": "#c084fc", "bg": "rgba(192,132,252,0.15)", "sindhi": "ماڻهو"},
"LOCATION": {"color": "#818cf8", "bg": "rgba(129,140,248,0.15)", "sindhi": "جڳهه"},
"ORGANIZATION": {"color": "#38bdf8", "bg": "rgba(56,189,248,0.15)", "sindhi": "ادارو"},
"DATE_TIME": {"color": "#34d399", "bg": "rgba(52,211,153,0.15)", "sindhi": "تاريخ"},
"EVENT": {"color": "#fbbf24", "bg": "rgba(251,191,36,0.15)", "sindhi": "واقعو"},
"TITLE": {"color": "#fb923c", "bg": "rgba(251,146,60,0.15)", "sindhi": "لقب"},
}
FALLBACK_CFG = {
"color": "#6b7280",
"bg": "rgba(107,114,128,0.15)",
"sindhi": "ادارو", # unknown entities shown as ادارو
}
def predict_ner(sentence):
if not sentence.strip():
return _empty_html(), _empty_summary(), "", None, gr.update(visible=False)
words = sentence.split()
input_ids = [BOS_ID]
word_map = [-1]
for i, word in enumerate(words):
subwords = sp.EncodeAsIds(word)
if not subwords:
continue
for j, sw in enumerate(subwords):
input_ids.append(sw)
word_map.append(i if j == 0 else -1)
input_ids.append(EOS_ID)
word_map.append(-1)
tensor = torch.tensor([input_ids])
with torch.no_grad():
logits = model(tensor).logits[0]
probs = torch.softmax(logits, dim=-1)
preds = torch.argmax(logits, dim=-1).tolist()
conf = probs.max(dim=-1).values.tolist()
word_tags = {}
word_conf = {}
for pos, (pred, wid) in enumerate(zip(preds, word_map)):
if wid >= 0:
word_tags[wid] = ID2TAG[pred]
word_conf[wid] = conf[pos]
entities = []
html_words = []
i = 0
while i < len(words):
tag = word_tags.get(i, "O")
if tag.startswith("B-"):
etype = tag[2:]
entity_words = [words[i]]
scores = [word_conf.get(i, 0)]
j = i + 1
while j < len(words):
if word_tags.get(j, "O") == f"I-{etype}":
entity_words.append(words[j])
scores.append(word_conf.get(j, 0))
j += 1
else:
break
entity_text = " ".join(entity_words)
avg_score = sum(scores) / len(scores)
cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG)
html_words.append(
f'<span style="'
f'background:{cfg["bg"]};'
f'border:1px solid {cfg["color"]}50;'
f'color:#f1f5f9;'
f'padding:4px 12px 4px 8px;'
f'border-radius:8px;margin:3px;'
f'display:inline-block;font-weight:500;">'
f'<span style="'
f'background:{cfg["color"]};color:#0a0a1a;'
f'font-size:0.62em;font-weight:800;'
f'padding:2px 7px;border-radius:4px;'
f'margin-left:7px;vertical-align:middle;">'
f'{cfg["sindhi"]}</span>'
f'{entity_text}'
f'</span>'
)
entities.append({
"text": entity_text,
"type": etype,
"sindhi": cfg["sindhi"],
"score": avg_score,
"color": cfg["color"],
})
i = j
else:
html_words.append(
f'<span style="color:#cbd5e1;padding:2px 4px;">{words[i]}</span>'
)
i += 1
highlighted = f"""
<div style="
background:linear-gradient(135deg,#1a0533 0%,#0f0f2e 100%);
border:1px solid #7c3aed30;border-radius:16px;
padding:24px 28px;
font-size:1.3em;
line-height:3.2em;
direction:rtl;text-align:right;
font-family:'Lateef','Scheherazade New',serif;
min-height:90px;">
{" ".join(html_words)}
</div>
"""
summary = _build_summary(entities)
conf_html = _build_confidence(entities)
csv_file = _build_csv(entities)
legend = _build_legend(entities) if entities else ""
return highlighted, summary, conf_html, csv_file, gr.update(value=legend, visible=bool(entities))
def _empty_html():
return """
<div style="
background:linear-gradient(135deg,#1a0533,#0f0f2e);
border:1px solid #7c3aed20;border-radius:16px;
padding:40px;text-align:center;min-height:90px;
display:flex;align-items:center;justify-content:center;">
<span style="color:#4c1d95;font-size:1.2em;
font-family:'Lateef','Scheherazade New',serif;">
ڪو بہ سنڌي جملو لکو
</span>
</div>
"""
def _empty_summary():
return """
<div style="
background:#1a0533;border:1px solid #7c3aed20;
border-radius:16px;padding:24px;
text-align:center;color:#4c1d95;
font-size:1.1em;
font-family:'Lateef','Scheherazade New',serif;">
اعتماد جوڳا نتيجا نہ مليا
</div>
"""
def _build_summary(entities):
if not entities:
return _empty_summary()
counts = Counter(e["type"] for e in entities)
cards = ""
for etype, cnt in sorted(counts.items(), key=lambda x: -x[1]):
cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG)
cards += f"""
<div style="
background:{cfg['bg']};border:1px solid {cfg['color']}40;
border-radius:10px;padding:10px 14px;
display:flex;justify-content:space-between;
align-items:center;margin-bottom:8px;direction:rtl;">
<span style="color:{cfg['color']};font-weight:600;font-size:1em;
font-family:'Lateef','Scheherazade New',serif;">
{cfg['sindhi']}
</span>
<span style="
background:{cfg['color']};color:#0a0a1a;
font-weight:800;border-radius:20px;
padding:1px 10px;font-size:0.85em;
min-width:24px;text-align:center;">
{cnt}
</span>
</div>
"""
# No مجموعي header — just cards directly
return f"""
<div style="
background:linear-gradient(135deg,#1a0533,#0f0f2e);
border:1px solid #7c3aed30;border-radius:16px;
padding:16px 14px;">
{cards}
</div>
"""
def _build_confidence(entities):
if not entities:
return ""
bars = ""
for ent in entities:
cfg = ENTITY_CONFIG.get(ent["type"], FALLBACK_CFG)
pct = int(ent["score"] * 100)
bars += f"""
<div style="margin-bottom:16px;direction:rtl;">
<div style="display:flex;justify-content:space-between;
align-items:center;margin-bottom:6px;">
<span style="color:#e2e8f0;font-size:1.1em;font-weight:500;
font-family:'Lateef','Scheherazade New',serif;">
{ent['text']}
</span>
<div style="display:flex;gap:8px;align-items:center;">
<span style="
background:{cfg['color']}18;
border:1px solid {cfg['color']}40;
color:{cfg['color']};
font-size:0.85em;padding:2px 8px;
border-radius:4px;font-weight:700;
font-family:'Lateef','Scheherazade New',serif;">
{ent['sindhi']}
</span>
<span style="color:{cfg['color']};
font-weight:800;font-size:0.88em;
font-family:monospace;">
{pct}%
</span>
</div>
</div>
<div style="background:#1e1040;border-radius:999px;
height:5px;overflow:hidden;">
<div style="width:{pct}%;height:100%;
background:linear-gradient(90deg,{cfg['color']}60,{cfg['color']});
border-radius:999px;">
</div>
</div>
</div>
"""
return f"""
<div style="
background:linear-gradient(135deg,#1a0533,#0f0f2e);
border:1px solid #7c3aed30;border-radius:16px;
padding:20px 18px;margin-top:4px;">
<div style="color:#c084fc;font-weight:700;font-size:0.9em;
margin-bottom:16px;padding-bottom:10px;
border-bottom:1px solid #7c3aed25;
direction:rtl;text-align:right;
font-family:'Lateef','Scheherazade New',serif;">
اعتماد
</div>
{bars}
</div>
"""
def _build_legend(entities):
"""Show only entity types found in this result."""
found_types = list(dict.fromkeys(e["type"] for e in entities)) # preserve order
items = ""
for etype in found_types:
cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG)
items += (
f'<span style="background:{cfg["bg"]};'
f'border:1px solid {cfg["color"]}40;'
f'color:{cfg["color"]};padding:5px 14px;'
f'border-radius:6px;font-size:1em;font-weight:600;'
f'font-family:\'Lateef\',\'Scheherazade New\',serif;">'
f'{cfg["sindhi"]}</span>'
)
return f"""
<div style="
background:linear-gradient(135deg,#1a0533,#0f0f2e);
border:1px solid #7c3aed20;border-radius:14px;
padding:14px 18px;margin-top:4px;">
<div style="display:flex;flex-wrap:wrap;gap:8px;direction:rtl;">
{items}
</div>
</div>
"""
def _build_csv(entities):
if not entities:
return None
output = io.StringIO()
writer = csv.writer(output)
writer.writerow(["Entity", "Type", "Sindhi Type", "Confidence"])
for ent in entities:
writer.writerow([
ent["text"], ent["type"],
ent["sindhi"], f"{ent['score']*100:.1f}%"
])
path = "/tmp/sindhi_ner.csv"
with open(path, "w", encoding="utf-8-sig", newline="") as f:
f.write(output.getvalue())
return path
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Lateef:wght@400;700&family=Scheherazade+New:wght@400;700&family=Outfit:wght@400;600;700;800&display=swap');
/* Base — Outfit for UI chrome only */
*, body, .gradio-container {
font-family: 'Outfit', sans-serif !important;
}
body, .gradio-container {
background: #08081a !important;
}
.gradio-container {
max-width: 980px !important;
margin: 0 auto !important;
padding: 16px !important;
}
/* Labels */
label > span {
color: #9333ea !important;
font-size: 0.82em !important;
font-weight: 700 !important;
letter-spacing: 0.8px !important;
text-transform: uppercase !important;
font-family: 'Outfit', sans-serif !important;
}
/* Textarea — Lateef font, large size */
textarea {
background: #130825 !important;
border: 1px solid #6d28d960 !important;
border-radius: 14px !important;
color: #e2e8f0 !important;
font-size: 1.4em !important;
direction: rtl !important;
font-family: 'Lateef', 'Scheherazade New', serif !important;
caret-color: #c084fc !important;
line-height: 2.2em !important;
padding: 14px 16px !important;
}
textarea:focus {
border-color: #c084fc !important;
box-shadow: 0 0 0 3px #7c3aed15 !important;
outline: none !important;
}
textarea::placeholder {
color: #4c1d95 !important;
font-family: 'Lateef', 'Scheherazade New', serif !important;
font-size: 1em !important;
}
/* Search button */
button.primary {
background: linear-gradient(135deg, #6d28d9, #9333ea, #c084fc) !important;
border: none !important;
border-radius: 12px !important;
color: #fff !important;
font-weight: 800 !important;
font-size: 1em !important;
font-family: 'Lateef', 'Scheherazade New', serif !important;
letter-spacing: 0.5px !important;
transition: all 0.3s ease !important;
padding: 14px !important;
width: 100% !important;
margin-top: 8px !important;
}
button.primary:hover {
box-shadow: 0 6px 24px #7c3aed50 !important;
transform: translateY(-1px) !important;
}
/* Examples — below button, clean look */
.examples-holder {
background: transparent !important;
border: none !important;
padding: 0 !important;
margin-top: 10px !important;
}
.examples-holder > .label-wrap {
display: none !important;
}
.examples table {
background: #130825 !important;
border: 1px solid #6d28d930 !important;
border-radius: 10px !important;
width: 100% !important;
}
.examples table thead {
display: none !important;
}
.examples table td {
color: #94a3b8 !important;
font-family: 'Lateef', 'Scheherazade New', serif !important;
font-size: 1.15em !important;
direction: rtl !important;
text-align: right !important;
padding: 8px 14px !important;
border-bottom: 1px solid #1e1040 !important;
}
.examples table tr:last-child td {
border-bottom: none !important;
}
.examples table tr:hover td {
color: #c084fc !important;
background: #1a0533 !important;
cursor: pointer !important;
}
/* File download */
.file-preview {
background: #130825 !important;
border: 1px solid #6d28d940 !important;
border-radius: 10px !important;
}
/* Scrollbar */
::-webkit-scrollbar { width: 5px; }
::-webkit-scrollbar-track { background: #08081a; }
::-webkit-scrollbar-thumb { background: #6d28d9; border-radius: 3px; }
"""
HEADER = """
<div style="
background:linear-gradient(135deg,#1a0533 0%,#0f0f2e 60%,#160a2e 100%);
border:1px solid #7c3aed25;border-radius:20px;
padding:28px 28px 22px;margin-bottom:20px;
text-align:center;position:relative;overflow:hidden;">
<div style="
position:absolute;top:0;left:0;right:0;bottom:0;
background:radial-gradient(ellipse at 50% 0%,#7c3aed12 0%,transparent 65%);
pointer-events:none;"></div>
<div style="position:relative;">
<h1 style="
color:#f1f5f9;font-size:2em;font-weight:800;
margin:0 0 4px;letter-spacing:-1px;
text-shadow:0 0 40px #7c3aed50;
font-family:'Lateef','Scheherazade New',serif;">
سنڌي اسمن جي سڃاڻپ
</h1>
<p style="
font-family:'Outfit',sans-serif;
color:#6d28d9;font-size:0.72em;
letter-spacing:3px;margin:0;">
SINDHI NAMED ENTITY RECOGNITION
</p>
</div>
</div>
"""
EXAMPLES = [
["شيخ اياز شڪارپور ۾ پيدا ٿيو"],
["يونيورسٽي آف سنڌ، حيدرآباد ۾ آھي"],
["سيد مراد علي شاھ سنڌ جو وڏو وزير آھي، سندس تعلق پاڪستان پيپلز پارٽي سان آھي"],
["پاڪستان ۽ ڀارت جي ويڙھ 2025ع ۾ لڳي"],
["ڊاڪٽر نبي بخش بلوچ 16 ڊسمبر 1917ع تي سنجھوري ۾ پيدا ٿيو"],
["بينظير ڀٽو پاڪستان جي پھرين عورت وزيراعظم هئي"],
]
with gr.Blocks(css=CSS, title="سنڌي NER") as demo:
gr.HTML(HEADER)
with gr.Row():
# ── Left column: input → button → examples ──
with gr.Column(scale=3):
inp = gr.Textbox(
label="سنڌي جملو لکو",
placeholder="شيخ اياز شڪارپور ۾ پيدا ٿيو...",
lines=4,
rtl=True
)
btn = gr.Button("🔍 ڳوليو", variant="primary")
# Examples BELOW button
gr.Examples(
examples=EXAMPLES,
inputs=inp,
label=None,
)
# ── Right column: summary ────────────────────
with gr.Column(scale=2):
summary_out = gr.HTML(value=_empty_summary())
gr.HTML("<div style='height:6px'></div>")
# Highlighted output
highlighted_out = gr.HTML(value=_empty_html())
# Confidence bars
conf_out = gr.HTML()
# Legend — hidden until search, no header text
legend_out = gr.HTML(visible=False)
# CSV download
csv_out = gr.File(
label="📥 ڊائونلوڊ ڪريو (CSV)",
file_types=[".csv"],
interactive=False
)
gr.HTML("""
<div style="text-align:center;padding:16px 0 4px;
color:#3b0764;font-size:0.72em;letter-spacing:1.5px;
font-family:'Outfit',sans-serif;">
hellosindh · sindhi-bert-ner · MIT License
</div>
""")
# Wire up both click and enter
btn.click(
fn=predict_ner,
inputs=inp,
outputs=[highlighted_out, summary_out, conf_out, csv_out, legend_out]
)
inp.submit(
fn=predict_ner,
inputs=inp,
outputs=[highlighted_out, summary_out, conf_out, csv_out, legend_out]
)
demo.launch()