Spaces:

hellosindh
/

sindhi-ner

Running

App Files Files Community

hellosindh commited on 10 days ago

Commit

e3be0eb

verified ·

1 Parent(s): b27b19b

Create app.py

Browse files

Files changed (1) hide show

app.py +207 -0

app.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import gradio as gr
+import torch
+import sentencepiece as spm
+import os
+from transformers import RobertaForTokenClassification
+# ─── Load model & tokenizer ───────────────────────
+MODEL_PATH = "hellosindh/sindhi-bert-ner"
+SP_MODEL   = "sindhi_bpe_32k.model"
+print("Loading model...", flush=True)
+model = RobertaForTokenClassification.from_pretrained(
+    MODEL_PATH
+)
+model.eval()
+print("Loading tokenizer...", flush=True)
+sp = spm.SentencePieceProcessor()
+sp.Load(SP_MODEL)
+# ─── Tag config ───────────────────────────────────
+ID2TAG = model.config.id2label
+BOS_ID = 2
+EOS_ID = 3
+# Entity colors for highlighting
+COLORS = {
+    "PERSON":       "#FF6B6B",
+    "LOCATION":     "#4ECDC4",
+    "ORGANIZATION": "#45B7D1",
+    "DATE_TIME":    "#96CEB4",
+    "EVENT":        "#FFEAA7",
+    "LITERARY_WORK":"#DDA0DD",
+    "PROFESSION":   "#98D8C8",
+    "TITLE":        "#F7DC6F",
+    "LANGUAGE":     "#BB8FCE",
+    "FIELD":        "#85C1E9",
+    "LAW":          "#F0B27A",
+    "GROUP":        "#82E0AA",
+    "MISC":         "#BDC3C7",
+}
+# ─── Prediction function ──────────────────────────
+def predict_ner(sentence):
+    if not sentence.strip():
+        return "", []
+    words = sentence.split()
+    # Tokenize
+    input_ids = [BOS_ID]
+    word_map  = [-1]  # maps token → word index
+    for i, word in enumerate(words):
+        subwords = sp.EncodeAsIds(word)
+        if not subwords:
+            continue
+        for j, sw in enumerate(subwords):
+            input_ids.append(sw)
+            word_map.append(i if j == 0 else -1)
+    input_ids.append(EOS_ID)
+    word_map.append(-1)
+    # Run model
+    tensor = torch.tensor([input_ids])
+    with torch.no_grad():
+        logits = model(tensor).logits[0]
+    preds = torch.argmax(logits, dim=-1).tolist()
+    # Collect word-level predictions
+    word_tags = {}
+    for pos, (pred, wid) in enumerate(zip(preds, word_map)):
+        if wid >= 0:
+            word_tags[wid] = ID2TAG[pred]
+    # ─── Build highlighted HTML ───────────────────
+    html_parts = []
+    entities   = []
+    i = 0
+    while i < len(words):
+        tag = word_tags.get(i, "O")
+        if tag.startswith("B-"):
+            entity_type = tag[2:]
+            entity_words = [words[i]]
+            # Collect I- continuation tokens
+            j = i + 1
+            while j < len(words):
+                next_tag = word_tags.get(j, "O")
+                if next_tag == f"I-{entity_type}":
+                    entity_words.append(words[j])
+                    j += 1
+                else:
+                    break
+            entity_text = " ".join(entity_words)
+            color = COLORS.get(entity_type, "#BDC3C7")
+            html_parts.append(
+                f'<mark style="background:{color}; '
+                f'padding:2px 6px; border-radius:4px; '
+                f'margin:2px; font-weight:bold;" '
+                f'title="{entity_type}">'
+                f'{entity_text} '
+                f'<span style="font-size:0.75em; '
+                f'opacity:0.8;">[{entity_type}]</span>'
+                f'</mark>'
+            )
+            entities.append((entity_text, entity_type))
+            i = j
+        else:
+            html_parts.append(words[i])
+            i += 1
+    html = '<p dir="rtl" style="font-size:1.2em; ' \
+           'line-height:2.5em; text-align:right;">' + \
+           " ".join(html_parts) + "</p>"
+    # Build entity table
+    table = []
+    for text, etype in entities:
+        table.append([text, etype])
+    return html, table
+# ─── Example sentences ────────────────────────────
+examples = [
+    ["شيخ اياز شڪارپور ۾ پيدا ٿيو"],
+    ["سنڌ يونيورسٽي حيدرآباد ۾ آھي"],
+    ["پاڪستان ڏکڻ ايشيا ۾ آھي"],
+    ["ڊاڪٽر محمد علي 1990ع ۾ سنڌ آيو"],
+]
+# ─── Gradio Interface ─────────────────────────────
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    title="Sindhi NER"
+) as demo:
+    gr.Markdown("""
+    # 🏷️ Sindhi Named Entity Recognizer
+    ### سنڌي نالن جي سڃاڻپ جو اوزار
+    First Sindhi NER model — trained on 22,777 annotated sentences!
+    **Recognizes:** Person · Location · Organization ·
+    Date/Time · Event · Literary Work · and 15 more types
+    """)
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="سنڌي جملو لکو (Enter Sindhi text)",
+                placeholder="شيخ اياز شڪارپور ۾ پيدا ٿيو",
+                lines=3,
+                rtl=True
+            )
+            submit_btn = gr.Button(
+                "🔍 Entities ڳوليو",
+                variant="primary"
+            )
+    with gr.Row():
+        highlighted = gr.HTML(
+            label="Highlighted Entities"
+        )
+    with gr.Row():
+        entity_table = gr.Dataframe(
+            headers=["Entity", "Type"],
+            label="Entities Found",
+            wrap=True
+        )
+    # Color legend
+    gr.Markdown("""
+    ### Legend
+    🔴 Person &nbsp; 🟦 Location &nbsp;
+    🔵 Organization &nbsp; 🟢 Date/Time &nbsp;
+    🟡 Event &nbsp; 🟣 Literary Work
+    """)
+    gr.Examples(
+        examples=examples,
+        inputs=text_input
+    )
+    submit_btn.click(
+        fn=predict_ner,
+        inputs=text_input,
+        outputs=[highlighted, entity_table]
+    )
+    text_input.submit(
+        fn=predict_ner,
+        inputs=text_input,
+        outputs=[highlighted, entity_table]
+    )
+demo.launch()