Spaces:

CIAZIZ
/

AER-bert

Sleeping

File size: 3,168 Bytes

import gradio as gr
from transformers import pipeline, AutoConfig

MODEL_ID = "CIAZIZ/arabic-ner-camelbert-wikiann"  # ← your model repo
ner = pipeline("token-classification", model=MODEL_ID, aggregation_strategy="simple")

# --- make sure labels are correct (no LABEL_0) ---
cfg = ner.model.config
WIKIANN_LABELS = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
if not getattr(cfg, "id2label", None) or any(str(v).startswith("LABEL_") for v in cfg.id2label.values()):
    cfg.id2label = {i: lab for i, lab in enumerate(WIKIANN_LABELS)}
    cfg.label2id = {lab: i for i, lab in enumerate(WIKIANN_LABELS)}

AR_MAP    = {"PER": "شخص", "ORG": "منظمة", "LOC": "مكان", "MISC": "كيان عام"}
COLOR_MAP = {"PER": "#2f80ed", "ORG": "#9b51e0", "LOC": "#27ae60", "MISC": "#f2994a"}

def merge_touching_spans(ents):
    """Merge adjacent spans with the SAME entity group (fixes أر + ام + كو → أرامكو)."""
    if not ents:
        return ents
    ents = sorted(ents, key=lambda x: x["start"])
    merged = [dict(ents[0])]
    for e in ents[1:]:
        last = merged[-1]
        g_last = last.get("entity_group") or last.get("entity")
        g_cur  = e.get("entity_group")  or e.get("entity")
        if g_last == g_cur and e["start"] == last["end"]:
            # extend the previous span
            last["end"] = e["end"]
            # average confidence (simple and robust)
            last["score"] = (float(last.get("score", 0)) + float(e.get("score", 0))) / 2.0
        else:
            merged.append(dict(e))
    return merged

def to_segments(text, ents):
    """Return [(span_text, label_or_None), ...] for gr.HighlightedText."""
    ents = sorted(ents, key=lambda x: x["start"])
    segs, i = [], 0
    for e in ents:
        if e["start"] > i:
            segs.append((text[i:e["start"]], None))
        group = e.get("entity_group", e.get("entity"))
        segs.append((text[e["start"]:e["end"]], group))
        i = e["end"]
    if i < len(text):
        segs.append((text[i:], None))
    return segs

def run(text: str):
    if not text.strip():
        return [], []
    out = ner(text)
    out = merge_touching_spans(out)            # ← merge subword pieces
    segs = to_segments(text, out)

    # Build rows for the table (no 'position' column)
    rows = []
    for e in out:
        group = e.get("entity_group", e.get("entity", ""))
        rows.append([
            text[e["start"]:e["end"]],
            AR_MAP.get(group, group),
            round(float(e["score"]), 4),
        ])
    return segs, rows

with gr.Blocks(title="Arabic NER — CAMeLBERT") as demo:
    txt = gr.Textbox(label="اكتب النص بالعربي", lines=3,
                     value="زار محمد بن سلمان مدينة نيوم والتقى بوفد من شركة أرامكو.")
    out_ht = gr.HighlightedText(label="", color_map=COLOR_MAP)  # ← empty label
    out_tbl = gr.Dataframe(headers=["الكيان","النوع","الثقة"], interactive=False)  # ← no 'الموضع'
    gr.Button("تحليل النص").click(run, inputs=txt, outputs=[out_ht, out_tbl])

if __name__ == "__main__":
    demo.launch()