import gradio as gr from transformers import pipeline, AutoConfig MODEL_ID = "CIAZIZ/arabic-ner-camelbert-wikiann" # ← your model repo ner = pipeline("token-classification", model=MODEL_ID, aggregation_strategy="simple") # --- make sure labels are correct (no LABEL_0) --- cfg = ner.model.config WIKIANN_LABELS = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] if not getattr(cfg, "id2label", None) or any(str(v).startswith("LABEL_") for v in cfg.id2label.values()): cfg.id2label = {i: lab for i, lab in enumerate(WIKIANN_LABELS)} cfg.label2id = {lab: i for i, lab in enumerate(WIKIANN_LABELS)} AR_MAP = {"PER": "شخص", "ORG": "منظمة", "LOC": "مكان", "MISC": "كيان عام"} COLOR_MAP = {"PER": "#2f80ed", "ORG": "#9b51e0", "LOC": "#27ae60", "MISC": "#f2994a"} def merge_touching_spans(ents): """Merge adjacent spans with the SAME entity group (fixes أر + ام + كو → أرامكو).""" if not ents: return ents ents = sorted(ents, key=lambda x: x["start"]) merged = [dict(ents[0])] for e in ents[1:]: last = merged[-1] g_last = last.get("entity_group") or last.get("entity") g_cur = e.get("entity_group") or e.get("entity") if g_last == g_cur and e["start"] == last["end"]: # extend the previous span last["end"] = e["end"] # average confidence (simple and robust) last["score"] = (float(last.get("score", 0)) + float(e.get("score", 0))) / 2.0 else: merged.append(dict(e)) return merged def to_segments(text, ents): """Return [(span_text, label_or_None), ...] for gr.HighlightedText.""" ents = sorted(ents, key=lambda x: x["start"]) segs, i = [], 0 for e in ents: if e["start"] > i: segs.append((text[i:e["start"]], None)) group = e.get("entity_group", e.get("entity")) segs.append((text[e["start"]:e["end"]], group)) i = e["end"] if i < len(text): segs.append((text[i:], None)) return segs def run(text: str): if not text.strip(): return [], [] out = ner(text) out = merge_touching_spans(out) # ← merge subword pieces segs = to_segments(text, out) # Build rows for the table (no 'position' column) rows = [] for e in out: group = e.get("entity_group", e.get("entity", "")) rows.append([ text[e["start"]:e["end"]], AR_MAP.get(group, group), round(float(e["score"]), 4), ]) return segs, rows with gr.Blocks(title="Arabic NER — CAMeLBERT") as demo: txt = gr.Textbox(label="اكتب النص بالعربي", lines=3, value="زار محمد بن سلمان مدينة نيوم والتقى بوفد من شركة أرامكو.") out_ht = gr.HighlightedText(label="", color_map=COLOR_MAP) # ← empty label out_tbl = gr.Dataframe(headers=["الكيان","النوع","الثقة"], interactive=False) # ← no 'الموضع' gr.Button("تحليل النص").click(run, inputs=txt, outputs=[out_ht, out_tbl]) if __name__ == "__main__": demo.launch()