Spaces:

optimopium
/

NER-Persian-LLM-Based

Running

App Files Files Community

optimopium commited on Nov 4

Commit

a39d4c2

verified ·

1 Parent(s): 67ad485

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -107

app.py CHANGED Viewed

@@ -1,119 +1,177 @@
-# app.py — Persian Zero-Shot NER (CPU) with few-shot prompting + beams
-import re, json, gradio as gr
-from typing import Dict, Any, List
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-MODEL_ID = "google/mt5-small"   # try "google/mt5-base" on CPU if still empty (slower, better)
-ALLOWED_LABELS: List[str] = ["PERSON","ORG","LOC","GPE","DATE","TIME","PRODUCT","EVENT"]
-DEFAULT_EXAMPLE = "من دیروز با علی در تهران در دفتر دیجی‌کالا جلسه داشتم."
-# --- Few-shot examples (in Persian) to nudge the model ---
-FEW_SHOT = """
-نمونه ۱:
-متن: من با علی در تهران در شرکت دیجی‌کالا جلسه داشتم.
-خروجی:
-{"entities":[
-  {"text":"علی","label":"PERSON","start":7,"end":10},
-  {"text":"تهران","label":"LOC","start":14,"end":19},
-  {"text":"دیجی‌کالا","label":"ORG","start":29,"end":37}
-]}
-نمونه ۲:
-متن: سارا فردا ساعت ۱۰ در دانشگاه تهران سخنرانی دارد.
-خروجی:
-{"entities":[
-  {"text":"سارا","label":"PERSON","start":0,"end":4},
-  {"text":"فردا","label":"DATE","start":5,"end":9},
-  {"text":"۱۰","label":"TIME","start":15,"end":17},
-  {"text":"دانشگاه تهران","label":"ORG","start":21,"end":34}
-]}
-"""
-def build_prompt(text: str, labels: List[str]) -> str:
-    return (
-        "متن زیر را برای شناسایی موجودیت‌های نامدار (NER) تحلیل کن.\n"
-        f"لیبل‌های مجاز: {', '.join(labels)}.\n"
-        "فقط JSON معتبر با اسکیمای زیر را برگردان:\n"
-        '{"entities":[{"text":"...", "label":"ORG|PERSON|...", "start":0, "end":0}]}\n'
-        "هیچ متن دیگری ننویس؛ فقط JSON.\n"
-        + FEW_SHOT +
-        "\nاکنون متن زیر را پردازش کن و فقط JSON بده:\n"
-        f"متن: {text}\n"
-        "خروجی:\n"
-    )
-def extract_first_json(s: str) -> Dict[str, Any]:
-    m = re.search(r"\{[\s\S]*\}", s)
-    if not m:
-        return {"entities": []}
-    raw = m.group(0)
-    try:
-        return json.loads(raw)
-    except Exception:
-        raw = re.sub(r",\s*}", "}", raw)
-        raw = re.sub(r",\s*]", "]", raw)
-        try:
-            return json.loads(raw)
-        except Exception:
-            return {"entities": []}
-def normalize_entities(data: Dict[str, Any], text: str, labels: List[str]) -> Dict[str, Any]:
-    text_norm = text or ""
-    out = []
-    for e in data.get("entities", []):
-        try:
-            t = str(e.get("text","")).strip()
-            lab = str(e.get("label","")).strip().upper()
-            if not t or lab not in labels:
-                continue
-            st, en = e.get("start"), e.get("end")
-            if not isinstance(st, int) or not isinstance(en, int) or st < 0 or en < 0:
-                idx = text_norm.find(t)
-                st, en = (idx, idx+len(t)) if idx >= 0 else (0, 0)
-            out.append({"text": t, "label": lab, "start": int(st), "end": int(en)})
-        except Exception:
-            pass
-    return {"entities": out}
-# lazy CPU load
-_tokenizer = None
-_model = None
-def load_model():
-    global _tokenizer, _model
-    if _tokenizer is None or _model is None:
-        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
-        _model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
-    return _tokenizer, _model
-def ner_infer(text: str, max_new_tokens: int = 256, num_beams: int = 4) -> Dict[str, Any]:
-    text = (text or "").strip()
-    if not text:
-        return {"entities": []}
-    tok, model = load_model()
-    prompt = build_prompt(text, ALLOWED_LABELS)
-    inputs = tok(prompt, return_tensors="pt")   # CPU
-    gen_ids = model.generate(
-        **inputs,
-        max_new_tokens=int(max_new_tokens),
-        do_sample=False,          # deterministic
-        num_beams=int(num_beams), # stronger decoding than greedy on CPU
-        length_penalty=1.05,
-        pad_token_id=tok.pad_token_id,
-        eos_token_id=tok.eos_token_id,
-    )
-    out_text = tok.decode(gen_ids[0], skip_special_tokens=True)
-    raw = extract_first_json(out_text)
-    return normalize_entities(raw, text, ALLOWED_LABELS)
-with gr.Blocks(title="Persian Zero-Shot NER (CPU)") as demo:
-    gr.Markdown("## Persian Zero-Shot NER — CPU (mT5) + Few-Shot Prompting")
-    inp = gr.Textbox(label="متن فارسی", lines=4, value=DEFAULT_EXAMPLE)
     with gr.Row():
-        max_tok = gr.Slider(96, 512, value=256, step=16, label="حداکثر توکن خروجی")
-        beams = gr.Slider(1, 8, value=4, step=1, label="Beam size")
-    btn = gr.Button("استخراج موجودیت‌ها")
-    out = gr.JSON(label="خروجی JSON")
-    btn.click(fn=ner_infer, inputs=[inp, max_tok, beams], outputs=out)
 if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+import torch
+# Set device to CPU explicitly
+device = "cpu"
+# Load the model and tokenizer
+model_name = "HooshvareLab/bert-base-parsbert-ner-uncased"
+print("Loading model and tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForTokenClassification.from_pretrained(model_name)
+model.to(device)
+# Create NER pipeline
+ner_pipeline = pipeline(
+    "ner",
+    model=model,
+    tokenizer=tokenizer,
+    device=-1,  # -1 means CPU
+    aggregation_strategy="simple"  # Groups entities together
+)
+# Label mapping for better readability
+label_colors = {
+    "B-PER": "#FF6B6B",  # Person - Red
+    "I-PER": "#FFB3B3",  # Person continuation - Light Red
+    "B-ORG": "#4ECDC4",  # Organization - Teal
+    "I-ORG": "#A7E9E4",  # Organization continuation - Light Teal
+    "B-LOC": "#95E1D3",  # Location - Green
+    "I-LOC": "#C7F0E8",  # Location continuation - Light Green
+    "B-DAT": "#FFA07A",  # Date - Orange
+    "I-DAT": "#FFDAB9",  # Date continuation - Light Orange
+    "B-TIM": "#DDA0DD",  # Time - Purple
+    "I-TIM": "#E6D0E6",  # Time continuation - Light Purple
+    "B-MON": "#FFD700",  # Money - Gold
+    "I-MON": "#FFEB99",  # Money continuation - Light Gold
+    "B-PCT": "#87CEEB",  # Percent - Sky Blue
+    "I-PCT": "#B3DFEF",  # Percent continuation - Light Sky Blue
+}
+label_names = {
+    "PER": "شخص (Person)",
+    "ORG": "سازمان (Organization)",
+    "LOC": "مکان (Location)",
+    "DAT": "تاریخ (Date)",
+    "TIM": "زمان (Time)",
+    "MON": "پول (Money)",
+    "PCT": "درصد (Percent)",
+}
+def highlight_entities(text, entities):
+    """Create HTML with highlighted entities"""
+    if not entities:
+        return text
+    # Sort entities by start position (reverse order to replace from end to start)
+    entities_sorted = sorted(entities, key=lambda x: x['start'], reverse=True)
+    result = text
+    for entity in entities_sorted:
+        start = entity['start']
+        end = entity['end']
+        label = entity['entity_group']
+        word = text[start:end]
+        score = entity['score']
+        # Get color for this label
+        color = label_colors.get(f"B-{label}", "#CCCCCC")
+        # Create highlighted span
+        highlighted = f'<span style="background-color: {color}; padding: 2px 6px; border-radius: 3px; margin: 0 2px; display: inline-block;" title="{label} (confidence: {score:.2f})">{word} <sup style="font-size: 0.7em; font-weight: bold;">[{label}]</sup></span>'
+        result = result[:start] + highlighted + result[end:]
+    return result
+def perform_ner(text):
+    """Perform NER on input text"""
+    if not text.strip():
+        return "<p style='color: red;'>لطفا متن فارسی وارد کنید (Please enter Persian text)</p>", ""
+    try:
+        # Perform NER
+        entities = ner_pipeline(text)
+        # Create highlighted version
+        highlighted_html = f"<div style='direction: rtl; text-align: right; font-size: 18px; line-height: 2; padding: 20px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9;'>{highlight_entities(text, entities)}</div>"
+        # Create entities table
+        if entities:
+            entity_info = "### موجودیت‌های شناسایی شده (Detected Entities):\n\n"
+            entity_info += "| کلمه (Word) | نوع (Type) | اطمینان (Confidence) |\n"
+            entity_info += "|------------|-----------|---------------------|\n"
+            for ent in entities:
+                label_fa = label_names.get(ent['entity_group'], ent['entity_group'])
+                entity_info += f"| {ent['word']} | {label_fa} | {ent['score']:.2%} |\n"
+        else:
+            entity_info = "هیچ موجودیتی شناسایی نشد (No entities detected)"
+        return highlighted_html, entity_info
+    except Exception as e:
+        return f"<p style='color: red;'>خطا (Error): {str(e)}</p>", ""
+# Example texts
+examples = [
+    ["باراک اوباما در هاوایی متولد شد و در شیکاگو زندگی می‌کرد."],
+    ["شرکت گوگل در کالیفرنیا واقع شده است."],
+    ["رضا در تهران در تاریخ ۱۵ خرداد ۱۳۸۰ متولد شد."],
+    ["دانشگاه تهران یکی از قدیمی‌ترین دانشگاه‌های ایران است."],
+    ["علی و حسین به همراه مریم به مشهد سفر کردند."],
+]
+# Create Gradio interface
+with gr.Blocks(title="Persian NER - شناسایی موجودیت‌های نامدار فارسی", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🇮🇷 Persian Named Entity Recognition
+    # شناسایی موجودیت‌های نامدار فارسی
+    این سیستم موجودیت‌های نامدار مانند اسامی اشخاص، سازمان‌ها، مکان‌ها، تاریخ‌ها و ... را در متن فارسی شناسایی می‌کند.
+    This system identifies named entities such as person names, organizations, locations, dates, etc. in Persian text.
+    **Model:** ParsBERT-NER (HooshvareLab)
+    **Running on:** CPU (may be slow for long texts)
+    """)
     with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="متن فارسی خود را وارد کنید (Enter Persian Text)",
+                placeholder="مثال: رضا در تهران زندگی می‌کند...",
+                lines=5,
+                rtl=True
+            )
+            submit_btn = gr.Button("🔍 تحلیل متن (Analyze Text)", variant="primary")
+        with gr.Column():
+            output_html = gr.HTML(label="متن با موجودیت‌های برجسته (Text with Highlighted Entities)")
+            output_entities = gr.Markdown(label="لیست موجودیت‌ها (Entity List)")
+    gr.Examples(
+        examples=examples,
+        inputs=input_text,
+        label="مثال‌ها (Examples)"
+    )
+    # Legend
+    gr.Markdown("""
+    ### راهنمای رنگ‌ها (Color Guide):
+    - 🔴 **PER (شخص)**: اسامی اشخاص / Person names
+    - 🔵 **ORG (سازمان)**: نام سازمان‌ها / Organizations
+    - 🟢 **LOC (مکان)**: نام مکان‌ها / Locations
+    - 🟠 **DAT (تاریخ)**: تاریخ‌ها / Dates
+    - 🟣 **TIM (زمان)**: زمان‌ها / Times
+    - 🟡 **MON (پول)**: مقادیر پولی / Money
+    - 🔷 **PCT (درصد)**: درصدها / Percentages
+    """)
+    # Event handler
+    submit_btn.click(
+        fn=perform_ner,
+        inputs=input_text,
+        outputs=[output_html, output_entities]
+    )
+    input_text.submit(
+        fn=perform_ner,
+        inputs=input_text,
+        outputs=[output_html, output_entities]
+    )
+# Launch the app
 if __name__ == "__main__":
+    demo.launch()