Spaces:

optimopium
/

NER-Persian-LLM-Based

Sleeping

File size: 7,043 Bytes

import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch

# Set device to CPU explicitly
device = "cpu"

# Load the model and tokenizer
model_name = "HooshvareLab/bert-base-parsbert-ner-uncased"

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model.to(device)

# Create NER pipeline
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    device=-1,  # -1 means CPU
    aggregation_strategy="simple"  # Groups entities together
)

# Label mapping for better readability
label_colors = {
    "B-PER": "#FF6B6B",  # Person - Red
    "I-PER": "#FFB3B3",  # Person continuation - Light Red
    "B-ORG": "#4ECDC4",  # Organization - Teal
    "I-ORG": "#A7E9E4",  # Organization continuation - Light Teal
    "B-LOC": "#95E1D3",  # Location - Green
    "I-LOC": "#C7F0E8",  # Location continuation - Light Green
    "B-DAT": "#FFA07A",  # Date - Orange
    "I-DAT": "#FFDAB9",  # Date continuation - Light Orange
    "B-TIM": "#DDA0DD",  # Time - Purple
    "I-TIM": "#E6D0E6",  # Time continuation - Light Purple
    "B-MON": "#FFD700",  # Money - Gold
    "I-MON": "#FFEB99",  # Money continuation - Light Gold
    "B-PCT": "#87CEEB",  # Percent - Sky Blue
    "I-PCT": "#B3DFEF",  # Percent continuation - Light Sky Blue
}

label_names = {
    "PER": "شخص (Person)",
    "ORG": "سازمان (Organization)",
    "LOC": "مکان (Location)",
    "DAT": "تاریخ (Date)",
    "TIM": "زمان (Time)",
    "MON": "پول (Money)",
    "PCT": "درصد (Percent)",
}

def highlight_entities(text, entities):
    """Create HTML with highlighted entities"""
    if not entities:
        return text
    
    # Sort entities by start position (reverse order to replace from end to start)
    entities_sorted = sorted(entities, key=lambda x: x['start'], reverse=True)
    
    result = text
    for entity in entities_sorted:
        start = entity['start']
        end = entity['end']
        label = entity['entity_group']
        word = text[start:end]
        score = entity['score']
        
        # Get color for this label
        color = label_colors.get(f"B-{label}", "#CCCCCC")
        
        # Create highlighted span
        highlighted = f'<span style="background-color: {color}; padding: 2px 6px; border-radius: 3px; margin: 0 2px; display: inline-block;" title="{label} (confidence: {score:.2f})">{word} <sup style="font-size: 0.7em; font-weight: bold;">[{label}]</sup></span>'
        
        result = result[:start] + highlighted + result[end:]
    
    return result

def perform_ner(text):
    """Perform NER on input text"""
    if not text.strip():
        return "<p style='color: red;'>لطفا متن فارسی وارد کنید (Please enter Persian text)</p>", ""
    
    try:
        # Perform NER
        entities = ner_pipeline(text)
        
        # Create highlighted version
        highlighted_html = f"<div style='direction: rtl; text-align: right; font-size: 18px; line-height: 2; padding: 20px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9;'>{highlight_entities(text, entities)}</div>"
        
        # Create entities table
        if entities:
            entity_info = "### موجودیت‌های شناسایی شده (Detected Entities):\n\n"
            entity_info += "| کلمه (Word) | نوع (Type) | اطمینان (Confidence) |\n"
            entity_info += "|------------|-----------|---------------------|\n"
            for ent in entities:
                label_fa = label_names.get(ent['entity_group'], ent['entity_group'])
                entity_info += f"| {ent['word']} | {label_fa} | {ent['score']:.2%} |\n"
        else:
            entity_info = "هیچ موجودیتی شناسایی نشد (No entities detected)"
        
        return highlighted_html, entity_info
        
    except Exception as e:
        return f"<p style='color: red;'>خطا (Error): {str(e)}</p>", ""

# Example texts
examples = [
    ["باراک اوباما در هاوایی متولد شد و در شیکاگو زندگی می‌کرد."],
    ["شرکت گوگل در کالیفرنیا واقع شده است."],
    ["رضا در تهران در تاریخ ۱۵ خرداد ۱۳۸۰ متولد شد."],
    ["دانشگاه تهران یکی از قدیمی‌ترین دانشگاه‌های ایران است."],
    ["علی و حسین به همراه مریم به مشهد سفر کردند."],
]

# Create Gradio interface
with gr.Blocks(title="Persian NER - شناسایی موجودیت‌های نامدار فارسی", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🇮🇷 Persian Named Entity Recognition
    # شناسایی موجودیت‌های نامدار فارسی
    
    این سیستم موجودیت‌های نامدار مانند اسامی اشخاص، سازمان‌ها، مکان‌ها، تاریخ‌ها و ... را در متن فارسی شناسایی می‌کند.
    
    This system identifies named entities such as person names, organizations, locations, dates, etc. in Persian text.
    
    **Model:** ParsBERT-NER (HooshvareLab)  
    **Running on:** CPU (may be slow for long texts)
    """)
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="متن فارسی خود را وارد کنید (Enter Persian Text)",
                placeholder="مثال: رضا در تهران زندگی می‌کند...",
                lines=5,
                rtl=True
            )
            submit_btn = gr.Button("🔍 تحلیل متن (Analyze Text)", variant="primary")
            
        with gr.Column():
            output_html = gr.HTML(label="متن با موجودیت‌های برجسته (Text with Highlighted Entities)")
            output_entities = gr.Markdown(label="لیست موجودیت‌ها (Entity List)")
    
    gr.Examples(
        examples=examples,
        inputs=input_text,
        label="مثال‌ها (Examples)"
    )
    
    # Legend
    gr.Markdown("""
    ### راهنمای رنگ‌ها (Color Guide):
    - 🔴 **PER (شخص)**: اسامی اشخاص / Person names
    - 🔵 **ORG (سازمان)**: نام سازمان‌ها / Organizations
    - 🟢 **LOC (مکان)**: نام مکان‌ها / Locations
    - 🟠 **DAT (تاریخ)**: تاریخ‌ها / Dates
    - 🟣 **TIM (زمان)**: زمان‌ها / Times
    - 🟡 **MON (پول)**: مقادیر پولی / Money
    - 🔷 **PCT (درصد)**: درصدها / Percentages
    """)
    
    # Event handler
    submit_btn.click(
        fn=perform_ner,
        inputs=input_text,
        outputs=[output_html, output_entities]
    )
    
    input_text.submit(
        fn=perform_ner,
        inputs=input_text,
        outputs=[output_html, output_entities]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()