Spaces:
Sleeping
Sleeping
File size: 7,043 Bytes
a39d4c2 ee8ab7e a39d4c2 ee8ab7e a39d4c2 67ad485 a39d4c2 67ad485 a39d4c2 ee8ab7e a39d4c2 ee8ab7e a39d4c2 eb83f82 a39d4c2 ee8ab7e a39d4c2 ee8ab7e a39d4c2 eb83f82 a39d4c2 ee8ab7e a39d4c2 ee8ab7e a39d4c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
# Set device to CPU explicitly
device = "cpu"
# Load the model and tokenizer
model_name = "HooshvareLab/bert-base-parsbert-ner-uncased"
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model.to(device)
# Create NER pipeline
ner_pipeline = pipeline(
"ner",
model=model,
tokenizer=tokenizer,
device=-1, # -1 means CPU
aggregation_strategy="simple" # Groups entities together
)
# Label mapping for better readability
label_colors = {
"B-PER": "#FF6B6B", # Person - Red
"I-PER": "#FFB3B3", # Person continuation - Light Red
"B-ORG": "#4ECDC4", # Organization - Teal
"I-ORG": "#A7E9E4", # Organization continuation - Light Teal
"B-LOC": "#95E1D3", # Location - Green
"I-LOC": "#C7F0E8", # Location continuation - Light Green
"B-DAT": "#FFA07A", # Date - Orange
"I-DAT": "#FFDAB9", # Date continuation - Light Orange
"B-TIM": "#DDA0DD", # Time - Purple
"I-TIM": "#E6D0E6", # Time continuation - Light Purple
"B-MON": "#FFD700", # Money - Gold
"I-MON": "#FFEB99", # Money continuation - Light Gold
"B-PCT": "#87CEEB", # Percent - Sky Blue
"I-PCT": "#B3DFEF", # Percent continuation - Light Sky Blue
}
label_names = {
"PER": "شخص (Person)",
"ORG": "سازمان (Organization)",
"LOC": "مکان (Location)",
"DAT": "تاریخ (Date)",
"TIM": "زمان (Time)",
"MON": "پول (Money)",
"PCT": "درصد (Percent)",
}
def highlight_entities(text, entities):
"""Create HTML with highlighted entities"""
if not entities:
return text
# Sort entities by start position (reverse order to replace from end to start)
entities_sorted = sorted(entities, key=lambda x: x['start'], reverse=True)
result = text
for entity in entities_sorted:
start = entity['start']
end = entity['end']
label = entity['entity_group']
word = text[start:end]
score = entity['score']
# Get color for this label
color = label_colors.get(f"B-{label}", "#CCCCCC")
# Create highlighted span
highlighted = f'<span style="background-color: {color}; padding: 2px 6px; border-radius: 3px; margin: 0 2px; display: inline-block;" title="{label} (confidence: {score:.2f})">{word} <sup style="font-size: 0.7em; font-weight: bold;">[{label}]</sup></span>'
result = result[:start] + highlighted + result[end:]
return result
def perform_ner(text):
"""Perform NER on input text"""
if not text.strip():
return "<p style='color: red;'>لطفا متن فارسی وارد کنید (Please enter Persian text)</p>", ""
try:
# Perform NER
entities = ner_pipeline(text)
# Create highlighted version
highlighted_html = f"<div style='direction: rtl; text-align: right; font-size: 18px; line-height: 2; padding: 20px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9;'>{highlight_entities(text, entities)}</div>"
# Create entities table
if entities:
entity_info = "### موجودیتهای شناسایی شده (Detected Entities):\n\n"
entity_info += "| کلمه (Word) | نوع (Type) | اطمینان (Confidence) |\n"
entity_info += "|------------|-----------|---------------------|\n"
for ent in entities:
label_fa = label_names.get(ent['entity_group'], ent['entity_group'])
entity_info += f"| {ent['word']} | {label_fa} | {ent['score']:.2%} |\n"
else:
entity_info = "هیچ موجودیتی شناسایی نشد (No entities detected)"
return highlighted_html, entity_info
except Exception as e:
return f"<p style='color: red;'>خطا (Error): {str(e)}</p>", ""
# Example texts
examples = [
["باراک اوباما در هاوایی متولد شد و در شیکاگو زندگی میکرد."],
["شرکت گوگل در کالیفرنیا واقع شده است."],
["رضا در تهران در تاریخ ۱۵ خرداد ۱۳۸۰ متولد شد."],
["دانشگاه تهران یکی از قدیمیترین دانشگاههای ایران است."],
["علی و حسین به همراه مریم به مشهد سفر کردند."],
]
# Create Gradio interface
with gr.Blocks(title="Persian NER - شناسایی موجودیتهای نامدار فارسی", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🇮🇷 Persian Named Entity Recognition
# شناسایی موجودیتهای نامدار فارسی
این سیستم موجودیتهای نامدار مانند اسامی اشخاص، سازمانها، مکانها، تاریخها و ... را در متن فارسی شناسایی میکند.
This system identifies named entities such as person names, organizations, locations, dates, etc. in Persian text.
**Model:** ParsBERT-NER (HooshvareLab)
**Running on:** CPU (may be slow for long texts)
""")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="متن فارسی خود را وارد کنید (Enter Persian Text)",
placeholder="مثال: رضا در تهران زندگی میکند...",
lines=5,
rtl=True
)
submit_btn = gr.Button("🔍 تحلیل متن (Analyze Text)", variant="primary")
with gr.Column():
output_html = gr.HTML(label="متن با موجودیتهای برجسته (Text with Highlighted Entities)")
output_entities = gr.Markdown(label="لیست موجودیتها (Entity List)")
gr.Examples(
examples=examples,
inputs=input_text,
label="مثالها (Examples)"
)
# Legend
gr.Markdown("""
### راهنمای رنگها (Color Guide):
- 🔴 **PER (شخص)**: اسامی اشخاص / Person names
- 🔵 **ORG (سازمان)**: نام سازمانها / Organizations
- 🟢 **LOC (مکان)**: نام مکانها / Locations
- 🟠 **DAT (تاریخ)**: تاریخها / Dates
- 🟣 **TIM (زمان)**: زمانها / Times
- 🟡 **MON (پول)**: مقادیر پولی / Money
- 🔷 **PCT (درصد)**: درصدها / Percentages
""")
# Event handler
submit_btn.click(
fn=perform_ner,
inputs=input_text,
outputs=[output_html, output_entities]
)
input_text.submit(
fn=perform_ner,
inputs=input_text,
outputs=[output_html, output_entities]
)
# Launch the app
if __name__ == "__main__":
demo.launch() |