Spaces:

optimopium
/

NER-Persian-LLM-Based

Sleeping

App Files Files Community

NER-Persian-LLM-Based / app.py

optimopium

Update app.py

a39d4c2 verified about 1 month ago

raw

history blame

7.04 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	import torch

	# Set device to CPU explicitly
	device = "cpu"

	# Load the model and tokenizer
	model_name = "HooshvareLab/bert-base-parsbert-ner-uncased"

	print("Loading model and tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForTokenClassification.from_pretrained(model_name)
	model.to(device)

	# Create NER pipeline
	ner_pipeline = pipeline(
	"ner",
	model=model,
	tokenizer=tokenizer,
	device=-1, # -1 means CPU
	aggregation_strategy="simple" # Groups entities together
	)

	# Label mapping for better readability
	label_colors = {
	"B-PER": "#FF6B6B", # Person - Red
	"I-PER": "#FFB3B3", # Person continuation - Light Red
	"B-ORG": "#4ECDC4", # Organization - Teal
	"I-ORG": "#A7E9E4", # Organization continuation - Light Teal
	"B-LOC": "#95E1D3", # Location - Green
	"I-LOC": "#C7F0E8", # Location continuation - Light Green
	"B-DAT": "#FFA07A", # Date - Orange
	"I-DAT": "#FFDAB9", # Date continuation - Light Orange
	"B-TIM": "#DDA0DD", # Time - Purple
	"I-TIM": "#E6D0E6", # Time continuation - Light Purple
	"B-MON": "#FFD700", # Money - Gold
	"I-MON": "#FFEB99", # Money continuation - Light Gold
	"B-PCT": "#87CEEB", # Percent - Sky Blue
	"I-PCT": "#B3DFEF", # Percent continuation - Light Sky Blue
	}

	label_names = {
	"PER": "شخص (Person)",
	"ORG": "سازمان (Organization)",
	"LOC": "مکان (Location)",
	"DAT": "تاریخ (Date)",
	"TIM": "زمان (Time)",
	"MON": "پول (Money)",
	"PCT": "درصد (Percent)",
	}

	def highlight_entities(text, entities):
	"""Create HTML with highlighted entities"""
	if not entities:
	return text

	# Sort entities by start position (reverse order to replace from end to start)
	entities_sorted = sorted(entities, key=lambda x: x['start'], reverse=True)

	result = text
	for entity in entities_sorted:
	start = entity['start']
	end = entity['end']
	label = entity['entity_group']
	word = text[start:end]
	score = entity['score']

	# Get color for this label
	color = label_colors.get(f"B-{label}", "#CCCCCC")

	# Create highlighted span
	highlighted = f'<span style="background-color: {color}; padding: 2px 6px; border-radius: 3px; margin: 0 2px; display: inline-block;" title="{label} (confidence: {score:.2f})">{word} <sup style="font-size: 0.7em; font-weight: bold;">[{label}]</sup></span>'

	result = result[:start] + highlighted + result[end:]

	return result

	def perform_ner(text):
	"""Perform NER on input text"""
	if not text.strip():
	return "<p style='color: red;'>لطفا متن فارسی وارد کنید (Please enter Persian text)</p>", ""

	try:
	# Perform NER
	entities = ner_pipeline(text)

	# Create highlighted version
	highlighted_html = f"<div style='direction: rtl; text-align: right; font-size: 18px; line-height: 2; padding: 20px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9;'>{highlight_entities(text, entities)}</div>"

	# Create entities table
	if entities:
	entity_info = "### موجودیت‌های شناسایی شده (Detected Entities):\n\n"
	entity_info += "\| کلمه (Word) \| نوع (Type) \| اطمینان (Confidence) \|\n"
	entity_info += "\|------------\|-----------\|---------------------\|\n"
	for ent in entities:
	label_fa = label_names.get(ent['entity_group'], ent['entity_group'])
	entity_info += f"\| {ent['word']} \| {label_fa} \| {ent['score']:.2%} \|\n"
	else:
	entity_info = "هیچ موجودیتی شناسایی نشد (No entities detected)"

	return highlighted_html, entity_info

	except Exception as e:
	return f"<p style='color: red;'>خطا (Error): {str(e)}</p>", ""

	# Example texts
	examples = [
	["باراک اوباما در هاوایی متولد شد و در شیکاگو زندگی می‌کرد."],
	["شرکت گوگل در کالیفرنیا واقع شده است."],
	["رضا در تهران در تاریخ ۱۵ خرداد ۱۳۸۰ متولد شد."],
	["دانشگاه تهران یکی از قدیمی‌ترین دانشگاه‌های ایران است."],
	["علی و حسین به همراه مریم به مشهد سفر کردند."],
	]

	# Create Gradio interface
	with gr.Blocks(title="Persian NER - شناسایی موجودیت‌های نامدار فارسی", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🇮🇷 Persian Named Entity Recognition
	# شناسایی موجودیت‌های نامدار فارسی

	این سیستم موجودیت‌های نامدار مانند اسامی اشخاص، سازمان‌ها، مکان‌ها، تاریخ‌ها و ... را در متن فارسی شناسایی می‌کند.

	This system identifies named entities such as person names, organizations, locations, dates, etc. in Persian text.

	Model: ParsBERT-NER (HooshvareLab)
	Running on: CPU (may be slow for long texts)
	""")

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="متن فارسی خود را وارد کنید (Enter Persian Text)",
	placeholder="مثال: رضا در تهران زندگی می‌کند...",
	lines=5,
	rtl=True
	)
	submit_btn = gr.Button("🔍 تحلیل متن (Analyze Text)", variant="primary")

	with gr.Column():
	output_html = gr.HTML(label="متن با موجودیت‌های برجسته (Text with Highlighted Entities)")
	output_entities = gr.Markdown(label="لیست موجودیت‌ها (Entity List)")

	gr.Examples(
	examples=examples,
	inputs=input_text,
	label="مثال‌ها (Examples)"
	)

	# Legend
	gr.Markdown("""
	### راهنمای رنگ‌ها (Color Guide):
	- 🔴 PER (شخص): اسامی اشخاص / Person names
	- 🔵 ORG (سازمان): نام سازمان‌ها / Organizations
	- 🟢 LOC (مکان): نام مکان‌ها / Locations
	- 🟠 DAT (تاریخ): تاریخ‌ها / Dates
	- 🟣 TIM (زمان): زمان‌ها / Times
	- 🟡 MON (پول): مقادیر پولی / Money
	- 🔷 PCT (درصد): درصدها / Percentages
	""")

	# Event handler
	submit_btn.click(
	fn=perform_ner,
	inputs=input_text,
	outputs=[output_html, output_entities]
	)

	input_text.submit(
	fn=perform_ner,
	inputs=input_text,
	outputs=[output_html, output_entities]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()