Spaces:

optimopium
/

NER-Persian-LLM-Based

Running

App Files Files Community

optimopium commited on Nov 5

Commit

c4885a5

verified ·

1 Parent(s): 3c57d2d

Update app.py

Browse files

Files changed (1) hide show

app.py +357 -251

app.py CHANGED Viewed

@@ -1,83 +1,225 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 import torch
 import re
-import csv
-import os
-# Set device to CPU explicitly
-device = "cpu"
-# Load the model and tokenizer
-model_name = "HooshvareLab/bert-base-parsbert-ner-uncased"
-print("Loading model and tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForTokenClassification.from_pretrained(model_name)
-model.to(device)
 # Create NER pipeline
 ner_pipeline = pipeline(
     "ner",
-    model=model,
-    tokenizer=tokenizer,
-    device=-1,  # -1 means CPU
-    aggregation_strategy="simple"  # Groups entities together
 )
-# Load stock symbols from CSV file
-def load_stock_symbols_from_csv(csv_path='symbols.csv'):
-    """Load stock symbols from CSV file"""
-    stock_symbols = {}
     try:
-        with open(csv_path, 'r', encoding='utf-8') as f:
-            reader = csv.DictReader(f)
-            for row in reader:
-                symbol = row['symbol']
-                company_name = row['company_name']
-                stock_symbols[symbol] = company_name
-        print(f"Loaded {len(stock_symbols)} stock symbols from CSV")
-    except FileNotFoundError:
-        print(f"Warning: {csv_path} not found. Using default symbols.")
-    return stock_symbols
-# Load stock symbols
-STOCK_SYMBOLS = load_stock_symbols_from_csv()
-# Hypernym patterns (generic terms that can be made more specific)
-HYPERNYM_PATTERNS = {
-    "شرکت": "ORG",
-    "سازمان": "ORG",
-    "موسسه": "ORG",
-    "بانک": "ORG",
-    "دانشگاه": "ORG",
-    "شهر": "LOC",
-    "کشور": "LOC",
-    "استان": "LOC",
-    "آقای": "PER",
-    "خانم": "PER",
-    "دکتر": "PER",
-    "مهندس": "PER",
 }
-# Label mapping for better readability
 label_colors = {
-    "B-PER": "#FF6B6B",
     "I-PER": "#FFB3B3",
-    "B-ORG": "#4ECDC4",
     "I-ORG": "#A7E9E4",
-    "B-LOC": "#95E1D3",
     "I-LOC": "#C7F0E8",
-    "B-DAT": "#FFA07A",
     "I-DAT": "#FFDAB9",
-    "B-TIM": "#DDA0DD",
     "I-TIM": "#E6D0E6",
-    "B-MON": "#FFD700",
     "I-MON": "#FFEB99",
-    "B-PCT": "#87CEEB",
     "I-PCT": "#B3DFEF",
-    "STK": "#FF1493",  # Stock symbol - Deep Pink
-    "HYP": "#A9A9A9",  # Hypernym - Dark Gray
 }
 label_names = {
@@ -88,98 +230,36 @@ label_names = {
     "TIM": "زمان (Time)",
     "MON": "پول (Money)",
     "PCT": "درصد (Percent)",
-    "STK": "نماد بورس (Stock Symbol)",
-    "HYP": "واژه عمومی (Hypernym)",
 }
-def detect_stock_symbols(text):
-    """Detect Persian stock market symbols in text"""
-    stock_entities = []
-    # Split text into words
-    words = re.findall(r'[\u0600-\u06FF]+', text)
-    for word in words:
-        if word in STOCK_SYMBOLS:
-            # Find all occurrences of this symbol in the text
-            for match in re.finditer(re.escape(word), text):
-                stock_entities.append({
-                    'entity_group': 'STK',
-                    'word': word,
-                    'start': match.start(),
-                    'end': match.end(),
-                    'score': 0.99,  # High confidence for dictionary match
-                    'full_name': STOCK_SYMBOLS[word]
-                })
-    return stock_entities
-def detect_hypernyms(text, entities):
-    """Detect hypernyms (general terms) in text and classify them"""
-    hypernym_entities = []
-    for hypernym, entity_type in HYPERNYM_PATTERNS.items():
-        for match in re.finditer(re.escape(hypernym), text):
-            start, end = match.start(), match.end()
-            # Check if this position already has a specific entity
-            is_covered = False
-            for ent in entities:
-                if start >= ent['start'] and end <= ent['end']:
-                    is_covered = True
-                    break
-            if not is_covered:
-                hypernym_entities.append({
-                    'entity_group': 'HYP',
-                    'word': hypernym,
-                    'start': start,
-                    'end': end,
-                    'score': 0.95,
-                    'base_type': entity_type,
-                    'is_hypernym': True
-                })
-    return hypernym_entities
-def merge_entities(entities, stock_entities, hypernym_entities):
-    """Merge all entity types and remove overlaps, prioritizing specific entities"""
-    all_entities = entities + stock_entities + hypernym_entities
-    # Sort by start position
-    all_entities.sort(key=lambda x: x['start'])
-    # Remove overlapping entities (keep higher priority)
-    # Priority: STK > specific entities > HYP
-    filtered_entities = []
-    for entity in all_entities:
-        overlaps = False
-        for existing in filtered_entities:
             # Check for overlap
-            if not (entity['end'] <= existing['start'] or entity['start'] >= existing['end']):
-                overlaps = True
-                # If new entity is stock symbol, replace existing
-                if entity['entity_group'] == 'STK' and existing['entity_group'] != 'STK':
-                    filtered_entities.remove(existing)
-                    overlaps = False
-                # If existing is hypernym and new is specific, replace
-                elif existing['entity_group'] == 'HYP' and entity['entity_group'] != 'HYP':
-                    filtered_entities.remove(existing)
-                    overlaps = False
                 break
-        if not overlaps:
-            filtered_entities.append(entity)
-    return sorted(filtered_entities, key=lambda x: x['start'])
-def highlight_entities(text, entities):
     """Create HTML with highlighted entities"""
-    if not entities:
         return text
-    # Sort entities by start position (reverse order to replace from end to start)
-    entities_sorted = sorted(entities, key=lambda x: x['start'], reverse=True)
     result = text
     for entity in entities_sorted:
@@ -190,177 +270,197 @@ def highlight_entities(text, entities):
         score = entity['score']
         # Get color for this label
-        if label == 'STK':
-            color = label_colors.get('STK')
-            extra_info = f" - {entity.get('full_name', '')}" if 'full_name' in entity else ""
-            title_text = f"Stock Symbol{extra_info} (confidence: {score:.2f})"
-        elif label == 'HYP':
-            color = label_colors.get('HYP')
-            base_type = entity.get('base_type', '')
-            title_text = f"Hypernym (general term for {base_type})"
-        else:
-            color = label_colors.get(f"B-{label}", "#CCCCCC")
-            title_text = f"{label} (confidence: {score:.2f})"
         # Create highlighted span
-        highlighted = f'<span style="background-color: {color}; padding: 2px 6px; border-radius: 3px; margin: 0 2px; display: inline-block;" title="{title_text}">{word} <sup style="font-size: 0.7em; font-weight: bold;">[{label}]</sup></span>'
         result = result[:start] + highlighted + result[end:]
     return result
 def perform_ner(text):
-    """Perform NER on input text"""
     if not text.strip():
         return "<p style='color: red;'>لطفا متن فارسی وارد کنید (Please enter Persian text)</p>", ""
     try:
-        # Perform base NER
         entities = ner_pipeline(text)
-        # Detect stock symbols
-        stock_entities = detect_stock_symbols(text)
-        # Detect hypernyms
-        hypernym_entities = detect_hypernyms(text, entities)
-        # Merge all entities
-        all_entities = merge_entities(entities, stock_entities, hypernym_entities)
         # Create highlighted version
-        highlighted_html = f"<div style='direction: rtl; text-align: right; font-size: 18px; line-height: 2; padding: 20px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9;'>{highlight_entities(text, all_entities)}</div>"
         # Create entities table
         if all_entities:
             entity_info = "### موجودیت‌های شناسایی شده (Detected Entities):\n\n"
-            entity_info += "| کلمه (Word) | نوع (Type) | اطمینان (Confidence) | اطلاعات اضافی (Additional Info) |\n"
-            entity_info += "|------------|-----------|---------------------|----------------------------------|\n"
             for ent in all_entities:
                 label_fa = label_names.get(ent['entity_group'], ent['entity_group'])
-                extra_info = ""
-                if ent['entity_group'] == 'STK' and 'full_name' in ent:
-                    extra_info = ent['full_name']
-                elif ent['entity_group'] == 'HYP':
-                    extra_info = f"Hypernym of {ent.get('base_type', '')}"
-                entity_info += f"| {ent['word']} | {label_fa} | {ent['score']:.2%} | {extra_info} |\n"
         else:
             entity_info = "هیچ موجودیتی شناسایی نشد (No entities detected)"
-        return highlighted_html, entity_info
     except Exception as e:
-        return f"<p style='color: red;'>خطا (Error): {str(e)}</p>", ""
-# Save stock symbols to CSV function
-def save_symbols_to_csv(output_path='symbols.csv'):
-    """Save current stock symbols to CSV file"""
-    with open(output_path, 'w', encoding='utf-8', newline='') as f:
-        writer = csv.writer(f)
-        writer.writerow(['symbol', 'company_name'])
-        for symbol, name in STOCK_SYMBOLS.items():
-            writer.writerow([symbol, name])
-    print(f"Saved {len(STOCK_SYMBOLS)} symbols to {output_path}")
-# Example texts
 examples = [
-    ["باراک اوباما در هاوایی متولد شد و در شیکاگو زندگی می‌کرد."],
-    ["شرکت گوگل در کالیفرنیا واقع شده است."],
-    ["رضا در تهران در تاریخ ۱۵ خرداد ۱۳۸۰ متولد شد."],
-    ["دانشگاه تهران یکی از قدیمی‌ترین دانشگاه‌های ایران است."],
-    ["علی و حسین به همراه مریم به مشهد سفر کردند."],
-    ["سهام فولاد و خودرو امروز رشد خوبی داشتند و شپنا هم صعودی بود."],
-    ["بانک ملت و وتجارت در بازار بورس فعال هستند."],
-    ["آقای احمدی مدیرعامل شرکت پتروشیمی است."],
-    ["وبملت و فملی امروز در صف خرید قرار گرفتند."],
 ]
 # Create Gradio interface
-with gr.Blocks(title="Persian NER - شناسایی موجودیت‌های نامدار فارسی", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(f"""
-    # 🇮🇷 Persian Named Entity Recognition + Stock Symbols
-    # شناسایی موجودیت‌های نامدار فارسی + نمادهای بورس
-    این سیستم موجودیت‌های نامدار مانند اسامی اشخاص، سازمان‌ها، مکان‌ها، تاریخ‌ها، **نمادهای بورس** و **واژه‌های عمومی (Hypernyms)** را در متن فارسی شناسایی می‌کند.
-    This system identifies named entities including person names, organizations, locations, dates, **stock symbols**, and **hypernyms** in Persian text.
-    **Model:** ParsBERT-NER (HooshvareLab) + Custom Stock Symbol Detection
-    **Stock Symbols Loaded:** {len(STOCK_SYMBOLS)} symbols from Tehran Stock Exchange (TSE)
-    **Running on:** CPU (may be slow for long texts)
     ---
-    ### 📊 APIs for Updating Stock Symbols:
-    **Recommended Python Libraries:**
-    1. **tsetmc-api** - `pip install tsetmc-api` - Direct access to TSETMC data
-    2. **tehran-stocks** - `pip install tehran-stocks` - Full stock price history with ORM
-    3. **tse-dataloader** - Data extraction from Tehran Stock Exchange
-    **Example Usage:**
-```python
-    # Using tsetmc-api
-    from tsetmc_api import market_watch
-    stocks = market_watch.get_market_watch()
-    # Using tehran-stocks
-    from tehran_stocks import Stocks
-    all_stocks = Stocks.query.all()
-```
-    **Official TSE Website:** https://tse.ir
-    **TSETMC Data Portal:** http://www.tsetmc.com
     """)
     with gr.Row():
-        with gr.Column():
             input_text = gr.Textbox(
-                label="متن فارسی خود را وارد کنید (Enter Persian Text)",
-                placeholder="مثال: سهام فولاد و خودرو امروز رشد کردند...",
-                lines=5,
-                rtl=True
             )
-            submit_btn = gr.Button("🔍 تحلیل متن (Analyze Text)", variant="primary")
-        with gr.Column():
-            output_html = gr.HTML(label="متن با موجودیت‌های برجسته (Text with Highlighted Entities)")
-            output_entities = gr.Markdown(label="لیست موجودیت‌ها (Entity List)")
     gr.Examples(
         examples=examples,
         inputs=input_text,
-        label="مثال‌ها (Examples)"
     )
-    # Legend
-    gr.Markdown("""
-    ### راهنمای رنگ‌ها (Color Guide):
-    - 🔴 **PER (شخص)**: اسامی اشخاص / Person names
-    - 🔵 **ORG (سازمان)**: نام سازمان‌ها / Organizations
-    - 🟢 **LOC (مکان)**: نام مکان‌ها / Locations
-    - 🟠 **DAT (تاریخ)**: تاریخ‌ها / Dates
-    - 🟣 **TIM (زمان)**: زمان‌ها / Times
-    - 🟡 **MON (پول)**: مقادیر پولی / Money
-    - 🔷 **PCT (درصد)**: درصدها / Percentages
-    - 💗 **STK (نماد بورس)**: نمادهای بورس تهران / Tehran Stock Exchange symbols
-    - ⚫ **HYP (واژه عمومی)**: واژه‌های عمومی / Hypernyms (general terms)
-    ---
-    ### 📝 تعداد نمادهای بورس: {len(STOCK_SYMBOLS)} نماد
-    *برای به‌روزرسانی نمادها، فایل CSV را جایگزین کنید یا از API استفاده کنید.*
-    """)
-    # Event handler
     submit_btn.click(
         fn=perform_ner,
         inputs=input_text,
         outputs=[output_html, output_entities]
     )
     input_text.submit(
         fn=perform_ner,
         inputs=input_text,
@@ -369,4 +469,10 @@ with gr.Blocks(title="Persian NER - شناسایی موجودیت‌های نا
 # Launch the app
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from transformers import (
+    AutoTokenizer,
+    AutoModelForTokenClassification,
+    AutoModelForSequenceClassification,
+    AutoModelForQuestionAnswering,
+    pipeline
+)
 import torch
+import pandas as pd
 import re
+from typing import List, Dict, Tuple
+import numpy as np
+# Set device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load the NER model and tokenizer
+print("Loading NER model...")
+ner_model_name = "HooshvareLab/bert-base-parsbert-ner-uncased"
+ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
+ner_model = AutoModelForTokenClassification.from_pretrained(ner_model_name)
+ner_model.to(device)
 # Create NER pipeline
 ner_pipeline = pipeline(
     "ner",
+    model=ner_model,
+    tokenizer=ner_tokenizer,
+    device=0 if device == "cuda" else -1,
+    aggregation_strategy="simple"
+)
+# Load Persian LLM for context understanding
+print("Loading Persian context model...")
+# Using a Persian BERT model fine-tuned for question answering/text classification
+context_model_name = "persiannlp/mt5-small-parsinlu-multiple-choice"  # Alternative model
+# You can also try: "HooshvareLab/bert-fa-base-uncased" with custom classification head
+# For stock symbol disambiguation, we'll use a QA approach
+qa_model_name = "mohammadehab/persian-qa-bert"
+qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
+qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
+qa_model.to(device)
+qa_pipeline = pipeline(
+    "question-answering",
+    model=qa_model,
+    tokenizer=qa_tokenizer,
+    device=0 if device == "cuda" else -1
 )
+# Load stock symbols from CSV
+def load_stock_symbols(csv_path="symbols.csv"):
+    """Load Iranian stock market symbols from CSV file"""
     try:
+        df = pd.read_csv(csv_path, encoding='utf-8')
+        symbols_dict = {}
+        for _, row in df.iterrows():
+            symbol = row['symbol']
+            symbols_dict[symbol] = {
+                'company': row['company_name'],
+                'bazaar': row['bazaar'],
+                'bazaar_group': row['bazaar_group']
+            }
+        return symbols_dict
+    except Exception as e:
+        print(f"Error loading symbols CSV: {e}")
+        # Provide some default symbols for demo
+        return {
+            'وبصادر': {'company': 'بانک صادرات ایران', 'bazaar': 'بورس - بازار دوم', 'bazaar_group': 'بانکها و موسسات اعتباری'},
+            'فولاد': {'company': 'فولاد مبارکه اصفهان', 'bazaar': 'بورس - بازار اول', 'bazaar_group': 'فلزات اساسی'},
+            'فارس': {'company': 'پتروشیمی فارس', 'bazaar': 'بورس - بازار اول', 'bazaar_group': 'محصولات شیمیایی'},
+            'شپنا': {'company': 'پالایش نفت اصفهان', 'bazaar': 'بورس - بازار اول', 'bazaar_group': 'فرآورده‌های نفتی'},
+            'خودرو': {'company': 'ایران خودرو', 'bazaar': 'بورس - بازار اول', 'bazaar_group': 'خودرو'},
+            'وبملت': {'company': 'بانک ملت', 'bazaar': 'بورس - بازار اول', 'bazaar_group': 'بانکها'},
+        }
+# Load symbols
+STOCK_SYMBOLS = load_stock_symbols()
+SYMBOL_NAMES = set(STOCK_SYMBOLS.keys())
+# Market context keywords for Persian
+MARKET_KEYWORDS = {
+    'سهام', 'سهم', 'بورس', 'فرابورس', 'معامله', 'معاملات', 'خرید', 'فروش',
+    'قیمت', 'ارزش', 'بازار', 'سرمایه', 'سرمایه‌گذاری', 'پرتفوی', 'نماد',
+    'شاخص', 'حجم', 'عرضه', 'تقاضا', 'صف', 'نوسان', 'بازدهی', 'سود',
+    'زیان', 'ریال', 'تومان', 'میلیارد', 'میلیون', 'درصد', 'رشد', 'افت',
+    'کندل', 'نمودار', 'تحلیل', 'بنیادی', 'تکنیکال', 'حمایت', 'مقاومت'
 }
+def use_llm_for_disambiguation(text: str, potential_symbol: str, symbol_info: Dict) -> float:
+    """
+    Use Persian QA model to determine if a word is used as a stock symbol
+    Returns confidence score (0-1)
+    """
+    try:
+        # Create context-aware questions in Persian
+        context = text
+        # Ask multiple questions to get better understanding
+        questions = [
+            f"آیا {potential_symbol} در این متن نماد بورسی است؟",
+            f"آیا منظور از {potential_symbol} سهام شرکت {symbol_info['company']} است؟",
+            f"آیا {potential_symbol} در این متن به معاملات بورس اشاره دارد؟"
+        ]
+        scores = []
+        for question in questions:
+            try:
+                result = qa_pipeline(question=question, context=context)
+                # Check if the answer contains positive indicators
+                answer = result['answer'].lower()
+                confidence = result['score']
+                # Adjust confidence based on answer content
+                if any(word in answer for word in ['بله', 'است', 'می‌باشد', 'درست']):
+                    scores.append(confidence)
+                elif any(word in answer for word in ['خیر', 'نیست', 'نمی‌باشد']):
+                    scores.append(1 - confidence)
+                else:
+                    scores.append(0.5)  # Uncertain
+            except Exception:
+                scores.append(0.5)  # Default to uncertain
+        # Return average confidence
+        return np.mean(scores) if scores else 0.5
+    except Exception as e:
+        print(f"LLM disambiguation error: {e}")
+        return 0.5  # Return neutral confidence on error
+def check_stock_symbol_context(text: str, potential_symbol: str, symbol_info: Dict) -> Tuple[bool, float]:
+    """
+    Check if a potential symbol is actually used as a stock symbol in context
+    Returns (is_stock_symbol, confidence_score)
+    """
+    # First, do a heuristic check
+    text_lower = text.lower()
+    # Get surrounding context (window of 30 characters before and after)
+    symbol_pos = text.find(potential_symbol)
+    if symbol_pos == -1:
+        return False, 0.0
+    start_context = max(0, symbol_pos - 50)
+    end_context = min(len(text), symbol_pos + len(potential_symbol) + 50)
+    context_window = text[start_context:end_context]
+    # Count market keywords in context
+    words_in_context = context_window.split()
+    market_keyword_count = sum(1 for word in words_in_context if word in MARKET_KEYWORDS)
+    # Calculate heuristic score
+    heuristic_score = min(market_keyword_count * 0.3, 1.0)
+    # If very strong or very weak signal from heuristics, use that
+    if heuristic_score >= 0.9:
+        return True, heuristic_score
+    elif heuristic_score == 0 and len(words_in_context) > 5:
+        return False, 0.1
+    # For ambiguous cases, use LLM
+    llm_score = use_llm_for_disambiguation(text, potential_symbol, symbol_info)
+    # Combine heuristic and LLM scores
+    final_score = (heuristic_score * 0.4 + llm_score * 0.6)
+    # Decision threshold
+    is_stock = final_score > 0.5
+    return is_stock, final_score
+def find_stock_symbols_in_text(text: str) -> List[Dict]:
+    """Find and validate stock symbols in text using context analysis"""
+    found_symbols = []
+    # Use regex to find all potential symbols (Persian words)
+    # This pattern matches Persian words that might be symbols
+    pattern = r'\b[\u0600-\u06FF]+\b'
+    for match in re.finditer(pattern, text):
+        word = match.group()
+        if word in SYMBOL_NAMES:
+            symbol_info = STOCK_SYMBOLS[word]
+            # Check context to determine if it's actually used as a stock symbol
+            is_stock, confidence = check_stock_symbol_context(text, word, symbol_info)
+            if is_stock:
+                found_symbols.append({
+                    'word': word,
+                    'start': match.start(),
+                    'end': match.end(),
+                    'entity_group': 'STOCK',
+                    'score': confidence,
+                    'company': symbol_info['company'],
+                    'bazaar': symbol_info['bazaar'],
+                    'bazaar_group': symbol_info['bazaar_group']
+                })
+    return found_symbols
+# Label colors and names
 label_colors = {
+    "B-PER": "#FF6B6B",  # Person - Red
     "I-PER": "#FFB3B3",
+    "B-ORG": "#4ECDC4",  # Organization - Teal
     "I-ORG": "#A7E9E4",
+    "B-LOC": "#95E1D3",  # Location - Green
     "I-LOC": "#C7F0E8",
+    "B-DAT": "#FFA07A",  # Date - Orange
     "I-DAT": "#FFDAB9",
+    "B-TIM": "#DDA0DD",  # Time - Purple
     "I-TIM": "#E6D0E6",
+    "B-MON": "#FFD700",  # Money - Gold
     "I-MON": "#FFEB99",
+    "B-PCT": "#87CEEB",  # Percent - Sky Blue
     "I-PCT": "#B3DFEF",
+    "STOCK": "#00FA9A",  # Stock Symbol - Medium Spring Green
 }
 label_names = {
     "TIM": "زمان (Time)",
     "MON": "پول (Money)",
     "PCT": "درصد (Percent)",
+    "STOCK": "نماد بورسی (Stock Symbol)",
 }
+def merge_overlapping_entities(entities: List[Dict], stock_entities: List[Dict]) -> List[Dict]:
+    """Merge entities, removing overlaps (stock symbols take precedence)"""
+    all_entities = []
+    # Add stock entities first (they have priority)
+    all_entities.extend(stock_entities)
+    # Add NER entities that don't overlap with stock entities
+    for ner_ent in entities:
+        overlap = False
+        for stock_ent in stock_entities:
             # Check for overlap
+            if not (ner_ent['end'] <= stock_ent['start'] or ner_ent['start'] >= stock_ent['end']):
+                overlap = True
                 break
+        if not overlap:
+            all_entities.append(ner_ent)
+    return all_entities
+def highlight_entities(text, all_entities):
     """Create HTML with highlighted entities"""
+    if not all_entities:
         return text
+    # Sort entities by start position (reverse order)
+    entities_sorted = sorted(all_entities, key=lambda x: x['start'], reverse=True)
     result = text
     for entity in entities_sorted:
         score = entity['score']
         # Get color for this label
+        color = label_colors.get(label if label == 'STOCK' else f"B-{label}", "#CCCCCC")
+        # Add extra info for stock symbols
+        tooltip_info = f"{label} (confidence: {score:.2f})"
+        if label == 'STOCK':
+            company = entity.get('company', '')
+            bazaar = entity.get('bazaar', '')
+            if company:
+                tooltip_info = f"{company} - {bazaar} (confidence: {score:.2f})"
         # Create highlighted span
+        highlighted = f'<span style="background-color: {color}; padding: 2px 6px; border-radius: 3px; margin: 0 2px; display: inline-block;" title="{tooltip_info}">{word} <sup style="font-size: 0.7em; font-weight: bold;">[{label}]</sup></span>'
         result = result[:start] + highlighted + result[end:]
     return result
 def perform_ner(text):
+    """Perform integrated NER and stock symbol detection"""
     if not text.strip():
         return "<p style='color: red;'>لطفا متن فارسی وارد کنید (Please enter Persian text)</p>", ""
     try:
+        # Perform standard NER
         entities = ner_pipeline(text)
+        # Find stock symbols using Persian LLM
+        stock_entities = find_stock_symbols_in_text(text)
+        # Merge entities (remove overlaps)
+        all_entities = merge_overlapping_entities(entities, stock_entities)
         # Create highlighted version
+        highlighted_html = f"""
+        <div style='direction: rtl; text-align: right; font-size: 18px; line-height: 2.5;
+                    padding: 20px; border: 1px solid #ddd; border-radius: 5px;
+                    background-color: #f9f9f9; font-family: Tahoma, Arial;'>
+            {highlight_entities(text, all_entities)}
+        </div>
+        """
         # Create entities table
         if all_entities:
             entity_info = "### موجودیت‌های شناسایی شده (Detected Entities):\n\n"
+            entity_info += "| کلمه (Word) | نوع (Type) | جزئیات (Details) | اطمینان (Confidence) |\n"
+            entity_info += "|:------------|:-----------|:------------------|:---------------------|\n"
+            # Sort by position in text
+            all_entities.sort(key=lambda x: x['start'])
             for ent in all_entities:
                 label_fa = label_names.get(ent['entity_group'], ent['entity_group'])
+                details = ""
+                if ent['entity_group'] == 'STOCK':
+                    company = ent.get('company', '')
+                    bazaar = ent.get('bazaar', '')
+                    group = ent.get('bazaar_group', '')
+                    details = f"{company}<br>{bazaar}<br>{group}"
+                entity_info += f"| **{ent['word']}** | {label_fa} | {details} | {ent['score']:.2%} |\n"
         else:
             entity_info = "هیچ موجودیتی شناسایی نشد (No entities detected)"
+        # Add statistics
+        stats = f"\n\n### آمار (Statistics):\n"
+        stats += f"- تعداد کل موجودیت‌ها: {len(all_entities)}\n"
+        stats += f"- نمادهای بورسی: {len([e for e in all_entities if e['entity_group'] == 'STOCK'])}\n"
+        stats += f"- اشخاص: {len([e for e in all_entities if e['entity_group'] == 'PER'])}\n"
+        stats += f"- سازمان‌ها: {len([e for e in all_entities if e['entity_group'] == 'ORG'])}\n"
+        stats += f"- مکان‌ها: {len([e for e in all_entities if e['entity_group'] == 'LOC'])}\n"
+        return highlighted_html, entity_info + stats
     except Exception as e:
+        return f"<p style='color: red;'>خطا (Error): {str(e)}</p>", str(e)
+# Enhanced examples
 examples = [
+    ["علی احمدی دیروز در تهران با مدیر شرکت ملی نفت ایران دیدار کرد."],
+    ["سهام وبصادر و فولاد در بورس امروز با افزایش قیمت مواجه شدند."],
+    ["صنعت فولاد در اصفهان یکی از مهمترین صنایع کشور است."],
+    ["قیمت سهام شپنا در معاملات امروز ۵ درصد رشد داشت و به ۱۲۰۰۰ ریال رسید."],
+    ["بانک ملت اعلام کرد که سود سهام وبملت را در تاریخ ۱۵ خرداد ۱۴۰۳ پرداخت خواهد کرد."],
+    ["شرکت فولاد مبارکه با نماد فولاد در بورس تهران فعال است و محصولات فولادی تولید می‌کند."],
+    ["من دیروز ۱۰۰۰ سهم از وتوسکا خریدم و امیدوارم تا پایان هفته ۲۰ درصد سود کنم."],
 ]
 # Create Gradio interface
+with gr.Blocks(
+    title="Persian NER + Stock Symbols | شناسایی موجودیت‌ها و نمادهای بورسی",
+    theme=gr.themes.Soft(),
+    css="""
+    .rtl-text { direction: rtl; text-align: right; font-family: 'B Nazanin', Tahoma, Arial; }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🏦 Persian NER with Intelligent Stock Symbol Detection
+    # شناسایی هوشمند موجودیت‌های نامدار و نمادهای بورس ایران
+    <div class="rtl-text">
+    این سیستم با است��اده از دو مدل هوش مصنوعی:
+    1. **ParsBERT-NER**: برای شناسایی موجودیت‌های نامدار (اشخاص، سازمان‌ها، مکان‌ها، تاریخ‌ها)
+    2. **Persian BERT QA**: برای تشخیص هوشمند نمادهای بورسی با درک متن
+    ویژگی خاص: تشخیص هوشمند کلماتی مثل «فولاد» که می‌تواند نماد بورسی یا کلمه عادی باشد
+    </div>
     ---
     """)
     with gr.Row():
+        with gr.Column(scale=6):
             input_text = gr.Textbox(
+                label="متن فارسی (Persian Text)",
+                placeholder="مثال: سهام فولاد در بورس تهران معامله می‌شود...",
+                lines=6,
+                rtl=True,
+                elem_classes=["rtl-text"]
             )
+            with gr.Row():
+                submit_btn = gr.Button("🔍 تحلیل متن", variant="primary", scale=2)
+                clear_btn = gr.Button("🗑️ پاک کردن", scale=1)
+        with gr.Column(scale=6):
+            output_html = gr.HTML(
+                label="نتیجه تحلیل (Analysis Result)",
+                elem_classes=["rtl-text"]
+            )
+    with gr.Row():
+        output_entities = gr.Markdown(
+            label="جدول موجودیت‌ها (Entity Table)",
+            elem_classes=["rtl-text"]
+        )
     gr.Examples(
         examples=examples,
         inputs=input_text,
+        label="نمونه‌های آماده (Ready Examples)",
+        examples_per_page=4
     )
+    # Color guide
+    with gr.Accordion("📖 راهنمای رنگ‌ها (Color Guide)", open=False):
+        gr.Markdown("""
+        <div class="rtl-text">
+        | رنگ | نوع موجودیت | توضیحات |
+        |:---:|:------------|:--------|
+        | 🔴 | **PER** | اشخاص و نام‌های افراد |
+        | 🔵 | **ORG** | سازمان‌ها و شرکت‌ها |
+        | 🟢 | **LOC** | مکان‌ها و نام‌های جغرافیایی |
+        | 🟠 | **DAT** | تاریخ‌ها |
+        | 🟣 | **TIM** | زمان‌ها |
+        | 🟡 | **MON** | مقادیر پولی |
+        | 🔷 | **PCT** | درصدها |
+        | 💚 | **STOCK** | نمادهای بورسی (با تحلیل هوشمند متن) |
+        </div>
+        """)
+    # Info section
+    with gr.Accordion("ℹ️ درباره سیستم (About)", open=False):
+        gr.Markdown("""
+        <div class="rtl-text">
+        ### قابلیت‌های کلیدی:
+        - **تشخیص هوشمند نمادهای بورسی**: با استفاده از مدل زبانی فارسی، سیستم تشخیص می‌دهد که آیا کلمات مشابه نمادها (مثل فولاد، فارس) در متن به عنوان نماد بورسی استفاده شده‌اند یا معنای عادی دارند
+        - **ترکیب دو مدل**: استفاده همزمان از ParsBERT-NER برای NER کلاسیک و Persian BERT برای درک متن
+        - **اطلاعات کامل نمادها**: نمایش نام شرکت، بازار، و گروه صنعت برای هر نماد شناسایی شده
+        - **دقت بالا**: با ترکیب تحلیل‌های مبتنی بر قواعد و مدل زبانی
+        </div>
+        """)
+    # Event handlers
     submit_btn.click(
         fn=perform_ner,
         inputs=input_text,
         outputs=[output_html, output_entities]
     )
+    clear_btn.click(
+        lambda: ("", "", ""),
+        outputs=[input_text, output_html, output_entities]
+    )
     input_text.submit(
         fn=perform_ner,
         inputs=input_text,
 # Launch the app
 if __name__ == "__main__":
+    print("Starting Persian NER + Stock Symbol Detection System...")
+    print(f"Using device: {device}")
+    print(f"Loaded {len(STOCK_SYMBOLS)} stock symbols")
+    demo.launch(
+        share=False,
+        debug=True
+    )