Spaces:

optimopium
/

NER-Persian-LLM-Based

Running

App Files Files Community

optimopium commited on Nov 5

Commit

2b32561

verified ·

1 Parent(s): 4865162

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -95

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import re
 from typing import List, Dict, Tuple
 import numpy as np
-# Set device and dtype for optimization
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
@@ -31,33 +31,41 @@ ner_pipeline = pipeline(
     aggregation_strategy="simple"
 )
-# Load Gemma model for stock symbol detection
-print("Loading Gemma-2-9b-it model for context understanding...")
-gemma_model_name = "google/gemma-2-9b-it"
-# Load with optimization settings for better performance
-gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_model_name)
-gemma_model = AutoModelForCausalLM.from_pretrained(
-    gemma_model_name,
-    torch_dtype=dtype,
-    device_map="auto" if torch.cuda.is_available() else None,
-    low_cpu_mem_usage=True
-)
-if device == "cpu":
-    gemma_model = gemma_model.to(device)
-# Create text generation pipeline
-gemma_pipeline = pipeline(
-    "text-generation",
-    model=gemma_model,
-    tokenizer=gemma_tokenizer,
-    device=0 if device == "cuda" else -1,
-    max_new_tokens=50,
-    temperature=0.1,  # Low temperature for consistent outputs
-    do_sample=False,  # Deterministic outputs
-    pad_token_id=gemma_tokenizer.eos_token_id
-)
 # Load stock symbols from CSV
 def load_stock_symbols(csv_path="symbols.csv"):
@@ -99,65 +107,101 @@ MARKET_KEYWORDS = {
     'زیان', 'ریال', 'تومان', 'میلیارد', 'میلیون', 'درصد', 'رشد', 'افت'
 }
-def use_gemma_for_disambiguation(text: str, potential_symbol: str, symbol_info: Dict) -> float:
     """
-    Use Gemma-2-9b-it to determine if a word is used as a stock symbol
     Returns confidence score (0-1)
     """
     try:
-        # Create a focused prompt for Gemma
-        prompt = f"""<bos><start_of_turn>user
-You are a Persian financial text analyzer. Determine if the word "{potential_symbol}" in the following Persian text is used as a stock market symbol or as a regular word.
-Context information:
-- The word "{potential_symbol}" could be a stock symbol for "{symbol_info['company']}" (industry: {symbol_info['bazaar_group']})
-- Stock symbols usually appear with financial terms like: سهام، بورس، معامله، قیمت، خرید، فروش
-Text to analyze:
-"{text}"
-Answer ONLY with one of these:
-1. "STOCK" if it's used as a stock market symbol
-2. "WORD" if it's used as a regular word
-Reasoning: Consider the surrounding context. If the text discusses trading, prices, or stock market activities, it's likely a stock symbol. If it discusses the general meaning (like فولاد meaning steel in manufacturing context), it's a regular word.
-Answer:<end_of_turn>
-<start_of_turn>model
-"""
-        # Generate response
-        response = gemma_pipeline(
-            prompt,
-            max_new_tokens=20,
-            temperature=0.1,
-            do_sample=False,
-            return_full_text=False
         )
-        # Extract the answer
-        answer = response[0]['generated_text'].strip().upper()
-        # Determine confidence based on response
-        if "STOCK" in answer:
-            return 0.9  # High confidence it's a stock symbol
-        elif "WORD" in answer:
-            return 0.1  # Low confidence it's a stock symbol
         else:
-            # If unclear, analyze the response for clues
-            if any(keyword in answer.lower() for keyword in ['نماد', 'سهام', 'بورس']):
-                return 0.7
-            else:
-                return 0.3
     except Exception as e:
-        print(f"Gemma inference error: {e}")
-        return 0.5  # Neutral confidence on error
 def check_stock_symbol_context(text: str, potential_symbol: str, symbol_info: Dict) -> Tuple[bool, float]:
     """
     Check if a potential symbol is actually used as a stock symbol in context
-    Using both heuristics and Gemma model
     """
     # Get surrounding context
     symbol_pos = text.find(potential_symbol)
@@ -181,11 +225,14 @@ def check_stock_symbol_context(text: str, potential_symbol: str, symbol_info: Di
     elif market_keyword_count == 0 and len(words_in_context) > 10:
         return False, 0.05
-    # Use Gemma for disambiguation
-    gemma_score = use_gemma_for_disambiguation(context_window, potential_symbol, symbol_info)
-    # Combine scores (give more weight to Gemma as it understands context better)
-    final_score = (heuristic_score * 0.2 + gemma_score * 0.8)
     # Decision threshold
     is_stock = final_score > 0.5
@@ -195,7 +242,7 @@ def check_stock_symbol_context(text: str, potential_symbol: str, symbol_info: Di
 def find_stock_symbols_in_text(text: str) -> List[Dict]:
     """Find and validate stock symbols in text"""
     found_symbols = []
-    processed_positions = set()  # To avoid duplicate processing
     # Pattern to match Persian/Arabic words
     pattern = r'\b[\u0600-\u06FF]+\b'
@@ -206,7 +253,7 @@ def find_stock_symbols_in_text(text: str) -> List[Dict]:
         if word in SYMBOL_NAMES and match.start() not in processed_positions:
             symbol_info = STOCK_SYMBOLS[word]
-            # Check context using Gemma
             is_stock, confidence = check_stock_symbol_context(text, word, symbol_info)
             if is_stock:
@@ -255,13 +302,10 @@ label_names = {
 }
 def merge_overlapping_entities(entities: List[Dict], stock_entities: List[Dict]) -> List[Dict]:
-    """Merge entities, removing overlaps (stock symbols take precedence)"""
     all_entities = []
-    # Add stock entities first
     all_entities.extend(stock_entities)
-    # Add NER entities that don't overlap
     for ner_ent in entities:
         overlap = False
         for stock_ent in stock_entities:
@@ -312,7 +356,7 @@ def perform_ner(text):
         # Perform standard NER
         entities = ner_pipeline(text)
-        # Find stock symbols using Gemma
         stock_entities = find_stock_symbols_in_text(text)
         # Merge entities
@@ -379,13 +423,13 @@ with gr.Blocks(
     .rtl-text { direction: rtl; text-align: right; font-family: 'B Nazanin', Tahoma, Arial; }
     """
 ) as demo:
-    gr.Markdown("""
     # 🏦 شناسایی هوشمند موجودیت‌ها و نمادهای بورس ایران
     ## Persian Named Entity Recognition with Stock Symbol Detection
-    ### Powered by Google Gemma-2-9B-IT
     <div class="rtl-text">
-    این برنامه با استفاده از مدل قدرتمند Gemma-2-9B، متن‌های فارسی را تحلیل کرده و موجودیت‌های مختلف را شناسایی می‌کند.
     </div>
     """)
@@ -447,12 +491,9 @@ with gr.Blocks(
         | 🔷 آبی آسمانی | **درصدها** | ۲۰ درصد، ۵٪ |
         | 💚 سبز روشن | **نمادهای بورسی** | فولاد، وبملت، شپنا |
-        ## ویژگی خاص: تشخیص هوشمند با Gemma-2-9B
-        این برنامه از **مدل Gemma-2-9B** گوگل استفاده می‌کند که:
-        - درک عمیق از زبان فارسی دارد
-        - متن را به صورت کامل تحلیل می‌کند
-        - بین نماد بورسی و کلمه عادی تمایز قائل می‌شود
         **مثال:**
         - «سهام **فولاد** در بورس معامله شد» ← فولاد = نماد بورسی ✅
@@ -468,9 +509,7 @@ with gr.Blocks(
         ## مدل‌های استفاده شده:
         - **ParsBERT NER**: شناسایی موجودیت‌های عمومی
-        - **Google Gemma-2-9B-IT**: تحلیل هوشمند متن و تشخیص نمادهای بورسی
-        ⚠️ **توجه**: مدل Gemma به دلیل حجم بالا (9 میلیارد پارامتر) ممکن است کمی کندتر باشد
         </div>
         """)
@@ -498,11 +537,9 @@ if __name__ == "__main__":
     print("Starting Persian NER + Stock Symbol Detection System...")
     print(f"Using device: {device}")
     print(f"Loaded {len(STOCK_SYMBOLS)} stock symbols")
-    print("Models:")
     print("  - NER: HooshvareLab/bert-base-parsbert-ner-uncased")
-    print("  - Context Understanding: Google Gemma-2-9B-IT")
-    print("\nNote: Gemma-2-9B is a large model. First run may take time to download.")
-    print("For better performance, consider using GPU if available.")
     demo.launch(
         share=False,
         debug=True

 from typing import List, Dict, Tuple
 import numpy as np
+# Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
     aggregation_strategy="simple"
 )
+# Load a smaller, open model for context understanding
+print("Loading context understanding model...")
+# Using Microsoft Phi-2 (small and efficient) or Mistral-7B-Instruct (if you have more resources)
+context_model_name = "microsoft/phi-2"  # 2.7B parameters, works well on CPU
+try:
+    context_tokenizer = AutoTokenizer.from_pretrained(context_model_name, trust_remote_code=True)
+    context_model = AutoModelForCausalLM.from_pretrained(
+        context_model_name,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+        device_map="auto" if torch.cuda.is_available() else None
+    )
+    if device == "cpu":
+        context_model = context_model.to(device)
+    # Set pad token if not set
+    if context_tokenizer.pad_token is None:
+        context_tokenizer.pad_token = context_tokenizer.eos_token
+    use_llm_model = True
+    print(f"Successfully loaded {context_model_name}")
+except Exception as e:
+    print(f"Could not load Phi-2 model: {e}")
+    print("Falling back to zero-shot classification model...")
+    # Fallback to mDeBERTa for zero-shot classification
+    context_model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
+    classifier = pipeline(
+        "zero-shot-classification",
+        model=context_model_name,
+        device=0 if device == "cuda" else -1
+    )
+    use_llm_model = False
 # Load stock symbols from CSV
 def load_stock_symbols(csv_path="symbols.csv"):
     'زیان', 'ریال', 'تومان', 'میلیارد', 'میلیون', 'درصد', 'رشد', 'افت'
 }
+def use_phi_for_disambiguation(text: str, potential_symbol: str, symbol_info: Dict) -> float:
     """
+    Use Phi-2 model to determine if a word is used as a stock symbol
     Returns confidence score (0-1)
     """
+    if not use_llm_model:
+        # Use zero-shot classification instead
+        return use_zero_shot_classification(text, potential_symbol, symbol_info)
     try:
+        # Create a simple prompt for Phi-2
+        prompt = f"""Analyze this Persian text and determine if "{potential_symbol}" is used as a stock market symbol.
+Context: "{potential_symbol}" could be a stock symbol for {symbol_info['company']} company.
+Text: {text}
+Answer with only "STOCK" if it's a stock symbol, or "WORD" if it's a regular word:
+Answer: """
+        # Tokenize and generate
+        inputs = context_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = context_model.generate(
+                **inputs,
+                max_new_tokens=10,
+                temperature=0.1,
+                do_sample=False,
+                pad_token_id=context_tokenizer.eos_token_id
+            )
+        # Decode the response
+        response = context_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+        response = response.strip().upper()
+        # Parse response
+        if "STOCK" in response:
+            return 0.9
+        elif "WORD" in response:
+            return 0.1
+        else:
+            # Ambiguous response, use heuristics
+            return 0.5
+    except Exception as e:
+        print(f"Phi-2 inference error: {e}")
+        return 0.5
+def use_zero_shot_classification(text: str, potential_symbol: str, symbol_info: Dict) -> float:
+    """
+    Fallback: Use zero-shot classification to determine if a word is a stock symbol
+    """
+    try:
+        # Get context around the symbol
+        symbol_pos = text.find(potential_symbol)
+        if symbol_pos == -1:
+            return 0.5
+        start = max(0, symbol_pos - 100)
+        end = min(len(text), symbol_pos + len(potential_symbol) + 100)
+        context_text = text[start:end]
+        # Define candidate labels
+        candidate_labels = [
+            f"نماد بورسی {symbol_info['company']}",
+            f"کلمه عادی {potential_symbol}",
+            "stock market symbol",
+            "regular word"
+        ]
+        # Perform classification
+        result = classifier(
+            context_text,
+            candidate_labels=candidate_labels,
+            multi_label=False
         )
+        # Check top label
+        top_label = result['labels'][0]
+        top_score = result['scores'][0]
+        if 'نماد بورسی' in top_label or 'stock' in top_label.lower():
+            return top_score
         else:
+            return 1 - top_score
     except Exception as e:
+        print(f"Classification error: {e}")
+        return 0.5
 def check_stock_symbol_context(text: str, potential_symbol: str, symbol_info: Dict) -> Tuple[bool, float]:
     """
     Check if a potential symbol is actually used as a stock symbol in context
     """
     # Get surrounding context
     symbol_pos = text.find(potential_symbol)
     elif market_keyword_count == 0 and len(words_in_context) > 10:
         return False, 0.05
+    # Use AI model for disambiguation
+    if use_llm_model:
+        ai_score = use_phi_for_disambiguation(context_window, potential_symbol, symbol_info)
+    else:
+        ai_score = use_zero_shot_classification(context_window, potential_symbol, symbol_info)
+    # Combine scores
+    final_score = (heuristic_score * 0.3 + ai_score * 0.7)
     # Decision threshold
     is_stock = final_score > 0.5
 def find_stock_symbols_in_text(text: str) -> List[Dict]:
     """Find and validate stock symbols in text"""
     found_symbols = []
+    processed_positions = set()
     # Pattern to match Persian/Arabic words
     pattern = r'\b[\u0600-\u06FF]+\b'
         if word in SYMBOL_NAMES and match.start() not in processed_positions:
             symbol_info = STOCK_SYMBOLS[word]
+            # Check context
             is_stock, confidence = check_stock_symbol_context(text, word, symbol_info)
             if is_stock:
 }
 def merge_overlapping_entities(entities: List[Dict], stock_entities: List[Dict]) -> List[Dict]:
+    """Merge entities, removing overlaps"""
     all_entities = []
     all_entities.extend(stock_entities)
     for ner_ent in entities:
         overlap = False
         for stock_ent in stock_entities:
         # Perform standard NER
         entities = ner_pipeline(text)
+        # Find stock symbols
         stock_entities = find_stock_symbols_in_text(text)
         # Merge entities
     .rtl-text { direction: rtl; text-align: right; font-family: 'B Nazanin', Tahoma, Arial; }
     """
 ) as demo:
+    gr.Markdown(f"""
     # 🏦 شناسایی هوشمند موجودیت‌ها و نمادهای بورس ایران
     ## Persian Named Entity Recognition with Stock Symbol Detection
+    ### Using {context_model_name.split('/')[-1]} for Context Understanding
     <div class="rtl-text">
+    این برنامه متن‌های فارسی را تحلیل کرده و موجودیت‌های مختلف را شناسایی می‌کند.
     </div>
     """)
         | 🔷 آبی آسمانی | **درصدها** | ۲۰ درصد، ۵٪ |
         | 💚 سبز روشن | **نمادهای بورسی** | فولاد، وبملت، شپنا |
+        ## ویژگی خاص: تشخیص هوشمند نمادهای بورسی
+        برنامه با استفاده از **هوش مصنوعی** تشخیص می‌دهد که آیا یک کلمه نماد بورسی است یا خیر.
         **مثال:**
         - «سهام **فولاد** در بورس معامله شد» ← فولاد = نماد بورسی ✅
         ## مدل‌های استفاده شده:
         - **ParsBERT NER**: شناسایی موجودیت‌های عمومی
+        - **Microsoft Phi-2 / mDeBERTa**: تحلیل هوشمند متن برای تشخیص نمادهای بورسی
         </div>
         """)
     print("Starting Persian NER + Stock Symbol Detection System...")
     print(f"Using device: {device}")
     print(f"Loaded {len(STOCK_SYMBOLS)} stock symbols")
+    print("Models loaded:")
     print("  - NER: HooshvareLab/bert-base-parsbert-ner-uncased")
+    print(f"  - Context: {context_model_name}")
     demo.launch(
         share=False,
         debug=True