Spaces:

leilaghomashchi
/

Data-anonymization

Sleeping

App Files Files Community

leilaghomashchi commited on Sep 9

Commit

993f2ac

verified ·

1 Parent(s): 2a0ebc7

Upload fixed_anonymizer (1).py

Browse files

Files changed (1) hide show

fixed_anonymizer (1).py +1107 -0

fixed_anonymizer (1).py ADDED Viewed

	@@ -0,0 +1,1107 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Enhanced Multi-Modal Data Anonymization System - Fixed Version
+=============================================================
+Fixed NER model loading + Optimized for Persian & English Support
+"""
+import gradio as gr
+import re
+import os
+import requests
+import time
+import logging
+from typing import List, Dict, Tuple, Optional, Set
+import warnings
+import subprocess
+import sys
+def install_requirements():
+    """نصب اجباری وابستگی‌ها"""
+    try:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers>=4.30.0"])
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "torch"])
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "tokenizers>=0.13.0"])
+        print("✅ Dependencies installed successfully")
+    except Exception as e:
+        print(f"❌ Failed to install dependencies: {e}")
+# نصب وابستگی‌ها در صورت عدم وجود
+try:
+    import transformers
+    print("✅ Transformers already available")
+except ImportError:
+    print("📦 Installing transformers...")
+    install_requirements()
+# Enhanced dependencies with better error handling
+TRANSFORMERS_AVAILABLE = False
+try:
+    print("🔄 Attempting to import transformers...")
+    from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+    TRANSFORMERS_AVAILABLE = True
+    print("✅ Transformers library loaded successfully")
+except ImportError as e:
+    print(f"⚠️ Transformers import failed: {e}")
+    print("🔍 Falling back to regex-only mode")
+    TRANSFORMERS_AVAILABLE = False
+except Exception as e:
+    print(f"❌ Unexpected error loading transformers: {e}")
+    TRANSFORMERS_AVAILABLE = False
+warnings.filterwarnings('ignore')
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class EnhancedDataAnonymizer:
+    def __init__(self):
+        self.mapping_table = {}
+        self.counters = {}
+        self.api_key = os.getenv("OPENAI_API_KEY", "")
+        # Processing modes
+        self.processing_modes = {
+            'regex_only': 'Pure Regex (Fast & Compatible)',
+            'hybrid': 'Regex + NER (Recommended)',
+            'ner_priority': 'NER Priority + Regex Backup (Highest Accuracy)'
+        }
+        # Model components
+        self.ner_pipeline = None
+        self.model_status = "Initializing..."
+        self.model_ready = False
+        # Initialize model with improved error handling
+        self.initialize_ner_model_safe()
+        # Pattern categories (همان کد قبلی)
+        self.pattern_categories = {
+            'personal_identity': {
+                'name_fa': 'اطلاعات شخصی و هویتی',
+                'name_en': 'Personal & Identity Information',
+                'patterns': ['PERSON', 'MIXED_NAMES', 'ID_NUMBER', 'ENGLISH_TITLES'],
+                'icon': '👤'
+            },
+            'financial': {
+                'name_fa': 'اطلاعات مالی',
+                'name_en': 'Financial Information',
+                'patterns': ['AMOUNT', 'INTERNATIONAL_CURRENCIES', 'ACCOUNT', 'FINANCIAL_TERMS', 'STOCK_SYMBOL'],
+                'icon': '💰'
+            },
+            'temporal': {
+                'name_fa': 'اطلاعات زمانی',
+                'name_en': 'Temporal Information',
+                'patterns': ['DATE', 'ADVANCED_DATE_FORMATS', 'TIME_RANGES'],
+                'icon': '📅'
+            },
+            'location': {
+                'name_fa': 'اطلاعات مکانی',
+                'name_en': 'Location Information',
+                'patterns': ['LOCATION', 'COMPLEX_ADDRESSES'],
+                'icon': '📍'
+            },
+            'technical': {
+                'name_fa': 'اطلاعات فنی و تکنولوژیکی',
+                'name_en': 'Technical & Technological',
+                'patterns': ['TECHNICAL_CODES', 'NETWORK_ADDRESSES', 'TECHNICAL_UNITS', 'ACRONYMS_ABBREVIATIONS'],
+                'icon': '⚙️'
+            },
+            'business': {
+                'name_fa': 'اطلاعات کسب‌وکار',
+                'name_en': 'Business Information',
+                'patterns': ['COMPANY', 'BUSINESS_TERMS', 'PRODUCT', 'PETROCHEMICAL'],
+                'icon': '🏢'
+            },
+            'quantity': {
+                'name_fa': 'اطلاعات کمیت و واحد',
+                'name_en': 'Quantity & Unit Information',
+                'patterns': ['PERCENTAGE', 'VOLUME', 'RATIOS'],
+                'icon': '📊'
+            },
+            'communication': {
+                'name_fa': 'اطلاعات ارتباطی',
+                'name_en': 'Communication Information',
+                'patterns': ['PHONE', 'EMAIL'],
+                'icon': '📞'
+            }
+        }
+        # Initialize counters
+        self.reset_counters()
+    def initialize_ner_model_safe(self):
+        """بارگذاری ایمن مدل NER با پشتیبانی فارسی و انگلیسی"""
+        print("🔄 Starting multilingual NER model initialization...")
+        if not TRANSFORMERS_AVAILABLE:
+            self.model_status = "⚠️ Transformers library not available - Using Regex only"
+            self.model_ready = False
+            print("🔍 Transformers not available, continuing with regex patterns only")
+            return
+        try:
+            print("🤖 Attempting to load multilingual NER models...")
+            # مدل‌های چندزبانه با پشتیبانی فارسی و انگلیسی
+            model_configs = [
+                {
+                    'name': 'xlm-roberta-base',
+                    'task': 'ner',
+                    'languages': 'Multilingual (FA+EN+98 others)',
+                    'priority': 1
+                },
+                {
+                    'name': 'microsoft/mdeberta-v3-base',
+                    'task': 'ner',
+                    'languages': 'Multilingual (FA+EN)',
+                    'priority': 2
+                },
+                {
+                    'name': 'distilbert-base-multilingual-cased',
+                    'task': 'ner',
+                    'languages': 'Multilingual',
+                    'priority': 3
+                }
+            ]
+            for config in model_configs:
+                try:
+                    model_name = config['name']
+                    print(f"🔄 Trying {model_name} ({config['languages']})...")
+                    # تنظیم ساده pipeline بدون tokenizer_kwargs
+                    self.ner_pipeline = pipeline(
+                        "ner",
+                        model=model_name,
+                        aggregation_strategy="simple",
+                        device=-1  # Force CPU usage
+                    )
+                    # تست مدل با متن فارسی و انگلیسی
+                    test_texts = [
+                        "Hello John Smith from New York.",
+                        "سلام آقای احمد رضایی از تهران."
+                    ]
+                    test_passed = True
+                    for test_text in test_texts:
+                        try:
+                            test_result = self.ner_pipeline(test_text)
+                            print(f"✅ Test passed for: {test_text[:20]}...")
+                        except Exception as test_error:
+                            print(f"❌ Test failed for {test_text[:20]}: {test_error}")
+                            test_passed = False
+                            break
+                    if test_passed:
+                        self.model_status = f"✅ {model_name} loaded successfully ({config['languages']})"
+                        self.model_ready = True
+                        print(f"🎉 Successfully loaded multilingual model: {model_name}")
+                        return
+                    else:
+                        print(f"❌ Model {model_name} failed language tests")
+                        continue
+                except Exception as model_error:
+                    print(f"❌ Failed to load {model_name}: {str(model_error)[:100]}")
+                    continue
+            # اگر همه مدل‌ها ناکام بودند
+            raise Exception("All multilingual NER model loading attempts failed")
+        except Exception as e:
+            error_msg = str(e)[:100]
+            print(f"❌ Multilingual NER model loading completely failed: {error_msg}")
+            self.model_status = "❌ NER Model loading failed - Using advanced Regex only"
+            self.model_ready = False
+            self.ner_pipeline = None
+    def reset_counters(self):
+        """ریست کانترها"""
+        pattern_types = []
+        for category in self.pattern_categories.values():
+            pattern_types.extend(category['patterns'])
+        self.counters = {pattern: 0 for pattern in pattern_types}
+    def detect_language(self, text):
+        """تشخیص زبان متن"""
+        if not text:
+            return 'fa'
+        persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
+        english_chars = len(re.findall(r'[a-zA-Z]', text))
+        total = persian_chars + english_chars
+        if total == 0:
+            return 'fa'
+        if persian_chars / total > 0.6:
+            return 'fa'
+        elif english_chars / total > 0.6:
+            return 'en'
+        else:
+            return 'mixed'
+    def get_comprehensive_patterns(self):
+        """الگوهای جامع ناشناس‌سازی - همان کد قبلی"""
+        return {
+            'PERSON': [
+                r'آقای\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
+                r'خانم\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
+                r'مهندس\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
+                r'دکتر\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
+                r'استاد\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
+                r'Mr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
+                r'Ms\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
+                r'Dr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
+                r'([آ-یa-zA-Z]+\s+[آ-یa-zA-Z]+)(?:، مدیرعامل|\s+مدیرعامل|\s+رئیس)',
+            ],
+            'MIXED_NAMES': [
+                r'([آ-یa-zA-Z]{2,}\s+[آ-یa-zA-Z]{2,})',
+                r'([A-Z][a-z]+-[A-Z][a-z]+)',
+                r"([A-Z]'[A-Z][a-z]+)",
+            ],
+            'ID_NUMBER': [
+                r'IR[۰-۹0-9]{24}',
+                r'شبا[\s:]*IR[۰-۹0-9]{24}',
+                r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
+                r'(?:شناسه[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
+                r'National[\s]*(?:ID[\s:]*)?[0-9]{10}',
+                r'(?:پاسپورت[\s:]*)?[A-Z][0-9]{8}',
+                r'SSN[\s:]*[0-9]{3}-[0-9]{2}-[0-9]{4}',
+            ],
+            'AMOUNT': [
+                r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
+                r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان',
+                r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?',
+                r'€\d+(?:,\d{3})*(?:\.\d+)?',
+                r'\d+(?:,\d{3})*\s*ریال',
+            ],
+            'PHONE': [
+                r'(?:تلفن[\s:]*)?(?:شماره[\s:]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
+                r'(?:تماس[\s:]*)?(?:شماره[\s:]*)?(?:با[\s]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
+                r'(?:موبایل[\s:]*)?(?:شماره[\s:]*)?(?:0)?9[۰-۹0-9]{9}',
+                r'[۰-۹0-9]{3,4}[-\s][۰-۹0-9]{7,8}',
+                r'[۰-۹0-9]{11}(?!\d)',
+                r'(?:\+98|0098)?[۰-۹0-9]{10}',
+                r'\+[0-9]{1,3}-[0-9]{3}-[0-9]{3}-[0-9]{4}(?:\s+ext\.\s+[0-9]{3,4})?',
+                r'\([0-9]{3}\)\s+[0-9]{3}-[0-9]{4}'
+            ],
+            'EMAIL': [
+                r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
+                r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
+                r'email[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
+            ],
+            'DATE': [
+                r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}',
+                r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}',
+                r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s*(?:[۰-۹0-9]{4})',
+                r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}',
+            ],
+            'LOCATION': [
+                r'(تهران|اصفهان|ماهشهر|عسلویه|بندرعباس|اهواز|شیراز|مشهد|تبریز|کرج|قم|رشت|کرمان|یزد|زاهدان|بوشهر|خرمشهر|آبادان|اراک|قزوین)',
+                r'استان\s+([آ-ی\s]+)',
+                r'شهر\s+([آ-ی\s]+)',
+                r'(ایران|عراق|کویت|عربستان|امارات|قطر|عمان|بحرین|ترکیه|پاکستان|افغانستان)',
+                r'(London|Paris|Tokyo|New\s+York|Dubai|Singapore|Hong\s+Kong|Shanghai|Mumbai|Frankfurt|Amsterdam)'
+            ],
+            'COMPANY': [
+                r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به)',
+                r'([آ-یa-zA-Z\s]+)\s+شرکت',
+                r'این\s+شرکت(?=\s|$|،|\.)',
+                r'(بانک\s+[آ-یa-zA-Z\s]+)',
+                r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))'
+            ],
+            'PERCENTAGE': [
+                r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش|\s+بالاتر|\s+پایین‌تر)?',
+                r'\d+(?:\.\d+)?\s*%',
+                r'معادل\s+\d+(?:\.\d+)?\s*درصد',
+                r'حدود\s+\d+(?:\.\d+)?\s*درصد',
+                r'\d+(?:\.\d+)?\%\s*(?:increase|decrease|growth|improvement)',
+            ],
+            'ACCOUNT': [
+                r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:بانکی[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
+                r'حساب[\s]*(?:شماره[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
+                r'Account[\s]*(?:Number[\s:]*)?(?:[0-9]{1,3}[-\s]?)*[0-9]{8,20}',
+            ]
+        }
+    def extract_entities_with_ner(self, text: str, confidence_threshold: float = 0.75) -> List[Dict]:
+        """استخراج موجودیت‌ها با مدل NER چندزبانه"""
+        if not self.model_ready or not self.ner_pipeline:
+            return []
+        try:
+            # تقسیم متن برای مدیریت بهتر
+            max_length = 400
+            if len(text) > max_length:
+                chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
+            else:
+                chunks = [text]
+            all_entities = []
+            char_offset = 0
+            for chunk in chunks:
+                try:
+                    # Process chunk with NER model
+                    ner_results = self.ner_pipeline(chunk)
+                    for entity in ner_results:
+                        if entity['score'] >= confidence_threshold:
+                            # Clean entity text
+                            entity_text = entity['word'].replace('##', '').strip()
+                            if len(entity_text) >= 2:  # Minimum length filter
+                                all_entities.append({
+                                    'text': entity_text,
+                                    'label': entity['entity_group'],
+                                    'confidence': entity['score'],
+                                    'start': entity['start'] + char_offset,
+                                    'end': entity['end'] + char_offset,
+                                    'source': 'ner'
+                                })
+                except Exception as chunk_error:
+                    logger.error(f"Error processing chunk: {chunk_error}")
+                    continue
+                char_offset += len(chunk)
+            return all_entities
+        except Exception as e:
+            logger.error(f"Error in multilingual NER extraction: {e}")
+            return []
+    def extract_entities_with_regex(self, text: str, selected_categories: List[str] = None) -> List[Dict]:
+        """استخراج موجودیت‌ها با Regex - همان کد قبلی"""
+        entities = []
+        all_patterns = self.get_comprehensive_patterns()
+        # Filter patterns based on selected categories
+        if selected_categories:
+            selected_pattern_types = self.get_selected_patterns(selected_categories, 'fa')
+            patterns = {k: v for k, v in all_patterns.items() if k in selected_pattern_types}
+        else:
+            patterns = all_patterns
+        processed_positions = set()
+        # Process patterns with priority
+        priority_order = [
+            'ID_NUMBER', 'EMAIL', 'PHONE', 'ACCOUNT',
+            'AMOUNT', 'DATE', 'LOCATION', 'COMPANY', 'PERSON'
+        ]
+        for category in priority_order:
+            if category in patterns:
+                pattern_list = patterns[category]
+                for pattern in pattern_list:
+                    try:
+                        matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
+                        for match in matches:
+                            if match.groups():
+                                entity_text = match.group(1).strip()
+                            else:
+                                entity_text = match.group(0).strip()
+                            # Check for overlaps
+                            match_start, match_end = match.span()
+                            overlaps = any(
+                                not (match_end <= pos_start or match_start >= pos_end)
+                                for pos_start, pos_end in processed_positions
+                            )
+                            if (not overlaps and len(entity_text) >= 2):
+                                entities.append({
+                                    'text': entity_text,
+                                    'category': category,
+                                    'start': match_start,
+                                    'end': match_end,
+                                    'confidence': 0.9,
+                                    'source': 'regex'
+                                })
+                                processed_positions.add((match_start, match_end))
+                    except re.error as e:
+                        logger.error(f"Regex error in pattern {pattern}: {e}")
+                        continue
+        return entities
+    def map_ner_to_categories(self, ner_label: str) -> str:
+        """نگاشت برچسب‌های NER به دسته‌های سیستم"""
+        mapping = {
+            'PER': 'PERSON',
+            'PERSON': 'PERSON',
+            'ORG': 'COMPANY',
+            'ORGANIZATION': 'COMPANY',
+            'LOC': 'LOCATION',
+            'LOCATION': 'LOCATION',
+            'MISC': 'MIXED_NAMES',
+            'GPE': 'LOCATION',
+            'MONEY': 'AMOUNT',
+            'DATE': 'DATE',
+            'TIME': 'DATE'
+        }
+        return mapping.get(ner_label.upper(), 'MIXED_NAMES')
+    def fuse_entities(self, regex_entities: List[Dict], ner_entities: List[Dict],
+                     processing_mode: str) -> List[Dict]:
+        """ترکیب هوشمندانه نتایج Regex و NER"""
+        if processing_mode == 'regex_only' or not self.model_ready:
+            return regex_entities
+        final_entities = []
+        processed_positions = set()
+        if processing_mode == 'hybrid':
+            # Regex priority for specific patterns
+            priority_categories = ['PHONE', 'EMAIL', 'ID_NUMBER', 'ACCOUNT', 'AMOUNT']
+            # Add high-priority regex entities first
+            for entity in regex_entities:
+                if entity['category'] in priority_categories:
+                    final_entities.append(entity)
+                    processed_positions.add((entity['start'], entity['end']))
+            # Add NER entities for names and organizations
+            for entity in ner_entities:
+                if not self.has_overlap(entity, processed_positions):
+                    category = self.map_ner_to_categories(entity['label'])
+                    entity_copy = entity.copy()
+                    entity_copy['category'] = category
+                    final_entities.append(entity_copy)
+                    processed_positions.add((entity['start'], entity['end']))
+            # Add remaining regex entities
+            for entity in regex_entities:
+                if (entity['category'] not in priority_categories and
+                    not self.has_overlap(entity, processed_positions)):
+                    final_entities.append(entity)
+                    processed_positions.add((entity['start'], entity['end']))
+        elif processing_mode == 'ner_priority':
+            # NER takes priority, regex as backup
+            for entity in ner_entities:
+                category = self.map_ner_to_categories(entity['label'])
+                entity_copy = entity.copy()
+                entity_copy['category'] = category
+                final_entities.append(entity_copy)
+                processed_positions.add((entity['start'], entity['end']))
+            # Add non-overlapping regex entities
+            for entity in regex_entities:
+                if not self.has_overlap(entity, processed_positions):
+                    final_entities.append(entity)
+                    processed_positions.add((entity['start'], entity['end']))
+        return final_entities
+    def has_overlap(self, entity: Dict, processed_positions: Set[Tuple[int, int]]) -> bool:
+        """بررسی تداخل موقعیت entities"""
+        entity_start, entity_end = entity['start'], entity['end']
+        for start, end in processed_positions:
+            if not (entity_end <= start or entity_start >= end):
+                return True
+        return False
+    def get_selected_patterns(self, selected_categories: List[str], language: str = 'fa') -> List[str]:
+        """تبدیل دسته‌بندی‌های انتخاب شده به لیست الگوها"""
+        selected_patterns = []
+        for cat_key, cat_info in self.pattern_categories.items():
+            name = cat_info['name_fa'] if language == 'fa' else cat_info['name_en']
+            icon = cat_info['icon']
+            category_display = f"{icon} {name}"
+            if category_display in selected_categories:
+                selected_patterns.extend(cat_info['patterns'])
+        return selected_patterns
+    def get_category_choices(self, language='fa'):
+        """دریافت لیست دسته‌بندی‌ها برای چک‌باکس"""
+        choices = []
+        for cat_key, cat_info in self.pattern_categories.items():
+            name = cat_info['name_fa'] if language == 'fa'else cat_info['name_en']
+            icon = cat_info['icon']
+            choices.append(f"{icon} {name}")
+        return choices
+    def anonymize_text_enhanced(self, original_text: str, lang: str = 'fa',
+                               selected_categories: List[str] = None,
+                               processing_mode: str = 'hybrid') -> str:
+        """ناشناس‌سازی پیشرفته با ترکیب Regex + NER"""
+        try:
+            if not original_text or not original_text.strip():
+                return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
+            # Force regex_only if model not ready
+            if not self.model_ready and processing_mode != 'regex_only':
+                processing_mode = 'regex_only'
+                print(f"🔄 Forced to regex_only mode because model not ready")
+            # Reset
+            self.mapping_table = {}
+            self.reset_counters()
+            # Extract entities with regex
+            regex_entities = self.extract_entities_with_regex(original_text, selected_categories)
+            # Extract entities with NER (if available)
+            ner_entities = []
+            if processing_mode != 'regex_only' and self.model_ready:
+                ner_raw = self.extract_entities_with_ner(original_text)
+                # Convert to standard format
+                for entity in ner_raw:
+                    ner_entities.append({
+                        'text': entity['text'],
+                        'category': self.map_ner_to_categories(entity['label']),
+                        'start': entity['start'],
+                        'end': entity['end'],
+                        'confidence': entity['confidence'],
+                        'source': 'ner'
+                    })
+            # Fuse entities
+            final_entities = self.fuse_entities(regex_entities, ner_entities, processing_mode)
+            # Create anonymization mapping
+            anonymized = original_text
+            found_entities = set()
+            # Sort by length (longer first to avoid partial replacements)
+            final_entities.sort(key=lambda x: len(x['text']), reverse=True)
+            for entity in final_entities:
+                entity_text = entity['text'].strip()
+                category = entity['category']
+                if (entity_text not in found_entities and
+                    entity_text not in self.mapping_table and
+                    len(entity_text) >= 2):
+                    # Generate unique code
+                    if category not in self.counters:
+                        self.counters[category] = 0
+                    self.counters[category] += 1
+                    # Add source indicator
+                    if processing_mode == 'regex_only':
+                        source_suffix = "REG"
+                    elif processing_mode == 'hybrid':
+                        source_suffix = "HYB" if self.model_ready else "REG"
+                    else:
+                        source_suffix = "ENH" if self.model_ready else "REG"
+                    code = f"{category}_{self.counters[category]:03d}_{source_suffix}"
+                    self.mapping_table[entity_text] = code
+                    found_entities.add(entity_text)
+            # Apply anonymization
+            sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True)
+            for original_item, code in sorted_items:
+                anonymized = anonymized.replace(original_item, code)
+            # Statistics
+            regex_count = len(regex_entities)
+            ner_count = len(ner_entities)
+            final_count = len(final_entities)
+            logger.info(f"✅ Enhanced multilingual anonymization completed. Mode: {processing_mode}")
+            logger.info(f"📊 Regex: {regex_count}, NER: {ner_count}, Final: {final_count}")
+            return anonymized
+        except Exception as e:
+            logger.error(f"Enhanced anonymization error: {e}")
+            return f"❌ Error in enhanced anonymization: {str(e)}"
+    def send_to_chatgpt(self, anonymized_text, lang='fa'):
+        """گام 2: ارسال به ChatGPT"""
+        try:
+            if not anonymized_text or not anonymized_text.strip():
+                return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناس‌شده خالی است!"
+            if not self.api_key:
+                return "❌ API Key not configured! Please set OPENAI_API_KEY environment variable." if lang == 'en' else "❌ کلید API تنظیم نشده است!"
+            system_msg = "You are a professional analyst. Answer questions accurately." if lang == 'en' else "شما یک تحلیلگر حرفه‌ای هستید. به سوالات با دقت پاسخ دهید."
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json"
+            }
+            data = {
+                "model": "gpt-4o-mini",
+                "messages": [
+                    {"role": "system", "content": system_msg},
+                    {"role": "user", "content": anonymized_text}
+                ],
+                "max_tokens": 2000,
+                "temperature": 0.7
+            }
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=data,
+                timeout=15
+            )
+            if response.status_code == 200:
+                result = response.json()
+                return result['choices'][0]['message']['content']
+            else:
+                error_data = response.json() if response.content else {}
+                error_message = error_data.get('error', {}).get('message', response.text)
+                return f"❌ API Error: {error_message}"
+        except Exception as e:
+            return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}"
+    def deanonymize_response(self, gpt_response, lang='fa'):
+        """گام 3: بازگردانی"""
+        try:
+            if not gpt_response or not gpt_response.strip():
+                return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!"
+            if not self.mapping_table:
+                return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
+            final_result = gpt_response
+            reverse_mapping = {code: original for original, code in self.mapping_table.items()}
+            sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True)
+            for code, original in sorted_codes:
+                final_result = final_result.replace(code, original)
+            return final_result
+        except Exception as e:
+            return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}"
+    def get_model_status(self):
+        """وضعیت سیستم"""
+        status = "🚀 **Enhanced Multilingual Anonymization System Status:**\n\n"
+        status += f"🤖 **NER Model Status**: {self.model_status}\n"
+        status += f"🔍 **Regex Patterns**: ✅ 50+ comprehensive patterns loaded\n"
+        status += f"🌐 **Language Support**: 🇮🇷 Persian + 🇺🇸 English + Mixed\n"
+        status += f"🐍 **Python Version**: {sys.version.split()[0]}\n"
+        status += f"📦 **Transformers Available**: {'✅ Yes' if TRANSFORMERS_AVAILABLE else '❌ No'}\n\n"
+        if self.model_ready:
+            status += "🎯 **Available Processing Modes:**\n"
+            status += "   • 🔥 Hybrid (Recommended): Regex priority + NER enhancement\n"
+            status += "   • 🎯 NER Priority: Multilingual NER + Regex backup\n"
+            status += "   • ⚡ Regex Only: High-speed pattern matching\n\n"
+            status += "📈 **Expected Accuracy:**\n"
+            status += "   • Regex Only: 70-75%\n"
+            status += "   • Hybrid Mode (FA+EN): 85-92%\n"
+            status += "   • NER Priority (FA+EN): 88-95%\n\n"
+        else:
+            status += "⚠️ **Current Mode: Advanced Regex Only**\n"
+            status += "   • Enhanced Regex processing (70-75% accuracy)\n"
+            if not TRANSFORMERS_AVAILABLE:
+                status += "   • Install transformers for multilingual NER support\n"
+                status += "   • pip install transformers torch\n"
+            status += "\n"
+        status += f"🎯 **Pattern Categories**: {len(self.pattern_categories)} categories available\n"
+        status += f"🔧 **Configuration**: User-controlled category selection\n"
+        status += f"🛡️ **Privacy**: Local processing with optional ChatGPT integration\n"
+        if TRANSFORMERS_AVAILABLE and self.model_ready:
+            status += f"✅ **Multilingual NER**: Ready for Persian + English processing\n"
+        else:
+            status += f"❌ **Multilingual NER**: Not available - Using advanced Regex patterns\n"
+        return status
+# Initialize the enhanced anonymizer
+print("🔄 Initializing Enhanced Multilingual Data Anonymizer...")
+anonymizer = EnhancedDataAnonymizer()
+print(f"✅ Anonymizer initialized with status: {anonymizer.model_status}")
+# باقی توابع Gradio همان کد قبلی...
+def process_all_steps_enhanced(input_text, language, selected_categories, processing_mode):
+    """پردازش خودکار تمام مراحل - نسخه پیشرفته"""
+    lang = 'en' if language == 'English' else 'fa'
+    if not input_text.strip():
+        error_msg = "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
+        return error_msg, "", "", ""
+    try:
+        start_time = time.time()
+        # Enhanced anonymization
+        anonymized_text = anonymizer.anonymize_text_enhanced(
+            input_text, lang, selected_categories, processing_mode
+        )
+        if anonymized_text.startswith("❌"):
+            return anonymized_text, "", "", ""
+        # ChatGPT processing
+        gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang)
+        if gpt_response.startswith("❌"):
+            entities_found = len(anonymizer.mapping_table)
+            success_msg = (f"✅ Enhanced multilingual anonymization completed successfully!\n"
+                          f"🎯 Processing mode: {processing_mode}\n"
+                          f"📊 Protected entities: {entities_found}")
+            return success_msg, anonymized_text, gpt_response, ""
+        # Deanonymization
+        final_result = anonymizer.deanonymize_response(gpt_response, lang)
+        total_time = time.time() - start_time
+        entities_found = len(anonymizer.mapping_table)
+        model_indicator = 'Multilingual NER + Regex' if anonymizer.model_ready else 'Advanced Regex Only'
+        success_msg = (f"🎉 Complete multilingual anonymization & restoration successful!\n"
+                      f"🎯 Mode: {processing_mode} | 📊 Entities: {entities_found}\n"
+                      f"⏱️ Time: {total_time:.2f}s | 🤖 Engine: {model_indicator}")
+        return success_msg, anonymized_text, gpt_response, final_result
+    except Exception as e:
+        error_msg = f"❌ Processing error: {str(e)}" if lang == 'en' else f"❌ خطا در پردازش: {str(e)}"
+        return error_msg, "", "", ""
+def get_mapping_table_enhanced(language):
+    """نمایش جدول نگاشت پیشرفته"""
+    lang = 'en' if language == 'English' else 'fa'
+    if not anonymizer.mapping_table:
+        return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
+    result = "📋 **Enhanced Multilingual Mapping Table:**\n\n"
+    result += f"📊 **Statistics**: {len(anonymizer.mapping_table)} total entities\n"
+    result += f"🎯 **Method**: {'Multilingual NER + Regex' if anonymizer.model_ready else 'Advanced Regex Only'}\n"
+    result += f"🤖 **Model Status**: {anonymizer.model_status}\n\n"
+    # Group by category
+    category_stats = {}
+    for original, code in anonymizer.mapping_table.items():
+        category = code.split('_')[0]
+        if category not in category_stats:
+            category_stats[category] = []
+        category_stats[category].append((original, code))
+    # Display results by category
+    for category, items in category_stats.items():
+        if len(items) > 0:
+            result += f"🔍 **{category}** ({len(items)} items):\n"
+            for original, code in items[:3]:
+                source_indicator = "🧠" if any(x in code for x in ["HYB", "ENH"]) else "🔤"
+                result += f"   {source_indicator} `{original}` → `{code}`\n"
+            if len(items) > 3:
+                result += f"   ... و {len(items) - 3} مورد دیگر\n"
+            result += "\n"
+    result += f"🔥 **Enhanced Multilingual System**: Advanced Persian + English NER + Regex patterns!"
+    return result
+def clear_all_enhanced():
+    """پاک کردن همه - نسخه پیشرفته"""
+    anonymizer.mapping_table = {}
+    anonymizer.reset_counters()
+    return "", "", "", "", ""
+# Enhanced CSS - همان کد قبلی
+enhanced_css = """
+body, .gradio-container {
+    font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    min-height: 100vh !important;
+    padding: 20px !important;
+}
+.enhanced-header {
+    background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
+    border-radius: 20px !important;
+    padding: 20px !important;
+    margin-bottom: 20px !important;
+    text-align: center !important;
+    box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
+}
+.mode-selector {
+    background: linear-gradient(135deg, #74b9ff, #0984e3) !important;
+    border-radius: 15px !important;
+    padding: 20px !important;
+    margin: 15px 0 !important;
+    box-shadow: 0 8px 25px rgba(116, 185, 255, 0.3) !important;
+}
+.model-status {
+    background: linear-gradient(135deg, #00b894, #00a085) !important;
+    border-radius: 15px !important;
+    padding: 15px !important;
+    margin: 15px 0 !important;
+    color: white !important;
+    font-weight: bold !important;
+    text-align: center !important;
+    box-shadow: 0 6px 20px rgba(0, 184, 148, 0.4) !important;
+}
+.rtl {
+    direction: rtl !important;
+    text-align: right !important;
+}
+.ltr {
+    direction: ltr !important;
+    text-align: left !important;
+}
+.workflow {
+    display: grid !important;
+    grid-template-columns: 1fr 1fr 1fr 1fr !important;
+    gap: 25px !important;
+    padding: 30px !important;
+    align-items: start !important;
+    background: rgba(255, 255, 255, 0.1) !important;
+    border-radius: 20px !important;
+    backdrop-filter: blur(10px) !important;
+}
+.gradio-textbox {
+    border-radius: 10px !important;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
+    min-height: 380px !important;
+    max-height: 380px !important;
+    height: 380px !important;
+}
+.gradio-button {
+    border-radius: 25px !important;
+    font-weight: bold !important;
+    transition: all 0.3s ease !important;
+    margin: 5px 0 !important;
+    min-height: 50px !important;
+    background: linear-gradient(45deg, #667eea, #764ba2) !important;
+    border: none !important;
+    color: white !important;
+}
+.gradio-button:hover {
+    transform: translateY(-2px) !important;
+    box-shadow: 0 8px 25px rgba(0,0,0,0.3) !important;
+    background: linear-gradient(45deg, #764ba2, #667eea) !important;
+}
+@media (max-width: 1200px) {
+    .workflow {
+        grid-template-columns: 1fr 1fr !important;
+    }
+}
+@media (max-width: 768px) {
+    .workflow {
+        grid-template-columns: 1fr !important;
+    }
+}
+"""
+# Main Gradio Interface - کد ادامه دارد...
+with gr.Blocks(title="🚀 Enhanced Multilingual Anonymization", theme=gr.themes.Soft(), css=enhanced_css) as app:
+    # Header
+    with gr.Row():
+        gr.HTML("""
+        <div class="enhanced-header">
+            <h1 style='color: white; font-size: 3em; margin: 0; text-shadow: 2px 2px 4px rgba(0,0,0,0.5);'>
+                🚀 Enhanced Multilingual Anonymization System
+            </h1>
+            <p style='color: white; font-size: 1.2em; margin: 10px 0 0 0; text-shadow: 1px 1px 2px rgba(0,0,0,0.5);'>
+                🇮🇷 Persian + 🇺🇸 English + 🤖 Advanced NER + Regex = Maximum Accuracy
+            </p>
+        </div>
+        """)
+    # Language and Mode Selection
+    with gr.Row():
+        with gr.Column(scale=1):
+            language_selector = gr.Radio(
+                choices=["فارسی", "English"],
+                value="فارسی",
+                label="Language / زبان",
+                interactive=True
+            )
+        with gr.Column(scale=2, elem_classes="mode-selector"):
+            processing_mode = gr.Radio(
+                choices=[
+                    ("⚡ Regex Only (Fast & Compatible)", "regex_only"),
+                    ("🎯 Hybrid Mode (Recommended)", "hybrid"),
+                    ("🔬 NER Priority (Highest Accuracy)", "ner_priority")
+                ],
+                value="regex_only" if not anonymizer.model_ready else "hybrid",
+                label="🎚️ Processing Mode",
+                info="Choose processing complexity vs accuracy trade-off"
+            )
+    # Model Status Display
+    with gr.Row():
+        model_status_display = gr.HTML(
+            f'<div class="model-status">🤖 Model Status: {anonymizer.model_status}</div>'
+        )
+    # Category Selection
+    with gr.Row():
+        with gr.Column():
+            pattern_categories = gr.CheckboxGroup(
+                choices=anonymizer.get_category_choices('fa'),
+                value=anonymizer.get_category_choices('fa'),
+                label="🎯 انتخاب دسته‌بندی‌های الگوی ناشناس‌سازی:",
+                interactive=True
+            )
+    # Main Workflow
+    with gr.Row(elem_classes="workflow rtl") as workflow_row:
+        with gr.Column():
+            step1_title = gr.HTML('<h2 style="direction: rtl;">🔍 متن ورودی</h2>')
+            input_text = gr.Textbox(
+                lines=15,
+                placeholder="متن اصلی خود را اینجا وارد کنید...\n\n🚀 سیستم پیشرفته چندزبانه\n✅ پشتیبانی کامل فارسی و انگلیسی\n🧠 تشخیص هوشمند نام اشخاص، شرکت‌ها، مکان‌ها\n📱 شناسایی دقیق تلفن، ایمیل، حساب بانکی\n💰 تشخیص مبالغ مالی و درصدها\n🗓️ استخراج تاریخ‌ها و زمان‌ها",
+                label="",
+                rtl=True
+            )
+            process_btn = gr.Button("🚀 پردازش چندزبانه پیشرفته", variant="primary")
+            clear_btn = gr.Button("🗑️ پاک کردن همه", variant="stop")
+            status = gr.Textbox(
+                label="وضعیت پردازش",
+                lines=4,
+                interactive=False,
+                rtl=True
+            )
+        with gr.Column():
+            step2_title = gr.HTML('<h2 style="direction: rtl;">🎭 متن ناشناس‌شده</h2>')
+            anonymized_output = gr.Textbox(
+                lines=15,
+                placeholder="متن ناشناس‌شده با کدهای محافظتی...",
+                label="",
+                interactive=False,
+                rtl=True
+            )
+        with gr.Column():
+            step3_title = gr.HTML('<h2 style="direction: rtl;">🤖 پاسخ ChatGPT</h2>')
+            gpt_output = gr.Textbox(
+                lines=15,
+                placeholder="پاسخ ChatGPT به متن ناشناس‌شده...",
+                label="",
+                interactive=False,
+                rtl=True
+            )
+        with gr.Column():
+            step4_title = gr.HTML('<h2 style="direction: rtl;">✅ پاسخ نهایی</h2>')
+            final_output = gr.Textbox(
+                lines=15,
+                placeholder="پاسخ نهایی با بازگردانی اطلاعات اصلی...",
+                label="",
+                interactive=False,
+                rtl=True
+            )
+    # Additional Tools
+    with gr.Row():
+        with gr.Column():
+            mapping_btn = gr.Button("📋 نمایش جدول نگاشت چندزبانه")
+            mapping_output = gr.Textbox(
+                lines=15,
+                label="جدول نگاشت اطلاعات",
+                interactive=False,
+                visible=False,
+                rtl=True
+            )
+        with gr.Column():
+            system_status_btn = gr.Button("📊 نمایش وضعیت سیستم چندزبانه")
+            system_status_output = gr.Textbox(
+                lines=20,
+                label="وضعیت سیستم",
+                interactive=False,
+                visible=False,
+                rtl=True
+            )
+    # Event Handlers
+    process_btn.click(
+        fn=process_all_steps_enhanced,
+        inputs=[input_text, language_selector, pattern_categories, processing_mode],
+        outputs=[status, anonymized_output, gpt_output, final_output]
+    )
+    clear_btn.click(
+        fn=clear_all_enhanced,
+        outputs=[input_text, anonymized_output, gpt_output, final_output, status]
+    )
+    mapping_btn.click(
+        fn=get_mapping_table_enhanced,
+        inputs=[language_selector],
+        outputs=[mapping_output]
+    )
+    mapping_btn.click(
+        fn=lambda: gr.update(visible=True),
+        outputs=[mapping_output]
+    )
+    system_status_btn.click(
+        fn=lambda: anonymizer.get_model_status(),
+        outputs=[system_status_output]
+    )
+    system_status_btn.click(
+        fn=lambda: gr.update(visible=True),
+        outputs=[system_status_output]
+    )
+if __name__ == "__main__":
+    logger.info("🚀 Starting Enhanced Multilingual Anonymization System...")
+    logger.info(f"🤖 NER Model Status: {anonymizer.model_status}")
+    logger.info("✅ Ready for high-accuracy Persian + English processing!")
+    app.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )