leilaghomashchi commited on
Commit
fcb7bf8
·
verified ·
1 Parent(s): 377f83a

Upload j1.py

Browse files
Files changed (1) hide show
  1. j1.py +1268 -0
j1.py ADDED
@@ -0,0 +1,1268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Enhanced Multi-Modal Data Anonymization System - Fixed for HuggingFace Spaces
5
+ =============================================================================
6
+ Combining XLM-RoBERTa + Advanced Regex Patterns for Maximum Accuracy
7
+ Supports Persian, English, and Mixed Languages
8
+ """
9
+
10
+ import gradio as gr
11
+ import re
12
+ import os
13
+ import requests
14
+ import time
15
+ import logging
16
+ from typing import List, Dict, Tuple, Optional, Set
17
+ import warnings
18
+ import sys
19
+
20
+ # Enhanced dependencies with better error handling
21
+ TRANSFORMERS_AVAILABLE = False
22
+ try:
23
+ print("🔄 Attempting to import transformers...")
24
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
25
+ TRANSFORMERS_AVAILABLE = True
26
+ print("✅ Transformers library loaded successfully")
27
+ except ImportError as e:
28
+ print(f"⚠️ Transformers import failed: {e}")
29
+ print("📝 Falling back to regex-only mode")
30
+ TRANSFORMERS_AVAILABLE = False
31
+ except Exception as e:
32
+ print(f"❌ Unexpected error loading transformers: {e}")
33
+ TRANSFORMERS_AVAILABLE = False
34
+
35
+ warnings.filterwarnings('ignore')
36
+ logging.basicConfig(level=logging.INFO)
37
+ logger = logging.getLogger(__name__)
38
+
39
+ class EnhancedDataAnonymizer:
40
+ def __init__(self):
41
+ self.mapping_table = {}
42
+ self.counters = {}
43
+ self.api_key = os.getenv("OPENAI_API_KEY", "")
44
+
45
+ # Processing modes
46
+ self.processing_modes = {
47
+ 'regex_only': 'Pure Regex (Fast & Compatible)',
48
+ 'hybrid': 'Regex + XLM-RoBERTa (Recommended)',
49
+ 'ner_priority': 'NER Priority + Regex Backup (Highest Accuracy)'
50
+ }
51
+
52
+ # Model components
53
+ self.ner_pipeline = None
54
+ self.model_status = "Initializing..."
55
+ self.model_ready = False
56
+
57
+ # Initialize model with improved error handling
58
+ self.initialize_ner_model_safe()
59
+
60
+ # Pattern categories
61
+ self.pattern_categories = {
62
+ 'personal_identity': {
63
+ 'name_fa': 'اطلاعات شخصی و هویتی',
64
+ 'name_en': 'Personal & Identity Information',
65
+ 'patterns': ['PERSON', 'MIXED_NAMES', 'ID_NUMBER', 'ENGLISH_TITLES'],
66
+ 'icon': '👤'
67
+ },
68
+ 'financial': {
69
+ 'name_fa': 'اطلاعات مالی',
70
+ 'name_en': 'Financial Information',
71
+ 'patterns': ['AMOUNT', 'INTERNATIONAL_CURRENCIES', 'ACCOUNT', 'FINANCIAL_TERMS', 'STOCK_SYMBOL'],
72
+ 'icon': '💰'
73
+ },
74
+ 'temporal': {
75
+ 'name_fa': 'اطلاعات زمانی',
76
+ 'name_en': 'Temporal Information',
77
+ 'patterns': ['DATE', 'ADVANCED_DATE_FORMATS', 'TIME_RANGES'],
78
+ 'icon': '📅'
79
+ },
80
+ 'location': {
81
+ 'name_fa': 'اطلاعات مکانی',
82
+ 'name_en': 'Location Information',
83
+ 'patterns': ['LOCATION', 'COMPLEX_ADDRESSES'],
84
+ 'icon': '📍'
85
+ },
86
+ 'technical': {
87
+ 'name_fa': 'اطلاعات فنی و تکنولوژیکی',
88
+ 'name_en': 'Technical & Technological',
89
+ 'patterns': ['TECHNICAL_CODES', 'NETWORK_ADDRESSES', 'TECHNICAL_UNITS', 'ACRONYMS_ABBREVIATIONS'],
90
+ 'icon': '⚙️'
91
+ },
92
+ 'business': {
93
+ 'name_fa': 'اطلاعات کسب‌وکار',
94
+ 'name_en': 'Business Information',
95
+ 'patterns': ['COMPANY', 'BUSINESS_TERMS', 'PRODUCT', 'PETROCHEMICAL'],
96
+ 'icon': '🏢'
97
+ },
98
+ 'quantity': {
99
+ 'name_fa': 'اطلاعات کمیت و واحد',
100
+ 'name_en': 'Quantity & Unit Information',
101
+ 'patterns': ['PERCENTAGE', 'VOLUME', 'RATIOS'],
102
+ 'icon': '📊'
103
+ },
104
+ 'communication': {
105
+ 'name_fa': 'اطلاعات ارتباطی',
106
+ 'name_en': 'Communication Information',
107
+ 'patterns': ['PHONE', 'EMAIL'],
108
+ 'icon': '📞'
109
+ }
110
+ }
111
+
112
+ # Initialize counters
113
+ self.reset_counters()
114
+
115
+ def initialize_ner_model_safe(self):
116
+ """بارگذاری ایمن مدل XLM-RoBERTa با مدیریت خطای بهبود یافته"""
117
+
118
+ print("🔄 Starting model initialization...")
119
+
120
+ if not TRANSFORMERS_AVAILABLE:
121
+ self.model_status = "⚠️ Transformers library not available - Using Regex only mode"
122
+ self.model_ready = False
123
+ print("📝 Transformers not available, continuing with regex patterns only")
124
+ return
125
+
126
+ try:
127
+ print("🤖 Attempting to load XLM-RoBERTa model...")
128
+
129
+ # Try loading with multiple fallback strategies
130
+ model_names = [
131
+ "xlm-roberta-base",
132
+ "distilbert-base-multilingual-cased",
133
+ "bert-base-multilingual-cased"
134
+ ]
135
+
136
+ for model_name in model_names:
137
+ try:
138
+ print(f"🔄 Trying model: {model_name}")
139
+
140
+ self.ner_pipeline = pipeline(
141
+ "ner",
142
+ model=model_name,
143
+ aggregation_strategy="simple",
144
+ device=-1, # Force CPU
145
+ tokenizer_kwargs={
146
+ "truncation": True,
147
+ "max_length": 256,
148
+ "padding": True
149
+ }
150
+ )
151
+
152
+ # Test the model with a simple input
153
+ test_result = self.ner_pipeline("Test text")
154
+
155
+ self.model_status = f"✅ {model_name} loaded successfully"
156
+ self.model_ready = True
157
+ print(f"✅ Successfully loaded model: {model_name}")
158
+ return
159
+
160
+ except Exception as model_error:
161
+ print(f"❌ Failed to load {model_name}: {model_error}")
162
+ continue
163
+
164
+ # If all models failed
165
+ raise Exception("All model loading attempts failed")
166
+
167
+ except Exception as e:
168
+ error_msg = str(e)[:100]
169
+ print(f"❌ Model loading completely failed: {error_msg}")
170
+ self.model_status = f"❌ Model loading failed - Using Regex only"
171
+ self.model_ready = False
172
+ self.ner_pipeline = None
173
+
174
+ def reset_counters(self):
175
+ """ریست کانترها"""
176
+ pattern_types = []
177
+ for category in self.pattern_categories.values():
178
+ pattern_types.extend(category['patterns'])
179
+
180
+ self.counters = {pattern: 0 for pattern in pattern_types}
181
+
182
+ def detect_language(self, text):
183
+ """تشخیص زبان متن"""
184
+ if not text:
185
+ return 'fa'
186
+
187
+ persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
188
+ english_chars = len(re.findall(r'[a-zA-Z]', text))
189
+ total = persian_chars + english_chars
190
+
191
+ if total == 0:
192
+ return 'fa'
193
+
194
+ if persian_chars / total > 0.6:
195
+ return 'fa'
196
+ elif english_chars / total > 0.6:
197
+ return 'en'
198
+ else:
199
+ return 'mixed'
200
+
201
+ def get_comprehensive_patterns(self):
202
+ """الگوهای جامع ناشناس‌سازی"""
203
+ return {
204
+ 'PERSON': [
205
+ r'آقای\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
206
+ r'خانم\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
207
+ r'مهندس\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
208
+ r'دکتر\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
209
+ r'استاد\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
210
+ r'Mr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
211
+ r'Ms\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
212
+ r'Dr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
213
+ r'([آ-یa-zA-Z]+\s+[آ-یa-zA-Z]+)(?:، مدیرعامل|\s+مدیرعامل|\s+رئیس)',
214
+ ],
215
+
216
+ 'MIXED_NAMES': [
217
+ r'([آ-یa-zA-Z]{2,}\s+[آ-یa-zA-Z]{2,})',
218
+ r'([A-Z][a-z]+-[A-Z][a-z]+)',
219
+ r"([A-Z]'[A-Z][a-z]+)",
220
+ ],
221
+
222
+ 'ID_NUMBER': [
223
+ r'IR[۰-۹0-9]{24}',
224
+ r'شبا[\s:]*IR[۰-۹0-9]{24}',
225
+ r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
226
+ r'(?:شناسه[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
227
+ r'National[\s]*(?:ID[\s:]*)?[0-9]{10}',
228
+ r'(?:پاسپورت[\s:]*)?[A-Z][0-9]{8}',
229
+ r'SSN[\s:]*[0-9]{3}-[0-9]{2}-[0-9]{4}',
230
+ ],
231
+
232
+ 'ENGLISH_TITLES': [
233
+ r'business\s+partner',
234
+ r'team\s+lead',
235
+ r'head\s+of\s+production',
236
+ r'senior\s+architect',
237
+ r'civil\s+engineer',
238
+ r'system\s+administrator',
239
+ r'network\s+engineer',
240
+ r'environmental\s+consultant',
241
+ r'senior\s+loan\s+officer',
242
+ r'facility\s+manager',
243
+ r'project\s+team',
244
+ r'technical\s+support'
245
+ ],
246
+
247
+ 'AMOUNT': [
248
+ r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
249
+ r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان',
250
+ r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?',
251
+ r'€\d+(?:,\d{3})*(?:\.\d+)?',
252
+ r'\d+(?:,\d{3})*\s*ریال',
253
+ r'رقم\s+فعلی\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
254
+ r'رقم\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
255
+ r'به\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
256
+ ],
257
+
258
+ 'INTERNATIONAL_CURRENCIES': [
259
+ r'\d+(?:,\d{3})*\s+euro',
260
+ r'€\d+(?:\.\d+)?M',
261
+ r'\d+\s+EUR',
262
+ r'\d+(?:,\d{3})*\s+AED',
263
+ r'\d+(?:\.\d+)?M\s+AED',
264
+ r'\$\d+(?:\.\d+)?M',
265
+ r'\$\d+(?:\.\d+)?K',
266
+ r'£\d+(?:,\d{3})*(?:\.\d+)?',
267
+ r'\d+\s+GBP',
268
+ r'\d+\s+CHF',
269
+ r'¥\d+(?:,\d{3})*',
270
+ r'\d+\s+JPY'
271
+ ],
272
+
273
+ 'ACCOUNT': [
274
+ r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:بانکی[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
275
+ r'حساب[\s]*(?:شماره[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
276
+ r'شماره[\s]*حساب[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
277
+ r'Account[\s]*(?:Number[\s:]*)?(?:[0-9]{1,3}[-\s]?)*[0-9]{8,20}',
278
+ r'[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{6,12}',
279
+ r'واریز[\s]*(?:سود[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
280
+ r'سود[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}'
281
+ ],
282
+
283
+ 'FINANCIAL_TERMS': [
284
+ r'فروش\s+(?:ماهانه|تجمیعی|صادراتی)',
285
+ r'درآمد\s+شرکت',
286
+ r'سود\s+(?:خالص|نقدی)',
287
+ r'صورت‌های\s+مالی',
288
+ r'بهای\s+تمام‌شده',
289
+ r'سودآوری',
290
+ r'عملکرد\s+مالی',
291
+ r'میانگین\s+فروش',
292
+ r'بالاترین\s+رقم\s+فروش',
293
+ r'رقم\s+فروش',
294
+ r'درآمدهای\s+عملیاتی'
295
+ ],
296
+
297
+ 'STOCK_SYMBOL': [
298
+ r'نماد\s+([آ-یa-zA-Z0-9]+)',
299
+ r'(سبهان|غدیر|شتران|شپنا|پترول|فارس|خارک|پلاسکو|جم|کرمان|مارون|اراک|رازی|شازند|کاوه|بندر|پارس|خوزستان|ماهشهر|عسلویه)(?=\s|$|،|\.|\s+)',
300
+ r'شرکت\s+([آ-یa-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+را|\s+به)',
301
+ r'پتروشیمی\s+([آ-یa-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+توان)',
302
+ r'(AAPL|GOOGL|MSFT|AMZN|TSLA|META|NVDA|SABIC)(?=\s|$|,|\.)'
303
+ ],
304
+
305
+ 'DATE': [
306
+ r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}',
307
+ r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}',
308
+ r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s*(?:[۰-۹0-9]{4})',
309
+ r'(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s+[۰-۹0-9]{4}',
310
+ r'(?:[0-9]{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(?:[0-9]{4})',
311
+ r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s*[0-9]{1,2},?\s*[0-9]{4}',
312
+ r'سال\s+گذشته',
313
+ r'سال\s+جاری',
314
+ r'این\s+سال',
315
+ r'ماه\s+قبل',
316
+ r'ماه\s+اخیر',
317
+ r'(?:13[0-9]{2}|14[0-9]{2}|20[0-9]{2}|19[0-9]{2})(?=\s|$|،|\.)'
318
+ ],
319
+
320
+ 'ADVANCED_DATE_FORMATS': [
321
+ r'(?:March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th),?\s+\d{4}',
322
+ r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d{3})?Z',
323
+ r'(?:PST|EST|GMT|UTC)(?:[+-]\d{1,2}:\d{2})?',
324
+ r'Eastern\s+Time',
325
+ r'GMT[+-]\d{1,2}:\d{2}',
326
+ r'end\s+of\s+fiscal\s+year\s+\d{4}/\d{2}/\d{2}'
327
+ ],
328
+
329
+ 'TIME_RANGES': [
330
+ r'\d{2}:\d{2}-\d{2}:\d{2}',
331
+ r'\d{2}:\d{2}\s+تا\s+\d{2}:\d{2}',
332
+ r'\d{1,2}:\d{2}\s+(?:AM|PM)\s+(?:PST|EST|GMT|UTC)',
333
+ r'\d{2}:\d{2}:\d{2}\s+(?:AM|PM)',
334
+ r'COB\s*\(Close\s+of\s+Business\)',
335
+ r'\d{1,3}\s+(?:business\s+days|روز\s+کاری)'
336
+ ],
337
+
338
+ 'LOCATION': [
339
+ r'(تهران|اصفهان|ماهشه��|عسلویه|بندرعباس|اهواز|شیراز|مشهد|تبریز|کرج|قم|رشت|کرمان|یزد|زاهدان|بوشهر|خرمشهر|آبادان|اراک|قزوین)',
340
+ r'استان\s+([آ-ی\s]+)',
341
+ r'شهر\s+([آ-ی\s]+)',
342
+ r'(ایران|عراق|کویت|عربستان|امارات|قطر|عمان|بحرین|ترکیه|پاکستان|افغانستان)',
343
+ r'داخلی|بازار\s+داخلی',
344
+ r'خارجی|بازارهای\s+خارجی',
345
+ r'(London|Paris|Tokyo|New\s+York|Dubai|Singapore|Hong\s+Kong|Shanghai|Mumbai|Frankfurt|Amsterdam)'
346
+ ],
347
+
348
+ 'COMPLEX_ADDRESSES': [
349
+ r'کیلومتر\s+\d+\s+جاده\s+[آ-ی\s]+-[آ-ی\s]+',
350
+ r'روبروی\s+(?:پمپ\s+بنزین|بانک|پارک|مسجد|بیمارستان)\s+[آ-یa-zA-Z\s]+',
351
+ r'Building-[A-Z],?\s+Floor-\d+,?\s+Unit-[A-Z0-9]+',
352
+ r'rack\s+number\s+R-\d+,?\s+slot\s+\d+',
353
+ r'phase\s+\d+\s+development,?\s+block\s+[A-Z],?\s+plot\s+\d+-[A-Z]',
354
+ r'\d{2,5}\s+[A-Z][a-z]+\s+(?:Street|Avenue|Boulevard|Road|Drive),?\s+Floor\s+\d+,?\s+Building\s+[A-Z]',
355
+ r'شهرک\s+صنعتی\s+[آ-ی\s]+،?\s+محور\s+[آ-ی\s]+'
356
+ ],
357
+
358
+ 'TECHNICAL_CODES': [
359
+ r'SN-\d{4}-[A-Z]{3}-\d{4}',
360
+ r'Serial\s+Number[\s:]*[A-Z0-9-]+',
361
+ r'REF-[A-Z]{3}-\d{4}-\d{3}',
362
+ r'DOC-[A-Z]{2}-\d{4}-\d{4}',
363
+ r'INF-\d{4}-\d{4}',
364
+ r'CTR/\d{4}/\d{3}',
365
+ r'HVAC-\d{7}',
366
+ r'Generator-Model-[A-Z0-9]+',
367
+ r'LOI-\d{4}-[A-Z]{4}-\d{3}',
368
+ r'BOQ-\d{4}-[A-Z]{3}-\d{3}',
369
+ r'#INV-\d{4}-Q\d-\d{4}',
370
+ r'ESC-\d{4}-[A-Z]{3}-\d{3}',
371
+ r'BN-\d{6}-[A-Z]\d+'
372
+ ],
373
+
374
+ 'NETWORK_ADDRESSES': [
375
+ r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
376
+ r'xxx\.xxx\.xxx\.xxx',
377
+ r'[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}',
378
+ r'srv-[a-z]+-[a-z]+-\d{2}',
379
+ r'[a-z]+-[a-z]+\d*\.[a-z]+\.[a-z]+',
380
+ r'[a-zA-Z0-9-]+\.[a-zA-Z]{2,4}(?:\.[a-zA-Z]{2,4})?'
381
+ ],
382
+
383
+ 'TECHNICAL_UNITS': [
384
+ r'\d+(?:\.\d+)?\s*MW',
385
+ r'\d+(?:\.\d+)?\s*kWh?',
386
+ r'\d+(?:,\d{3})*\s*cubic\s+meters',
387
+ r'\d+(?:,\d{3})*\s*m³',
388
+ r'\d+(?:,\d{3})*\s*sq\s+ft',
389
+ r'\d+(?:\.\d+)?\s*ppm',
390
+ r'\d+(?:\.\d+)?\s*mg/m³',
391
+ r'\b(?:CO2|NOx|SO2)\b',
392
+ r'\d+(?:\.\d+)?\s*TB',
393
+ r'\d+(?:\.\d+)?\s*GB',
394
+ r'\d+(?:,\d{3})*\s*square\s+meters',
395
+ r'\d+(?:\.\d+)?\%\s*efficiency',
396
+ r'FICO\s+score:\s*\d{3}',
397
+ r'\d+(?:\.\d+)?\s*(?:bar|psi)',
398
+ r'\d+(?:\.\d+)?\s*°[CF]',
399
+ r'\d+(?:\.\d+)?\s*(?:rpm|m/s)'
400
+ ],
401
+
402
+ 'ACRONYMS_ABBREVIATIONS': [
403
+ r'\b(?:HVAC|IT|HSE|BOQ|LC|COB)\b',
404
+ r'\b(?:YTD|NNN|EIN|SSN|FICO)\b',
405
+ r'\bIP\s+Address\b',
406
+ r'\bMAC\s+Address\b',
407
+ r'\bURL\b',
408
+ r'\b(?:LLC|Corp|Inc|Ltd)\b',
409
+ r'\b(?:PST|GMT|UTC|EST)\b',
410
+ r'\b(?:CO2|NOx|pH|UV)\b',
411
+ r'\b(?:SCADA|PLC|HMI)\b',
412
+ r'\b(?:GDP|CPI|ROI|NPV)\b',
413
+ r'\b(?:FOB|CIF|DDP)\b',
414
+ r'\b(?:ABA|SWIFT|IBAN)\b'
415
+ ],
416
+
417
+ 'COMPANY': [
418
+ r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به)',
419
+ r'([آ-یa-zA-Z\s]+)\s+شرکت',
420
+ r'این\s+شرکت(?=\s|$|،|\.)',
421
+ r'(بانک\s+[آ-یa-zA-Z\s]+)',
422
+ r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))'
423
+ ],
424
+
425
+ 'BUSINESS_TERMS': [
426
+ r'تحلیل\s+عملکرد',
427
+ r'گزارش\s+(?:فعالیت|عملکرد)\s+ماهانه',
428
+ r'وضعیت\s+فروش',
429
+ r'تولید\s+پایدار',
430
+ r'سهم\s+بازار',
431
+ r'صادرات\s+هدفمند',
432
+ r'بهره‌وری',
433
+ r'ظرفیت‌های\s+داخلی',
434
+ r'شرکت‌های\s+پیشرو',
435
+ r'صنعت\s+پتروشیمی',
436
+ r'سرمایه‌گذاران\s+بنیادی',
437
+ r'شاخص‌های\s+عملیاتی',
438
+ r'برنامه‌ریزی\s+مناسب',
439
+ r'واحد\s+فروش',
440
+ r'موجودی\s+انبار',
441
+ r'فاز\s+رشد\s+جدید',
442
+ r'ترکیب\s+فروش',
443
+ r'سهم\s+صادراتی',
444
+ r'روند\s+عملکرد',
445
+ r'اعداد\s+اعلام‌شده',
446
+ r'داده‌های\s+ثبت‌شده'
447
+ ],
448
+
449
+ 'PRODUCT': [
450
+ r'\b(?:VCM|PVC|PE|PP|PS|ABS|SAN|PC|PMMA|PET|PBT|PA|POM|TPU)\b',
451
+ r'پلی\s*(?:اتیلن|پروپیلن|استایرن|کربنات|متیل)',
452
+ r'\b(?:اتیلن|پروپیلن|بنزن|تولوئن|زایلن|متانول|اتانول|استون|فنول)\b',
453
+ r'\b(?:کلر|هیدروژن|اکسیژن|نیتروژن|آمونیاک|اتان|پروپان|بوتان)\b',
454
+ r'محصول(?:ات)?',
455
+ r'تولیدات\s+شرکت'
456
+ ],
457
+
458
+ 'PETROCHEMICAL': [
459
+ r'\b(?:LDPE|HDPE|LLDPE|PP|PS|EPS|ABS|SAN|PC|PMMA|PET|PBT|PA6|PA66|POM|TPU|EVA|EAA)\b',
460
+ r'(?:Ethylene\s+Vinyl\s+Acetate|Ethyl\s+Acrylate|Methyl\s+Methacrylate|Polyethylene\s+Terephthalate)'
461
+ ],
462
+
463
+ 'PERCENTAGE': [
464
+ r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش|\s+بالاتر|\s+پایین‌تر)?',
465
+ r'\d+(?:\.\d+)?\s*%',
466
+ r'معادل\s+\d+(?:\.\d+)?\s*درصد',
467
+ r'حدود\s+\d+(?:\.\d+)?\s*درصد',
468
+ r'با\s+\d+(?:\.\d+)?\s*درصد\s+افزایش',
469
+ r'رشد\s+\d+(?:\.\d+)?\s*درصدی',
470
+ r'\d+(?:\.\d+)?\s*درصدی(?=\s+همراه|\s+بوده)',
471
+ r'میزان\s+رشد(?=\s+نسبت|\s+معادل)',
472
+ r'افزایش\s+قابل‌توجهی',
473
+ r'بهبود\s+نسبی',
474
+ r'\d+(?:\.\d+)?\%\s*(?:increase|decrease|growth|improvement)',
475
+ r'(?:approximately|about)\s+\d+(?:\.\d+)?\%'
476
+ ],
477
+
478
+ 'VOLUME': [
479
+ r'\d+(?:,\d{3})*\s*تن',
480
+ r'\d+(?:,\d{3})*\s*(?:کیلوگرم|لیتر|بشکه)',
481
+ r'میزان\s+\d+(?:,\d{3})*\s*تن',
482
+ r'مقدار\s+تولید',
483
+ r'حجم\s+فروش',
484
+ r'ظرفیت\s+(?:تولید|اسمی)',
485
+ r'\d+(?:,\d{3})*\s*(?:tons|kg|liters|barrels)',
486
+ r'\d+(?:,\d{3})*\s*(?:metric\s+tons|MT)',
487
+ r'\d+(?:,\d{3})*\s*(?:thousand\s+tons|KT)'
488
+ ],
489
+
490
+ 'RATIOS': [
491
+ r'نسبت\s+(?:فروش|تولید)\s+به\s+[آ-ی\s]+',
492
+ r'\d+(?:\.\d+)?\s*نزدیک',
493
+ r'برابر\s+با\s+\d+(?:\.\d+)?',
494
+ r'معادل\s+\d+(?:\.\d+)?',
495
+ r'میزان\s+(?:رشد|افزایش)',
496
+ r'شاخص\s+(?:مهم|عملیاتی)',
497
+ r'\d+(?:\.\d+)?\s*درصد\s+کل\s+تولید'
498
+ ],
499
+
500
+ 'PHONE': [
501
+ r'(?:تلفن[\s:]*)?(?:شماره[\s:]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
502
+ r'(?:تماس[\s:]*)?(?:شماره[\s:]*)?(?:با[\s]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
503
+ r'(?:موبایل[\s:]*)?(?:شماره[\s:]*)?(?:0)?9[۰-۹0-9]{9}',
504
+ r'[۰-۹0-9]{3,4}[-\s][۰-۹0-9]{7,8}',
505
+ r'[۰-۹0-9]{11}(?!\d)',
506
+ r'(?:\+98|0098)?[۰-۹0-9]{10}',
507
+ r'[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}',
508
+ r'\+[0-9]{1,3}-[0-9]{3}-[0-9]{3}-[0-9]{4}(?:\s+ext\.\s+[0-9]{3,4})?',
509
+ r'\([0-9]{3}\)\s+[0-9]{3}-[0-9]{4}'
510
+ ],
511
+
512
+ 'EMAIL': [
513
+ r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
514
+ r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
515
+ r'email[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
516
+ r'نشانی[\s]*الکترونیکی[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
517
+ r'آدرس[\s]*ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
518
+ r'facility\.manager@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
519
+ ]
520
+ }
521
+
522
+ def extract_entities_with_ner(self, text: str, confidence_threshold: float = 0.75) -> List[Dict]:
523
+ """استخراج موجودیت‌ها با مدل NER"""
524
+ if not self.model_ready or not self.ner_pipeline:
525
+ return []
526
+
527
+ try:
528
+ # Process text with NER model
529
+ ner_results = self.ner_pipeline(text)
530
+
531
+ entities = []
532
+ for entity in ner_results:
533
+ if entity['score'] >= confidence_threshold:
534
+ # Clean entity text
535
+ entity_text = entity['word'].replace('##', '').strip()
536
+
537
+ if len(entity_text) >= 2: # Minimum length filter
538
+ entities.append({
539
+ 'text': entity_text,
540
+ 'label': entity['entity_group'],
541
+ 'confidence': entity['score'],
542
+ 'start': entity['start'],
543
+ 'end': entity['end'],
544
+ 'source': 'ner'
545
+ })
546
+
547
+ return entities
548
+
549
+ except Exception as e:
550
+ logger.error(f"Error in NER extraction: {e}")
551
+ return []
552
+
553
+ def map_ner_to_categories(self, ner_label: str) -> str:
554
+ """نگاشت برچسب‌های NER به دسته‌های سیستم"""
555
+ mapping = {
556
+ 'PER': 'PERSON',
557
+ 'PERSON': 'PERSON',
558
+ 'ORG': 'COMPANY',
559
+ 'ORGANIZATION': 'COMPANY',
560
+ 'LOC': 'LOCATION',
561
+ 'LOCATION': 'LOCATION',
562
+ 'MISC': 'MIXED_NAMES',
563
+ 'GPE': 'LOCATION',
564
+ 'MONEY': 'AMOUNT',
565
+ 'DATE': 'DATE',
566
+ 'TIME': 'DATE'
567
+ }
568
+ return mapping.get(ner_label.upper(), 'MIXED_NAMES')
569
+
570
+ def extract_entities_with_regex(self, text: str, selected_categories: List[str] = None) -> List[Dict]:
571
+ """استخراج موجودیت‌ها با Regex"""
572
+ entities = []
573
+ all_patterns = self.get_comprehensive_patterns()
574
+
575
+ # Filter patterns based on selected categories
576
+ if selected_categories:
577
+ selected_pattern_types = self.get_selected_patterns(selected_categories, 'fa')
578
+ patterns = {k: v for k, v in all_patterns.items() if k in selected_pattern_types}
579
+ else:
580
+ patterns = all_patterns
581
+
582
+ processed_positions = set()
583
+
584
+ # Process patterns with priority
585
+ priority_order = [
586
+ 'ID_NUMBER', 'EMAIL', 'PHONE', 'ACCOUNT',
587
+ 'AMOUNT', 'DATE', 'LOCATION', 'COMPANY', 'PERSON'
588
+ ]
589
+
590
+ for category in priority_order:
591
+ if category in patterns:
592
+ pattern_list = patterns[category]
593
+ for pattern in pattern_list:
594
+ try:
595
+ matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
596
+ for match in matches:
597
+ if match.groups():
598
+ entity_text = match.group(1).strip()
599
+ else:
600
+ entity_text = match.group(0).strip()
601
+
602
+ # Check for overlaps
603
+ match_start, match_end = match.span()
604
+ overlaps = any(
605
+ not (match_end <= pos_start or match_start >= pos_end)
606
+ for pos_start, pos_end in processed_positions
607
+ )
608
+
609
+ if (not overlaps and len(entity_text) >= 2):
610
+ entities.append({
611
+ 'text': entity_text,
612
+ 'category': category,
613
+ 'start': match_start,
614
+ 'end': match_end,
615
+ 'confidence': 0.9,
616
+ 'source': 'regex'
617
+ })
618
+ processed_positions.add((match_start, match_end))
619
+
620
+ except re.error as e:
621
+ logger.error(f"Regex error in pattern {pattern}: {e}")
622
+ continue
623
+
624
+ return entities
625
+
626
+ def fuse_entities(self, regex_entities: List[Dict], ner_entities: List[Dict],
627
+ processing_mode: str) -> List[Dict]:
628
+ """ترکیب هوشمندانه نتایج Regex و NER"""
629
+
630
+ if processing_mode == 'regex_only' or not self.model_ready:
631
+ return regex_entities
632
+
633
+ final_entities = []
634
+ processed_positions = set()
635
+
636
+ if processing_mode == 'hybrid':
637
+ # Regex priority for specific patterns
638
+ priority_categories = ['PHONE', 'EMAIL', 'ID_NUMBER', 'ACCOUNT', 'AMOUNT']
639
+
640
+ # Add high-priority regex entities first
641
+ for entity in regex_entities:
642
+ if entity['category'] in priority_categories:
643
+ final_entities.append(entity)
644
+ processed_positions.add((entity['start'], entity['end']))
645
+
646
+ # Add NER entities for names and organizations
647
+ for entity in ner_entities:
648
+ if not self.has_overlap(entity, processed_positions):
649
+ category = self.map_ner_to_categories(entity['label'])
650
+ entity_copy = entity.copy()
651
+ entity_copy['category'] = category
652
+ final_entities.append(entity_copy)
653
+ processed_positions.add((entity['start'], entity['end']))
654
+
655
+ # Add remaining regex entities
656
+ for entity in regex_entities:
657
+ if (entity['category'] not in priority_categories and
658
+ not self.has_overlap(entity, processed_positions)):
659
+ final_entities.append(entity)
660
+ processed_positions.add((entity['start'], entity['end']))
661
+
662
+ elif processing_mode == 'ner_priority':
663
+ # NER takes priority, regex as backup
664
+ for entity in ner_entities:
665
+ category = self.map_ner_to_categories(entity['label'])
666
+ entity_copy = entity.copy()
667
+ entity_copy['category'] = category
668
+ final_entities.append(entity_copy)
669
+ processed_positions.add((entity['start'], entity['end']))
670
+
671
+ # Add non-overlapping regex entities
672
+ for entity in regex_entities:
673
+ if not self.has_overlap(entity, processed_positions):
674
+ final_entities.append(entity)
675
+ processed_positions.add((entity['start'], entity['end']))
676
+
677
+ return final_entities
678
+
679
+ def has_overlap(self, entity: Dict, processed_positions: Set[Tuple[int, int]]) -> bool:
680
+ """بررسی تداخل موقعیت entities"""
681
+ entity_start, entity_end = entity['start'], entity['end']
682
+
683
+ for start, end in processed_positions:
684
+ if not (entity_end <= start or entity_start >= end):
685
+ return True
686
+ return False
687
+
688
+ def get_selected_patterns(self, selected_categories: List[str], language: str = 'fa') -> List[str]:
689
+ """تبدیل دسته‌بندی‌های انتخاب شده به لیست الگوها"""
690
+ selected_patterns = []
691
+
692
+ for cat_key, cat_info in self.pattern_categories.items():
693
+ name = cat_info['name_fa'] if language == 'fa' else cat_info['name_en']
694
+ icon = cat_info['icon']
695
+ category_display = f"{icon} {name}"
696
+
697
+ if category_display in selected_categories:
698
+ selected_patterns.extend(cat_info['patterns'])
699
+
700
+ return selected_patterns
701
+
702
+ def get_category_choices(self, language='fa'):
703
+ """دریافت لیست دسته‌بندی‌ها برای چک‌باکس"""
704
+ choices = []
705
+ for cat_key, cat_info in self.pattern_categories.items():
706
+ name = cat_info['name_fa'] if language == 'fa'else cat_info['name_en']
707
+ icon = cat_info['icon']
708
+ choices.append(f"{icon} {name}")
709
+ return choices
710
+
711
+ def anonymize_text_enhanced(self, original_text: str, lang: str = 'fa',
712
+ selected_categories: List[str] = None,
713
+ processing_mode: str = 'hybrid') -> str:
714
+ """ناشناس‌سازی پیشرفته با ترکیب Regex + NER"""
715
+
716
+ try:
717
+ if not original_text or not original_text.strip():
718
+ return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
719
+
720
+ # Force regex_only if model not ready
721
+ if not self.model_ready and processing_mode != 'regex_only':
722
+ processing_mode = 'regex_only'
723
+ print(f"🔄 Forced to regex_only mode because model not ready")
724
+
725
+ # Reset
726
+ self.mapping_table = {}
727
+ self.reset_counters()
728
+
729
+ # Extract entities with regex
730
+ regex_entities = self.extract_entities_with_regex(original_text, selected_categories)
731
+
732
+ # Extract entities with NER (if available)
733
+ ner_entities = []
734
+ if processing_mode != 'regex_only' and self.model_ready:
735
+ ner_raw = self.extract_entities_with_ner(original_text)
736
+
737
+ # Convert to standard format
738
+ for entity in ner_raw:
739
+ ner_entities.append({
740
+ 'text': entity['text'],
741
+ 'category': self.map_ner_to_categories(entity['label']),
742
+ 'start': entity['start'],
743
+ 'end': entity['end'],
744
+ 'confidence': entity['confidence'],
745
+ 'source': 'ner'
746
+ })
747
+
748
+ # Fuse entities
749
+ final_entities = self.fuse_entities(regex_entities, ner_entities, processing_mode)
750
+
751
+ # Create anonymization mapping
752
+ anonymized = original_text
753
+ found_entities = set()
754
+
755
+ # Sort by length (longer first to avoid partial replacements)
756
+ final_entities.sort(key=lambda x: len(x['text']), reverse=True)
757
+
758
+ for entity in final_entities:
759
+ entity_text = entity['text'].strip()
760
+ category = entity['category']
761
+
762
+ if (entity_text not in found_entities and
763
+ entity_text not in self.mapping_table and
764
+ len(entity_text) >= 2):
765
+
766
+ # Generate unique code
767
+ if category not in self.counters:
768
+ self.counters[category] = 0
769
+
770
+ self.counters[category] += 1
771
+
772
+ # Add source indicator
773
+ if processing_mode == 'regex_only':
774
+ source_suffix = "REG"
775
+ elif processing_mode == 'hybrid':
776
+ source_suffix = "HYB" if self.model_ready else "REG"
777
+ else:
778
+ source_suffix = "ENH" if self.model_ready else "REG"
779
+
780
+ code = f"{category}_{self.counters[category]:03d}_{source_suffix}"
781
+
782
+ self.mapping_table[entity_text] = code
783
+ found_entities.add(entity_text)
784
+
785
+ # Apply anonymization
786
+ sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True)
787
+ for original_item, code in sorted_items:
788
+ anonymized = anonymized.replace(original_item, code)
789
+
790
+ # Statistics
791
+ regex_count = len(regex_entities)
792
+ ner_count = len(ner_entities)
793
+ final_count = len(final_entities)
794
+
795
+ logger.info(f"✅ Enhanced anonymization completed. Mode: {processing_mode}")
796
+ logger.info(f"📊 Regex: {regex_count}, NER: {ner_count}, Final: {final_count}")
797
+
798
+ return anonymized
799
+
800
+ except Exception as e:
801
+ logger.error(f"Enhanced anonymization error: {e}")
802
+ return f"❌ Error in enhanced anonymization: {str(e)}"
803
+
804
+ def send_to_chatgpt(self, anonymized_text, lang='fa'):
805
+ """گام 2: ارسال به ChatGPT"""
806
+ try:
807
+ if not anonymized_text or not anonymized_text.strip():
808
+ return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناس‌شده خالی است!"
809
+
810
+ if not self.api_key:
811
+ return "❌ API Key not configured! Please set OPENAI_API_KEY environment variable." if lang == 'en' else "❌ کلید API تنظیم نشده است!"
812
+
813
+ system_msg = "You are a professional analyst. Answer questions accurately." if lang == 'en' else "شما یک تحلیلگر حرفه‌ای هستید. به سوالات با دقت پاسخ دهید."
814
+
815
+ headers = {
816
+ "Authorization": f"Bearer {self.api_key}",
817
+ "Content-Type": "application/json"
818
+ }
819
+
820
+ data = {
821
+ "model": "gpt-4o-mini",
822
+ "messages": [
823
+ {"role": "system", "content": system_msg},
824
+ {"role": "user", "content": anonymized_text}
825
+ ],
826
+ "max_tokens": 2000,
827
+ "temperature": 0.7
828
+ }
829
+
830
+ response = requests.post(
831
+ "https://api.openai.com/v1/chat/completions",
832
+ headers=headers,
833
+ json=data,
834
+ timeout=15 # Reduced timeout for HF Spaces
835
+ )
836
+
837
+ if response.status_code == 200:
838
+ result = response.json()
839
+ return result['choices'][0]['message']['content']
840
+ else:
841
+ error_data = response.json() if response.content else {}
842
+ error_message = error_data.get('error', {}).get('message', response.text)
843
+ return f"❌ API Error: {error_message}"
844
+
845
+ except Exception as e:
846
+ return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}"
847
+
848
+ def deanonymize_response(self, gpt_response, lang='fa'):
849
+ """گام 3: بازگردانی"""
850
+ try:
851
+ if not gpt_response or not gpt_response.strip():
852
+ return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!"
853
+
854
+ if not self.mapping_table:
855
+ return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
856
+
857
+ final_result = gpt_response
858
+ reverse_mapping = {code: original for original, code in self.mapping_table.items()}
859
+
860
+ sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True)
861
+ for code, original in sorted_codes:
862
+ final_result = final_result.replace(code, original)
863
+
864
+ return final_result
865
+
866
+ except Exception as e:
867
+ return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}"
868
+
869
+ def get_model_status(self):
870
+ """وضعیت سیستم"""
871
+ status = "🚀 **Enhanced Multi-Modal Anonymization System Status:**\n\n"
872
+
873
+ status += f"🤖 **Model Status**: {self.model_status}\n"
874
+ status += f"📝 **Regex Patterns**: ✅ 221 comprehensive patterns loaded\n"
875
+ status += f"🌍 **Language Support**: Persian, English, Mixed\n"
876
+ status += f"🐍 **Python Version**: {sys.version.split()[0]}\n"
877
+ status += f"📦 **Transformers Available**: {'✅ Yes' if TRANSFORMERS_AVAILABLE else '❌ No'}\n\n"
878
+
879
+ if self.model_ready:
880
+ status += "🎯 **Available Processing Modes:**\n"
881
+ status += " • 🔥 Hybrid (Recommended): Regex priority + NER enhancement\n"
882
+ status += " • 🎯 NER Priority: NER priority + Regex backup\n"
883
+ status += " • ⚡ Regex Only: High-speed pattern matching\n\n"
884
+
885
+ status += "📈 **Expected Accuracy:**\n"
886
+ status += " • Regex Only: 70-75%\n"
887
+ status += " • Hybrid Mode: 85-92%\n"
888
+ status += " • NER Priority: 88-95%\n\n"
889
+ else:
890
+ status += "⚠️ **Current Mode: Regex Only**\n"
891
+ status += " • Pure Regex processing (70-75% accuracy)\n"
892
+ if not TRANSFORMERS_AVAILABLE:
893
+ status += " • Install transformers library for enhanced accuracy\n"
894
+ status += " • pip install transformers torch\n"
895
+ status += "\n"
896
+
897
+ status += f"🎯 **Pattern Categories**: {len(self.pattern_categories)} categories available\n"
898
+ status += f"🔧 **Configuration**: User-controlled category selection\n"
899
+ status += f"🛡️ **Privacy**: Local processing with optional ChatGPT integration\n"
900
+
901
+ if TRANSFORMERS_AVAILABLE:
902
+ status += f"✅ **Transformers Library**: Ready for NER processing\n"
903
+ else:
904
+ status += f"❌ **Transformers Library**: Not available - Add to requirements.txt\n"
905
+
906
+ return status
907
+
908
+ # Initialize the enhanced anonymizer
909
+ print("🔄 Initializing Enhanced Data Anonymizer...")
910
+ anonymizer = EnhancedDataAnonymizer()
911
+ print(f"✅ Anonymizer initialized with status: {anonymizer.model_status}")
912
+
913
+ def process_all_steps_enhanced(input_text, language, selected_categories, processing_mode):
914
+ """پردازش خودکار تمام مراحل - نسخه پیشرفته"""
915
+ lang = 'en' if language == 'English' else 'fa'
916
+
917
+ if not input_text.strip():
918
+ error_msg = "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
919
+ return error_msg, "", "", ""
920
+
921
+ try:
922
+ start_time = time.time()
923
+
924
+ # Enhanced anonymization
925
+ anonymized_text = anonymizer.anonymize_text_enhanced(
926
+ input_text, lang, selected_categories, processing_mode
927
+ )
928
+
929
+ if anonymized_text.startswith("❌"):
930
+ return anonymized_text, "", "", ""
931
+
932
+ # ChatGPT processing
933
+ gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang)
934
+ if gpt_response.startswith("❌"):
935
+ entities_found = len(anonymizer.mapping_table)
936
+
937
+ success_msg = (f"✅ Enhanced anonymization completed successfully!\n"
938
+ f"🎯 Processing mode: {processing_mode}\n"
939
+ f"📊 Protected entities: {entities_found}")
940
+ return success_msg, anonymized_text, gpt_response, ""
941
+
942
+ # Deanonymization
943
+ final_result = anonymizer.deanonymize_response(gpt_response, lang)
944
+
945
+ total_time = time.time() - start_time
946
+ entities_found = len(anonymizer.mapping_table)
947
+
948
+ model_indicator = 'XLM-RoBERTa + Regex' if anonymizer.model_ready else 'Regex Only'
949
+
950
+ success_msg = (f"🎉 Complete enhanced anonymization & restoration successful!\n"
951
+ f"🎯 Mode: {processing_mode} | 📊 Entities: {entities_found}\n"
952
+ f"⏱️ Time: {total_time:.2f}s | 🤖 Model: {model_indicator}")
953
+
954
+ return success_msg, anonymized_text, gpt_response, final_result
955
+
956
+ except Exception as e:
957
+ error_msg = f"❌ Processing error: {str(e)}" if lang == 'en' else f"❌ خطا در پردازش: {str(e)}"
958
+ return error_msg, "", "", ""
959
+
960
+ def get_mapping_table_enhanced(language):
961
+ """نم��یش جدول نگاشت پیشرفته"""
962
+ lang = 'en' if language == 'English' else 'fa'
963
+
964
+ if not anonymizer.mapping_table:
965
+ return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
966
+
967
+ result = "🔋 **Enhanced Mapping Table:**\n\n"
968
+
969
+ result += f"📊 **Statistics**: {len(anonymizer.mapping_table)} total entities\n"
970
+ result += f"🎯 **Method**: {'Hybrid Processing' if anonymizer.model_ready else 'Regex Only'}\n"
971
+ result += f"🤖 **Model Status**: {anonymizer.model_status}\n\n"
972
+
973
+ # Group by category
974
+ category_stats = {}
975
+ for original, code in anonymizer.mapping_table.items():
976
+ category = code.split('_')[0]
977
+ if category not in category_stats:
978
+ category_stats[category] = []
979
+ category_stats[category].append((original, code))
980
+
981
+ # Display results by category
982
+ for category, items in category_stats.items():
983
+ if len(items) > 0:
984
+ result += f"📁 **{category}** ({len(items)} items):\n"
985
+ for original, code in items[:3]:
986
+ source_indicator = "🧠" if any(x in code for x in ["HYB", "ENH"]) else "📝"
987
+ result += f" {source_indicator} `{original}` → `{code}`\n"
988
+ if len(items) > 3:
989
+ result += f" ... و {len(items) - 3} مورد دیگر\n"
990
+ result += "\n"
991
+
992
+ result += f"🔥 **Enhanced System**: Advanced Regex patterns with optional NER support!"
993
+
994
+ return result
995
+
996
+ def clear_all_enhanced():
997
+ """پاک کردن همه - نسخه پیشرفته"""
998
+ anonymizer.mapping_table = {}
999
+ anonymizer.reset_counters()
1000
+ return "", "", "", "", ""
1001
+
1002
+ # Enhanced CSS
1003
+ enhanced_css = """
1004
+ body, .gradio-container {
1005
+ font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important;
1006
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
1007
+ min-height: 100vh !important;
1008
+ padding: 20px !important;
1009
+ }
1010
+
1011
+ .enhanced-header {
1012
+ background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
1013
+ border-radius: 20px !important;
1014
+ padding: 20px !important;
1015
+ margin-bottom: 20px !important;
1016
+ text-align: center !important;
1017
+ box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
1018
+ }
1019
+
1020
+ .mode-selector {
1021
+ background: linear-gradient(135deg, #74b9ff, #0984e3) !important;
1022
+ border-radius: 15px !important;
1023
+ padding: 20px !important;
1024
+ margin: 15px 0 !important;
1025
+ box-shadow: 0 8px 25px rgba(116, 185, 255, 0.3) !important;
1026
+ }
1027
+
1028
+ .model-status {
1029
+ background: linear-gradient(135deg, #00b894, #00a085) !important;
1030
+ border-radius: 15px !important;
1031
+ padding: 15px !important;
1032
+ margin: 15px 0 !important;
1033
+ color: white !important;
1034
+ font-weight: bold !important;
1035
+ text-align: center !important;
1036
+ box-shadow: 0 6px 20px rgba(0, 184, 148, 0.4) !important;
1037
+ }
1038
+
1039
+ .rtl {
1040
+ direction: rtl !important;
1041
+ text-align: right !important;
1042
+ }
1043
+
1044
+ .ltr {
1045
+ direction: ltr !important;
1046
+ text-align: left !important;
1047
+ }
1048
+
1049
+ .workflow {
1050
+ display: grid !important;
1051
+ grid-template-columns: 1fr 1fr 1fr 1fr !important;
1052
+ gap: 25px !important;
1053
+ padding: 30px !important;
1054
+ align-items: start !important;
1055
+ background: rgba(255, 255, 255, 0.1) !important;
1056
+ border-radius: 20px !important;
1057
+ backdrop-filter: blur(10px) !important;
1058
+ }
1059
+
1060
+ .gradio-textbox {
1061
+ border-radius: 10px !important;
1062
+ box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
1063
+ min-height: 380px !important;
1064
+ max-height: 380px !important;
1065
+ height: 380px !important;
1066
+ }
1067
+
1068
+ .gradio-button {
1069
+ border-radius: 25px !important;
1070
+ font-weight: bold !important;
1071
+ transition: all 0.3s ease !important;
1072
+ margin: 5px 0 !important;
1073
+ min-height: 50px !important;
1074
+ background: linear-gradient(45deg, #667eea, #764ba2) !important;
1075
+ border: none !important;
1076
+ color: white !important;
1077
+ }
1078
+
1079
+ .gradio-button:hover {
1080
+ transform: translateY(-2px) !important;
1081
+ box-shadow: 0 8px 25px rgba(0,0,0,0.3) !important;
1082
+ background: linear-gradient(45deg, #764ba2, #667eea) !important;
1083
+ }
1084
+
1085
+ @media (max-width: 1200px) {
1086
+ .workflow {
1087
+ grid-template-columns: 1fr 1fr !important;
1088
+ }
1089
+ }
1090
+
1091
+ @media (max-width: 768px) {
1092
+ .workflow {
1093
+ grid-template-columns: 1fr !important;
1094
+ }
1095
+ }
1096
+ """
1097
+
1098
+ # Main Gradio Interface
1099
+ with gr.Blocks(title="🚀 Enhanced Multi-Modal Anonymization", theme=gr.themes.Soft(), css=enhanced_css) as app:
1100
+
1101
+ # Header
1102
+ with gr.Row():
1103
+ gr.HTML("""
1104
+ <div class="enhanced-header">
1105
+ <h1 style='color: white; font-size: 3em; margin: 0; text-shadow: 2px 2px 4px rgba(0,0,0,0.5);'>
1106
+ 🚀 Enhanced Multi-Modal Anonymization System
1107
+ </h1>
1108
+ <p style='color: white; font-size: 1.2em; margin: 10px 0 0 0; text-shadow: 1px 1px 2px rgba(0,0,0,0.5);'>
1109
+ ���� Advanced Regex + Optional NER = Maximum Accuracy
1110
+ </p>
1111
+ </div>
1112
+ """)
1113
+
1114
+ # Language and Mode Selection
1115
+ with gr.Row():
1116
+ with gr.Column(scale=1):
1117
+ language_selector = gr.Radio(
1118
+ choices=["فارسی", "English"],
1119
+ value="فارسی",
1120
+ label="Language / زبان",
1121
+ interactive=True
1122
+ )
1123
+
1124
+ with gr.Column(scale=2, elem_classes="mode-selector"):
1125
+ processing_mode = gr.Radio(
1126
+ choices=[
1127
+ ("⚡ Regex Only (Fast & Compatible)", "regex_only"),
1128
+ ("🎯 Hybrid Mode (Recommended)", "hybrid"),
1129
+ ("🔬 NER Priority (Highest Accuracy)", "ner_priority")
1130
+ ],
1131
+ value="regex_only" if not anonymizer.model_ready else "hybrid",
1132
+ label="🎚️ Processing Mode",
1133
+ info="Choose processing complexity vs accuracy trade-off"
1134
+ )
1135
+
1136
+ # Model Status Display
1137
+ with gr.Row():
1138
+ model_status_display = gr.HTML(
1139
+ f'<div class="model-status">🤖 Model Status: {anonymizer.model_status}</div>'
1140
+ )
1141
+
1142
+ # Category Selection
1143
+ with gr.Row():
1144
+ with gr.Column():
1145
+ pattern_categories = gr.CheckboxGroup(
1146
+ choices=anonymizer.get_category_choices('fa'),
1147
+ value=anonymizer.get_category_choices('fa'),
1148
+ label="🎯 انتخاب دسته‌بندی‌های الگوی ناشناس‌سازی:",
1149
+ interactive=True
1150
+ )
1151
+
1152
+ # Main Workflow
1153
+ with gr.Row(elem_classes="workflow rtl") as workflow_row:
1154
+ with gr.Column():
1155
+ step1_title = gr.HTML('<h2 style="direction: rtl;">📝 متن ورودی</h2>')
1156
+ input_text = gr.Textbox(
1157
+ lines=15,
1158
+ placeholder="متن اصلی خود را اینجا وارد کنید...\n\n🚀 سیستم پیشرفته با الگوهای regex جامع\n✅ دقت بالا برای نام اشخاص، شرکت‌ها، مکان‌ها\n📱 شناسایی دقیق تلفن، ایمیل، حساب بانکی\n💰 تشخیص مبالغ مالی و درصدها\n🗓️ استخراج تاریخ‌ها و زمان‌ها",
1159
+ label="",
1160
+ rtl=True
1161
+ )
1162
+
1163
+ process_btn = gr.Button("🚀 پردازش پیشرفته", variant="primary")
1164
+ clear_btn = gr.Button("🗑️ پاک کردن همه", variant="stop")
1165
+
1166
+ status = gr.Textbox(
1167
+ label="وضعیت پردازش",
1168
+ lines=4,
1169
+ interactive=False,
1170
+ rtl=True
1171
+ )
1172
+
1173
+ with gr.Column():
1174
+ step2_title = gr.HTML('<h2 style="direction: rtl;">🎭 متن ناشناس‌شده</h2>')
1175
+ anonymized_output = gr.Textbox(
1176
+ lines=15,
1177
+ placeholder="متن ناشناس‌شده با کدهای محافظتی...",
1178
+ label="",
1179
+ interactive=False,
1180
+ rtl=True
1181
+ )
1182
+
1183
+ with gr.Column():
1184
+ step3_title = gr.HTML('<h2 style="direction: rtl;">🤖 پاسخ ChatGPT</h2>')
1185
+ gpt_output = gr.Textbox(
1186
+ lines=15,
1187
+ placeholder="پاسخ ChatGPT به متن ناشناس‌شده...",
1188
+ label="",
1189
+ interactive=False,
1190
+ rtl=True
1191
+ )
1192
+
1193
+ with gr.Column():
1194
+ step4_title = gr.HTML('<h2 style="direction: rtl;">✅ پاسخ نهایی</h2>')
1195
+ final_output = gr.Textbox(
1196
+ lines=15,
1197
+ placeholder="پاسخ نهایی با بازگردانی اطلاعات اصلی...",
1198
+ label="",
1199
+ interactive=False,
1200
+ rtl=True
1201
+ )
1202
+
1203
+ # Additional Tools
1204
+ with gr.Row():
1205
+ with gr.Column():
1206
+ mapping_btn = gr.Button("📋 نمایش جدول نگاشت پیشرفته")
1207
+ mapping_output = gr.Textbox(
1208
+ lines=15,
1209
+ label="جدول نگاشت اطلاعات",
1210
+ interactive=False,
1211
+ visible=False,
1212
+ rtl=True
1213
+ )
1214
+
1215
+ with gr.Column():
1216
+ system_status_btn = gr.Button("📊 نمایش وضعیت سیستم پیشرفته")
1217
+ system_status_output = gr.Textbox(
1218
+ lines=20,
1219
+ label="وضعیت سیستم",
1220
+ interactive=False,
1221
+ visible=False,
1222
+ rtl=True
1223
+ )
1224
+
1225
+ # Event Handlers
1226
+ process_btn.click(
1227
+ fn=process_all_steps_enhanced,
1228
+ inputs=[input_text, language_selector, pattern_categories, processing_mode],
1229
+ outputs=[status, anonymized_output, gpt_output, final_output]
1230
+ )
1231
+
1232
+ clear_btn.click(
1233
+ fn=clear_all_enhanced,
1234
+ outputs=[input_text, anonymized_output, gpt_output, final_output, status]
1235
+ )
1236
+
1237
+ mapping_btn.click(
1238
+ fn=get_mapping_table_enhanced,
1239
+ inputs=[language_selector],
1240
+ outputs=[mapping_output]
1241
+ )
1242
+
1243
+ mapping_btn.click(
1244
+ fn=lambda: gr.update(visible=True),
1245
+ outputs=[mapping_output]
1246
+ )
1247
+
1248
+ system_status_btn.click(
1249
+ fn=lambda: anonymizer.get_model_status(),
1250
+ outputs=[system_status_output]
1251
+ )
1252
+
1253
+ system_status_btn.click(
1254
+ fn=lambda: gr.update(visible=True),
1255
+ outputs=[system_status_output]
1256
+ )
1257
+
1258
+ if __name__ == "__main__":
1259
+ logger.info("🚀 Starting Enhanced Multi-Modal Anonymization System...")
1260
+ logger.info(f"🤖 XLM-RoBERTa Status: {anonymizer.model_status}")
1261
+ logger.info("✅ Ready for high-accuracy bilingual processing!")
1262
+
1263
+ app.launch(
1264
+ share=False,
1265
+ server_name="0.0.0.0",
1266
+ server_port=7860,
1267
+ show_error=True
1268
+ )