leilaghomashchi commited on
Commit
6862f9e
·
verified ·
1 Parent(s): 6c28dca
Files changed (1) hide show
  1. app.py +1136 -0
app.py ADDED
@@ -0,0 +1,1136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ import os
4
+ import requests
5
+ import time
6
+ import logging
7
+ from packaging import version
8
+
9
+ # تنظیم logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ def auto_setup_models():
14
+ """راه‌اندازی خودکار مدل‌ها در صورت عدم وجود"""
15
+ models_dir = "./models"
16
+ required_models = {
17
+ 'bert-fa-ner': 'HooshvareLab/bert-fa-zwnj-base-ner',
18
+ 'bert-base-NER': 'dslim/bert-base-NER',
19
+ }
20
+
21
+ missing_models = []
22
+ for model_name in required_models.keys():
23
+ model_path = os.path.join(models_dir, model_name)
24
+ if not os.path.exists(model_path) or not os.listdir(model_path):
25
+ missing_models.append(model_name)
26
+
27
+ if not missing_models:
28
+ logger.info("✅ All models are already available")
29
+ return True
30
+
31
+ logger.info(f"📥 Auto-downloading missing models: {missing_models}")
32
+
33
+ try:
34
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
35
+ os.makedirs(models_dir, exist_ok=True)
36
+
37
+ for model_name in missing_models:
38
+ hf_repo = required_models[model_name]
39
+ model_path = os.path.join(models_dir, model_name)
40
+ logger.info(f"📥 Downloading {model_name} from {hf_repo}...")
41
+ try:
42
+ tokenizer = AutoTokenizer.from_pretrained(hf_repo)
43
+ model = AutoModelForTokenClassification.from_pretrained(hf_repo)
44
+ tokenizer.save_pretrained(model_path)
45
+ model.save_pretrained(model_path)
46
+ logger.info(f"✅ {model_name} downloaded successfully")
47
+ del tokenizer, model
48
+ except Exception as e:
49
+ logger.error(f"❌ Failed to download {model_name}: {e}")
50
+ if os.path.exists(model_path):
51
+ import shutil
52
+ shutil.rmtree(model_path)
53
+
54
+ logger.info("🎉 Auto-setup completed!")
55
+ return True
56
+
57
+ except ImportError:
58
+ logger.error("❌ transformers library not available for auto-download")
59
+ return False
60
+ except Exception as e:
61
+ logger.error(f"❌ Auto-setup failed: {e}")
62
+ return False
63
+
64
+ # اجرای auto-setup در startup
65
+ try:
66
+ auto_setup_models()
67
+ except Exception as e:
68
+ logger.warning(f"⚠️ Auto-setup encountered an issue: {e}")
69
+ logger.info("ℹ️ Continuing with manual setup...")
70
+
71
+ class BilingualDataAnonymizer:
72
+ def __init__(self):
73
+ self.mapping_table = {}
74
+ # counters به‌روزرسانی شده با دسته‌های جدید
75
+ self.counters = {
76
+ 'COMPANY': 0, 'PERSON': 0, 'AMOUNT': 0, 'ACCOUNT': 0,
77
+ 'DATE': 0, 'STOCK_SYMBOL': 0, 'PETROCHEMICAL': 0,
78
+ 'PRODUCT': 0, 'PERCENTAGE': 0, 'LOCATION': 0,
79
+ 'VOLUME': 0, 'PHONE': 0, 'EMAIL': 0, 'ID_NUMBER': 0,
80
+ 'FINANCIAL_TERMS': 0, 'BUSINESS_TERMS': 0, 'RATIOS': 0
81
+ }
82
+
83
+ self.api_key = os.getenv("OPENAI_API_KEY", "")
84
+ self.models_base_path = "./models"
85
+ self.models_loaded = False
86
+ self.model_status = {}
87
+ self.load_local_ner_models()
88
+
89
+ def ensure_models_directory(self):
90
+ if not os.path.exists(self.models_base_path):
91
+ try:
92
+ os.makedirs(self.models_base_path, exist_ok=True)
93
+ logger.info(f"📁 Created models directory: {self.models_base_path}")
94
+ except Exception as e:
95
+ logger.error(f"❌ Failed to create models directory: {e}")
96
+ return False
97
+ return True
98
+
99
+ def download_model_if_missing(self, local_name, hf_repo):
100
+ model_path = os.path.join(self.models_base_path, local_name)
101
+ if os.path.exists(model_path) and os.listdir(model_path):
102
+ return True, f"Model {local_name} already exists"
103
+ try:
104
+ logger.info(f"📥 Auto-downloading {local_name} from {hf_repo}...")
105
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
106
+ tokenizer = AutoTokenizer.from_pretrained(hf_repo)
107
+ model = AutoModelForTokenClassification.from_pretrained(hf_repo)
108
+ tokenizer.save_pretrained(model_path)
109
+ model.save_pretrained(model_path)
110
+ logger.info(f"✅ {local_name} auto-downloaded successfully")
111
+ return True, f"Downloaded {local_name}"
112
+ except Exception as e:
113
+ logger.error(f"❌ Auto-download failed for {local_name}: {e}")
114
+ return False, str(e)
115
+
116
+ def _load_pipeline(self, task, model_path, tokenizer_path=None):
117
+ """لود مدل با مدیریت صحیح پارامترهای ورژن مختلف transformers"""
118
+ try:
119
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, __version__ as tr_version
120
+
121
+ # بررسی پشتیبانی از aggregation_strategy
122
+ supports_agg = version.parse(tr_version) >= version.parse("4.11.0")
123
+
124
+ # لود توکنایزر و مدل به صورت جداگانه
125
+ if tokenizer_path:
126
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
127
+ else:
128
+ tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
129
+
130
+ model = AutoModelForTokenClassification.from_pretrained(model_path, local_files_only=True)
131
+
132
+ # ایجاد pipeline با پارامترهای مناسب
133
+ pipeline_kwargs = {
134
+ "model": model,
135
+ "tokenizer": tokenizer,
136
+ "device": -1 # استفاده از CPU
137
+ }
138
+
139
+ # اضافه کردن aggregation_strategy اگر پشتیبانی می‌شود
140
+ if supports_agg:
141
+ pipeline_kwargs["aggregation_strategy"] = "simple"
142
+
143
+ return pipeline(task, **pipeline_kwargs)
144
+
145
+ except Exception as e:
146
+ logger.error(f"❌ Failed to load pipeline for {model_path}: {e}")
147
+ return None
148
+
149
+ def load_local_ner_models(self):
150
+ logger.info("🔄 Loading local NER models with auto-download...")
151
+ if not self.ensure_models_directory():
152
+ self.models_loaded = False
153
+ self.model_status['directory'] = "❌ Cannot create models directory"
154
+ return
155
+
156
+ try:
157
+ try:
158
+ import torch
159
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
160
+ transformers_available = True
161
+ logger.info("✅ Transformers library available")
162
+ except ImportError as e:
163
+ transformers_available = False
164
+ self.model_status['transformers'] = f"❌ Transformers library not installed: {str(e)}"
165
+ self.models_loaded = False
166
+ return
167
+
168
+ # Persian model
169
+ persian_model_path = os.path.join(self.models_base_path, "bert-fa-ner")
170
+ self.download_model_if_missing("bert-fa-ner", "HooshvareLab/bert-fa-zwnj-base-ner")
171
+ if os.path.exists(persian_model_path) and os.listdir(persian_model_path):
172
+ try:
173
+ self.persian_ner = self._load_pipeline("ner", persian_model_path)
174
+ if self.persian_ner:
175
+ self.model_status['persian'] = f"✅ Local Persian NER: {persian_model_path}"
176
+ else:
177
+ self.model_status['persian'] = f"❌ Failed to load Persian model: {persian_model_path}"
178
+ except Exception as e:
179
+ self.persian_ner = None
180
+ self.model_status['persian'] = f"❌ Persian model loading error: {str(e)[:100]}"
181
+ else:
182
+ self.persian_ner = None
183
+ self.model_status['persian'] = f"❌ Persian model not found: {persian_model_path}"
184
+
185
+ # English model
186
+ english_model_path = os.path.join(self.models_base_path, "bert-base-NER")
187
+ self.download_model_if_missing("bert-base-NER", "dslim/bert-base-NER")
188
+ if os.path.exists(english_model_path) and os.listdir(english_model_path):
189
+ try:
190
+ self.english_ner = self._load_pipeline("ner", english_model_path)
191
+ if self.english_ner:
192
+ self.model_status['english'] = f"✅ Local English NER: {english_model_path}"
193
+ else:
194
+ self.model_status['english'] = f"❌ Failed to load English model: {english_model_path}"
195
+ except Exception as e:
196
+ self.english_ner = None
197
+ self.model_status['english'] = f"❌ English model loading error: {str(e)[:100]}"
198
+ else:
199
+ self.english_ner = None
200
+ self.model_status['english'] = f"❌ English model not found: {english_model_path}"
201
+
202
+ loaded_models = sum(1 for status in self.model_status.values() if status.startswith("✅"))
203
+ self.models_loaded = loaded_models > 0
204
+ if loaded_models == 0:
205
+ self.model_status['fallback'] = "⚠️ Using regex-only mode (no local models found)"
206
+
207
+ except Exception as e:
208
+ self.models_loaded = False
209
+ self.model_status['critical'] = f"❌ Critical error: {str(e)[:100]}..."
210
+
211
+ def detect_language(self, text):
212
+ """تشخیص زبان متن"""
213
+ if not text:
214
+ return 'fa'
215
+
216
+ persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
217
+ english_chars = len(re.findall(r'[a-zA-Z]', text))
218
+ total = persian_chars + english_chars
219
+
220
+ if total == 0:
221
+ return 'fa'
222
+
223
+ if persian_chars / total > 0.6:
224
+ return 'fa'
225
+ elif english_chars / total > 0.6:
226
+ return 'en'
227
+ else:
228
+ return 'mixed'
229
+
230
+ def extract_entities_with_ner(self, text, lang='fa'):
231
+ """استخراج entities با مدل‌های NER محلی"""
232
+ entities = []
233
+
234
+ if not self.models_loaded:
235
+ logger.info("ℹ️ Local NER models not available - using regex only")
236
+ return entities
237
+
238
+ try:
239
+ # مدل فارسی محلی
240
+ if lang in ['fa', 'mixed'] and hasattr(self, 'persian_ner') and self.persian_ner:
241
+ try:
242
+ persian_results = self.persian_ner(text)
243
+ for entity in persian_results:
244
+ # بررسی فرمت خروجی بر اساس ورژن transformers
245
+ if isinstance(entity, dict):
246
+ if 'entity_group' in entity:
247
+ # ورژن جدید با aggregation_strategy
248
+ entities.append({
249
+ 'text': entity['word'].strip(),
250
+ 'label': entity['entity_group'],
251
+ 'start': entity['start'],
252
+ 'end': entity['end'],
253
+ 'confidence': entity['score'],
254
+ 'source': 'local_persian_ner'
255
+ })
256
+ else:
257
+ # ورژن قدیمی
258
+ entities.append({
259
+ 'text': entity['word'].strip(),
260
+ 'label': entity['entity'],
261
+ 'start': entity['start'],
262
+ 'end': entity['end'],
263
+ 'confidence': entity['score'],
264
+ 'source': 'local_persian_ner'
265
+ })
266
+ logger.info(f"Local Persian NER found {len(persian_results)} entities")
267
+ except Exception as e:
268
+ logger.error(f"Local Persian NER extraction error: {e}")
269
+
270
+ # مدل انگلیسی محلی
271
+ if lang in ['en', 'mixed'] and hasattr(self, 'english_ner') and self.english_ner:
272
+ try:
273
+ english_results = self.english_ner(text)
274
+ for entity in english_results:
275
+ # بررسی فرمت خروجی بر اساس ورژن transformers
276
+ if isinstance(entity, dict):
277
+ if 'entity_group' in entity:
278
+ # ورژن جدید با aggregation_strategy
279
+ entities.append({
280
+ 'text': entity['word'].strip(),
281
+ 'label': entity['entity_group'],
282
+ 'start': entity['start'],
283
+ 'end': entity['end'],
284
+ 'confidence': entity['score'],
285
+ 'source': 'local_english_ner'
286
+ })
287
+ else:
288
+ # ورژن قدیمی
289
+ entities.append({
290
+ 'text': entity['word'].strip(),
291
+ 'label': entity['entity'],
292
+ 'start': entity['start'],
293
+ 'end': entity['end'],
294
+ 'confidence': entity['score'],
295
+ 'source': 'local_english_ner'
296
+ })
297
+ logger.info(f"Local English NER found {len(english_results)} entities")
298
+ except Exception as e:
299
+ logger.error(f"Local English NER extraction error: {e}")
300
+
301
+ except Exception as e:
302
+ logger.error(f"Local NER extraction general error: {e}")
303
+
304
+ # حذف تکراری‌ها
305
+ unique_entities = []
306
+ seen = set()
307
+ for entity in entities:
308
+ key = (entity['text'].lower(), entity['start'], entity['end'])
309
+ if key not in seen:
310
+ seen.add(key)
311
+ unique_entities.append(entity)
312
+
313
+ logger.info(f"Total unique entities found by local models: {len(unique_entities)}")
314
+ return unique_entities
315
+
316
+ def map_ner_to_categories(self, ner_label, source=''):
317
+ """نگاشت برچسب‌های NER به دسته‌های سیستم"""
318
+ mapping = {
319
+ 'PER': 'PERSON', 'PERSON': 'PERSON',
320
+ 'ORG': 'COMPANY', 'ORGANIZATION': 'COMPANY',
321
+ 'LOC': 'LOCATION', 'LOCATION': 'LOCATION',
322
+ 'MISC': 'BUSINESS_TERMS', 'MISCELLANEOUS': 'BUSINESS_TERMS',
323
+ 'B-PER': 'PERSON', 'I-PER': 'PERSON',
324
+ 'B-ORG': 'COMPANY', 'I-ORG': 'COMPANY',
325
+ 'B-LOC': 'LOCATION', 'I-LOC': 'LOCATION',
326
+ 'B-MISC': 'BUSINESS_TERMS', 'I-MISC': 'BUSINESS_TERMS',
327
+ 'MONEY': 'AMOUNT', 'PERCENT': 'PERCENTAGE',
328
+ 'DATE': 'DATE', 'TIME': 'DATE'
329
+ }
330
+ return mapping.get(ner_label.upper(), 'BUSINESS_TERMS')
331
+
332
+ def anonymize_text(self, original_text, lang='fa'):
333
+ """گام 1: ناشناس‌سازی متن"""
334
+ try:
335
+ if not original_text or not original_text.strip():
336
+ return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
337
+
338
+ # ریست متغیرها
339
+ self.mapping_table = {}
340
+ self.counters = {key: 0 for key in self.counters.keys()}
341
+
342
+ anonymized = original_text
343
+ found_entities = set()
344
+
345
+ # تشخیص زبان
346
+ detected_lang = self.detect_language(original_text)
347
+ logger.info(f"Detected language: {detected_lang}")
348
+
349
+ # مرحله 1: استخراج با Local NER
350
+ if self.models_loaded:
351
+ logger.info("🤖 Running local NER extraction...")
352
+ ner_entities = self.extract_entities_with_ner(original_text, detected_lang)
353
+
354
+ for entity in ner_entities:
355
+ if (entity['text'] not in found_entities and
356
+ len(entity['text'].strip()) > 1 and
357
+ entity['confidence'] > 0.5):
358
+
359
+ category = self.map_ner_to_categories(entity['label'], entity['source'])
360
+
361
+ if entity['text'] not in self.mapping_table:
362
+ self.counters[category] += 1
363
+ code = f"{category}_{self.counters[category]:03d}_LOCAL_NER"
364
+ self.mapping_table[entity['text']] = code
365
+ found_entities.add(entity['text'])
366
+ logger.info(f"Local NER: {entity['text']} -> {code}")
367
+ else:
368
+ logger.info("ℹ️ Using regex-only mode")
369
+
370
+ # مرحله 2: الگوهای Regex - الگوهای جدید اضافه شده
371
+ patterns = {
372
+ 'STOCK_SYMBOL': [
373
+ r'نماد\s+([آ-ی‌a-zA-Z0-9]+)',
374
+ r'(سبهان|غدیر|شتران|شپنا|پترول|فارس|خارک|پلاسکو|جم|کرمان|مارون|اراک|رازی|شازند|کاوه|بندر|پارس|خوزستان|ماهشهر|عسلویه)(?=\s|$|،|\.|\s+—)',
375
+ r'شرکت\s+([آ-ی‌a-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+را|\s+به)',
376
+ r'پتروشیمی\s+([آ-ی‌a-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+توان)',
377
+ r'(AAPL|GOOGL|MSFT|AMZN|TSLA|META|NVDA|SABIC)(?=\s|$|,|\.)'
378
+ ],
379
+ 'COMPANY': [
380
+ r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به|\s+طی)',
381
+ r'([آ-ی‌a-zA-Z\s]+)\s+شرکت',
382
+ r'این\s+شرکت(?=\s|$|،|\.)',
383
+ r'(بانک\s+[آ-ی‌a-zA-Z\s]+)',
384
+ r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))'
385
+ ],
386
+ 'PERSON': [
387
+ r'آقای\s+([آ-ی‌a-zA-Z]+(?:\s+[آ-ی‌a-zA-Z]+)*)',
388
+ r'خانم\s+([آ-ی‌a-zA-Z]+(?:\s+[آ-ی‌a-zA-Z]+)*)',
389
+ r'مهندس\s+([آ-ی‌a-zA-Z]+(?:\s+[آ-ی‌a-zA-Z]+)*)',
390
+ r'دکتر\s+([آ-ی‌a-zA-Z]+(?:\s+[آ-ی‌a-zA-Z]+)*)',
391
+ r'([آ-ی‌a-zA-Z]+\s+[آ-ی‌a-zA-Z]+)(?=،\s+مدیرعامل|\s+مدیرعامل|\s+رئیس)',
392
+ r'مدیرعامل(?=\s|$|،|\.)',
393
+ r'سرپرست(?=\s+و|\s|$|،|\.)',
394
+ r'رئیس\s+هیأت‌مدیره',
395
+ r'وی(?=\s+ادامه|\s+اظهار|\s+گفت|\s+اعلام|\s+همچنین)'
396
+ ],
397
+ 'AMOUNT': [
398
+ r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
399
+ r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان',
400
+ r'\d+\s*تومان(?=\s+به\s+ازای|\s+فروش|\s+،)',
401
+ r'رقم\s+فعلی\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
402
+ r'رقم\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
403
+ r'به\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
404
+ r'از\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
405
+ r'برابر\s+با\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
406
+ r'\d+(?:میلیارد|میلیون)\s*تومان(?=\s+رسیده|\s+ثبت|\s+بوده|\s+،)',
407
+ r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?',
408
+ r'\d+(?:,\d{3})*\s*ریال',
409
+ r'€\d+(?:,\d{3})*(?:\.\d+)?'
410
+ ],
411
+ 'PERCENTAGE': [
412
+ r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش|\s+بالاتر|\s+پایین‌تر)?',
413
+ r'\d+(?:\.\d+)?\s*%',
414
+ r'معادل\s+\d+(?:\.\d+)?\s*درصد',
415
+ r'حدود\s+\d+(?:\.\d+)?\s*درصد',
416
+ r'با\s+\d+(?:\.\d+)?\s*درصد\s+افزایش',
417
+ r'رشد\s+\d+(?:\.\d+)?\s*درصدی',
418
+ r'\d+(?:\.\d+)?\s*درصدی(?=\s+همراه|\s+بوده)',
419
+ r'میزان\s+رشد(?=\s+نسبت|\s+معادل)',
420
+ r'افزایش\s+قابل‌توجهی',
421
+ r'بهبود\s+نسبی'
422
+ ],
423
+ 'PHONE': [
424
+ r'(?:تلفن[\s:]*)?(?:شماره[\s:]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
425
+ r'(?:تماس[\s:]*)?(?:شماره[\s:]*)?(?:با[\s]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
426
+ r'(?:موبایل[\s:]*)?(?:شماره[\s:]*)?(?:0)?9[۰-۹0-9]{9}',
427
+ r'[۰-۹0-9]{3,4}[-\s][۰-۹0-9]{7,8}',
428
+ r'[۰-۹0-9]{11}(?!\d)',
429
+ r'(?:\+98|0098)?[۰-۹0-9]{10}',
430
+ r'[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}'
431
+ ],
432
+ 'EMAIL': [
433
+ r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
434
+ r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
435
+ r'email[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
436
+ r'نشانی[\s]*الکترونیک[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
437
+ r'آدرس[\s]*ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
438
+ ],
439
+ 'ACCOUNT': [
440
+ r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:بانکی[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
441
+ r'حساب[\s]*(?:شماره[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
442
+ r'شماره[\s]*حساب[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
443
+ r'Account[\s]*(?:Number[\s:]*)?(?:[0-9]{1,3}[-\s]?)*[0-9]{8,20}',
444
+ r'[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{6,12}',
445
+ r'[۰-۹0-9]{2,4}[-\s]?[۰-۹0-9]{6,12}[-\s]?[۰-۹0-9]{2,4}',
446
+ r'واریز[\s]*(?:سود[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
447
+ r'سود[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}'
448
+ ],
449
+ 'ID_NUMBER': [
450
+ r'IR[۰-۹0-9]{24}',
451
+ r'شبا[\s:]*IR[۰-۹0-9]{24}',
452
+ r'IBAN[\s:]*IR[۰-۹0-9]{24}',
453
+ r'شماره[\s]*شبا[\s:]*IR[۰-۹0-9]{24}',
454
+ r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
455
+ r'(?:شناسه[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
456
+ r'National[\s]*(?:ID[\s:]*)?[0-9]{10}',
457
+ r'(?:پاسپورت[\s:]*)?[A-Z][0-9]{8}',
458
+ r'(?:Passport[\s:]*)?[A-Z][0-9]{8}',
459
+ r'(?:کارت[\s:]*)?(?:[۰-۹0-9]{4}[-\s]?){3}[۰-۹0-9]{4}',
460
+ r'(?:Card[\s:]*)?(?:[0-9]{4}[-\s]?){3}[0-9]{4}'
461
+ ],
462
+ 'DATE': [
463
+ r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}',
464
+ r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}',
465
+ r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s*(?:[۰-۹0-9]{4})',
466
+ r'(?:[0-9]{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(?:[0-9]{4})',
467
+ r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s*[0-9]{1,2},?\s*[0-9]{4}'
468
+ ]
469
+ }
470
+
471
+ # پردازش patterns با اولویت‌بندی - از خاص به عام
472
+ logger.info("🔍 Running prioritized regex extraction...")
473
+
474
+ # پردازش به ترتیب اولویت برای جلوگیری از تداخل
475
+ processed_entities = set() # برای جلوگیری از تکرار
476
+
477
+ for category, pattern_list in patterns.items():
478
+ for pattern in pattern_list:
479
+ matches = re.finditer(pattern, original_text, re.IGNORECASE | re.MULTILINE)
480
+ for match in matches:
481
+ if match.groups():
482
+ item = match.group(1).strip()
483
+ full_match = match.group(0).strip()
484
+ else:
485
+ item = match.group(0).strip()
486
+ full_match = item
487
+
488
+ # بررسی تداخل با entities قبلی
489
+ overlaps = False
490
+ match_start, match_end = match.span()
491
+
492
+ for proc_start, proc_end in processed_entities:
493
+ # بررسی تداخل موقعیت
494
+ if not (match_end <= proc_start or match_start >= proc_end):
495
+ overlaps = True
496
+ break
497
+
498
+ if (not overlaps and
499
+ full_match not in found_entities and
500
+ full_match not in self.mapping_table and
501
+ len(full_match) >= 2):
502
+
503
+ self.counters[category] += 1
504
+ code = f"{category}_{self.counters[category]:03d}_REGEX"
505
+ self.mapping_table[full_match] = code
506
+ found_entities.add(full_match)
507
+ processed_entities.add((match_start, match_end))
508
+ logger.info(f"Regex ({category}): {full_match} -> {code}")
509
+
510
+ # جایگزینی در متن با ترتیب طولانی‌ترین اول
511
+ sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True)
512
+ for original_item, code in sorted_items:
513
+ anonymized = anonymized.replace(original_item, code)
514
+
515
+ logger.info(f"✅ Anonymization completed. Found {len(self.mapping_table)} entities.")
516
+ return anonymized
517
+
518
+ except Exception as e:
519
+ return f"❌ Error in anonymization: {str(e)}" if lang == 'en' else f"❌ خطا در ناشناس‌سازی: {str(e)}"
520
+
521
+ def send_to_chatgpt(self, anonymized_text, lang='fa'):
522
+ """گام 2: ارسال به ChatGPT"""
523
+ try:
524
+ if not anonymized_text or not anonymized_text.strip():
525
+ return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناس‌شده خالی است!"
526
+
527
+ if not self.api_key:
528
+ return "❌ API Key not configured! Please set OPENAI_API_KEY environment variable." if lang == 'en' else "❌ کلید API تنظیم نشده است! لطفاً OPENAI_API_KEY را در متغیرهای محیطی تنظیم کنید."
529
+
530
+ system_msg = "You are a professional financial analyst. The text contains anonymous codes. Answer questions accurately." if lang == 'en' else "شما یک تحلیلگر مالی حرفه‌ای هستید. متن حاوی کدهای ناشناس است. به سوالات با دقت پاسخ دهید."
531
+
532
+ headers = {
533
+ "Authorization": f"Bearer {self.api_key}",
534
+ "Content-Type": "application/json"
535
+ }
536
+
537
+ data = {
538
+ "model": "gpt-4o-mini",
539
+ "messages": [
540
+ {"role": "system", "content": system_msg},
541
+ {"role": "user", "content": anonymized_text}
542
+ ],
543
+ "max_tokens": 2000,
544
+ "temperature": 0.7
545
+ }
546
+
547
+ response = requests.post(
548
+ "https://api.openai.com/v1/chat/completions",
549
+ headers=headers,
550
+ json=data,
551
+ timeout=30
552
+ )
553
+
554
+ if response.status_code == 200:
555
+ result = response.json()
556
+ return result['choices'][0]['message']['content']
557
+ else:
558
+ error_data = response.json() if response.content else {}
559
+ error_message = error_data.get('error', {}).get('message', response.text)
560
+
561
+ if 'Incorrect API key' in error_message:
562
+ return "❌ Invalid API key." if lang == 'en' else "❌ کلید API نامعتبر است."
563
+ elif 'quota' in error_message:
564
+ return "❌ API quota exceeded." if lang == 'en' else "❌ سهمیه API تمام شده است."
565
+ else:
566
+ return f"❌ API Error: {error_message}"
567
+
568
+ except Exception as e:
569
+ return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}"
570
+
571
+ def deanonymize_response(self, gpt_response, lang='fa'):
572
+ """گام 3: بازگردانی"""
573
+ try:
574
+ if not gpt_response or not gpt_response.strip():
575
+ return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!"
576
+
577
+ if not self.mapping_table:
578
+ return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
579
+
580
+ final_result = gpt_response
581
+ reverse_mapping = {code: original for original, code in self.mapping_table.items()}
582
+
583
+ sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True)
584
+ for code, original in sorted_codes:
585
+ final_result = final_result.replace(code, original)
586
+ escaped_code = code.replace('_', '\\_')
587
+ final_result = final_result.replace(escaped_code, original)
588
+
589
+ return final_result
590
+
591
+ except Exception as e:
592
+ return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}"
593
+
594
+ def get_model_status(self):
595
+ """وضعیت مدل‌های محلی"""
596
+ status = "🤖 **Local Model Status (Enhanced with Priority-Based Sensitive Data Detection):**\n\n"
597
+
598
+ if hasattr(self, 'model_status') and self.model_status:
599
+ for model_type, model_status in self.model_status.items():
600
+ if model_type == 'persian':
601
+ status += f"• **Persian NER**: {model_status}\n"
602
+ elif model_type == 'english':
603
+ status += f"• **English NER**: {model_status}\n"
604
+ elif model_type == 'financial':
605
+ status += f"• **Financial NER**: {model_status}\n"
606
+ elif model_type == 'transformers':
607
+ status += f"• **Transformers**: {model_status}\n"
608
+ elif model_type == 'fallback':
609
+ status += f"• **Fallback Mode**: {model_status}\n"
610
+ elif model_type == 'critical':
611
+ status += f"• **Critical**: {model_status}\n"
612
+ elif model_type == 'directory':
613
+ status += f"• **Directory**: {model_status}\n"
614
+
615
+ loaded_count = sum(1 for status in getattr(self, 'model_status', {}).values()
616
+ if status.startswith("✅"))
617
+ status += f"\n📊 **Summary**: {loaded_count}/2 local models loaded"
618
+
619
+ status += f"\n📁 **Models Path**: {self.models_base_path}"
620
+ status += f"\n🔧 **Latest Features**: Priority-based detection with overlap prevention"
621
+
622
+ status += f"\n\n🔍 **Enhanced Sensitive Data Detection (Priority Order):**"
623
+ status += f"\n 1️⃣ **ID Numbers**: IBAN/SHEBA codes, National IDs, Passport numbers"
624
+ status += f"\n 2️⃣ **Contact Info**: Email addresses with context keywords"
625
+ status += f"\n 3️⃣ **Phone Numbers**: Mobile & landline with country codes"
626
+ status += f"\n 4️⃣ **Bank Accounts**: Account numbers with Persian keywords"
627
+ status += f"\n 5️⃣ **Financial Data**: Amounts, percentages, stock symbols"
628
+ status += f"\n 6️⃣ **Corporate Data**: Company names, person names, dates"
629
+
630
+ status += f"\n\n✨ **Key Improvements:**"
631
+ status += f"\n 🎯 Overlap detection prevents double-matching"
632
+ status += f"\n 🇮🇷 Full Persian digit support (۰-۹)"
633
+ status += f"\n 🔄 Context-aware pattern matching"
634
+ status += f"\n 📏 Length-based replacement order"
635
+
636
+ return status
637
+
638
+ def process_all_steps(input_text, language):
639
+ """پردازش خودکار تمام مراحل"""
640
+ lang = 'en' if language == 'English' else 'fa'
641
+
642
+ if not input_text.strip():
643
+ error_msg = "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
644
+ return error_msg, "", "", ""
645
+
646
+ try:
647
+ start_time = time.time()
648
+
649
+ anonymized_text = anonymizer.anonymize_text(input_text, lang)
650
+ if anonymized_text.startswith("❌"):
651
+ return anonymized_text, "", "", ""
652
+
653
+ gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang)
654
+ if gpt_response.startswith("❌"):
655
+ entities_found = len(anonymizer.mapping_table)
656
+ local_ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_LOCAL_NER' in code)
657
+ regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code)
658
+
659
+ # آمار اطلاعات حساس
660
+ sensitive_categories = ['ID_NUMBER', 'EMAIL', 'PHONE', 'ACCOUNT']
661
+ sensitive_count = sum(1 for code in anonymizer.mapping_table.values()
662
+ if any(cat in code for cat in sensitive_categories))
663
+
664
+ method = "Priority-Based Local NER + Regex" if anonymizer.models_loaded else "Priority-Based Regex Only"
665
+ success_msg = (f"✅ Anonymization completed with {method}!\n"
666
+ f"🔍 Sensitive data: {sensitive_count} | 🤖 NER: {local_ner_count} | 🔎 Regex: {regex_count}\n"
667
+ f"📊 Total: {entities_found} entities protected")
668
+ return success_msg, anonymized_text, gpt_response, ""
669
+
670
+ final_result = anonymizer.deanonymize_response(gpt_response, lang)
671
+
672
+ total_time = time.time() - start_time
673
+ entities_found = len(anonymizer.mapping_table)
674
+ local_ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_LOCAL_NER' in code)
675
+ regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code)
676
+
677
+ # آمار تفصیلی اطلاعات حساس
678
+ id_count = sum(1 for code in anonymizer.mapping_table.values() if 'ID_NUMBER' in code)
679
+ email_count = sum(1 for code in anonymizer.mapping_table.values() if 'EMAIL' in code)
680
+ phone_count = sum(1 for code in anonymizer.mapping_table.values() if 'PHONE' in code)
681
+ account_count = sum(1 for code in anonymizer.mapping_table.values() if 'ACCOUNT' in code)
682
+
683
+ sensitive_details = []
684
+ if id_count > 0: sensitive_details.append(f"🆔 IDs: {id_count}")
685
+ if email_count > 0: sensitive_details.append(f"📧 Emails: {email_count}")
686
+ if phone_count > 0: sensitive_details.append(f"📞 Phones: {phone_count}")
687
+ if account_count > 0: sensitive_details.append(f"🏦 Accounts: {account_count}")
688
+
689
+ method = "Priority-Based Local NER + Regex" if anonymizer.models_loaded else "Priority-Based Regex Only"
690
+ success_msg = (f"🎉 Complete anonymization & restoration successful!\n"
691
+ f"🔧 Method: {method}\n"
692
+ f"🔍 Sensitive data: {' | '.join(sensitive_details) if sensitive_details else '0'}\n"
693
+ f"📊 Total: {entities_found} entities | ⏱️ Time: {total_time:.2f}s")
694
+
695
+ return success_msg, anonymized_text, gpt_response, final_result
696
+
697
+ except Exception as e:
698
+ error_msg = f"❌ Processing error: {str(e)}" if lang == 'en' else f"❌ خطا در پردازش: {str(e)}"
699
+ return error_msg, "", "", ""
700
+
701
+ def get_mapping_table(language):
702
+ """نمایش جدول نگاشت"""
703
+ lang = 'en' if language == 'English' else 'fa'
704
+
705
+ if not anonymizer.mapping_table:
706
+ return "❌ Mapping table is empty! Please process some text first." if lang == 'en' else "❌ جدول نگاشت خالی است! ابتدا متنی را پردازش کنید."
707
+
708
+ result = "📋 **Priority-Based Sensitive Data Mapping Table:**\n\n" if lang == 'en' else "📋 **جدول نگاشت اطلاعات حساس با اولویت‌بندی:**\n\n"
709
+
710
+ local_ner_items = {k: v for k, v in anonymizer.mapping_table.items() if '_LOCAL_NER' in v}
711
+ regex_items = {k: v for k, v in anonymizer.mapping_table.items() if '_REGEX' in v}
712
+
713
+ # گروه‌بندی بر اساس نوع اطلاعات حساس
714
+ priority_categories = {
715
+ 'ID_NUMBER': '🆔 **Identity & Financial Codes**',
716
+ 'EMAIL': '📧 **Email Addresses**',
717
+ 'PHONE': '📞 **Phone Numbers**',
718
+ 'ACCOUNT': '🏦 **Bank Account Numbers**'
719
+ }
720
+
721
+ sensitive_found = False
722
+ for category, title in priority_categories.items():
723
+ category_items = {k: v for k, v in anonymizer.mapping_table.items() if category in v}
724
+ if category_items:
725
+ sensitive_found = True
726
+ result += f"{title}:\n"
727
+ for original, code in list(category_items.items())[:8]:
728
+ result += f" • `{original}` → `{code}`\n"
729
+ if len(category_items) > 8:
730
+ result += f" ... و {len(category_items) - 8} مورد دیگر\n"
731
+ result += "\n"
732
+
733
+ if local_ner_items:
734
+ result += "🤖 **Local NER Detected**:\n"
735
+ for original, code in list(local_ner_items.items())[:8]:
736
+ result += f" • `{original}` → `{code}`\n"
737
+ if len(local_ner_items) > 8:
738
+ result += f" ... و {len(local_ner_items) - 8} مورد دیگر\n"
739
+ result += "\n"
740
+
741
+ # سایر موارد (مالی، شرکتی و غیره)
742
+ other_categories = ['AMOUNT', 'PERCENTAGE', 'COMPANY', 'PERSON', 'STOCK_SYMBOL', 'DATE']
743
+ other_items = {k: v for k, v in regex_items.items()
744
+ if any(cat in v for cat in other_categories)}
745
+
746
+ if other_items:
747
+ result += "💼 **Business & Financial Data**:\n"
748
+ for original, code in list(other_items.items())[:8]:
749
+ result += f" • `{original}` → `{code}`\n"
750
+ if len(other_items) > 8:
751
+ result += f" ... و {len(other_items) - 8} مورد دیگر\n"
752
+
753
+ # آمار کلی
754
+ sensitive_count = sum(len({k: v for k, v in anonymizer.mapping_table.items() if cat in v})
755
+ for cat in priority_categories.keys())
756
+
757
+ result += f"\n📊 **Statistics**:\n"
758
+ result += f"🔍 **Sensitive Data**: {sensitive_count} items\n"
759
+ result += f"🤖 **NER Detected**: {len(local_ner_items)} items\n"
760
+ result += f"💼 **Business Data**: {len(other_items)} items\n"
761
+ result += f"📋 **Total**: {len(anonymizer.mapping_table)} entities\n"
762
+
763
+ result += f"\n✨ **Enhancement Applied**: Priority-based detection with overlap prevention\n"
764
+ result += f"🎯 **Success**: All major sensitive data types detected and anonymized!"
765
+
766
+ return result
767
+
768
+ def clear_all():
769
+ """پاک کردن همه"""
770
+ anonymizer.mapping_table = {}
771
+ anonymizer.counters = {key: 0 for key in anonymizer.counters.keys()}
772
+ return "", "", "", "", ""
773
+
774
+ def update_ui_text(language):
775
+ """به‌روزرسانی متن‌های رابط کاربری"""
776
+ if language == 'English':
777
+ return {
778
+ 'title': 'Priority-Based Bilingual Data Anonymization System',
779
+ 'step1': 'Input Text & Settings',
780
+ 'step2': 'Anonymized Text',
781
+ 'step3': 'Raw ChatGPT Response',
782
+ 'step4': 'Final Restored Response',
783
+ 'input_placeholder': 'Enter your original text here...\nExample: Company reports, person names, financial amounts, phone numbers, emails, IBAN codes, bank accounts, etc.',
784
+ 'process_btn': 'Process with Smart Priority Detection',
785
+ 'clear_btn': 'Clear All',
786
+ 'mapping_btn': 'Show Priority-Based Mapping Table',
787
+ 'copy_btn': 'Copy',
788
+ 'direction': 'ltr'
789
+ }
790
+ else:
791
+ return {
792
+ 'title': 'سیستم ناشناس‌سازی هوشمند با اولویت‌بندی',
793
+ 'step1': 'متن ورودی و تنظیمات',
794
+ 'step2': 'متن ناشناس‌شده',
795
+ 'step3': 'پاسخ خام ChatGPT',
796
+ 'step4': 'پاسخ نهایی بازگردانده شده',
797
+ 'input_placeholder': 'متن اصلی خود را اینجا وارد کنید...\nمثال: گزارش‌های شرکت، نام اشخاص، مبالغ مالی، شماره تلفن، ایمیل، شماره شبا، حساب بانکی و غیره',
798
+ 'process_btn': 'پردازش با تشخیص هوشمند اولویت‌دار',
799
+ 'clear_btn': 'پاک کردن همه',
800
+ 'mapping_btn': 'نمایش جدول نگاشت اولویت‌دار',
801
+ 'copy_btn': 'کپی',
802
+ 'direction': 'rtl'
803
+ }
804
+
805
+ def update_interface(language):
806
+ """تغییر رابط کاربری بر اساس زبان"""
807
+ ui_text = update_ui_text(language)
808
+ is_english = (language == 'English')
809
+
810
+ # تغییر direction برای workflow
811
+ workflow_css = "workflow ltr" if is_english else "workflow rtl"
812
+
813
+ return [
814
+ gr.update(value=f"<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 {ui_text['title']}</h1>"),
815
+ gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>📝 {ui_text['step1']}</h2>"),
816
+ gr.update(placeholder=ui_text['input_placeholder'], rtl=not is_english),
817
+ gr.update(value=f"🚀 {ui_text['process_btn']}"),
818
+ gr.update(value=f"🗑️ {ui_text['clear_btn']}"),
819
+ gr.update(rtl=not is_english),
820
+ gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🎭 {ui_text['step2']}</h2>"),
821
+ gr.update(rtl=not is_english),
822
+ gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🤖 {ui_text['step3']}</h2>"),
823
+ gr.update(rtl=not is_english),
824
+ gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>✅ {ui_text['step4']}</h2>"),
825
+ gr.update(rtl=not is_english),
826
+ gr.update(value=f"📋 {ui_text['mapping_btn']}"),
827
+ gr.update(rtl=not is_english),
828
+ gr.update(elem_classes=workflow_css)
829
+ ]
830
+
831
+ # ایجاد instance
832
+ anonymizer = BilingualDataAnonymizer()
833
+
834
+ # CSS اصلاح شده برای ترازبندی عمودی مناسب
835
+ custom_css = """
836
+ body, .gradio-container {
837
+ font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important;
838
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
839
+ min-height: 100vh !important;
840
+ padding: 20px !important;
841
+ }
842
+
843
+ .rtl {
844
+ direction: rtl !important;
845
+ text-align: right !important;
846
+ }
847
+
848
+ .ltr {
849
+ direction: ltr !important;
850
+ text-align: left !important;
851
+ }
852
+
853
+ .workflow {
854
+ display: grid !important;
855
+ grid-template-columns: 1fr 1fr 1fr 1fr !important;
856
+ gap: 25px !important;
857
+ padding: 30px !important;
858
+ align-items: start !important;
859
+ align-content: start !important;
860
+ grid-auto-rows: auto !important;
861
+ }
862
+
863
+ .workflow > * {
864
+ align-self: start !important;
865
+ vertical-align: top !important;
866
+ margin-top: 0 !important;
867
+ }
868
+
869
+ .workflow .gradio-column,
870
+ .workflow-column {
871
+ display: flex !important;
872
+ flex-direction: column !important;
873
+ align-items: stretch !important;
874
+ justify-content: flex-start !important;
875
+ height: auto !important;
876
+ min-height: 0 !important;
877
+ margin-top: 0 !important;
878
+ padding-top: 0 !important;
879
+ }
880
+
881
+ .gradio-textbox {
882
+ border-radius: 10px !important;
883
+ box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
884
+ flex-grow: 1 !important;
885
+ min-height: 380px !important;
886
+ max-height: 380px !important;
887
+ height: 380px !important;
888
+ }
889
+
890
+ .gradio-textbox textarea {
891
+ min-height: 350px !important;
892
+ max-height: 350px !important;
893
+ height: 350px !important;
894
+ resize: vertical !important;
895
+ }
896
+
897
+ .workflow.rtl {
898
+ direction: rtl !important;
899
+ }
900
+
901
+ .workflow.ltr {
902
+ direction: ltr !important;
903
+ }
904
+
905
+ h1, h2, h3 {
906
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
907
+ margin-top: 0 !important;
908
+ margin-bottom: 10px !important;
909
+ padding-top: 0 !important;
910
+ line-height: 1.2 !important;
911
+ }
912
+
913
+ h2 {
914
+ min-height: 40px !important;
915
+ max-height: 40px !important;
916
+ display: flex !important;
917
+ align-items: center !important;
918
+ margin-bottom: 15px !important;
919
+ }
920
+
921
+ .status-box {
922
+ background: linear-gradient(135deg, #4CAF50, #45a049) !important;
923
+ border: 3px solid #2E7D32 !important;
924
+ border-radius: 15px !important;
925
+ padding: 15px !important;
926
+ margin: 10px 0 !important;
927
+ box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3) !important;
928
+ animation: pulse 2s infinite !important;
929
+ min-height: 120px !important;
930
+ max-height: 120px !important;
931
+ }
932
+
933
+ .status-box textarea {
934
+ background: rgba(255, 255, 255, 0.95) !important;
935
+ border: none !important;
936
+ border-radius: 10px !important;
937
+ font-weight: bold !important;
938
+ font-size: 1.1em !important;
939
+ color: #1B5E20 !important;
940
+ text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.8) !important;
941
+ min-height: 80px !important;
942
+ max-height: 80px !important;
943
+ }
944
+
945
+ @keyframes pulse {
946
+ 0% { box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3); }
947
+ 50% { box-shadow: 0 8px 40px rgba(76, 175, 80, 0.6); }
948
+ 100% { box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3); }
949
+ }
950
+
951
+ .gradio-button {
952
+ border-radius: 25px !important;
953
+ font-weight: bold !important;
954
+ transition: all 0.3s ease !important;
955
+ margin: 5px 0 !important;
956
+ min-height: 50px !important;
957
+ max-height: 50px !important;
958
+ }
959
+
960
+ .gradio-button:hover {
961
+ transform: translateY(-2px) !important;
962
+ box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important;
963
+ }
964
+
965
+ h1 {
966
+ background: linear-gradient(45deg, #FFD700, #FFA500) !important;
967
+ -webkit-background-clip: text !important;
968
+ -webkit-text-fill-color: transparent !important;
969
+ background-clip: text !important;
970
+ min-height: 80px !important;
971
+ }
972
+
973
+ @media (max-width: 1200px) {
974
+ .workflow {
975
+ grid-template-columns: 1fr 1fr !important;
976
+ gap: 20px !important;
977
+ }
978
+ }
979
+
980
+ @media (max-width: 768px) {
981
+ .workflow {
982
+ grid-template-columns: 1fr !important;
983
+ gap: 15px !important;
984
+ }
985
+
986
+ .gradio-textbox {
987
+ min-height: 300px !important;
988
+ max-height: 300px !important;
989
+ height: 300px !important;
990
+ }
991
+ }
992
+
993
+ [data-testid="textbox"]:dir(rtl) {
994
+ text-align: right !important;
995
+ direction: rtl !important;
996
+ }
997
+
998
+ [data-testid="textbox"]:dir(ltr) {
999
+ text-align: left !important;
1000
+ direction: ltr !important;
1001
+ }
1002
+
1003
+ .gradio-container .gradio-column {
1004
+ align-self: start !important;
1005
+ vertical-align: top !important;
1006
+ }
1007
+
1008
+ .gradio-container .gradio-row {
1009
+ align-items: flex-start !important;
1010
+ }
1011
+
1012
+ * {
1013
+ box-sizing: border-box !important;
1014
+ }
1015
+
1016
+ .gradio-container {
1017
+ align-items: start !important;
1018
+ justify-content: start !important;
1019
+ }
1020
+ """
1021
+
1022
+ # رابط کاربری Gradio با ترازبندی اصلاح شده
1023
+ with gr.Blocks(title="📊 Priority-Based Anonymization System", theme=gr.themes.Soft(), css=custom_css) as app:
1024
+
1025
+ with gr.Row():
1026
+ language_selector = gr.Radio(
1027
+ choices=["فارسی", "English"],
1028
+ value="فارسی",
1029
+ label="Language / زبان",
1030
+ interactive=True
1031
+ )
1032
+
1033
+ with gr.Column():
1034
+ title = gr.HTML("<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 سیستم ناشناس‌سازی هوشمند با اولویت‌بندی</h1>")
1035
+
1036
+ with gr.Row(elem_classes="workflow rtl") as workflow_row:
1037
+ with gr.Column(elem_classes="workflow-column"):
1038
+ step1_title = gr.HTML('<h2 style="direction: rtl;">📝 متن ورودی و تنظیمات</h2>')
1039
+
1040
+ input_text = gr.Textbox(
1041
+ lines=15,
1042
+ placeholder="متن اصلی خود را اینجا وارد کنید...\n✨ سیستم هوشمند اطلاعات حساس مثل شماره تلفن، ایمیل، شماره شبا، حسا�� بانکی را به ترتیب اولویت تشخیص می‌دهد",
1043
+ label="",
1044
+ rtl=True
1045
+ )
1046
+
1047
+ process_btn = gr.Button("🚀 پردازش با تشخیص هوشمند اولویت‌دار", variant="primary")
1048
+ clear_btn = gr.Button("🗑️ پاک کردن همه", variant="stop")
1049
+
1050
+ status = gr.Textbox(
1051
+ label="وضعیت",
1052
+ lines=4,
1053
+ interactive=False,
1054
+ rtl=True,
1055
+ elem_classes=["status-box"]
1056
+ )
1057
+
1058
+ with gr.Column(elem_classes="workflow-column"):
1059
+ step2_title = gr.HTML('<h2 style="direction: rtl;">🎭 متن ناشناس‌شده</h2>')
1060
+
1061
+ anonymized_output = gr.Textbox(
1062
+ lines=15,
1063
+ placeholder="متن ناشناس‌شده اینجا نمایش داده می‌شود...",
1064
+ label="",
1065
+ interactive=False,
1066
+ rtl=True
1067
+ )
1068
+
1069
+ with gr.Column(elem_classes="workflow-column"):
1070
+ step3_title = gr.HTML('<h2 style="direction: rtl;">🤖 پاسخ خام ChatGPT</h2>')
1071
+
1072
+ gpt_output = gr.Textbox(
1073
+ lines=15,
1074
+ placeholder="پاسخ خام ChatGPT اینجا نمایش داده می‌شود...",
1075
+ label="",
1076
+ interactive=False,
1077
+ rtl=True
1078
+ )
1079
+
1080
+ with gr.Column(elem_classes="workflow-column"):
1081
+ step4_title = gr.HTML('<h2 style="direction: rtl;">✅ پاسخ نهایی بازگردانده شده</h2>')
1082
+
1083
+ final_output = gr.Textbox(
1084
+ lines=15,
1085
+ placeholder="پاسخ نهایی اینجا نمایش داده می‌شود...",
1086
+ label="",
1087
+ interactive=False,
1088
+ rtl=True
1089
+ )
1090
+
1091
+ with gr.Row():
1092
+ with gr.Column():
1093
+ mapping_title = gr.HTML('<h2>🗂️ جدول نگاشت اولویت‌دار</h2>')
1094
+ mapping_btn = gr.Button("📋 نمایش جدول نگاشت اولویت‌دار")
1095
+
1096
+ mapping_output = gr.Textbox(
1097
+ lines=10,
1098
+ label="جدول نگاشت اطلاعات",
1099
+ interactive=False,
1100
+ visible=False,
1101
+ rtl=True
1102
+ )
1103
+
1104
+ # Event handlers
1105
+ language_selector.change(
1106
+ fn=update_interface,
1107
+ inputs=[language_selector],
1108
+ outputs=[title, step1_title, input_text, process_btn, clear_btn,
1109
+ status, step2_title, anonymized_output, step3_title, gpt_output,
1110
+ step4_title, final_output, mapping_btn, mapping_output, workflow_row]
1111
+ )
1112
+
1113
+ process_btn.click(
1114
+ fn=process_all_steps,
1115
+ inputs=[input_text, language_selector],
1116
+ outputs=[status, anonymized_output, gpt_output, final_output]
1117
+ )
1118
+
1119
+ clear_btn.click(
1120
+ fn=clear_all,
1121
+ outputs=[input_text, anonymized_output, gpt_output, final_output, status]
1122
+ )
1123
+
1124
+ mapping_btn.click(
1125
+ fn=get_mapping_table,
1126
+ inputs=[language_selector],
1127
+ outputs=[mapping_output]
1128
+ )
1129
+
1130
+ mapping_btn.click(
1131
+ fn=lambda: gr.update(visible=True),
1132
+ outputs=[mapping_output]
1133
+ )
1134
+
1135
+ if __name__ == "__main__":
1136
+ app.launch(share=True)