leilaghomashchi commited on
Commit
666b5ff
·
verified ·
1 Parent(s): b3aa17e

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -1136
app.py DELETED
@@ -1,1136 +0,0 @@
1
- import gradio as gr
2
- import re
3
- import os
4
- import requests
5
- import time
6
- import logging
7
- from packaging import version
8
-
9
- # تنظیم logging
10
- logging.basicConfig(level=logging.INFO)
11
- logger = logging.getLogger(__name__)
12
-
13
- def auto_setup_models():
14
- """راه‌اندازی خودکار مدل‌ها در صورت عدم وجود"""
15
- models_dir = "./models"
16
- required_models = {
17
- 'bert-fa-ner': 'HooshvareLab/bert-fa-zwnj-base-ner',
18
- 'bert-base-NER': 'dslim/bert-base-NER',
19
- }
20
-
21
- missing_models = []
22
- for model_name in required_models.keys():
23
- model_path = os.path.join(models_dir, model_name)
24
- if not os.path.exists(model_path) or not os.listdir(model_path):
25
- missing_models.append(model_name)
26
-
27
- if not missing_models:
28
- logger.info("✅ All models are already available")
29
- return True
30
-
31
- logger.info(f"📥 Auto-downloading missing models: {missing_models}")
32
-
33
- try:
34
- from transformers import AutoTokenizer, AutoModelForTokenClassification
35
- os.makedirs(models_dir, exist_ok=True)
36
-
37
- for model_name in missing_models:
38
- hf_repo = required_models[model_name]
39
- model_path = os.path.join(models_dir, model_name)
40
- logger.info(f"📥 Downloading {model_name} from {hf_repo}...")
41
- try:
42
- tokenizer = AutoTokenizer.from_pretrained(hf_repo)
43
- model = AutoModelForTokenClassification.from_pretrained(hf_repo)
44
- tokenizer.save_pretrained(model_path)
45
- model.save_pretrained(model_path)
46
- logger.info(f"✅ {model_name} downloaded successfully")
47
- del tokenizer, model
48
- except Exception as e:
49
- logger.error(f"❌ Failed to download {model_name}: {e}")
50
- if os.path.exists(model_path):
51
- import shutil
52
- shutil.rmtree(model_path)
53
-
54
- logger.info("🎉 Auto-setup completed!")
55
- return True
56
-
57
- except ImportError:
58
- logger.error("❌ transformers library not available for auto-download")
59
- return False
60
- except Exception as e:
61
- logger.error(f"❌ Auto-setup failed: {e}")
62
- return False
63
-
64
- # اجرای auto-setup در startup
65
- try:
66
- auto_setup_models()
67
- except Exception as e:
68
- logger.warning(f"⚠️ Auto-setup encountered an issue: {e}")
69
- logger.info("ℹ️ Continuing with manual setup...")
70
-
71
- class BilingualDataAnonymizer:
72
- def __init__(self):
73
- self.mapping_table = {}
74
- # counters به‌روزرسانی شده با دسته‌های جدید
75
- self.counters = {
76
- 'COMPANY': 0, 'PERSON': 0, 'AMOUNT': 0, 'ACCOUNT': 0,
77
- 'DATE': 0, 'STOCK_SYMBOL': 0, 'PETROCHEMICAL': 0,
78
- 'PRODUCT': 0, 'PERCENTAGE': 0, 'LOCATION': 0,
79
- 'VOLUME': 0, 'PHONE': 0, 'EMAIL': 0, 'ID_NUMBER': 0,
80
- 'FINANCIAL_TERMS': 0, 'BUSINESS_TERMS': 0, 'RATIOS': 0
81
- }
82
-
83
- self.api_key = os.getenv("OPENAI_API_KEY", "")
84
- self.models_base_path = "./models"
85
- self.models_loaded = False
86
- self.model_status = {}
87
- self.load_local_ner_models()
88
-
89
- def ensure_models_directory(self):
90
- if not os.path.exists(self.models_base_path):
91
- try:
92
- os.makedirs(self.models_base_path, exist_ok=True)
93
- logger.info(f"📁 Created models directory: {self.models_base_path}")
94
- except Exception as e:
95
- logger.error(f"❌ Failed to create models directory: {e}")
96
- return False
97
- return True
98
-
99
- def download_model_if_missing(self, local_name, hf_repo):
100
- model_path = os.path.join(self.models_base_path, local_name)
101
- if os.path.exists(model_path) and os.listdir(model_path):
102
- return True, f"Model {local_name} already exists"
103
- try:
104
- logger.info(f"📥 Auto-downloading {local_name} from {hf_repo}...")
105
- from transformers import AutoTokenizer, AutoModelForTokenClassification
106
- tokenizer = AutoTokenizer.from_pretrained(hf_repo)
107
- model = AutoModelForTokenClassification.from_pretrained(hf_repo)
108
- tokenizer.save_pretrained(model_path)
109
- model.save_pretrained(model_path)
110
- logger.info(f"✅ {local_name} auto-downloaded successfully")
111
- return True, f"Downloaded {local_name}"
112
- except Exception as e:
113
- logger.error(f"❌ Auto-download failed for {local_name}: {e}")
114
- return False, str(e)
115
-
116
- def _load_pipeline(self, task, model_path, tokenizer_path=None):
117
- """لود مدل با مدیریت صحیح پارامترهای ورژن مختلف transformers"""
118
- try:
119
- from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, __version__ as tr_version
120
-
121
- # بررسی پشتیبانی از aggregation_strategy
122
- supports_agg = version.parse(tr_version) >= version.parse("4.11.0")
123
-
124
- # لود توکنایزر و مدل به صورت جداگانه
125
- if tokenizer_path:
126
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
127
- else:
128
- tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
129
-
130
- model = AutoModelForTokenClassification.from_pretrained(model_path, local_files_only=True)
131
-
132
- # ایجاد pipeline با پارامترهای مناسب
133
- pipeline_kwargs = {
134
- "model": model,
135
- "tokenizer": tokenizer,
136
- "device": -1 # استفاده از CPU
137
- }
138
-
139
- # اضافه کردن aggregation_strategy اگر پشتیبانی می‌شود
140
- if supports_agg:
141
- pipeline_kwargs["aggregation_strategy"] = "simple"
142
-
143
- return pipeline(task, **pipeline_kwargs)
144
-
145
- except Exception as e:
146
- logger.error(f"❌ Failed to load pipeline for {model_path}: {e}")
147
- return None
148
-
149
- def load_local_ner_models(self):
150
- logger.info("🔄 Loading local NER models with auto-download...")
151
- if not self.ensure_models_directory():
152
- self.models_loaded = False
153
- self.model_status['directory'] = "❌ Cannot create models directory"
154
- return
155
-
156
- try:
157
- try:
158
- import torch
159
- from transformers import AutoTokenizer, AutoModelForTokenClassification
160
- transformers_available = True
161
- logger.info("✅ Transformers library available")
162
- except ImportError as e:
163
- transformers_available = False
164
- self.model_status['transformers'] = f"❌ Transformers library not installed: {str(e)}"
165
- self.models_loaded = False
166
- return
167
-
168
- # Persian model
169
- persian_model_path = os.path.join(self.models_base_path, "bert-fa-ner")
170
- self.download_model_if_missing("bert-fa-ner", "HooshvareLab/bert-fa-zwnj-base-ner")
171
- if os.path.exists(persian_model_path) and os.listdir(persian_model_path):
172
- try:
173
- self.persian_ner = self._load_pipeline("ner", persian_model_path)
174
- if self.persian_ner:
175
- self.model_status['persian'] = f"✅ Local Persian NER: {persian_model_path}"
176
- else:
177
- self.model_status['persian'] = f"❌ Failed to load Persian model: {persian_model_path}"
178
- except Exception as e:
179
- self.persian_ner = None
180
- self.model_status['persian'] = f"❌ Persian model loading error: {str(e)[:100]}"
181
- else:
182
- self.persian_ner = None
183
- self.model_status['persian'] = f"❌ Persian model not found: {persian_model_path}"
184
-
185
- # English model
186
- english_model_path = os.path.join(self.models_base_path, "bert-base-NER")
187
- self.download_model_if_missing("bert-base-NER", "dslim/bert-base-NER")
188
- if os.path.exists(english_model_path) and os.listdir(english_model_path):
189
- try:
190
- self.english_ner = self._load_pipeline("ner", english_model_path)
191
- if self.english_ner:
192
- self.model_status['english'] = f"✅ Local English NER: {english_model_path}"
193
- else:
194
- self.model_status['english'] = f"❌ Failed to load English model: {english_model_path}"
195
- except Exception as e:
196
- self.english_ner = None
197
- self.model_status['english'] = f"❌ English model loading error: {str(e)[:100]}"
198
- else:
199
- self.english_ner = None
200
- self.model_status['english'] = f"❌ English model not found: {english_model_path}"
201
-
202
- loaded_models = sum(1 for status in self.model_status.values() if status.startswith("✅"))
203
- self.models_loaded = loaded_models > 0
204
- if loaded_models == 0:
205
- self.model_status['fallback'] = "⚠️ Using regex-only mode (no local models found)"
206
-
207
- except Exception as e:
208
- self.models_loaded = False
209
- self.model_status['critical'] = f"❌ Critical error: {str(e)[:100]}..."
210
-
211
- def detect_language(self, text):
212
- """تشخیص زبان متن"""
213
- if not text:
214
- return 'fa'
215
-
216
- persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
217
- english_chars = len(re.findall(r'[a-zA-Z]', text))
218
- total = persian_chars + english_chars
219
-
220
- if total == 0:
221
- return 'fa'
222
-
223
- if persian_chars / total > 0.6:
224
- return 'fa'
225
- elif english_chars / total > 0.6:
226
- return 'en'
227
- else:
228
- return 'mixed'
229
-
230
- def extract_entities_with_ner(self, text, lang='fa'):
231
- """استخراج entities با مدل‌های NER محلی"""
232
- entities = []
233
-
234
- if not self.models_loaded:
235
- logger.info("ℹ️ Local NER models not available - using regex only")
236
- return entities
237
-
238
- try:
239
- # مدل فارسی محلی
240
- if lang in ['fa', 'mixed'] and hasattr(self, 'persian_ner') and self.persian_ner:
241
- try:
242
- persian_results = self.persian_ner(text)
243
- for entity in persian_results:
244
- # بررسی فرمت خروجی بر اساس ورژن transformers
245
- if isinstance(entity, dict):
246
- if 'entity_group' in entity:
247
- # ورژن جدید با aggregation_strategy
248
- entities.append({
249
- 'text': entity['word'].strip(),
250
- 'label': entity['entity_group'],
251
- 'start': entity['start'],
252
- 'end': entity['end'],
253
- 'confidence': entity['score'],
254
- 'source': 'local_persian_ner'
255
- })
256
- else:
257
- # ورژن قدیمی
258
- entities.append({
259
- 'text': entity['word'].strip(),
260
- 'label': entity['entity'],
261
- 'start': entity['start'],
262
- 'end': entity['end'],
263
- 'confidence': entity['score'],
264
- 'source': 'local_persian_ner'
265
- })
266
- logger.info(f"Local Persian NER found {len(persian_results)} entities")
267
- except Exception as e:
268
- logger.error(f"Local Persian NER extraction error: {e}")
269
-
270
- # مدل انگلیسی محلی
271
- if lang in ['en', 'mixed'] and hasattr(self, 'english_ner') and self.english_ner:
272
- try:
273
- english_results = self.english_ner(text)
274
- for entity in english_results:
275
- # بررسی فرمت خروجی بر اساس ورژن transformers
276
- if isinstance(entity, dict):
277
- if 'entity_group' in entity:
278
- # ورژن جدید با aggregation_strategy
279
- entities.append({
280
- 'text': entity['word'].strip(),
281
- 'label': entity['entity_group'],
282
- 'start': entity['start'],
283
- 'end': entity['end'],
284
- 'confidence': entity['score'],
285
- 'source': 'local_english_ner'
286
- })
287
- else:
288
- # ورژن قدیمی
289
- entities.append({
290
- 'text': entity['word'].strip(),
291
- 'label': entity['entity'],
292
- 'start': entity['start'],
293
- 'end': entity['end'],
294
- 'confidence': entity['score'],
295
- 'source': 'local_english_ner'
296
- })
297
- logger.info(f"Local English NER found {len(english_results)} entities")
298
- except Exception as e:
299
- logger.error(f"Local English NER extraction error: {e}")
300
-
301
- except Exception as e:
302
- logger.error(f"Local NER extraction general error: {e}")
303
-
304
- # حذف تکراری‌ها
305
- unique_entities = []
306
- seen = set()
307
- for entity in entities:
308
- key = (entity['text'].lower(), entity['start'], entity['end'])
309
- if key not in seen:
310
- seen.add(key)
311
- unique_entities.append(entity)
312
-
313
- logger.info(f"Total unique entities found by local models: {len(unique_entities)}")
314
- return unique_entities
315
-
316
- def map_ner_to_categories(self, ner_label, source=''):
317
- """نگاشت برچسب‌های NER به دسته‌های سیستم"""
318
- mapping = {
319
- 'PER': 'PERSON', 'PERSON': 'PERSON',
320
- 'ORG': 'COMPANY', 'ORGANIZATION': 'COMPANY',
321
- 'LOC': 'LOCATION', 'LOCATION': 'LOCATION',
322
- 'MISC': 'BUSINESS_TERMS', 'MISCELLANEOUS': 'BUSINESS_TERMS',
323
- 'B-PER': 'PERSON', 'I-PER': 'PERSON',
324
- 'B-ORG': 'COMPANY', 'I-ORG': 'COMPANY',
325
- 'B-LOC': 'LOCATION', 'I-LOC': 'LOCATION',
326
- 'B-MISC': 'BUSINESS_TERMS', 'I-MISC': 'BUSINESS_TERMS',
327
- 'MONEY': 'AMOUNT', 'PERCENT': 'PERCENTAGE',
328
- 'DATE': 'DATE', 'TIME': 'DATE'
329
- }
330
- return mapping.get(ner_label.upper(), 'BUSINESS_TERMS')
331
-
332
- def anonymize_text(self, original_text, lang='fa'):
333
- """گام 1: ناشناس‌سازی متن"""
334
- try:
335
- if not original_text or not original_text.strip():
336
- return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
337
-
338
- # ریست متغیرها
339
- self.mapping_table = {}
340
- self.counters = {key: 0 for key in self.counters.keys()}
341
-
342
- anonymized = original_text
343
- found_entities = set()
344
-
345
- # تشخیص زبان
346
- detected_lang = self.detect_language(original_text)
347
- logger.info(f"Detected language: {detected_lang}")
348
-
349
- # مرحله 1: استخراج با Local NER
350
- if self.models_loaded:
351
- logger.info("🤖 Running local NER extraction...")
352
- ner_entities = self.extract_entities_with_ner(original_text, detected_lang)
353
-
354
- for entity in ner_entities:
355
- if (entity['text'] not in found_entities and
356
- len(entity['text'].strip()) > 1 and
357
- entity['confidence'] > 0.5):
358
-
359
- category = self.map_ner_to_categories(entity['label'], entity['source'])
360
-
361
- if entity['text'] not in self.mapping_table:
362
- self.counters[category] += 1
363
- code = f"{category}_{self.counters[category]:03d}_LOCAL_NER"
364
- self.mapping_table[entity['text']] = code
365
- found_entities.add(entity['text'])
366
- logger.info(f"Local NER: {entity['text']} -> {code}")
367
- else:
368
- logger.info("ℹ️ Using regex-only mode")
369
-
370
- # مرحله 2: الگوهای Regex - الگوهای جدید اضافه شده
371
- patterns = {
372
- 'STOCK_SYMBOL': [
373
- r'نماد\s+([آ-ی‌a-zA-Z0-9]+)',
374
- r'(سبهان|غدیر|شتران|شپنا|پترول|فارس|خارک|پلاسکو|جم|کرمان|مارون|اراک|رازی|شازند|کاوه|بندر|پارس|خوزستان|ماهشهر|عسلویه)(?=\s|$|،|\.|\s+—)',
375
- r'شرکت\s+([آ-ی‌a-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+را|\s+به)',
376
- r'پتروشیمی\s+([آ-ی‌a-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+توان)',
377
- r'(AAPL|GOOGL|MSFT|AMZN|TSLA|META|NVDA|SABIC)(?=\s|$|,|\.)'
378
- ],
379
- 'COMPANY': [
380
- r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به|\s+طی)',
381
- r'([آ-ی‌a-zA-Z\s]+)\s+شرکت',
382
- r'این\s+شرکت(?=\s|$|،|\.)',
383
- r'(بانک\s+[آ-ی‌a-zA-Z\s]+)',
384
- r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))'
385
- ],
386
- 'PERSON': [
387
- r'آقای\s+([آ-ی‌a-zA-Z]+(?:\s+[آ-ی‌a-zA-Z]+)*)',
388
- r'خانم\s+([آ-ی‌a-zA-Z]+(?:\s+[آ-ی‌a-zA-Z]+)*)',
389
- r'مهندس\s+([آ-ی‌a-zA-Z]+(?:\s+[آ-ی‌a-zA-Z]+)*)',
390
- r'دکتر\s+([آ-ی‌a-zA-Z]+(?:\s+[آ-ی‌a-zA-Z]+)*)',
391
- r'([آ-ی‌a-zA-Z]+\s+[آ-ی‌a-zA-Z]+)(?=،\s+مدیرعامل|\s+مدیرعامل|\s+رئیس)',
392
- r'مدیرعامل(?=\s|$|،|\.)',
393
- r'سرپرست(?=\s+و|\s|$|،|\.)',
394
- r'رئیس\s+هیأت‌مدیره',
395
- r'وی(?=\s+ادامه|\s+اظهار|\s+گفت|\s+اعلام|\s+همچنین)'
396
- ],
397
- 'AMOUNT': [
398
- r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
399
- r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان',
400
- r'\d+\s*تومان(?=\s+به\s+ازای|\s+فروش|\s+،)',
401
- r'رقم\s+فعلی\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
402
- r'رقم\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
403
- r'به\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
404
- r'از\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
405
- r'برابر\s+با\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
406
- r'\d+(?:میلیارد|میلیون)\s*تومان(?=\s+رسیده|\s+ثبت|\s+بوده|\s+،)',
407
- r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?',
408
- r'\d+(?:,\d{3})*\s*ریال',
409
- r'€\d+(?:,\d{3})*(?:\.\d+)?'
410
- ],
411
- 'PERCENTAGE': [
412
- r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش|\s+بالاتر|\s+پایین‌تر)?',
413
- r'\d+(?:\.\d+)?\s*%',
414
- r'معادل\s+\d+(?:\.\d+)?\s*درصد',
415
- r'حدود\s+\d+(?:\.\d+)?\s*درصد',
416
- r'با\s+\d+(?:\.\d+)?\s*درصد\s+افزایش',
417
- r'رشد\s+\d+(?:\.\d+)?\s*درصدی',
418
- r'\d+(?:\.\d+)?\s*درصدی(?=\s+همراه|\s+بوده)',
419
- r'میزان\s+رشد(?=\s+نسبت|\s+معادل)',
420
- r'افزایش\s+قابل‌توجهی',
421
- r'بهبود\s+نسبی'
422
- ],
423
- 'PHONE': [
424
- r'(?:تلفن[\s:]*)?(?:شماره[\s:]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
425
- r'(?:تماس[\s:]*)?(?:شماره[\s:]*)?(?:با[\s]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
426
- r'(?:موبایل[\s:]*)?(?:شماره[\s:]*)?(?:0)?9[۰-۹0-9]{9}',
427
- r'[۰-۹0-9]{3,4}[-\s][۰-۹0-9]{7,8}',
428
- r'[۰-۹0-9]{11}(?!\d)',
429
- r'(?:\+98|0098)?[۰-۹0-9]{10}',
430
- r'[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}'
431
- ],
432
- 'EMAIL': [
433
- r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
434
- r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
435
- r'email[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
436
- r'نشانی[\s]*الکترونیک[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
437
- r'آدرس[\s]*ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
438
- ],
439
- 'ACCOUNT': [
440
- r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:بانکی[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
441
- r'حساب[\s]*(?:شماره[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
442
- r'شماره[\s]*حساب[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
443
- r'Account[\s]*(?:Number[\s:]*)?(?:[0-9]{1,3}[-\s]?)*[0-9]{8,20}',
444
- r'[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{6,12}',
445
- r'[۰-۹0-9]{2,4}[-\s]?[۰-۹0-9]{6,12}[-\s]?[۰-۹0-9]{2,4}',
446
- r'واریز[\s]*(?:سود[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
447
- r'سود[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}'
448
- ],
449
- 'ID_NUMBER': [
450
- r'IR[۰-۹0-9]{24}',
451
- r'شبا[\s:]*IR[۰-۹0-9]{24}',
452
- r'IBAN[\s:]*IR[۰-۹0-9]{24}',
453
- r'شماره[\s]*شبا[\s:]*IR[۰-۹0-9]{24}',
454
- r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
455
- r'(?:شناسه[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
456
- r'National[\s]*(?:ID[\s:]*)?[0-9]{10}',
457
- r'(?:پاسپورت[\s:]*)?[A-Z][0-9]{8}',
458
- r'(?:Passport[\s:]*)?[A-Z][0-9]{8}',
459
- r'(?:کارت[\s:]*)?(?:[۰-۹0-9]{4}[-\s]?){3}[۰-۹0-9]{4}',
460
- r'(?:Card[\s:]*)?(?:[0-9]{4}[-\s]?){3}[0-9]{4}'
461
- ],
462
- 'DATE': [
463
- r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}',
464
- r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}',
465
- r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s*(?:[۰-۹0-9]{4})',
466
- r'(?:[0-9]{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(?:[0-9]{4})',
467
- r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s*[0-9]{1,2},?\s*[0-9]{4}'
468
- ]
469
- }
470
-
471
- # پردازش patterns با اولویت‌بندی - از خاص به عام
472
- logger.info("🔍 Running prioritized regex extraction...")
473
-
474
- # پردازش به ترتیب اولویت برای جلوگیری از تداخل
475
- processed_entities = set() # برای جلوگیری از تکرار
476
-
477
- for category, pattern_list in patterns.items():
478
- for pattern in pattern_list:
479
- matches = re.finditer(pattern, original_text, re.IGNORECASE | re.MULTILINE)
480
- for match in matches:
481
- if match.groups():
482
- item = match.group(1).strip()
483
- full_match = match.group(0).strip()
484
- else:
485
- item = match.group(0).strip()
486
- full_match = item
487
-
488
- # بررسی تداخل با entities قبلی
489
- overlaps = False
490
- match_start, match_end = match.span()
491
-
492
- for proc_start, proc_end in processed_entities:
493
- # بررسی تداخل موقعیت
494
- if not (match_end <= proc_start or match_start >= proc_end):
495
- overlaps = True
496
- break
497
-
498
- if (not overlaps and
499
- full_match not in found_entities and
500
- full_match not in self.mapping_table and
501
- len(full_match) >= 2):
502
-
503
- self.counters[category] += 1
504
- code = f"{category}_{self.counters[category]:03d}_REGEX"
505
- self.mapping_table[full_match] = code
506
- found_entities.add(full_match)
507
- processed_entities.add((match_start, match_end))
508
- logger.info(f"Regex ({category}): {full_match} -> {code}")
509
-
510
- # جایگزینی در متن با ترتیب طولانی‌ترین اول
511
- sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True)
512
- for original_item, code in sorted_items:
513
- anonymized = anonymized.replace(original_item, code)
514
-
515
- logger.info(f"✅ Anonymization completed. Found {len(self.mapping_table)} entities.")
516
- return anonymized
517
-
518
- except Exception as e:
519
- return f"❌ Error in anonymization: {str(e)}" if lang == 'en' else f"❌ خطا در ناشناس‌سازی: {str(e)}"
520
-
521
- def send_to_chatgpt(self, anonymized_text, lang='fa'):
522
- """گام 2: ارسال به ChatGPT"""
523
- try:
524
- if not anonymized_text or not anonymized_text.strip():
525
- return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناس‌شده خالی است!"
526
-
527
- if not self.api_key:
528
- return "❌ API Key not configured! Please set OPENAI_API_KEY environment variable." if lang == 'en' else "❌ کلید API تنظیم نشده است! لطفاً OPENAI_API_KEY را در متغیرهای محیطی تنظیم کنید."
529
-
530
- system_msg = "You are a professional financial analyst. The text contains anonymous codes. Answer questions accurately." if lang == 'en' else "شما یک تحلیلگر مالی حرفه‌ای هستید. متن حاوی کدهای ناشناس است. به سوالات با دقت پاسخ دهید."
531
-
532
- headers = {
533
- "Authorization": f"Bearer {self.api_key}",
534
- "Content-Type": "application/json"
535
- }
536
-
537
- data = {
538
- "model": "gpt-4o-mini",
539
- "messages": [
540
- {"role": "system", "content": system_msg},
541
- {"role": "user", "content": anonymized_text}
542
- ],
543
- "max_tokens": 2000,
544
- "temperature": 0.7
545
- }
546
-
547
- response = requests.post(
548
- "https://api.openai.com/v1/chat/completions",
549
- headers=headers,
550
- json=data,
551
- timeout=30
552
- )
553
-
554
- if response.status_code == 200:
555
- result = response.json()
556
- return result['choices'][0]['message']['content']
557
- else:
558
- error_data = response.json() if response.content else {}
559
- error_message = error_data.get('error', {}).get('message', response.text)
560
-
561
- if 'Incorrect API key' in error_message:
562
- return "❌ Invalid API key." if lang == 'en' else "❌ کلید API نامعتبر است."
563
- elif 'quota' in error_message:
564
- return "❌ API quota exceeded." if lang == 'en' else "❌ سهمیه API تمام شده است."
565
- else:
566
- return f"❌ API Error: {error_message}"
567
-
568
- except Exception as e:
569
- return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}"
570
-
571
- def deanonymize_response(self, gpt_response, lang='fa'):
572
- """گام 3: بازگردانی"""
573
- try:
574
- if not gpt_response or not gpt_response.strip():
575
- return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!"
576
-
577
- if not self.mapping_table:
578
- return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
579
-
580
- final_result = gpt_response
581
- reverse_mapping = {code: original for original, code in self.mapping_table.items()}
582
-
583
- sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True)
584
- for code, original in sorted_codes:
585
- final_result = final_result.replace(code, original)
586
- escaped_code = code.replace('_', '\\_')
587
- final_result = final_result.replace(escaped_code, original)
588
-
589
- return final_result
590
-
591
- except Exception as e:
592
- return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}"
593
-
594
- def get_model_status(self):
595
- """وضعیت مدل‌های محلی"""
596
- status = "🤖 **Local Model Status (Enhanced with Priority-Based Sensitive Data Detection):**\n\n"
597
-
598
- if hasattr(self, 'model_status') and self.model_status:
599
- for model_type, model_status in self.model_status.items():
600
- if model_type == 'persian':
601
- status += f"• **Persian NER**: {model_status}\n"
602
- elif model_type == 'english':
603
- status += f"• **English NER**: {model_status}\n"
604
- elif model_type == 'financial':
605
- status += f"• **Financial NER**: {model_status}\n"
606
- elif model_type == 'transformers':
607
- status += f"• **Transformers**: {model_status}\n"
608
- elif model_type == 'fallback':
609
- status += f"• **Fallback Mode**: {model_status}\n"
610
- elif model_type == 'critical':
611
- status += f"• **Critical**: {model_status}\n"
612
- elif model_type == 'directory':
613
- status += f"• **Directory**: {model_status}\n"
614
-
615
- loaded_count = sum(1 for status in getattr(self, 'model_status', {}).values()
616
- if status.startswith("✅"))
617
- status += f"\n📊 **Summary**: {loaded_count}/2 local models loaded"
618
-
619
- status += f"\n📁 **Models Path**: {self.models_base_path}"
620
- status += f"\n🔧 **Latest Features**: Priority-based detection with overlap prevention"
621
-
622
- status += f"\n\n🔍 **Enhanced Sensitive Data Detection (Priority Order):**"
623
- status += f"\n 1️⃣ **ID Numbers**: IBAN/SHEBA codes, National IDs, Passport numbers"
624
- status += f"\n 2️⃣ **Contact Info**: Email addresses with context keywords"
625
- status += f"\n 3️⃣ **Phone Numbers**: Mobile & landline with country codes"
626
- status += f"\n 4️⃣ **Bank Accounts**: Account numbers with Persian keywords"
627
- status += f"\n 5️⃣ **Financial Data**: Amounts, percentages, stock symbols"
628
- status += f"\n 6️⃣ **Corporate Data**: Company names, person names, dates"
629
-
630
- status += f"\n\n✨ **Key Improvements:**"
631
- status += f"\n 🎯 Overlap detection prevents double-matching"
632
- status += f"\n 🇮🇷 Full Persian digit support (۰-۹)"
633
- status += f"\n 🔄 Context-aware pattern matching"
634
- status += f"\n 📏 Length-based replacement order"
635
-
636
- return status
637
-
638
- def process_all_steps(input_text, language):
639
- """پردازش خودکار تمام مراحل"""
640
- lang = 'en' if language == 'English' else 'fa'
641
-
642
- if not input_text.strip():
643
- error_msg = "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
644
- return error_msg, "", "", ""
645
-
646
- try:
647
- start_time = time.time()
648
-
649
- anonymized_text = anonymizer.anonymize_text(input_text, lang)
650
- if anonymized_text.startswith("❌"):
651
- return anonymized_text, "", "", ""
652
-
653
- gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang)
654
- if gpt_response.startswith("❌"):
655
- entities_found = len(anonymizer.mapping_table)
656
- local_ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_LOCAL_NER' in code)
657
- regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code)
658
-
659
- # آمار اطلاعات حساس
660
- sensitive_categories = ['ID_NUMBER', 'EMAIL', 'PHONE', 'ACCOUNT']
661
- sensitive_count = sum(1 for code in anonymizer.mapping_table.values()
662
- if any(cat in code for cat in sensitive_categories))
663
-
664
- method = "Priority-Based Local NER + Regex" if anonymizer.models_loaded else "Priority-Based Regex Only"
665
- success_msg = (f"✅ Anonymization completed with {method}!\n"
666
- f"🔍 Sensitive data: {sensitive_count} | 🤖 NER: {local_ner_count} | 🔎 Regex: {regex_count}\n"
667
- f"📊 Total: {entities_found} entities protected")
668
- return success_msg, anonymized_text, gpt_response, ""
669
-
670
- final_result = anonymizer.deanonymize_response(gpt_response, lang)
671
-
672
- total_time = time.time() - start_time
673
- entities_found = len(anonymizer.mapping_table)
674
- local_ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_LOCAL_NER' in code)
675
- regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code)
676
-
677
- # آمار تفصیلی اطلاعات حساس
678
- id_count = sum(1 for code in anonymizer.mapping_table.values() if 'ID_NUMBER' in code)
679
- email_count = sum(1 for code in anonymizer.mapping_table.values() if 'EMAIL' in code)
680
- phone_count = sum(1 for code in anonymizer.mapping_table.values() if 'PHONE' in code)
681
- account_count = sum(1 for code in anonymizer.mapping_table.values() if 'ACCOUNT' in code)
682
-
683
- sensitive_details = []
684
- if id_count > 0: sensitive_details.append(f"🆔 IDs: {id_count}")
685
- if email_count > 0: sensitive_details.append(f"📧 Emails: {email_count}")
686
- if phone_count > 0: sensitive_details.append(f"📞 Phones: {phone_count}")
687
- if account_count > 0: sensitive_details.append(f"🏦 Accounts: {account_count}")
688
-
689
- method = "Priority-Based Local NER + Regex" if anonymizer.models_loaded else "Priority-Based Regex Only"
690
- success_msg = (f"🎉 Complete anonymization & restoration successful!\n"
691
- f"🔧 Method: {method}\n"
692
- f"🔍 Sensitive data: {' | '.join(sensitive_details) if sensitive_details else '0'}\n"
693
- f"📊 Total: {entities_found} entities | ⏱️ Time: {total_time:.2f}s")
694
-
695
- return success_msg, anonymized_text, gpt_response, final_result
696
-
697
- except Exception as e:
698
- error_msg = f"❌ Processing error: {str(e)}" if lang == 'en' else f"❌ خطا در پردازش: {str(e)}"
699
- return error_msg, "", "", ""
700
-
701
- def get_mapping_table(language):
702
- """نمایش جدول نگاشت"""
703
- lang = 'en' if language == 'English' else 'fa'
704
-
705
- if not anonymizer.mapping_table:
706
- return "❌ Mapping table is empty! Please process some text first." if lang == 'en' else "❌ جدول نگاشت خالی است! ابتدا متنی را پردازش کنید."
707
-
708
- result = "📋 **Priority-Based Sensitive Data Mapping Table:**\n\n" if lang == 'en' else "📋 **جدول نگاشت اطلاعات حساس با اولویت‌بندی:**\n\n"
709
-
710
- local_ner_items = {k: v for k, v in anonymizer.mapping_table.items() if '_LOCAL_NER' in v}
711
- regex_items = {k: v for k, v in anonymizer.mapping_table.items() if '_REGEX' in v}
712
-
713
- # گروه‌بندی بر اساس نوع اطلاعات حساس
714
- priority_categories = {
715
- 'ID_NUMBER': '🆔 **Identity & Financial Codes**',
716
- 'EMAIL': '📧 **Email Addresses**',
717
- 'PHONE': '📞 **Phone Numbers**',
718
- 'ACCOUNT': '🏦 **Bank Account Numbers**'
719
- }
720
-
721
- sensitive_found = False
722
- for category, title in priority_categories.items():
723
- category_items = {k: v for k, v in anonymizer.mapping_table.items() if category in v}
724
- if category_items:
725
- sensitive_found = True
726
- result += f"{title}:\n"
727
- for original, code in list(category_items.items())[:8]:
728
- result += f" • `{original}` → `{code}`\n"
729
- if len(category_items) > 8:
730
- result += f" ... و {len(category_items) - 8} مورد دیگر\n"
731
- result += "\n"
732
-
733
- if local_ner_items:
734
- result += "🤖 **Local NER Detected**:\n"
735
- for original, code in list(local_ner_items.items())[:8]:
736
- result += f" • `{original}` → `{code}`\n"
737
- if len(local_ner_items) > 8:
738
- result += f" ... و {len(local_ner_items) - 8} مورد دیگر\n"
739
- result += "\n"
740
-
741
- # سایر موارد (مالی، شرکتی و غیره)
742
- other_categories = ['AMOUNT', 'PERCENTAGE', 'COMPANY', 'PERSON', 'STOCK_SYMBOL', 'DATE']
743
- other_items = {k: v for k, v in regex_items.items()
744
- if any(cat in v for cat in other_categories)}
745
-
746
- if other_items:
747
- result += "💼 **Business & Financial Data**:\n"
748
- for original, code in list(other_items.items())[:8]:
749
- result += f" • `{original}` → `{code}`\n"
750
- if len(other_items) > 8:
751
- result += f" ... و {len(other_items) - 8} مورد دیگر\n"
752
-
753
- # آمار کلی
754
- sensitive_count = sum(len({k: v for k, v in anonymizer.mapping_table.items() if cat in v})
755
- for cat in priority_categories.keys())
756
-
757
- result += f"\n📊 **Statistics**:\n"
758
- result += f"🔍 **Sensitive Data**: {sensitive_count} items\n"
759
- result += f"🤖 **NER Detected**: {len(local_ner_items)} items\n"
760
- result += f"💼 **Business Data**: {len(other_items)} items\n"
761
- result += f"📋 **Total**: {len(anonymizer.mapping_table)} entities\n"
762
-
763
- result += f"\n✨ **Enhancement Applied**: Priority-based detection with overlap prevention\n"
764
- result += f"🎯 **Success**: All major sensitive data types detected and anonymized!"
765
-
766
- return result
767
-
768
- def clear_all():
769
- """پاک کردن همه"""
770
- anonymizer.mapping_table = {}
771
- anonymizer.counters = {key: 0 for key in anonymizer.counters.keys()}
772
- return "", "", "", "", ""
773
-
774
- def update_ui_text(language):
775
- """به‌روزرسانی متن‌های رابط کاربری"""
776
- if language == 'English':
777
- return {
778
- 'title': 'Priority-Based Bilingual Data Anonymization System',
779
- 'step1': 'Input Text & Settings',
780
- 'step2': 'Anonymized Text',
781
- 'step3': 'Raw ChatGPT Response',
782
- 'step4': 'Final Restored Response',
783
- 'input_placeholder': 'Enter your original text here...\nExample: Company reports, person names, financial amounts, phone numbers, emails, IBAN codes, bank accounts, etc.',
784
- 'process_btn': 'Process with Smart Priority Detection',
785
- 'clear_btn': 'Clear All',
786
- 'mapping_btn': 'Show Priority-Based Mapping Table',
787
- 'copy_btn': 'Copy',
788
- 'direction': 'ltr'
789
- }
790
- else:
791
- return {
792
- 'title': 'سیستم ناشناس‌سازی هوشمند با اولویت‌بندی',
793
- 'step1': 'متن ورودی و تنظیمات',
794
- 'step2': 'متن ناشناس‌شده',
795
- 'step3': 'پاسخ خام ChatGPT',
796
- 'step4': 'پاسخ نهایی بازگردانده شده',
797
- 'input_placeholder': 'متن اصلی خود را اینجا وارد کنید...\nمثال: گزارش‌های شرکت، نام اشخاص، مبالغ مالی، شماره تلفن، ایمیل، شماره شبا، حساب بانکی و غیره',
798
- 'process_btn': 'پردازش با تشخیص هوشمند اولویت‌دار',
799
- 'clear_btn': 'پاک کردن همه',
800
- 'mapping_btn': 'نمایش جدول نگاشت اولویت‌دار',
801
- 'copy_btn': 'کپی',
802
- 'direction': 'rtl'
803
- }
804
-
805
- def update_interface(language):
806
- """تغییر رابط کاربری بر اساس زبان"""
807
- ui_text = update_ui_text(language)
808
- is_english = (language == 'English')
809
-
810
- # تغییر direction برای workflow
811
- workflow_css = "workflow ltr" if is_english else "workflow rtl"
812
-
813
- return [
814
- gr.update(value=f"<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 {ui_text['title']}</h1>"),
815
- gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>📝 {ui_text['step1']}</h2>"),
816
- gr.update(placeholder=ui_text['input_placeholder'], rtl=not is_english),
817
- gr.update(value=f"🚀 {ui_text['process_btn']}"),
818
- gr.update(value=f"🗑️ {ui_text['clear_btn']}"),
819
- gr.update(rtl=not is_english),
820
- gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🎭 {ui_text['step2']}</h2>"),
821
- gr.update(rtl=not is_english),
822
- gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🤖 {ui_text['step3']}</h2>"),
823
- gr.update(rtl=not is_english),
824
- gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>✅ {ui_text['step4']}</h2>"),
825
- gr.update(rtl=not is_english),
826
- gr.update(value=f"📋 {ui_text['mapping_btn']}"),
827
- gr.update(rtl=not is_english),
828
- gr.update(elem_classes=workflow_css)
829
- ]
830
-
831
- # ایجاد instance
832
- anonymizer = BilingualDataAnonymizer()
833
-
834
- # CSS اصلاح شده برای ترازبندی عمودی مناسب
835
- custom_css = """
836
- body, .gradio-container {
837
- font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important;
838
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
839
- min-height: 100vh !important;
840
- padding: 20px !important;
841
- }
842
-
843
- .rtl {
844
- direction: rtl !important;
845
- text-align: right !important;
846
- }
847
-
848
- .ltr {
849
- direction: ltr !important;
850
- text-align: left !important;
851
- }
852
-
853
- .workflow {
854
- display: grid !important;
855
- grid-template-columns: 1fr 1fr 1fr 1fr !important;
856
- gap: 25px !important;
857
- padding: 30px !important;
858
- align-items: start !important;
859
- align-content: start !important;
860
- grid-auto-rows: auto !important;
861
- }
862
-
863
- .workflow > * {
864
- align-self: start !important;
865
- vertical-align: top !important;
866
- margin-top: 0 !important;
867
- }
868
-
869
- .workflow .gradio-column,
870
- .workflow-column {
871
- display: flex !important;
872
- flex-direction: column !important;
873
- align-items: stretch !important;
874
- justify-content: flex-start !important;
875
- height: auto !important;
876
- min-height: 0 !important;
877
- margin-top: 0 !important;
878
- padding-top: 0 !important;
879
- }
880
-
881
- .gradio-textbox {
882
- border-radius: 10px !important;
883
- box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
884
- flex-grow: 1 !important;
885
- min-height: 380px !important;
886
- max-height: 380px !important;
887
- height: 380px !important;
888
- }
889
-
890
- .gradio-textbox textarea {
891
- min-height: 350px !important;
892
- max-height: 350px !important;
893
- height: 350px !important;
894
- resize: vertical !important;
895
- }
896
-
897
- .workflow.rtl {
898
- direction: rtl !important;
899
- }
900
-
901
- .workflow.ltr {
902
- direction: ltr !important;
903
- }
904
-
905
- h1, h2, h3 {
906
- text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
907
- margin-top: 0 !important;
908
- margin-bottom: 10px !important;
909
- padding-top: 0 !important;
910
- line-height: 1.2 !important;
911
- }
912
-
913
- h2 {
914
- min-height: 40px !important;
915
- max-height: 40px !important;
916
- display: flex !important;
917
- align-items: center !important;
918
- margin-bottom: 15px !important;
919
- }
920
-
921
- .status-box {
922
- background: linear-gradient(135deg, #4CAF50, #45a049) !important;
923
- border: 3px solid #2E7D32 !important;
924
- border-radius: 15px !important;
925
- padding: 15px !important;
926
- margin: 10px 0 !important;
927
- box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3) !important;
928
- animation: pulse 2s infinite !important;
929
- min-height: 120px !important;
930
- max-height: 120px !important;
931
- }
932
-
933
- .status-box textarea {
934
- background: rgba(255, 255, 255, 0.95) !important;
935
- border: none !important;
936
- border-radius: 10px !important;
937
- font-weight: bold !important;
938
- font-size: 1.1em !important;
939
- color: #1B5E20 !important;
940
- text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.8) !important;
941
- min-height: 80px !important;
942
- max-height: 80px !important;
943
- }
944
-
945
- @keyframes pulse {
946
- 0% { box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3); }
947
- 50% { box-shadow: 0 8px 40px rgba(76, 175, 80, 0.6); }
948
- 100% { box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3); }
949
- }
950
-
951
- .gradio-button {
952
- border-radius: 25px !important;
953
- font-weight: bold !important;
954
- transition: all 0.3s ease !important;
955
- margin: 5px 0 !important;
956
- min-height: 50px !important;
957
- max-height: 50px !important;
958
- }
959
-
960
- .gradio-button:hover {
961
- transform: translateY(-2px) !important;
962
- box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important;
963
- }
964
-
965
- h1 {
966
- background: linear-gradient(45deg, #FFD700, #FFA500) !important;
967
- -webkit-background-clip: text !important;
968
- -webkit-text-fill-color: transparent !important;
969
- background-clip: text !important;
970
- min-height: 80px !important;
971
- }
972
-
973
- @media (max-width: 1200px) {
974
- .workflow {
975
- grid-template-columns: 1fr 1fr !important;
976
- gap: 20px !important;
977
- }
978
- }
979
-
980
- @media (max-width: 768px) {
981
- .workflow {
982
- grid-template-columns: 1fr !important;
983
- gap: 15px !important;
984
- }
985
-
986
- .gradio-textbox {
987
- min-height: 300px !important;
988
- max-height: 300px !important;
989
- height: 300px !important;
990
- }
991
- }
992
-
993
- [data-testid="textbox"]:dir(rtl) {
994
- text-align: right !important;
995
- direction: rtl !important;
996
- }
997
-
998
- [data-testid="textbox"]:dir(ltr) {
999
- text-align: left !important;
1000
- direction: ltr !important;
1001
- }
1002
-
1003
- .gradio-container .gradio-column {
1004
- align-self: start !important;
1005
- vertical-align: top !important;
1006
- }
1007
-
1008
- .gradio-container .gradio-row {
1009
- align-items: flex-start !important;
1010
- }
1011
-
1012
- * {
1013
- box-sizing: border-box !important;
1014
- }
1015
-
1016
- .gradio-container {
1017
- align-items: start !important;
1018
- justify-content: start !important;
1019
- }
1020
- """
1021
-
1022
- # رابط کاربری Gradio با ترازبندی اصلاح شده
1023
- with gr.Blocks(title="📊 Priority-Based Anonymization System", theme=gr.themes.Soft(), css=custom_css) as app:
1024
-
1025
- with gr.Row():
1026
- language_selector = gr.Radio(
1027
- choices=["فارسی", "English"],
1028
- value="فارسی",
1029
- label="Language / زبان",
1030
- interactive=True
1031
- )
1032
-
1033
- with gr.Column():
1034
- title = gr.HTML("<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 سیستم ناشناس‌سازی هوشمند با اولویت‌بندی</h1>")
1035
-
1036
- with gr.Row(elem_classes="workflow rtl") as workflow_row:
1037
- with gr.Column(elem_classes="workflow-column"):
1038
- step1_title = gr.HTML('<h2 style="direction: rtl;">📝 متن ورودی و تنظیمات</h2>')
1039
-
1040
- input_text = gr.Textbox(
1041
- lines=15,
1042
- placeholder="متن اصلی خود را اینجا وارد کنید...\n✨ سیستم هوشمند اطلاعات حساس مثل شماره تلفن، ایمیل، شماره شبا، حسا�� بانکی را به ترتیب اولویت تشخیص می‌دهد",
1043
- label="",
1044
- rtl=True
1045
- )
1046
-
1047
- process_btn = gr.Button("🚀 پردازش با تشخیص هوشمند اولویت‌دار", variant="primary")
1048
- clear_btn = gr.Button("🗑️ پاک کردن همه", variant="stop")
1049
-
1050
- status = gr.Textbox(
1051
- label="وضعیت",
1052
- lines=4,
1053
- interactive=False,
1054
- rtl=True,
1055
- elem_classes=["status-box"]
1056
- )
1057
-
1058
- with gr.Column(elem_classes="workflow-column"):
1059
- step2_title = gr.HTML('<h2 style="direction: rtl;">🎭 متن ناشناس‌شده</h2>')
1060
-
1061
- anonymized_output = gr.Textbox(
1062
- lines=15,
1063
- placeholder="متن ناشناس‌شده اینجا نمایش داده می‌شود...",
1064
- label="",
1065
- interactive=False,
1066
- rtl=True
1067
- )
1068
-
1069
- with gr.Column(elem_classes="workflow-column"):
1070
- step3_title = gr.HTML('<h2 style="direction: rtl;">🤖 پاسخ خام ChatGPT</h2>')
1071
-
1072
- gpt_output = gr.Textbox(
1073
- lines=15,
1074
- placeholder="پاسخ خام ChatGPT اینجا نمایش داده می‌شود...",
1075
- label="",
1076
- interactive=False,
1077
- rtl=True
1078
- )
1079
-
1080
- with gr.Column(elem_classes="workflow-column"):
1081
- step4_title = gr.HTML('<h2 style="direction: rtl;">✅ پاسخ نهایی بازگردانده شده</h2>')
1082
-
1083
- final_output = gr.Textbox(
1084
- lines=15,
1085
- placeholder="پاسخ نهایی اینجا نمایش داده می‌شود...",
1086
- label="",
1087
- interactive=False,
1088
- rtl=True
1089
- )
1090
-
1091
- with gr.Row():
1092
- with gr.Column():
1093
- mapping_title = gr.HTML('<h2>🗂️ جدول نگاشت اولویت‌دار</h2>')
1094
- mapping_btn = gr.Button("📋 نمایش جدول نگاشت اولویت‌دار")
1095
-
1096
- mapping_output = gr.Textbox(
1097
- lines=10,
1098
- label="جدول نگاشت اطلاعات",
1099
- interactive=False,
1100
- visible=False,
1101
- rtl=True
1102
- )
1103
-
1104
- # Event handlers
1105
- language_selector.change(
1106
- fn=update_interface,
1107
- inputs=[language_selector],
1108
- outputs=[title, step1_title, input_text, process_btn, clear_btn,
1109
- status, step2_title, anonymized_output, step3_title, gpt_output,
1110
- step4_title, final_output, mapping_btn, mapping_output, workflow_row]
1111
- )
1112
-
1113
- process_btn.click(
1114
- fn=process_all_steps,
1115
- inputs=[input_text, language_selector],
1116
- outputs=[status, anonymized_output, gpt_output, final_output]
1117
- )
1118
-
1119
- clear_btn.click(
1120
- fn=clear_all,
1121
- outputs=[input_text, anonymized_output, gpt_output, final_output, status]
1122
- )
1123
-
1124
- mapping_btn.click(
1125
- fn=get_mapping_table,
1126
- inputs=[language_selector],
1127
- outputs=[mapping_output]
1128
- )
1129
-
1130
- mapping_btn.click(
1131
- fn=lambda: gr.update(visible=True),
1132
- outputs=[mapping_output]
1133
- )
1134
-
1135
- if __name__ == "__main__":
1136
- app.launch(share=True)