leilaghomashchi commited on
Commit
26c1d2c
·
verified ·
1 Parent(s): af05cc3

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -972
app.py DELETED
@@ -1,972 +0,0 @@
1
- import gradio as gr
2
- import re
3
- import os
4
- import requests
5
- import time
6
- import logging
7
- from pathlib import Path
8
-
9
- # تنظیم logging
10
- logging.basicConfig(level=logging.INFO)
11
- logger = logging.getLogger(__name__)
12
-
13
- def auto_setup_models_for_hf():
14
- """راه‌اندازی خودکار مدل‌ها برای هاگینگ فیس (بدون ورودی کاربر)"""
15
- models_dir = Path("./models")
16
- models_dir.mkdir(exist_ok=True)
17
-
18
- required_models = {
19
- 'bert-fa-ner': 'HooshvareLab/bert-fa-zwnj-base-ner',
20
- 'bert-base-NER': 'dslim/bert-base-NER',
21
- }
22
-
23
- try:
24
- # بررسی نصب transformers
25
- from transformers import AutoTokenizer, AutoModelForTokenClassification
26
- logger.info("✅ Transformers library available")
27
-
28
- for model_name, hf_repo in required_models.items():
29
- model_path = models_dir / model_name
30
-
31
- # اگر مدل وجود دارد، رد شو
32
- if model_path.exists() and list(model_path.glob("*.json")):
33
- logger.info(f"✅ {model_name} already exists")
34
- continue
35
-
36
- try:
37
- logger.info(f"📥 Auto-downloading {model_name} from {hf_repo}...")
38
- model_path.mkdir(exist_ok=True)
39
-
40
- tokenizer = AutoTokenizer.from_pretrained(hf_repo)
41
- model = AutoModelForTokenClassification.from_pretrained(hf_repo)
42
-
43
- tokenizer.save_pretrained(model_path)
44
- model.save_pretrained(model_path)
45
-
46
- logger.info(f"✅ {model_name} downloaded successfully")
47
- del tokenizer, model # آزادسازی حافظه
48
-
49
- except Exception as e:
50
- logger.error(f"❌ Failed to download {model_name}: {e}")
51
- # پاک کردن فایل‌های ناقص
52
- if model_path.exists():
53
- import shutil
54
- shutil.rmtree(model_path)
55
-
56
- return True
57
-
58
- except ImportError:
59
- logger.error("❌ transformers library not available")
60
- return False
61
- except Exception as e:
62
- logger.error(f"❌ Auto-setup failed: {e}")
63
- return False
64
-
65
- class ComprehensiveBilingualDataAnonymizer:
66
- def __init__(self):
67
- self.mapping_table = {}
68
- # دسته‌بندی‌های الگوها برای UI
69
- self.pattern_categories = {
70
- 'personal_identity': {
71
- 'name_fa': 'اطلاعات شخصی و هویتی',
72
- 'name_en': 'Personal & Identity Information',
73
- 'patterns': ['PERSON', 'MIXED_NAMES', 'ID_NUMBER', 'ENGLISH_TITLES'],
74
- 'icon': '👤'
75
- },
76
- 'financial': {
77
- 'name_fa': 'اطلاعات مالی',
78
- 'name_en': 'Financial Information',
79
- 'patterns': ['AMOUNT', 'INTERNATIONAL_CURRENCIES', 'ACCOUNT', 'FINANCIAL_TERMS', 'STOCK_SYMBOL'],
80
- 'icon': '💰'
81
- },
82
- 'temporal': {
83
- 'name_fa': 'اطلاعات زمانی',
84
- 'name_en': 'Temporal Information',
85
- 'patterns': ['DATE', 'ADVANCED_DATE_FORMATS', 'TIME_RANGES'],
86
- 'icon': '📅'
87
- },
88
- 'location': {
89
- 'name_fa': 'اطلاعات مکانی',
90
- 'name_en': 'Location Information',
91
- 'patterns': ['LOCATION', 'COMPLEX_ADDRESSES'],
92
- 'icon': '📍'
93
- },
94
- 'technical': {
95
- 'name_fa': 'اطلاعات فنی و تکنولوژیکی',
96
- 'name_en': 'Technical & Technological',
97
- 'patterns': ['TECHNICAL_CODES', 'NETWORK_ADDRESSES', 'TECHNICAL_UNITS', 'ACRONYMS_ABBREVIATIONS'],
98
- 'icon': '⚙️'
99
- },
100
- 'business': {
101
- 'name_fa': 'اطلاعات کسب‌وکار',
102
- 'name_en': 'Business Information',
103
- 'patterns': ['COMPANY', 'BUSINESS_TERMS', 'PRODUCT', 'PETROCHEMICAL'],
104
- 'icon': '🏢'
105
- },
106
- 'quantity': {
107
- 'name_fa': 'اطلاعات کمیت و واحد',
108
- 'name_en': 'Quantity & Unit Information',
109
- 'patterns': ['PERCENTAGE', 'VOLUME', 'RATIOS'],
110
- 'icon': '📊'
111
- },
112
- 'communication': {
113
- 'name_fa': 'اطلاعات ارتباطی',
114
- 'name_en': 'Communication Information',
115
- 'patterns': ['PHONE', 'EMAIL'],
116
- 'icon': '📞'
117
- }
118
- }
119
-
120
- # counters
121
- self.counters = {
122
- 'PERSON': 0, 'MIXED_NAMES': 0, 'ID_NUMBER': 0, 'ENGLISH_TITLES': 0,
123
- 'AMOUNT': 0, 'INTERNATIONAL_CURRENCIES': 0, 'ACCOUNT': 0,
124
- 'FINANCIAL_TERMS': 0, 'STOCK_SYMBOL': 0,
125
- 'DATE': 0, 'ADVANCED_DATE_FORMATS': 0, 'TIME_RANGES': 0,
126
- 'LOCATION': 0, 'COMPLEX_ADDRESSES': 0,
127
- 'TECHNICAL_CODES': 0, 'NETWORK_ADDRESSES': 0, 'TECHNICAL_UNITS': 0,
128
- 'ACRONYMS_ABBREVIATIONS': 0,
129
- 'COMPANY': 0, 'BUSINESS_TERMS': 0, 'PRODUCT': 0, 'PETROCHEMICAL': 0,
130
- 'PERCENTAGE': 0, 'VOLUME': 0, 'RATIOS': 0,
131
- 'PHONE': 0, 'EMAIL': 0
132
- }
133
-
134
- self.api_key = os.getenv("OPENAI_API_KEY", "")
135
- self.models_base_path = "./models"
136
- self.models_loaded = False
137
- self.model_status = {}
138
- self.load_local_ner_models()
139
-
140
- def get_category_choices(self, language='fa'):
141
- """دریافت لیست دسته‌بندی‌ها برای چک‌باکس"""
142
- choices = []
143
- for cat_key, cat_info in self.pattern_categories.items():
144
- name = cat_info['name_fa'] if language == 'fa' else cat_info['name_en']
145
- icon = cat_info['icon']
146
- choices.append(f"{icon} {name}")
147
- return choices
148
-
149
- def get_selected_patterns(self, selected_categories, language='fa'):
150
- """تبدیل دسته‌بندی‌های انتخاب شده به لیست الگوها"""
151
- selected_patterns = []
152
-
153
- for cat_key, cat_info in self.pattern_categories.items():
154
- name = cat_info['name_fa'] if language == 'fa' else cat_info['name_en']
155
- icon = cat_info['icon']
156
- category_display = f"{icon} {name}"
157
-
158
- if category_display in selected_categories:
159
- selected_patterns.extend(cat_info['patterns'])
160
-
161
- return selected_patterns
162
-
163
- def load_local_ner_models(self):
164
- """لود مدل‌های NER محلی با مدیریت خطا برای هاگینگ فیس"""
165
- logger.info("📄 Loading local NER models (HuggingFace compatible)...")
166
-
167
- if not Path(self.models_base_path).exists():
168
- try:
169
- Path(self.models_base_path).mkdir(exist_ok=True)
170
- logger.info(f"📁 Created models directory: {self.models_base_path}")
171
- except Exception as e:
172
- logger.error(f"❌ Failed to create models directory: {e}")
173
- self.model_status['directory'] = f"❌ Cannot create models directory: {e}"
174
- self.models_loaded = False
175
- return
176
-
177
- try:
178
- # بررسی نصب transformers
179
- try:
180
- import torch
181
- from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
182
- self.model_status['transformers'] = "✅ Transformers library available"
183
- logger.info("✅ Transformers library available")
184
- except ImportError as e:
185
- self.model_status['transformers'] = f"❌ Transformers not installed: {str(e)}"
186
- self.models_loaded = False
187
- logger.error(f"❌ Transformers not available: {e}")
188
- return
189
-
190
- # تلاش برای لود مدل فارسی
191
- persian_model_path = Path(self.models_base_path) / "bert-fa-ner"
192
- if persian_model_path.exists() and list(persian_model_path.glob("*.json")):
193
- try:
194
- self.persian_ner = pipeline("ner",
195
- model=str(persian_model_path),
196
- tokenizer=str(persian_model_path),
197
- device=-1) # استفاده از CPU
198
- self.model_status['persian'] = f"✅ Persian NER loaded: {persian_model_path}"
199
- logger.info("✅ Persian NER model loaded successfully")
200
- except Exception as e:
201
- self.persian_ner = None
202
- self.model_status['persian'] = f"❌ Persian loading error: {str(e)[:100]}"
203
- logger.error(f"❌ Persian model loading error: {e}")
204
- else:
205
- self.persian_ner = None
206
- self.model_status['persian'] = f"❌ Persian model not found: {persian_model_path}"
207
- logger.warning(f"Persian model not found at {persian_model_path}")
208
-
209
- # تلاش برای لود مدل انگلیسی
210
- english_model_path = Path(self.models_base_path) / "bert-base-NER"
211
- if english_model_path.exists() and list(english_model_path.glob("*.json")):
212
- try:
213
- self.english_ner = pipeline("ner",
214
- model=str(english_model_path),
215
- tokenizer=str(english_model_path),
216
- device=-1) # استفاده از CPU
217
- self.model_status['english'] = f"✅ English NER loaded: {english_model_path}"
218
- logger.info("✅ English NER model loaded successfully")
219
- except Exception as e:
220
- self.english_ner = None
221
- self.model_status['english'] = f"❌ English loading error: {str(e)[:100]}"
222
- logger.error(f"❌ English model loading error: {e}")
223
- else:
224
- self.english_ner = None
225
- self.model_status['english'] = f"❌ English model not found: {english_model_path}"
226
- logger.warning(f"English model not found at {english_model_path}")
227
-
228
- # تعیین وضعیت نهایی
229
- loaded_models = sum(1 for status in self.model_status.values() if status.startswith("✅"))
230
- self.models_loaded = loaded_models > 0
231
-
232
- if loaded_models == 0:
233
- self.model_status['fallback'] = "⚠️ Using regex-only mode (no local models found)"
234
- logger.info("⚠️ No NER models loaded - using regex-only mode")
235
- else:
236
- logger.info(f"✅ Loaded {loaded_models} NER models successfully")
237
-
238
- except Exception as e:
239
- self.models_loaded = False
240
- self.model_status['critical'] = f"❌ Critical error: {str(e)[:100]}..."
241
- logger.error(f"❌ Critical error in NER loading: {e}")
242
-
243
- def detect_language(self, text):
244
- """تشخیص زبان متن"""
245
- if not text:
246
- return 'fa'
247
-
248
- persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
249
- english_chars = len(re.findall(r'[a-zA-Z]', text))
250
- total = persian_chars + english_chars
251
-
252
- if total == 0:
253
- return 'fa'
254
-
255
- if persian_chars / total > 0.6:
256
- return 'fa'
257
- elif english_chars / total > 0.6:
258
- return 'en'
259
- else:
260
- return 'mixed'
261
-
262
- def extract_entities_with_ner(self, text, lang='fa'):
263
- """استخراج entities با مدل‌های NER محلی"""
264
- entities = []
265
-
266
- if not self.models_loaded:
267
- logger.info("ℹ️ Local NER models not available - using regex only")
268
- return entities
269
-
270
- try:
271
- # مدل فارسی
272
- if lang in ['fa', 'mixed'] and hasattr(self, 'persian_ner') and self.persian_ner:
273
- try:
274
- persian_results = self.persian_ner(text)
275
- for entity in persian_results:
276
- if isinstance(entity, dict):
277
- # پردازش نتایج بر اساس ساختار خروجی
278
- entity_text = entity.get('word', '').strip()
279
- entity_label = entity.get('entity_group', entity.get('entity', ''))
280
- entity_score = entity.get('score', 0)
281
-
282
- if entity_text and len(entity_text) > 1 and entity_score > 0.5:
283
- entities.append({
284
- 'text': entity_text,
285
- 'label': entity_label,
286
- 'start': entity.get('start', 0),
287
- 'end': entity.get('end', 0),
288
- 'confidence': entity_score,
289
- 'source': 'local_persian_ner'
290
- })
291
- logger.info(f"Persian NER found {len(persian_results)} entities")
292
- except Exception as e:
293
- logger.error(f"Persian NER extraction error: {e}")
294
-
295
- # مدل انگلیسی
296
- if lang in ['en', 'mixed'] and hasattr(self, 'english_ner') and self.english_ner:
297
- try:
298
- english_results = self.english_ner(text)
299
- for entity in english_results:
300
- if isinstance(entity, dict):
301
- entity_text = entity.get('word', '').strip()
302
- entity_label = entity.get('entity_group', entity.get('entity', ''))
303
- entity_score = entity.get('score', 0)
304
-
305
- if entity_text and len(entity_text) > 1 and entity_score > 0.5:
306
- entities.append({
307
- 'text': entity_text,
308
- 'label': entity_label,
309
- 'start': entity.get('start', 0),
310
- 'end': entity.get('end', 0),
311
- 'confidence': entity_score,
312
- 'source': 'local_english_ner'
313
- })
314
- logger.info(f"English NER found {len(english_results)} entities")
315
- except Exception as e:
316
- logger.error(f"English NER extraction error: {e}")
317
-
318
- except Exception as e:
319
- logger.error(f"NER extraction general error: {e}")
320
-
321
- # حذف تکراری‌ها
322
- unique_entities = []
323
- seen = set()
324
- for entity in entities:
325
- key = (entity['text'].lower(), entity['start'], entity['end'])
326
- if key not in seen:
327
- seen.add(key)
328
- unique_entities.append(entity)
329
-
330
- logger.info(f"Total unique entities found by local models: {len(unique_entities)}")
331
- return unique_entities
332
-
333
- def map_ner_to_categories(self, ner_label, source=''):
334
- """نگاشت برچسب‌های NER به دسته‌های سیستم"""
335
- mapping = {
336
- 'PER': 'PERSON', 'PERSON': 'PERSON',
337
- 'ORG': 'COMPANY', 'ORGANIZATION': 'COMPANY',
338
- 'LOC': 'LOCATION', 'LOCATION': 'LOCATION',
339
- 'MISC': 'BUSINESS_TERMS', 'MISCELLANEOUS': 'BUSINESS_TERMS',
340
- 'B-PER': 'PERSON', 'I-PER': 'PERSON',
341
- 'B-ORG': 'COMPANY', 'I-ORG': 'COMPANY',
342
- 'B-LOC': 'LOCATION', 'I-LOC': 'LOCATION',
343
- 'B-MISC': 'BUSINESS_TERMS', 'I-MISC': 'BUSINESS_TERMS',
344
- 'MONEY': 'AMOUNT', 'PERCENT': 'PERCENTAGE',
345
- 'DATE': 'DATE', 'TIME': 'DATE'
346
- }
347
- return mapping.get(ner_label.upper(), 'BUSINESS_TERMS')
348
-
349
- def get_comprehensive_patterns(self):
350
- """الگوهای جامع ناشناس‌سازی - نسخه فشرده برای هاگینگ فیس"""
351
- return {
352
- 'PERSON': [
353
- r'آقای\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
354
- r'خانم\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
355
- r'مهندس\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
356
- r'دکتر\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
357
- r'Mr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
358
- r'Ms\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
359
- r'Dr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)'
360
- ],
361
-
362
- 'MIXED_NAMES': [
363
- r'([آ-یa-zA-Z]{2,}\s+[آ-یa-zA-Z]{2,})',
364
- r'([A-Z][a-z]+-[A-Z][a-z]+)'
365
- ],
366
-
367
- 'ID_NUMBER': [
368
- r'IR[۰-۹0-9]{24}',
369
- r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
370
- r'(?:کارت[\s:]*)?(?:[۰-۹0-9]{4}[-\s]?){3}[۰-۹0-9]{4}',
371
- r'SSN[\s:]*[0-9]{3}-[0-9]{2}-[0-9]{4}'
372
- ],
373
-
374
- 'ENGLISH_TITLES': [
375
- r'business\s+partner',
376
- r'team\s+lead',
377
- r'senior\s+architect',
378
- r'facility\s+manager'
379
- ],
380
-
381
- 'AMOUNT': [
382
- r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
383
- r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان',
384
- r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?',
385
- r'€\d+(?:,\d{3})*(?:\.\d+)?'
386
- ],
387
-
388
- 'INTERNATIONAL_CURRENCIES': [
389
- r'\d+(?:,\d{3})*\s+euro',
390
- r'\d+(?:,\d{3})*\s+AED',
391
- r'£\d+(?:,\d{3})*(?:\.\d+)?'
392
- ],
393
-
394
- 'ACCOUNT': [
395
- r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
396
- r'Account[\s]*(?:Number[\s:]*)?(?:[0-9]{1,3}[-\s]?)*[0-9]{8,20}'
397
- ],
398
-
399
- 'FINANCIAL_TERMS': [
400
- r'فروش\s+(?:ماهانه|تجمیعی|صادراتی)',
401
- r'درآمد\s+شرکت',
402
- r'سود\s+(?:خالص|نقدی)',
403
- r'صورت‌های\s+مالی'
404
- ],
405
-
406
- 'STOCK_SYMBOL': [
407
- r'نماد\s+([آ-یa-zA-Z0-9]+)',
408
- r'(AAPL|GOOGL|MSFT|AMZN|TSLA|META|NVDA|SABIC)(?=\s|$|,|\.)'
409
- ],
410
-
411
- 'DATE': [
412
- r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}',
413
- r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s*(?:[۰-۹0-9]{4})',
414
- r'(?:[0-9]{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(?:[0-9]{4})'
415
- ],
416
-
417
- 'ADVANCED_DATE_FORMATS': [
418
- r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d{3})?Z',
419
- r'(?:PST|EST|GMT|UTC)(?:[+-]\d{1,2}:\d{2})?'
420
- ],
421
-
422
- 'TIME_RANGES': [
423
- r'\d{2}:\d{2}-\d{2}:\d{2}',
424
- r'\d{1,2}:\d{2}\s+(?:AM|PM)\s+(?:PST|EST|GMT|UTC)'
425
- ],
426
-
427
- 'LOCATION': [
428
- r'(تهران|اصفهان|ماهشهر|عسلویه|بندرعباس|اهواز|شیراز|مشهد|تبریز|کرج)',
429
- r'(London|Paris|Tokyo|New\s+York|Dubai|Singapore)'
430
- ],
431
-
432
- 'COMPLEX_ADDRESSES': [
433
- r'کیلومتر\s+\d+\s+جاده\s+[آ-ی\s]+-[آ-ی\s]+',
434
- r'Building-[A-Z],?\s+Floor-\d+,?\s+Unit-[A-Z0-9]+'
435
- ],
436
-
437
- 'TECHNICAL_CODES': [
438
- r'SN-\d{4}-[A-Z]{3}-\d{4}',
439
- r'REF-[A-Z]{3}-\d{4}-\d{3}',
440
- r'HVAC-\d{7}'
441
- ],
442
-
443
- 'NETWORK_ADDRESSES': [
444
- r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
445
- r'[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}'
446
- ],
447
-
448
- 'TECHNICAL_UNITS': [
449
- r'\d+(?:\.\d+)?\s*MW',
450
- r'\d+(?:\.\d+)?\s*kWh?',
451
- r'\d+(?:,\d{3})*\s*cubic\s+meters'
452
- ],
453
-
454
- 'ACRONYMS_ABBREVIATIONS': [
455
- r'\b(?:HVAC|IT|HSE|BOQ|LC|COB)\b',
456
- r'\b(?:LLC|Corp|Inc|Ltd)\b'
457
- ],
458
-
459
- 'COMPANY': [
460
- r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به|\s+طی)',
461
- r'(بانک\s+[آ-یa-zA-Z\s]+)',
462
- r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))'
463
- ],
464
-
465
- 'BUSINESS_TERMS': [
466
- r'تحلیل\s+عملکرد',
467
- r'گزارش\s+(?:فعالیت|عملکرد)\s+ماهانه',
468
- r'تولید\s+پایدار'
469
- ],
470
-
471
- 'PRODUCT': [
472
- r'\b(?:VCM|PVC|PE|PP|PS|ABS|SAN|PC|PMMA|PET|PBT|PA|POM|TPU)\b',
473
- r'پلی\s*(?:اتیلن|پروپیلن|استایرن)'
474
- ],
475
-
476
- 'PETROCHEMICAL': [
477
- r'\b(?:LDPE|HDPE|LLDPE|PP|PS|EPS|ABS)\b'
478
- ],
479
-
480
- 'PERCENTAGE': [
481
- r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش)?',
482
- r'\d+(?:\.\d+)?\s*%',
483
- r'\d+(?:\.\d+)?\%\s*(?:increase|decrease|growth)'
484
- ],
485
-
486
- 'VOLUME': [
487
- r'\d+(?:,\d{3})*\s*تن',
488
- r'\d+(?:,\d{3})*\s*(?:tons|kg|liters|barrels)'
489
- ],
490
-
491
- 'RATIOS': [
492
- r'نسبت\s+(?:فروش|تولید)\s+به\s+[آ-ی\s]+',
493
- r'برابر\s+با\s+\d+(?:\.\d+)?'
494
- ],
495
-
496
- 'PHONE': [
497
- r'(?:تلفن[\s:]*)?(?:شماره[\s:]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
498
- r'(?:موبایل[\s:]*)?(?:شماره[\s:]*)?(?:0)?9[۰-۹0-9]{9}',
499
- r'\+[0-9]{1,3}-[0-9]{3}-[0-9]{3}-[0-9]{4}'
500
- ],
501
-
502
- 'EMAIL': [
503
- r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
504
- r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
505
- ]
506
- }
507
-
508
- def anonymize_text(self, original_text, lang='fa', selected_categories=None):
509
- """گام 1: ناشناس‌سازی متن با الگوهای انتخاب شده"""
510
- try:
511
- if not original_text or not original_text.strip():
512
- return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
513
-
514
- # ریست متغیرها
515
- self.mapping_table = {}
516
- self.counters = {key: 0 for key in self.counters.keys()}
517
-
518
- anonymized = original_text
519
- found_entities = set()
520
-
521
- # تشخیص زبان
522
- detected_lang = self.detect_language(original_text)
523
- logger.info(f"Detected language: {detected_lang}")
524
-
525
- # مرحله 1: استخراج با Local NER
526
- if self.models_loaded:
527
- logger.info("🤖 Running local NER extraction...")
528
- ner_entities = self.extract_entities_with_ner(original_text, detected_lang)
529
-
530
- for entity in ner_entities:
531
- if (entity['text'] not in found_entities and
532
- len(entity['text'].strip()) > 1 and
533
- entity['confidence'] > 0.5):
534
-
535
- category = self.map_ner_to_categories(entity['label'], entity['source'])
536
-
537
- if entity['text'] not in self.mapping_table:
538
- self.counters[category] += 1
539
- code = f"{category}_{self.counters[category]:03d}_LOCAL_NER"
540
- self.mapping_table[entity['text']] = code
541
- found_entities.add(entity['text'])
542
- logger.info(f"Local NER: {entity['text']} -> {code}")
543
- else:
544
- logger.info("ℹ️ Using regex-only mode")
545
-
546
- # مرحله 2: الگوهای Regex انتخاب شده
547
- all_patterns = self.get_comprehensive_patterns()
548
-
549
- # فیلتر کردن الگوها بر اساس انتخاب کاربر
550
- if selected_categories:
551
- selected_pattern_types = self.get_selected_patterns(selected_categories, lang)
552
- patterns = {k: v for k, v in all_patterns.items() if k in selected_pattern_types}
553
- logger.info(f"📋 Using selected pattern categories: {len(patterns)} types")
554
- else:
555
- patterns = all_patterns
556
- logger.info("📋 Using all available pattern categories")
557
-
558
- # پردازش patterns
559
- logger.info("🔍 Running selective regex extraction...")
560
-
561
- processed_entities = set()
562
-
563
- # اولویت‌بندی دسته‌ها
564
- priority_order = [
565
- 'ID_NUMBER', 'EMAIL', 'PHONE', 'ACCOUNT', 'TECHNICAL_CODES',
566
- 'NETWORK_ADDRESSES', 'INTERNATIONAL_CURRENCIES', 'AMOUNT',
567
- 'TECHNICAL_UNITS', 'ACRONYMS_ABBREVIATIONS', 'ADVANCED_DATE_FORMATS',
568
- 'TIME_RANGES', 'COMPLEX_ADDRESSES', 'MIXED_NAMES', 'ENGLISH_TITLES',
569
- 'STOCK_SYMBOL', 'COMPANY', 'PERSON', 'PERCENTAGE', 'VOLUME',
570
- 'RATIOS', 'LOCATION', 'DATE', 'FINANCIAL_TERMS', 'BUSINESS_TERMS',
571
- 'PRODUCT', 'PETROCHEMICAL'
572
- ]
573
-
574
- for category in priority_order:
575
- if category in patterns:
576
- pattern_list = patterns[category]
577
- for pattern in pattern_list:
578
- try:
579
- matches = re.finditer(pattern, original_text, re.IGNORECASE | re.MULTILINE)
580
- for match in matches:
581
- if match.groups():
582
- item = match.group(1).strip()
583
- full_match = match.group(0).strip()
584
- else:
585
- item = match.group(0).strip()
586
- full_match = item
587
-
588
- # بررسی تداخل
589
- overlaps = False
590
- match_start, match_end = match.span()
591
-
592
- for proc_start, proc_end in processed_entities:
593
- if not (match_end <= proc_start or match_start >= proc_end):
594
- overlaps = True
595
- break
596
-
597
- if (not overlaps and
598
- full_match not in found_entities and
599
- full_match not in self.mapping_table and
600
- len(full_match) >= 2):
601
-
602
- self.counters[category] += 1
603
- code = f"{category}_{self.counters[category]:03d}_REGEX"
604
- self.mapping_table[full_match] = code
605
- found_entities.add(full_match)
606
- processed_entities.add((match_start, match_end))
607
- logger.info(f"Regex ({category}): {full_match} -> {code}")
608
- except re.error as e:
609
- logger.error(f"Regex error in pattern {pattern}: {e}")
610
- continue
611
-
612
- # جایگزینی در متن
613
- sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True)
614
- for original_item, code in sorted_items:
615
- anonymized = anonymized.replace(original_item, code)
616
-
617
- logger.info(f"✅ Selective anonymization completed. Found {len(self.mapping_table)} entities.")
618
- return anonymized
619
-
620
- except Exception as e:
621
- logger.error(f"Anonymization error: {e}")
622
- return f"❌ Error in anonymization: {str(e)}" if lang == 'en' else f"❌ خطا در ناشناس‌سازی: {str(e)}"
623
-
624
- def send_to_chatgpt(self, anonymized_text, lang='fa'):
625
- """گام 2: ارسال به ChatGPT"""
626
- try:
627
- if not anonymized_text or not anonymized_text.strip():
628
- return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناس‌شده خالی است!"
629
-
630
- if not self.api_key:
631
- return "❌ API Key not configured! Please set OPENAI_API_KEY environment variable." if lang == 'en' else "❌ کلید API تنظیم نشده است!"
632
-
633
- system_msg = "You are a professional analyst. Answer questions accurately." if lang == 'en' else "شما یک تحلیلگر حرفه‌ای هستید. به سوالات با دقت پاسخ دهید."
634
-
635
- headers = {
636
- "Authorization": f"Bearer {self.api_key}",
637
- "Content-Type": "application/json"
638
- }
639
-
640
- data = {
641
- "model": "gpt-4o-mini",
642
- "messages": [
643
- {"role": "system", "content": system_msg},
644
- {"role": "user", "content": anonymized_text}
645
- ],
646
- "max_tokens": 2000,
647
- "temperature": 0.7
648
- }
649
-
650
- response = requests.post(
651
- "https://api.openai.com/v1/chat/completions",
652
- headers=headers,
653
- json=data,
654
- timeout=30
655
- )
656
-
657
- if response.status_code == 200:
658
- result = response.json()
659
- return result['choices'][0]['message']['content']
660
- else:
661
- error_data = response.json() if response.content else {}
662
- error_message = error_data.get('error', {}).get('message', response.text)
663
- return f"❌ API Error: {error_message}"
664
-
665
- except Exception as e:
666
- return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}"
667
-
668
- def deanonymize_response(self, gpt_response, lang='fa'):
669
- """گام 3: بازگردانی"""
670
- try:
671
- if not gpt_response or not gpt_response.strip():
672
- return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!"
673
-
674
- if not self.mapping_table:
675
- return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
676
-
677
- final_result = gpt_response
678
- reverse_mapping = {code: original for original, code in self.mapping_table.items()}
679
-
680
- sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True)
681
- for code, original in sorted_codes:
682
- final_result = final_result.replace(code, original)
683
-
684
- return final_result
685
-
686
- except Exception as e:
687
- return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}"
688
-
689
- def get_model_status(self):
690
- """وضعیت مدل‌های محلی"""
691
- status = "🤖 **HuggingFace Compatible Anonymization System Status:**\n\n"
692
-
693
- if hasattr(self, 'model_status') and self.model_status:
694
- for model_type, model_status in self.model_status.items():
695
- status += f"• **{model_type.title()}**: {model_status}\n"
696
-
697
- loaded_count = sum(1 for status in getattr(self, 'model_status', {}).values()
698
- if status.startswith("✅"))
699
- status += f"\n📊 **Summary**: {loaded_count}/2 local models loaded"
700
- status += f"\n🔍 **Models Path**: {self.models_base_path}"
701
- status += f"\n🔧 **Environment**: HuggingFace Spaces Compatible"
702
-
703
- status += f"\n\n🎯 **Available Pattern Categories:**"
704
- for cat_key, cat_info in self.pattern_categories.items():
705
- icon = cat_info['icon']
706
- name_fa = cat_info['name_fa']
707
- pattern_count = len(cat_info['patterns'])
708
- status += f"\n {icon} {name_fa}: {pattern_count} patterns"
709
-
710
- status += f"\n\n✨ **System Features:**"
711
- status += f"\n 🎯 User-controlled category selection"
712
- status += f"\n 🛡️ Flexible sensitive data protection"
713
- status += f"\n 📊 Efficient targeted processing"
714
- status += f"\n ⚡ HuggingFace Spaces optimized"
715
-
716
- return status
717
-
718
- # ایجاد instance
719
- anonymizer = ComprehensiveBilingualDataAnonymizer()
720
-
721
- def process_all_steps(input_text, language, selected_categories):
722
- """پردازش خودکار تمام مراحل"""
723
- lang = 'en' if language == 'English' else 'fa'
724
-
725
- if not input_text.strip():
726
- error_msg = "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
727
- return error_msg, "", "", ""
728
-
729
- try:
730
- start_time = time.time()
731
-
732
- anonymized_text = anonymizer.anonymize_text(input_text, lang, selected_categories)
733
- if anonymized_text.startswith("❌"):
734
- return anonymized_text, "", "", ""
735
-
736
- gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang)
737
- if gpt_response.startswith("❌"):
738
- entities_found = len(anonymizer.mapping_table)
739
- ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_NER' in code)
740
- regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code)
741
-
742
- selected_count = len(selected_categories) if selected_categories else 0
743
-
744
- method = "Selective Local NER + Regex" if anonymizer.models_loaded else "Selective Regex Only"
745
- success_msg = (f"✅ Selective anonymization completed with {method}!\n"
746
- f"��� Selected categories: {selected_count} | 🤖 NER: {ner_count} | 🔍 Regex: {regex_count}\n"
747
- f"📊 Total protected entities: {entities_found}")
748
- return success_msg, anonymized_text, gpt_response, ""
749
-
750
- final_result = anonymizer.deanonymize_response(gpt_response, lang)
751
-
752
- total_time = time.time() - start_time
753
- entities_found = len(anonymizer.mapping_table)
754
- ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_NER' in code)
755
- regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code)
756
-
757
- selected_count = len(selected_categories) if selected_categories else 8
758
-
759
- method = "Selective Local NER + Regex" if anonymizer.models_loaded else "Selective Regex Only"
760
- success_msg = (f"🎉 Complete selective anonymization & restoration successful!\n"
761
- f"🔧 Method: {method} | 📋 Categories: {selected_count}/8\n"
762
- f"📊 Total: {entities_found} entities | 🤖 NER: {ner_count} | 🔍 Regex: {regex_count}\n"
763
- f"⏱️ Time: {total_time:.2f}s | 🎯 HuggingFace optimized")
764
-
765
- return success_msg, anonymized_text, gpt_response, final_result
766
-
767
- except Exception as e:
768
- error_msg = f"❌ Processing error: {str(e)}" if lang == 'en' else f"❌ خطا در پردازش: {str(e)}"
769
- return error_msg, "", "", ""
770
-
771
- def get_mapping_table(language):
772
- """نمایش جدول نگاشت"""
773
- lang = 'en' if language == 'English' else 'fa'
774
-
775
- if not anonymizer.mapping_table:
776
- return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
777
-
778
- result = "📋 **Selective Mapping Table:**\n\n"
779
-
780
- # نمایش آمار کلی
781
- ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_NER' in code)
782
- regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code)
783
-
784
- result += f"📊 **Statistics**: {len(anonymizer.mapping_table)} total entities\n"
785
- result += f"🤖 **NER Detected**: {ner_count} entities\n"
786
- result += f"🔍 **Regex Detected**: {regex_count} entities\n\n"
787
-
788
- # نمایش نمونه‌ها
789
- if ner_count > 0:
790
- result += "🤖 **NER Results (Sample)**:\n"
791
- ner_items = [(k, v) for k, v in anonymizer.mapping_table.items() if '_NER' in v]
792
- for original, code in ner_items[:3]:
793
- result += f" • `{original}` → `{code}`\n"
794
- result += "\n"
795
-
796
- if regex_count > 0:
797
- result += "🔍 **Regex Results (Sample)**:\n"
798
- regex_items = [(k, v) for k, v in anonymizer.mapping_table.items() if '_REGEX' in v]
799
- for original, code in regex_items[:5]:
800
- result += f" • `{original}` → `{code}`\n"
801
- result += "\n"
802
-
803
- result += "✨ **System**: HuggingFace Spaces compatible with selective processing!"
804
-
805
- return result
806
-
807
- def clear_all():
808
- """پاک کردن همه"""
809
- anonymizer.mapping_table = {}
810
- anonymizer.counters = {key: 0 for key in anonymizer.counters.keys()}
811
- return "", "", "", "", ""
812
-
813
- # CSS ساده برای هاگینگ فیس
814
- custom_css = """
815
- .gradio-container {
816
- font-family: 'Segoe UI', Arial, sans-serif !important;
817
- max-width: 1200px !important;
818
- margin: 0 auto !important;
819
- }
820
-
821
- .rtl {
822
- direction: rtl !important;
823
- text-align: right !important;
824
- }
825
-
826
- .gradio-textbox {
827
- border-radius: 8px !important;
828
- min-height: 150px !important;
829
- }
830
-
831
- .gradio-button {
832
- border-radius: 8px !important;
833
- font-weight: bold !important;
834
- }
835
- """
836
-
837
- # رابط کاربری Gradio ساده برای هاگینگ فیس
838
- with gr.Blocks(title="Selective Anonymization System", theme=gr.themes.Soft(), css=custom_css) as app:
839
-
840
- gr.HTML("<h1 style='text-align: center; color: #2563eb;'>📊 سیستم ناشناس‌سازی انتخابی</h1>")
841
-
842
- with gr.Row():
843
- language_selector = gr.Radio(
844
- choices=["فارسی", "English"],
845
- value="فارسی",
846
- label="Language / زبان"
847
- )
848
-
849
- with gr.Row():
850
- with gr.Column(scale=2):
851
- gr.HTML("<h3>🎯 انتخاب دسته‌بندی‌های مورد نظر:</h3>")
852
- pattern_categories = gr.CheckboxGroup(
853
- choices=anonymizer.get_category_choices('fa'),
854
- value=anonymizer.get_category_choices('fa'),
855
- label="دسته‌بندی‌های الگو"
856
- )
857
-
858
- with gr.Column(scale=3):
859
- input_text = gr.Textbox(
860
- lines=8,
861
- placeholder="متن خود را اینجا وارد کنید...",
862
- label="متن ورودی",
863
- rtl=True
864
- )
865
-
866
- with gr.Row():
867
- process_btn = gr.Button("🚀 پردازش با دسته‌بندی‌های انتخاب شده", variant="primary", size="lg")
868
- clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary")
869
-
870
- status = gr.Textbox(
871
- label="وضعیت",
872
- lines=3,
873
- interactive=False,
874
- rtl=True
875
- )
876
-
877
- with gr.Row():
878
- with gr.Column():
879
- gr.HTML("<h3>🎭 متن ناشناس‌شده</h3>")
880
- anonymized_output = gr.Textbox(
881
- lines=6,
882
- interactive=False,
883
- rtl=True
884
- )
885
-
886
- with gr.Column():
887
- gr.HTML("<h3>🤖 پاسخ ChatGPT</h3>")
888
- gpt_output = gr.Textbox(
889
- lines=6,
890
- interactive=False,
891
- rtl=True
892
- )
893
-
894
- gr.HTML("<h3>✅ پاسخ نهایی بازگردانده شده</h3>")
895
- final_output = gr.Textbox(
896
- lines=6,
897
- interactive=False,
898
- rtl=True
899
- )
900
-
901
- with gr.Row():
902
- mapping_btn = gr.Button("📋 نمایش جدول نگاشت")
903
- status_btn = gr.Button("📊 وضعیت سیستم")
904
-
905
- with gr.Row():
906
- mapping_output = gr.Textbox(
907
- lines=10,
908
- label="جدول نگاشت",
909
- interactive=False,
910
- visible=False,
911
- rtl=True
912
- )
913
-
914
- system_status_output = gr.Textbox(
915
- lines=15,
916
- label="وضعیت سیستم",
917
- interactive=False,
918
- visible=False,
919
- rtl=True
920
- )
921
-
922
- # Event handlers
923
- process_btn.click(
924
- fn=process_all_steps,
925
- inputs=[input_text, language_selector, pattern_categories],
926
- outputs=[status, anonymized_output, gpt_output, final_output]
927
- )
928
-
929
- clear_btn.click(
930
- fn=clear_all,
931
- outputs=[input_text, anonymized_output, gpt_output, final_output, status]
932
- )
933
-
934
- mapping_btn.click(
935
- fn=get_mapping_table,
936
- inputs=[language_selector],
937
- outputs=[mapping_output]
938
- )
939
-
940
- mapping_btn.click(
941
- fn=lambda: gr.update(visible=True),
942
- outputs=[mapping_output]
943
- )
944
-
945
- status_btn.click(
946
- fn=lambda: anonymizer.get_model_status(),
947
- outputs=[system_status_output]
948
- )
949
-
950
- status_btn.click(
951
- fn=lambda: gr.update(visible=True),
952
- outputs=[system_status_output]
953
- )
954
-
955
- # تلاش برای دانلود خودکار مدل‌ها در startup
956
- if __name__ == "__main__":
957
- logger.info("🚀 Starting HuggingFace compatible anonymization system...")
958
-
959
- # تلاش برای دانلود خودکار مدل‌ها
960
- try:
961
- auto_setup_models_for_hf()
962
- except Exception as e:
963
- logger.warning(f"⚠️ Auto-setup issue: {e}")
964
-
965
- logger.info("✅ System ready for HuggingFace Spaces!")
966
-
967
- app.launch(
968
- share=False, # در هاگینگ فیس share=False بهتر است
969
- server_name="0.0.0.0",
970
- server_port=7860,
971
- show_error=True
972
- )