leilaghomashchi commited on
Commit
0fecdc5
·
verified ·
1 Parent(s): 993f2ac

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -1289
app.py DELETED
@@ -1,1289 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Enhanced Multi-Modal Data Anonymization System - Fixed for HuggingFace Spaces
5
- =============================================================================
6
- Combining XLM-RoBERTa + Advanced Regex Patterns for Maximum Accuracy
7
- Supports Persian, English, and Mixed Languages
8
- """
9
-
10
- import gradio as gr
11
- import re
12
- import os
13
- import requests
14
- import time
15
- import logging
16
- from typing import List, Dict, Tuple, Optional, Set
17
- import warnings
18
- import subprocess
19
- import sys
20
- import os
21
-
22
- def install_requirements():
23
- """نصب اجباری وابستگی‌ها"""
24
- try:
25
- subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
26
- subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers>=4.30.0"])
27
- subprocess.check_call([sys.executable, "-m", "pip", "install", "torch"])
28
- subprocess.check_call([sys.executable, "-m", "pip", "install", "tokenizers>=0.13.0"])
29
- print("✅ Dependencies installed successfully")
30
- except Exception as e:
31
- print(f"❌ Failed to install dependencies: {e}")
32
-
33
- # نصب وابستگی‌ها در صورت عدم وجود
34
- try:
35
- import transformers
36
- print("✅ Transformers already available")
37
- except ImportError:
38
- print("📦 Installing transformers...")
39
- install_requirements()
40
-
41
- # Enhanced dependencies with better error handling
42
- TRANSFORMERS_AVAILABLE = False
43
- try:
44
- print("🔄 Attempting to import transformers...")
45
- from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
46
- TRANSFORMERS_AVAILABLE = True
47
- print("✅ Transformers library loaded successfully")
48
- except ImportError as e:
49
- print(f"⚠️ Transformers import failed: {e}")
50
- print("📝 Falling back to regex-only mode")
51
- TRANSFORMERS_AVAILABLE = False
52
- except Exception as e:
53
- print(f"❌ Unexpected error loading transformers: {e}")
54
- TRANSFORMERS_AVAILABLE = False
55
-
56
- warnings.filterwarnings('ignore')
57
- logging.basicConfig(level=logging.INFO)
58
- logger = logging.getLogger(__name__)
59
-
60
- class EnhancedDataAnonymizer:
61
- def __init__(self):
62
- self.mapping_table = {}
63
- self.counters = {}
64
- self.api_key = os.getenv("OPENAI_API_KEY", "")
65
-
66
- # Processing modes
67
- self.processing_modes = {
68
- 'regex_only': 'Pure Regex (Fast & Compatible)',
69
- 'hybrid': 'Regex + XLM-RoBERTa (Recommended)',
70
- 'ner_priority': 'NER Priority + Regex Backup (Highest Accuracy)'
71
- }
72
-
73
- # Model components
74
- self.ner_pipeline = None
75
- self.model_status = "Initializing..."
76
- self.model_ready = False
77
-
78
- # Initialize model with improved error handling
79
- self.initialize_ner_model_safe()
80
-
81
- # Pattern categories
82
- self.pattern_categories = {
83
- 'personal_identity': {
84
- 'name_fa': 'اطلاعات شخصی و هویتی',
85
- 'name_en': 'Personal & Identity Information',
86
- 'patterns': ['PERSON', 'MIXED_NAMES', 'ID_NUMBER', 'ENGLISH_TITLES'],
87
- 'icon': '👤'
88
- },
89
- 'financial': {
90
- 'name_fa': 'اطلاعات مالی',
91
- 'name_en': 'Financial Information',
92
- 'patterns': ['AMOUNT', 'INTERNATIONAL_CURRENCIES', 'ACCOUNT', 'FINANCIAL_TERMS', 'STOCK_SYMBOL'],
93
- 'icon': '💰'
94
- },
95
- 'temporal': {
96
- 'name_fa': 'اطلاعات زمانی',
97
- 'name_en': 'Temporal Information',
98
- 'patterns': ['DATE', 'ADVANCED_DATE_FORMATS', 'TIME_RANGES'],
99
- 'icon': '📅'
100
- },
101
- 'location': {
102
- 'name_fa': 'اطلاعات مکانی',
103
- 'name_en': 'Location Information',
104
- 'patterns': ['LOCATION', 'COMPLEX_ADDRESSES'],
105
- 'icon': '📍'
106
- },
107
- 'technical': {
108
- 'name_fa': 'اطلاعات فنی و تکنولوژیکی',
109
- 'name_en': 'Technical & Technological',
110
- 'patterns': ['TECHNICAL_CODES', 'NETWORK_ADDRESSES', 'TECHNICAL_UNITS', 'ACRONYMS_ABBREVIATIONS'],
111
- 'icon': '⚙️'
112
- },
113
- 'business': {
114
- 'name_fa': 'اطلاعات کسب‌وکار',
115
- 'name_en': 'Business Information',
116
- 'patterns': ['COMPANY', 'BUSINESS_TERMS', 'PRODUCT', 'PETROCHEMICAL'],
117
- 'icon': '🏢'
118
- },
119
- 'quantity': {
120
- 'name_fa': 'اطلاعات کمیت و واحد',
121
- 'name_en': 'Quantity & Unit Information',
122
- 'patterns': ['PERCENTAGE', 'VOLUME', 'RATIOS'],
123
- 'icon': '📊'
124
- },
125
- 'communication': {
126
- 'name_fa': 'اطلاعات ارتباطی',
127
- 'name_en': 'Communication Information',
128
- 'patterns': ['PHONE', 'EMAIL'],
129
- 'icon': '📞'
130
- }
131
- }
132
-
133
- # Initialize counters
134
- self.reset_counters()
135
-
136
- def initialize_ner_model_safe(self):
137
- """بارگذاری ایمن مدل XLM-RoBERTa با مدیریت خطای بهبود یافته"""
138
-
139
- print("🔄 Starting model initialization...")
140
-
141
- if not TRANSFORMERS_AVAILABLE:
142
- self.model_status = "⚠️ Transformers library not available - Using Regex only mode"
143
- self.model_ready = False
144
- print("📝 Transformers not available, continuing with regex patterns only")
145
- return
146
-
147
- try:
148
- print("🤖 Attempting to load XLM-RoBERTa model...")
149
-
150
- # Try loading with multiple fallback strategies
151
- model_names = [
152
- "xlm-roberta-base",
153
- "distilbert-base-multilingual-cased",
154
- "bert-base-multilingual-cased"
155
- ]
156
-
157
- for model_name in model_names:
158
- try:
159
- print(f"🔄 Trying model: {model_name}")
160
-
161
- self.ner_pipeline = pipeline(
162
- "ner",
163
- model=model_name,
164
- aggregation_strategy="simple",
165
- device=-1, # Force CPU
166
- tokenizer_kwargs={
167
- "truncation": True,
168
- "max_length": 256,
169
- "padding": True
170
- }
171
- )
172
-
173
- # Test the model with a simple input
174
- test_result = self.ner_pipeline("Test text")
175
-
176
- self.model_status = f"✅ {model_name} loaded successfully"
177
- self.model_ready = True
178
- print(f"✅ Successfully loaded model: {model_name}")
179
- return
180
-
181
- except Exception as model_error:
182
- print(f"❌ Failed to load {model_name}: {model_error}")
183
- continue
184
-
185
- # If all models failed
186
- raise Exception("All model loading attempts failed")
187
-
188
- except Exception as e:
189
- error_msg = str(e)[:100]
190
- print(f"❌ Model loading completely failed: {error_msg}")
191
- self.model_status = f"❌ Model loading failed - Using Regex only"
192
- self.model_ready = False
193
- self.ner_pipeline = None
194
-
195
- def reset_counters(self):
196
- """ریست کانترها"""
197
- pattern_types = []
198
- for category in self.pattern_categories.values():
199
- pattern_types.extend(category['patterns'])
200
-
201
- self.counters = {pattern: 0 for pattern in pattern_types}
202
-
203
- def detect_language(self, text):
204
- """تشخیص زبان متن"""
205
- if not text:
206
- return 'fa'
207
-
208
- persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
209
- english_chars = len(re.findall(r'[a-zA-Z]', text))
210
- total = persian_chars + english_chars
211
-
212
- if total == 0:
213
- return 'fa'
214
-
215
- if persian_chars / total > 0.6:
216
- return 'fa'
217
- elif english_chars / total > 0.6:
218
- return 'en'
219
- else:
220
- return 'mixed'
221
-
222
- def get_comprehensive_patterns(self):
223
- """الگوهای جامع ناشناس‌سازی"""
224
- return {
225
- 'PERSON': [
226
- r'آقای\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
227
- r'خانم\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
228
- r'مهندس\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
229
- r'دکتر\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
230
- r'استاد\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
231
- r'Mr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
232
- r'Ms\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
233
- r'Dr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
234
- r'([آ-یa-zA-Z]+\s+[آ-یa-zA-Z]+)(?:، مدیرعامل|\s+مدیرعامل|\s+رئیس)',
235
- ],
236
-
237
- 'MIXED_NAMES': [
238
- r'([آ-یa-zA-Z]{2,}\s+[آ-یa-zA-Z]{2,})',
239
- r'([A-Z][a-z]+-[A-Z][a-z]+)',
240
- r"([A-Z]'[A-Z][a-z]+)",
241
- ],
242
-
243
- 'ID_NUMBER': [
244
- r'IR[۰-۹0-9]{24}',
245
- r'شبا[\s:]*IR[۰-۹0-9]{24}',
246
- r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
247
- r'(?:شناسه[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
248
- r'National[\s]*(?:ID[\s:]*)?[0-9]{10}',
249
- r'(?:پاسپورت[\s:]*)?[A-Z][0-9]{8}',
250
- r'SSN[\s:]*[0-9]{3}-[0-9]{2}-[0-9]{4}',
251
- ],
252
-
253
- 'ENGLISH_TITLES': [
254
- r'business\s+partner',
255
- r'team\s+lead',
256
- r'head\s+of\s+production',
257
- r'senior\s+architect',
258
- r'civil\s+engineer',
259
- r'system\s+administrator',
260
- r'network\s+engineer',
261
- r'environmental\s+consultant',
262
- r'senior\s+loan\s+officer',
263
- r'facility\s+manager',
264
- r'project\s+team',
265
- r'technical\s+support'
266
- ],
267
-
268
- 'AMOUNT': [
269
- r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
270
- r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان',
271
- r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?',
272
- r'€\d+(?:,\d{3})*(?:\.\d+)?',
273
- r'\d+(?:,\d{3})*\s*ریال',
274
- r'رقم\s+فعلی\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
275
- r'رقم\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
276
- r'به\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
277
- ],
278
-
279
- 'INTERNATIONAL_CURRENCIES': [
280
- r'\d+(?:,\d{3})*\s+euro',
281
- r'€\d+(?:\.\d+)?M',
282
- r'\d+\s+EUR',
283
- r'\d+(?:,\d{3})*\s+AED',
284
- r'\d+(?:\.\d+)?M\s+AED',
285
- r'\$\d+(?:\.\d+)?M',
286
- r'\$\d+(?:\.\d+)?K',
287
- r'£\d+(?:,\d{3})*(?:\.\d+)?',
288
- r'\d+\s+GBP',
289
- r'\d+\s+CHF',
290
- r'¥\d+(?:,\d{3})*',
291
- r'\d+\s+JPY'
292
- ],
293
-
294
- 'ACCOUNT': [
295
- r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:بانکی[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
296
- r'حساب[\s]*(?:شماره[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
297
- r'شماره[\s]*حساب[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
298
- r'Account[\s]*(?:Number[\s:]*)?(?:[0-9]{1,3}[-\s]?)*[0-9]{8,20}',
299
- r'[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{6,12}',
300
- r'واریز[\s]*(?:سود[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
301
- r'سود[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}'
302
- ],
303
-
304
- 'FINANCIAL_TERMS': [
305
- r'فروش\s+(?:ماهانه|تجمیعی|صادراتی)',
306
- r'درآمد\s+شرکت',
307
- r'سود\s+(?:خالص|نقدی)',
308
- r'صورت‌های\s+مالی',
309
- r'بهای\s+تمام‌شده',
310
- r'سودآوری',
311
- r'عملکرد\s+مالی',
312
- r'میانگین\s+فروش',
313
- r'بالاترین\s+رقم\s+فروش',
314
- r'رقم\s+فروش',
315
- r'درآمدهای\s+عملیاتی'
316
- ],
317
-
318
- 'STOCK_SYMBOL': [
319
- r'نماد\s+([آ-یa-zA-Z0-9]+)',
320
- r'(سبهان|غدیر|شتران|شپنا|پترول|فارس|خارک|پلاسکو|جم|کرمان|مارون|اراک|رازی|شازند|کاوه|بندر|پارس|خوزستان|ماهشهر|عسلویه)(?=\s|$|،|\.|\s+)',
321
- r'شرکت\s+([آ-یa-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+را|\s+به)',
322
- r'پتروشیمی\s+([آ-یa-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+توان)',
323
- r'(AAPL|GOOGL|MSFT|AMZN|TSLA|META|NVDA|SABIC)(?=\s|$|,|\.)'
324
- ],
325
-
326
- 'DATE': [
327
- r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}',
328
- r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}',
329
- r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s*(?:[۰-۹0-9]{4})',
330
- r'(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s+[۰-۹0-9]{4}',
331
- r'(?:[0-9]{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(?:[0-9]{4})',
332
- r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s*[0-9]{1,2},?\s*[0-9]{4}',
333
- r'سال\s+گذشته',
334
- r'سال\s+جاری',
335
- r'این\s+سال',
336
- r'ماه\s+قبل',
337
- r'ماه\s+اخیر',
338
- r'(?:13[0-9]{2}|14[0-9]{2}|20[0-9]{2}|19[0-9]{2})(?=\s|$|،|\.)'
339
- ],
340
-
341
- 'ADVANCED_DATE_FORMATS': [
342
- r'(?:March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th),?\s+\d{4}',
343
- r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d{3})?Z',
344
- r'(?:PST|EST|GMT|UTC)(?:[+-]\d{1,2}:\d{2})?',
345
- r'Eastern\s+Time',
346
- r'GMT[+-]\d{1,2}:\d{2}',
347
- r'end\s+of\s+fiscal\s+year\s+\d{4}/\d{2}/\d{2}'
348
- ],
349
-
350
- 'TIME_RANGES': [
351
- r'\d{2}:\d{2}-\d{2}:\d{2}',
352
- r'\d{2}:\d{2}\s+تا\s+\d{2}:\d{2}',
353
- r'\d{1,2}:\d{2}\s+(?:AM|PM)\s+(?:PST|EST|GMT|UTC)',
354
- r'\d{2}:\d{2}:\d{2}\s+(?:AM|PM)',
355
- r'COB\s*\(Close\s+of\s+Business\)',
356
- r'\d{1,3}\s+(?:business\s+days|روز\s+کاری)'
357
- ],
358
-
359
- 'LOCATION': [
360
- r'(تهران|اصفهان|ماهشهر|عسلویه|بندرعباس|اهواز|شیراز|مشهد|تبریز|کرج|قم|رشت|کرمان|یزد|زاهدان|بوشهر|خرمشهر|آبادان|اراک|قزوین)',
361
- r'استان\s+([آ-ی\s]+)',
362
- r'شهر\s+([آ-ی\s]+)',
363
- r'(ایران|عراق|کویت|عربستان|امارات|قطر|عمان|بحرین|ترکیه|پاکستان|افغانستان)',
364
- r'داخلی|بازار\s+داخلی',
365
- r'خارجی|بازارهای\s+خارجی',
366
- r'(London|Paris|Tokyo|New\s+York|Dubai|Singapore|Hong\s+Kong|Shanghai|Mumbai|Frankfurt|Amsterdam)'
367
- ],
368
-
369
- 'COMPLEX_ADDRESSES': [
370
- r'کیلومتر\s+\d+\s+جاده\s+[آ-ی\s]+-[آ-ی\s]+',
371
- r'روبروی\s+(?:پمپ\s+بنزین|بانک|پارک|مسجد|بیمارستان)\s+[آ-یa-zA-Z\s]+',
372
- r'Building-[A-Z],?\s+Floor-\d+,?\s+Unit-[A-Z0-9]+',
373
- r'rack\s+number\s+R-\d+,?\s+slot\s+\d+',
374
- r'phase\s+\d+\s+development,?\s+block\s+[A-Z],?\s+plot\s+\d+-[A-Z]',
375
- r'\d{2,5}\s+[A-Z][a-z]+\s+(?:Street|Avenue|Boulevard|Road|Drive),?\s+Floor\s+\d+,?\s+Building\s+[A-Z]',
376
- r'شهرک\s+صنعتی\s+[آ-ی\s]+،?\s+محور\s+[آ-ی\s]+'
377
- ],
378
-
379
- 'TECHNICAL_CODES': [
380
- r'SN-\d{4}-[A-Z]{3}-\d{4}',
381
- r'Serial\s+Number[\s:]*[A-Z0-9-]+',
382
- r'REF-[A-Z]{3}-\d{4}-\d{3}',
383
- r'DOC-[A-Z]{2}-\d{4}-\d{4}',
384
- r'INF-\d{4}-\d{4}',
385
- r'CTR/\d{4}/\d{3}',
386
- r'HVAC-\d{7}',
387
- r'Generator-Model-[A-Z0-9]+',
388
- r'LOI-\d{4}-[A-Z]{4}-\d{3}',
389
- r'BOQ-\d{4}-[A-Z]{3}-\d{3}',
390
- r'#INV-\d{4}-Q\d-\d{4}',
391
- r'ESC-\d{4}-[A-Z]{3}-\d{3}',
392
- r'BN-\d{6}-[A-Z]\d+'
393
- ],
394
-
395
- 'NETWORK_ADDRESSES': [
396
- r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
397
- r'xxx\.xxx\.xxx\.xxx',
398
- r'[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}',
399
- r'srv-[a-z]+-[a-z]+-\d{2}',
400
- r'[a-z]+-[a-z]+\d*\.[a-z]+\.[a-z]+',
401
- r'[a-zA-Z0-9-]+\.[a-zA-Z]{2,4}(?:\.[a-zA-Z]{2,4})?'
402
- ],
403
-
404
- 'TECHNICAL_UNITS': [
405
- r'\d+(?:\.\d+)?\s*MW',
406
- r'\d+(?:\.\d+)?\s*kWh?',
407
- r'\d+(?:,\d{3})*\s*cubic\s+meters',
408
- r'\d+(?:,\d{3})*\s*m³',
409
- r'\d+(?:,\d{3})*\s*sq\s+ft',
410
- r'\d+(?:\.\d+)?\s*ppm',
411
- r'\d+(?:\.\d+)?\s*mg/m³',
412
- r'\b(?:CO2|NOx|SO2)\b',
413
- r'\d+(?:\.\d+)?\s*TB',
414
- r'\d+(?:\.\d+)?\s*GB',
415
- r'\d+(?:,\d{3})*\s*square\s+meters',
416
- r'\d+(?:\.\d+)?\%\s*efficiency',
417
- r'FICO\s+score:\s*\d{3}',
418
- r'\d+(?:\.\d+)?\s*(?:bar|psi)',
419
- r'\d+(?:\.\d+)?\s*°[CF]',
420
- r'\d+(?:\.\d+)?\s*(?:rpm|m/s)'
421
- ],
422
-
423
- 'ACRONYMS_ABBREVIATIONS': [
424
- r'\b(?:HVAC|IT|HSE|BOQ|LC|COB)\b',
425
- r'\b(?:YTD|NNN|EIN|SSN|FICO)\b',
426
- r'\bIP\s+Address\b',
427
- r'\bMAC\s+Address\b',
428
- r'\bURL\b',
429
- r'\b(?:LLC|Corp|Inc|Ltd)\b',
430
- r'\b(?:PST|GMT|UTC|EST)\b',
431
- r'\b(?:CO2|NOx|pH|UV)\b',
432
- r'\b(?:SCADA|PLC|HMI)\b',
433
- r'\b(?:GDP|CPI|ROI|NPV)\b',
434
- r'\b(?:FOB|CIF|DDP)\b',
435
- r'\b(?:ABA|SWIFT|IBAN)\b'
436
- ],
437
-
438
- 'COMPANY': [
439
- r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به)',
440
- r'([آ-یa-zA-Z\s]+)\s+شرکت',
441
- r'این\s+شرکت(?=\s|$|،|\.)',
442
- r'(بانک\s+[آ-یa-zA-Z\s]+)',
443
- r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))'
444
- ],
445
-
446
- 'BUSINESS_TERMS': [
447
- r'تحلیل\s+عملکرد',
448
- r'گزارش\s+(?:فعالیت|عملکرد)\s+ماهانه',
449
- r'وضعیت\s+فروش',
450
- r'تولید\s+پایدار',
451
- r'سهم\s+بازار',
452
- r'صادرات\s+هدفمند',
453
- r'بهره‌وری',
454
- r'ظرفیت‌های\s+داخلی',
455
- r'شرکت‌های\s+پیشرو',
456
- r'صنعت\s+پتروشیمی',
457
- r'سرمایه‌گذاران\s+بنیادی',
458
- r'شاخص‌های\s+عملیاتی',
459
- r'برنامه‌ریزی\s+مناسب',
460
- r'واحد\s+فروش',
461
- r'موجودی\s+انبار',
462
- r'فاز\s+رشد\s+جدید',
463
- r'ترکیب\s+فروش',
464
- r'سهم\s+صادراتی',
465
- r'روند\s+عملکرد',
466
- r'اعداد\s+اعلام‌شده',
467
- r'داده‌های\s+ثبت‌شده'
468
- ],
469
-
470
- 'PRODUCT': [
471
- r'\b(?:VCM|PVC|PE|PP|PS|ABS|SAN|PC|PMMA|PET|PBT|PA|POM|TPU)\b',
472
- r'پلی\s*(?:اتیلن|پروپیلن|استایرن|کربنات|متیل)',
473
- r'\b(?:اتیلن|پروپیلن|بنزن|تولوئن|زایلن|متانول|اتانول|استون|فنول)\b',
474
- r'\b(?:کلر|هیدروژن|اکسیژن|نیتروژن|آمونیاک|اتان|پروپان|بوتان)\b',
475
- r'محصول(?:ات)?',
476
- r'تولیدات\s+شرکت'
477
- ],
478
-
479
- 'PETROCHEMICAL': [
480
- r'\b(?:LDPE|HDPE|LLDPE|PP|PS|EPS|ABS|SAN|PC|PMMA|PET|PBT|PA6|PA66|POM|TPU|EVA|EAA)\b',
481
- r'(?:Ethylene\s+Vinyl\s+Acetate|Ethyl\s+Acrylate|Methyl\s+Methacrylate|Polyethylene\s+Terephthalate)'
482
- ],
483
-
484
- 'PERCENTAGE': [
485
- r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش|\s+بالاتر|\s+پایین‌تر)?',
486
- r'\d+(?:\.\d+)?\s*%',
487
- r'معادل\s+\d+(?:\.\d+)?\s*درصد',
488
- r'حدود\s+\d+(?:\.\d+)?\s*درصد',
489
- r'با\s+\d+(?:\.\d+)?\s*درصد\s+افزایش',
490
- r'رشد\s+\d+(?:\.\d+)?\s*درصدی',
491
- r'\d+(?:\.\d+)?\s*درصدی(?=\s+همراه|\s+بوده)',
492
- r'میزان\s+رشد(?=\s+نسبت|\s+معادل)',
493
- r'افزایش\s+قابل‌توجهی',
494
- r'بهبود\s+نسبی',
495
- r'\d+(?:\.\d+)?\%\s*(?:increase|decrease|growth|improvement)',
496
- r'(?:approximately|about)\s+\d+(?:\.\d+)?\%'
497
- ],
498
-
499
- 'VOLUME': [
500
- r'\d+(?:,\d{3})*\s*تن',
501
- r'\d+(?:,\d{3})*\s*(?:کیلوگرم|لیتر|بشکه)',
502
- r'میزان\s+\d+(?:,\d{3})*\s*تن',
503
- r'مقدار\s+تولید',
504
- r'حجم\s+فروش',
505
- r'ظرفیت\s+(?:تولید|اسمی)',
506
- r'\d+(?:,\d{3})*\s*(?:tons|kg|liters|barrels)',
507
- r'\d+(?:,\d{3})*\s*(?:metric\s+tons|MT)',
508
- r'\d+(?:,\d{3})*\s*(?:thousand\s+tons|KT)'
509
- ],
510
-
511
- 'RATIOS': [
512
- r'نسبت\s+(?:فروش|تولید)\s+به\s+[آ-ی\s]+',
513
- r'\d+(?:\.\d+)?\s*نزدیک',
514
- r'برابر\s+با\s+\d+(?:\.\d+)?',
515
- r'معادل\s+\d+(?:\.\d+)?',
516
- r'میزان\s+(?:رشد|افزایش)',
517
- r'شاخص\s+(?:مهم|عملیاتی)',
518
- r'\d+(?:\.\d+)?\s*درصد\s+کل\s+تولید'
519
- ],
520
-
521
- 'PHONE': [
522
- r'(?:تلفن[\s:]*)?(?:شماره[\s:]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
523
- r'(?:تماس[\s:]*)?(?:شماره[\s:]*)?(?:با[\s]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
524
- r'(?:موبایل[\s:]*)?(?:شماره[\s:]*)?(?:0)?9[۰-۹0-9]{9}',
525
- r'[۰-۹0-9]{3,4}[-\s][۰-۹0-9]{7,8}',
526
- r'[۰-۹0-9]{11}(?!\d)',
527
- r'(?:\+98|0098)?[۰-۹0-9]{10}',
528
- r'[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}',
529
- r'\+[0-9]{1,3}-[0-9]{3}-[0-9]{3}-[0-9]{4}(?:\s+ext\.\s+[0-9]{3,4})?',
530
- r'\([0-9]{3}\)\s+[0-9]{3}-[0-9]{4}'
531
- ],
532
-
533
- 'EMAIL': [
534
- r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
535
- r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
536
- r'email[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
537
- r'نشانی[\s]*الکترونیکی[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
538
- r'آدرس[\s]*ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
539
- r'facility\.manager@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
540
- ]
541
- }
542
-
543
- def extract_entities_with_ner(self, text: str, confidence_threshold: float = 0.75) -> List[Dict]:
544
- """استخراج موجودیت‌ها با مدل NER"""
545
- if not self.model_ready or not self.ner_pipeline:
546
- return []
547
-
548
- try:
549
- # Process text with NER model
550
- ner_results = self.ner_pipeline(text)
551
-
552
- entities = []
553
- for entity in ner_results:
554
- if entity['score'] >= confidence_threshold:
555
- # Clean entity text
556
- entity_text = entity['word'].replace('##', '').strip()
557
-
558
- if len(entity_text) >= 2: # Minimum length filter
559
- entities.append({
560
- 'text': entity_text,
561
- 'label': entity['entity_group'],
562
- 'confidence': entity['score'],
563
- 'start': entity['start'],
564
- 'end': entity['end'],
565
- 'source': 'ner'
566
- })
567
-
568
- return entities
569
-
570
- except Exception as e:
571
- logger.error(f"Error in NER extraction: {e}")
572
- return []
573
-
574
- def map_ner_to_categories(self, ner_label: str) -> str:
575
- """نگاشت برچسب‌های NER به دسته‌های سیستم"""
576
- mapping = {
577
- 'PER': 'PERSON',
578
- 'PERSON': 'PERSON',
579
- 'ORG': 'COMPANY',
580
- 'ORGANIZATION': 'COMPANY',
581
- 'LOC': 'LOCATION',
582
- 'LOCATION': 'LOCATION',
583
- 'MISC': 'MIXED_NAMES',
584
- 'GPE': 'LOCATION',
585
- 'MONEY': 'AMOUNT',
586
- 'DATE': 'DATE',
587
- 'TIME': 'DATE'
588
- }
589
- return mapping.get(ner_label.upper(), 'MIXED_NAMES')
590
-
591
- def extract_entities_with_regex(self, text: str, selected_categories: List[str] = None) -> List[Dict]:
592
- """استخراج موجودیت‌ها با Regex"""
593
- entities = []
594
- all_patterns = self.get_comprehensive_patterns()
595
-
596
- # Filter patterns based on selected categories
597
- if selected_categories:
598
- selected_pattern_types = self.get_selected_patterns(selected_categories, 'fa')
599
- patterns = {k: v for k, v in all_patterns.items() if k in selected_pattern_types}
600
- else:
601
- patterns = all_patterns
602
-
603
- processed_positions = set()
604
-
605
- # Process patterns with priority
606
- priority_order = [
607
- 'ID_NUMBER', 'EMAIL', 'PHONE', 'ACCOUNT',
608
- 'AMOUNT', 'DATE', 'LOCATION', 'COMPANY', 'PERSON'
609
- ]
610
-
611
- for category in priority_order:
612
- if category in patterns:
613
- pattern_list = patterns[category]
614
- for pattern in pattern_list:
615
- try:
616
- matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
617
- for match in matches:
618
- if match.groups():
619
- entity_text = match.group(1).strip()
620
- else:
621
- entity_text = match.group(0).strip()
622
-
623
- # Check for overlaps
624
- match_start, match_end = match.span()
625
- overlaps = any(
626
- not (match_end <= pos_start or match_start >= pos_end)
627
- for pos_start, pos_end in processed_positions
628
- )
629
-
630
- if (not overlaps and len(entity_text) >= 2):
631
- entities.append({
632
- 'text': entity_text,
633
- 'category': category,
634
- 'start': match_start,
635
- 'end': match_end,
636
- 'confidence': 0.9,
637
- 'source': 'regex'
638
- })
639
- processed_positions.add((match_start, match_end))
640
-
641
- except re.error as e:
642
- logger.error(f"Regex error in pattern {pattern}: {e}")
643
- continue
644
-
645
- return entities
646
-
647
- def fuse_entities(self, regex_entities: List[Dict], ner_entities: List[Dict],
648
- processing_mode: str) -> List[Dict]:
649
- """ترکیب هوشمندانه نتایج Regex و NER"""
650
-
651
- if processing_mode == 'regex_only' or not self.model_ready:
652
- return regex_entities
653
-
654
- final_entities = []
655
- processed_positions = set()
656
-
657
- if processing_mode == 'hybrid':
658
- # Regex priority for specific patterns
659
- priority_categories = ['PHONE', 'EMAIL', 'ID_NUMBER', 'ACCOUNT', 'AMOUNT']
660
-
661
- # Add high-priority regex entities first
662
- for entity in regex_entities:
663
- if entity['category'] in priority_categories:
664
- final_entities.append(entity)
665
- processed_positions.add((entity['start'], entity['end']))
666
-
667
- # Add NER entities for names and organizations
668
- for entity in ner_entities:
669
- if not self.has_overlap(entity, processed_positions):
670
- category = self.map_ner_to_categories(entity['label'])
671
- entity_copy = entity.copy()
672
- entity_copy['category'] = category
673
- final_entities.append(entity_copy)
674
- processed_positions.add((entity['start'], entity['end']))
675
-
676
- # Add remaining regex entities
677
- for entity in regex_entities:
678
- if (entity['category'] not in priority_categories and
679
- not self.has_overlap(entity, processed_positions)):
680
- final_entities.append(entity)
681
- processed_positions.add((entity['start'], entity['end']))
682
-
683
- elif processing_mode == 'ner_priority':
684
- # NER takes priority, regex as backup
685
- for entity in ner_entities:
686
- category = self.map_ner_to_categories(entity['label'])
687
- entity_copy = entity.copy()
688
- entity_copy['category'] = category
689
- final_entities.append(entity_copy)
690
- processed_positions.add((entity['start'], entity['end']))
691
-
692
- # Add non-overlapping regex entities
693
- for entity in regex_entities:
694
- if not self.has_overlap(entity, processed_positions):
695
- final_entities.append(entity)
696
- processed_positions.add((entity['start'], entity['end']))
697
-
698
- return final_entities
699
-
700
- def has_overlap(self, entity: Dict, processed_positions: Set[Tuple[int, int]]) -> bool:
701
- """بررسی تداخل موقعیت entities"""
702
- entity_start, entity_end = entity['start'], entity['end']
703
-
704
- for start, end in processed_positions:
705
- if not (entity_end <= start or entity_start >= end):
706
- return True
707
- return False
708
-
709
- def get_selected_patterns(self, selected_categories: List[str], language: str = 'fa') -> List[str]:
710
- """تبدیل دسته‌بندی‌های انتخاب شده به لیست الگوها"""
711
- selected_patterns = []
712
-
713
- for cat_key, cat_info in self.pattern_categories.items():
714
- name = cat_info['name_fa'] if language == 'fa' else cat_info['name_en']
715
- icon = cat_info['icon']
716
- category_display = f"{icon} {name}"
717
-
718
- if category_display in selected_categories:
719
- selected_patterns.extend(cat_info['patterns'])
720
-
721
- return selected_patterns
722
-
723
- def get_category_choices(self, language='fa'):
724
- """دریافت لیست دسته‌بندی‌ها برای چک‌باکس"""
725
- choices = []
726
- for cat_key, cat_info in self.pattern_categories.items():
727
- name = cat_info['name_fa'] if language == 'fa'else cat_info['name_en']
728
- icon = cat_info['icon']
729
- choices.append(f"{icon} {name}")
730
- return choices
731
-
732
- def anonymize_text_enhanced(self, original_text: str, lang: str = 'fa',
733
- selected_categories: List[str] = None,
734
- processing_mode: str = 'hybrid') -> str:
735
- """ناشناس‌سازی پیشرفته با ترکیب Regex + NER"""
736
-
737
- try:
738
- if not original_text or not original_text.strip():
739
- return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
740
-
741
- # Force regex_only if model not ready
742
- if not self.model_ready and processing_mode != 'regex_only':
743
- processing_mode = 'regex_only'
744
- print(f"🔄 Forced to regex_only mode because model not ready")
745
-
746
- # Reset
747
- self.mapping_table = {}
748
- self.reset_counters()
749
-
750
- # Extract entities with regex
751
- regex_entities = self.extract_entities_with_regex(original_text, selected_categories)
752
-
753
- # Extract entities with NER (if available)
754
- ner_entities = []
755
- if processing_mode != 'regex_only' and self.model_ready:
756
- ner_raw = self.extract_entities_with_ner(original_text)
757
-
758
- # Convert to standard format
759
- for entity in ner_raw:
760
- ner_entities.append({
761
- 'text': entity['text'],
762
- 'category': self.map_ner_to_categories(entity['label']),
763
- 'start': entity['start'],
764
- 'end': entity['end'],
765
- 'confidence': entity['confidence'],
766
- 'source': 'ner'
767
- })
768
-
769
- # Fuse entities
770
- final_entities = self.fuse_entities(regex_entities, ner_entities, processing_mode)
771
-
772
- # Create anonymization mapping
773
- anonymized = original_text
774
- found_entities = set()
775
-
776
- # Sort by length (longer first to avoid partial replacements)
777
- final_entities.sort(key=lambda x: len(x['text']), reverse=True)
778
-
779
- for entity in final_entities:
780
- entity_text = entity['text'].strip()
781
- category = entity['category']
782
-
783
- if (entity_text not in found_entities and
784
- entity_text not in self.mapping_table and
785
- len(entity_text) >= 2):
786
-
787
- # Generate unique code
788
- if category not in self.counters:
789
- self.counters[category] = 0
790
-
791
- self.counters[category] += 1
792
-
793
- # Add source indicator
794
- if processing_mode == 'regex_only':
795
- source_suffix = "REG"
796
- elif processing_mode == 'hybrid':
797
- source_suffix = "HYB" if self.model_ready else "REG"
798
- else:
799
- source_suffix = "ENH" if self.model_ready else "REG"
800
-
801
- code = f"{category}_{self.counters[category]:03d}_{source_suffix}"
802
-
803
- self.mapping_table[entity_text] = code
804
- found_entities.add(entity_text)
805
-
806
- # Apply anonymization
807
- sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True)
808
- for original_item, code in sorted_items:
809
- anonymized = anonymized.replace(original_item, code)
810
-
811
- # Statistics
812
- regex_count = len(regex_entities)
813
- ner_count = len(ner_entities)
814
- final_count = len(final_entities)
815
-
816
- logger.info(f"✅ Enhanced anonymization completed. Mode: {processing_mode}")
817
- logger.info(f"📊 Regex: {regex_count}, NER: {ner_count}, Final: {final_count}")
818
-
819
- return anonymized
820
-
821
- except Exception as e:
822
- logger.error(f"Enhanced anonymization error: {e}")
823
- return f"❌ Error in enhanced anonymization: {str(e)}"
824
-
825
- def send_to_chatgpt(self, anonymized_text, lang='fa'):
826
- """گام 2: ارسال به ChatGPT"""
827
- try:
828
- if not anonymized_text or not anonymized_text.strip():
829
- return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناس‌شده خالی است!"
830
-
831
- if not self.api_key:
832
- return "❌ API Key not configured! Please set OPENAI_API_KEY environment variable." if lang == 'en' else "❌ کلید API تنظیم نشده است!"
833
-
834
- system_msg = "You are a professional analyst. Answer questions accurately." if lang == 'en' else "شما یک تحلیلگر حرفه‌ای هستید. به سوالات با دقت پاسخ دهید."
835
-
836
- headers = {
837
- "Authorization": f"Bearer {self.api_key}",
838
- "Content-Type": "application/json"
839
- }
840
-
841
- data = {
842
- "model": "gpt-4o-mini",
843
- "messages": [
844
- {"role": "system", "content": system_msg},
845
- {"role": "user", "content": anonymized_text}
846
- ],
847
- "max_tokens": 2000,
848
- "temperature": 0.7
849
- }
850
-
851
- response = requests.post(
852
- "https://api.openai.com/v1/chat/completions",
853
- headers=headers,
854
- json=data,
855
- timeout=15 # Reduced timeout for HF Spaces
856
- )
857
-
858
- if response.status_code == 200:
859
- result = response.json()
860
- return result['choices'][0]['message']['content']
861
- else:
862
- error_data = response.json() if response.content else {}
863
- error_message = error_data.get('error', {}).get('message', response.text)
864
- return f"❌ API Error: {error_message}"
865
-
866
- except Exception as e:
867
- return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}"
868
-
869
- def deanonymize_response(self, gpt_response, lang='fa'):
870
- """گام 3: بازگردانی"""
871
- try:
872
- if not gpt_response or not gpt_response.strip():
873
- return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!"
874
-
875
- if not self.mapping_table:
876
- return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
877
-
878
- final_result = gpt_response
879
- reverse_mapping = {code: original for original, code in self.mapping_table.items()}
880
-
881
- sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True)
882
- for code, original in sorted_codes:
883
- final_result = final_result.replace(code, original)
884
-
885
- return final_result
886
-
887
- except Exception as e:
888
- return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}"
889
-
890
- def get_model_status(self):
891
- """وضعیت سیستم"""
892
- status = "🚀 **Enhanced Multi-Modal Anonymization System Status:**\n\n"
893
-
894
- status += f"🤖 **Model Status**: {self.model_status}\n"
895
- status += f"📝 **Regex Patterns**: ✅ 221 comprehensive patterns loaded\n"
896
- status += f"🌍 **Language Support**: Persian, English, Mixed\n"
897
- status += f"🐍 **Python Version**: {sys.version.split()[0]}\n"
898
- status += f"📦 **Transformers Available**: {'✅ Yes' if TRANSFORMERS_AVAILABLE else '❌ No'}\n\n"
899
-
900
- if self.model_ready:
901
- status += "🎯 **Available Processing Modes:**\n"
902
- status += " • 🔥 Hybrid (Recommended): Regex priority + NER enhancement\n"
903
- status += " • 🎯 NER Priority: NER priority + Regex backup\n"
904
- status += " • ⚡ Regex Only: High-speed pattern matching\n\n"
905
-
906
- status += "📈 **Expected Accuracy:**\n"
907
- status += " • Regex Only: 70-75%\n"
908
- status += " • Hybrid Mode: 85-92%\n"
909
- status += " • NER Priority: 88-95%\n\n"
910
- else:
911
- status += "⚠️ **Current Mode: Regex Only**\n"
912
- status += " • Pure Regex processing (70-75% accuracy)\n"
913
- if not TRANSFORMERS_AVAILABLE:
914
- status += " • Install transformers library for enhanced accuracy\n"
915
- status += " • pip install transformers torch\n"
916
- status += "\n"
917
-
918
- status += f"🎯 **Pattern Categories**: {len(self.pattern_categories)} categories available\n"
919
- status += f"🔧 **Configuration**: User-controlled category selection\n"
920
- status += f"🛡️ **Privacy**: Local processing with optional ChatGPT integration\n"
921
-
922
- if TRANSFORMERS_AVAILABLE:
923
- status += f"✅ **Transformers Library**: Ready for NER processing\n"
924
- else:
925
- status += f"❌ **Transformers Library**: Not available - Add to requirements.txt\n"
926
-
927
- return status
928
-
929
- # Initialize the enhanced anonymizer
930
- print("🔄 Initializing Enhanced Data Anonymizer...")
931
- anonymizer = EnhancedDataAnonymizer()
932
- print(f"✅ Anonymizer initialized with status: {anonymizer.model_status}")
933
-
934
- def process_all_steps_enhanced(input_text, language, selected_categories, processing_mode):
935
- """پردازش خودکار تمام مراحل - نسخه پیشرفته"""
936
- lang = 'en' if language == 'English' else 'fa'
937
-
938
- if not input_text.strip():
939
- error_msg = "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
940
- return error_msg, "", "", ""
941
-
942
- try:
943
- start_time = time.time()
944
-
945
- # Enhanced anonymization
946
- anonymized_text = anonymizer.anonymize_text_enhanced(
947
- input_text, lang, selected_categories, processing_mode
948
- )
949
-
950
- if anonymized_text.startswith("❌"):
951
- return anonymized_text, "", "", ""
952
-
953
- # ChatGPT processing
954
- gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang)
955
- if gpt_response.startswith("❌"):
956
- entities_found = len(anonymizer.mapping_table)
957
-
958
- success_msg = (f"✅ Enhanced anonymization completed successfully!\n"
959
- f"🎯 Processing mode: {processing_mode}\n"
960
- f"📊 Protected entities: {entities_found}")
961
- return success_msg, anonymized_text, gpt_response, ""
962
-
963
- # Deanonymization
964
- final_result = anonymizer.deanonymize_response(gpt_response, lang)
965
-
966
- total_time = time.time() - start_time
967
- entities_found = len(anonymizer.mapping_table)
968
-
969
- model_indicator = 'XLM-RoBERTa + Regex' if anonymizer.model_ready else 'Regex Only'
970
-
971
- success_msg = (f"🎉 Complete enhanced anonymization & restoration successful!\n"
972
- f"🎯 Mode: {processing_mode} | 📊 Entities: {entities_found}\n"
973
- f"⏱️ Time: {total_time:.2f}s | 🤖 Model: {model_indicator}")
974
-
975
- return success_msg, anonymized_text, gpt_response, final_result
976
-
977
- except Exception as e:
978
- error_msg = f"❌ Processing error: {str(e)}" if lang == 'en' else f"❌ خطا در پردازش: {str(e)}"
979
- return error_msg, "", "", ""
980
-
981
- def get_mapping_table_enhanced(language):
982
- """نمایش جدول نگاشت پیشرفته"""
983
- lang = 'en' if language == 'English' else 'fa'
984
-
985
- if not anonymizer.mapping_table:
986
- return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!"
987
-
988
- result = "🔋 **Enhanced Mapping Table:**\n\n"
989
-
990
- result += f"📊 **Statistics**: {len(anonymizer.mapping_table)} total entities\n"
991
- result += f"🎯 **Method**: {'Hybrid Processing' if anonymizer.model_ready else 'Regex Only'}\n"
992
- result += f"🤖 **Model Status**: {anonymizer.model_status}\n\n"
993
-
994
- # Group by category
995
- category_stats = {}
996
- for original, code in anonymizer.mapping_table.items():
997
- category = code.split('_')[0]
998
- if category not in category_stats:
999
- category_stats[category] = []
1000
- category_stats[category].append((original, code))
1001
-
1002
- # Display results by category
1003
- for category, items in category_stats.items():
1004
- if len(items) > 0:
1005
- result += f"📁 **{category}** ({len(items)} items):\n"
1006
- for original, code in items[:3]:
1007
- source_indicator = "🧠" if any(x in code for x in ["HYB", "ENH"]) else "📝"
1008
- result += f" {source_indicator} `{original}` → `{code}`\n"
1009
- if len(items) > 3:
1010
- result += f" ... و {len(items) - 3} مورد دیگر\n"
1011
- result += "\n"
1012
-
1013
- result += f"🔥 **Enhanced System**: Advanced Regex patterns with optional NER support!"
1014
-
1015
- return result
1016
-
1017
- def clear_all_enhanced():
1018
- """پاک کردن همه - نسخه پیشرفته"""
1019
- anonymizer.mapping_table = {}
1020
- anonymizer.reset_counters()
1021
- return "", "", "", "", ""
1022
-
1023
- # Enhanced CSS
1024
- enhanced_css = """
1025
- body, .gradio-container {
1026
- font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important;
1027
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
1028
- min-height: 100vh !important;
1029
- padding: 20px !important;
1030
- }
1031
-
1032
- .enhanced-header {
1033
- background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
1034
- border-radius: 20px !important;
1035
- padding: 20px !important;
1036
- margin-bottom: 20px !important;
1037
- text-align: center !important;
1038
- box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
1039
- }
1040
-
1041
- .mode-selector {
1042
- background: linear-gradient(135deg, #74b9ff, #0984e3) !important;
1043
- border-radius: 15px !important;
1044
- padding: 20px !important;
1045
- margin: 15px 0 !important;
1046
- box-shadow: 0 8px 25px rgba(116, 185, 255, 0.3) !important;
1047
- }
1048
-
1049
- .model-status {
1050
- background: linear-gradient(135deg, #00b894, #00a085) !important;
1051
- border-radius: 15px !important;
1052
- padding: 15px !important;
1053
- margin: 15px 0 !important;
1054
- color: white !important;
1055
- font-weight: bold !important;
1056
- text-align: center !important;
1057
- box-shadow: 0 6px 20px rgba(0, 184, 148, 0.4) !important;
1058
- }
1059
-
1060
- .rtl {
1061
- direction: rtl !important;
1062
- text-align: right !important;
1063
- }
1064
-
1065
- .ltr {
1066
- direction: ltr !important;
1067
- text-align: left !important;
1068
- }
1069
-
1070
- .workflow {
1071
- display: grid !important;
1072
- grid-template-columns: 1fr 1fr 1fr 1fr !important;
1073
- gap: 25px !important;
1074
- padding: 30px !important;
1075
- align-items: start !important;
1076
- background: rgba(255, 255, 255, 0.1) !important;
1077
- border-radius: 20px !important;
1078
- backdrop-filter: blur(10px) !important;
1079
- }
1080
-
1081
- .gradio-textbox {
1082
- border-radius: 10px !important;
1083
- box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
1084
- min-height: 380px !important;
1085
- max-height: 380px !important;
1086
- height: 380px !important;
1087
- }
1088
-
1089
- .gradio-button {
1090
- border-radius: 25px !important;
1091
- font-weight: bold !important;
1092
- transition: all 0.3s ease !important;
1093
- margin: 5px 0 !important;
1094
- min-height: 50px !important;
1095
- background: linear-gradient(45deg, #667eea, #764ba2) !important;
1096
- border: none !important;
1097
- color: white !important;
1098
- }
1099
-
1100
- .gradio-button:hover {
1101
- transform: translateY(-2px) !important;
1102
- box-shadow: 0 8px 25px rgba(0,0,0,0.3) !important;
1103
- background: linear-gradient(45deg, #764ba2, #667eea) !important;
1104
- }
1105
-
1106
- @media (max-width: 1200px) {
1107
- .workflow {
1108
- grid-template-columns: 1fr 1fr !important;
1109
- }
1110
- }
1111
-
1112
- @media (max-width: 768px) {
1113
- .workflow {
1114
- grid-template-columns: 1fr !important;
1115
- }
1116
- }
1117
- """
1118
-
1119
- # Main Gradio Interface
1120
- with gr.Blocks(title="🚀 Enhanced Multi-Modal Anonymization", theme=gr.themes.Soft(), css=enhanced_css) as app:
1121
-
1122
- # Header
1123
- with gr.Row():
1124
- gr.HTML("""
1125
- <div class="enhanced-header">
1126
- <h1 style='color: white; font-size: 3em; margin: 0; text-shadow: 2px 2px 4px rgba(0,0,0,0.5);'>
1127
- 🚀 Enhanced Multi-Modal Anonymization System
1128
- </h1>
1129
- <p style='color: white; font-size: 1.2em; margin: 10px 0 0 0; text-shadow: 1px 1px 2px rgba(0,0,0,0.5);'>
1130
- 🤖 Advanced Regex + Optional NER = Maximum Accuracy
1131
- </p>
1132
- </div>
1133
- """)
1134
-
1135
- # Language and Mode Selection
1136
- with gr.Row():
1137
- with gr.Column(scale=1):
1138
- language_selector = gr.Radio(
1139
- choices=["فارسی", "English"],
1140
- value="فارسی",
1141
- label="Language / زبان",
1142
- interactive=True
1143
- )
1144
-
1145
- with gr.Column(scale=2, elem_classes="mode-selector"):
1146
- processing_mode = gr.Radio(
1147
- choices=[
1148
- ("⚡ Regex Only (Fast & Compatible)", "regex_only"),
1149
- ("🎯 Hybrid Mode (Recommended)", "hybrid"),
1150
- ("🔬 NER Priority (Highest Accuracy)", "ner_priority")
1151
- ],
1152
- value="regex_only" if not anonymizer.model_ready else "hybrid",
1153
- label="🎚️ Processing Mode",
1154
- info="Choose processing complexity vs accuracy trade-off"
1155
- )
1156
-
1157
- # Model Status Display
1158
- with gr.Row():
1159
- model_status_display = gr.HTML(
1160
- f'<div class="model-status">🤖 Model Status: {anonymizer.model_status}</div>'
1161
- )
1162
-
1163
- # Category Selection
1164
- with gr.Row():
1165
- with gr.Column():
1166
- pattern_categories = gr.CheckboxGroup(
1167
- choices=anonymizer.get_category_choices('fa'),
1168
- value=anonymizer.get_category_choices('fa'),
1169
- label="🎯 انتخاب دسته‌بندی‌های الگوی ناشناس‌سازی:",
1170
- interactive=True
1171
- )
1172
-
1173
- # Main Workflow
1174
- with gr.Row(elem_classes="workflow rtl") as workflow_row:
1175
- with gr.Column():
1176
- step1_title = gr.HTML('<h2 style="direction: rtl;">📝 متن ورودی</h2>')
1177
- input_text = gr.Textbox(
1178
- lines=15,
1179
- placeholder="متن اصلی خود را اینجا وارد کنید...\n\n🚀 سیستم پیشرفته با الگوهای regex جامع\n✅ دقت بالا برای نام اشخاص، شرکت‌ها، مکان‌ها\n📱 شناسایی دقیق تلفن، ایمیل، حساب بانکی\n💰 تشخیص مبالغ مالی و درصدها\n🗓️ استخراج تاریخ‌ها و زمان‌ها",
1180
- label="",
1181
- rtl=True
1182
- )
1183
-
1184
- process_btn = gr.Button("🚀 پردازش پیشرفته", variant="primary")
1185
- clear_btn = gr.Button("🗑️ پاک کردن همه", variant="stop")
1186
-
1187
- status = gr.Textbox(
1188
- label="وضعیت پردازش",
1189
- lines=4,
1190
- interactive=False,
1191
- rtl=True
1192
- )
1193
-
1194
- with gr.Column():
1195
- step2_title = gr.HTML('<h2 style="direction: rtl;">🎭 متن ناشناس‌شده</h2>')
1196
- anonymized_output = gr.Textbox(
1197
- lines=15,
1198
- placeholder="متن ناشناس‌شده با کدهای محافظتی...",
1199
- label="",
1200
- interactive=False,
1201
- rtl=True
1202
- )
1203
-
1204
- with gr.Column():
1205
- step3_title = gr.HTML('<h2 style="direction: rtl;">🤖 پاسخ ChatGPT</h2>')
1206
- gpt_output = gr.Textbox(
1207
- lines=15,
1208
- placeholder="پاسخ ChatGPT به متن ناشناس‌شده...",
1209
- label="",
1210
- interactive=False,
1211
- rtl=True
1212
- )
1213
-
1214
- with gr.Column():
1215
- step4_title = gr.HTML('<h2 style="direction: rtl;">✅ پاسخ نهایی</h2>')
1216
- final_output = gr.Textbox(
1217
- lines=15,
1218
- placeholder="پاسخ نهایی با بازگردانی اطلاعات اصلی...",
1219
- label="",
1220
- interactive=False,
1221
- rtl=True
1222
- )
1223
-
1224
- # Additional Tools
1225
- with gr.Row():
1226
- with gr.Column():
1227
- mapping_btn = gr.Button("📋 نمایش جدول نگاشت پیشرفته")
1228
- mapping_output = gr.Textbox(
1229
- lines=15,
1230
- label="جدول نگاشت اطلاعات",
1231
- interactive=False,
1232
- visible=False,
1233
- rtl=True
1234
- )
1235
-
1236
- with gr.Column():
1237
- system_status_btn = gr.Button("📊 نمایش وضعیت سیستم پیشرفته")
1238
- system_status_output = gr.Textbox(
1239
- lines=20,
1240
- label="وضعیت سیستم",
1241
- interactive=False,
1242
- visible=False,
1243
- rtl=True
1244
- )
1245
-
1246
- # Event Handlers
1247
- process_btn.click(
1248
- fn=process_all_steps_enhanced,
1249
- inputs=[input_text, language_selector, pattern_categories, processing_mode],
1250
- outputs=[status, anonymized_output, gpt_output, final_output]
1251
- )
1252
-
1253
- clear_btn.click(
1254
- fn=clear_all_enhanced,
1255
- outputs=[input_text, anonymized_output, gpt_output, final_output, status]
1256
- )
1257
-
1258
- mapping_btn.click(
1259
- fn=get_mapping_table_enhanced,
1260
- inputs=[language_selector],
1261
- outputs=[mapping_output]
1262
- )
1263
-
1264
- mapping_btn.click(
1265
- fn=lambda: gr.update(visible=True),
1266
- outputs=[mapping_output]
1267
- )
1268
-
1269
- system_status_btn.click(
1270
- fn=lambda: anonymizer.get_model_status(),
1271
- outputs=[system_status_output]
1272
- )
1273
-
1274
- system_status_btn.click(
1275
- fn=lambda: gr.update(visible=True),
1276
- outputs=[system_status_output]
1277
- )
1278
-
1279
- if __name__ == "__main__":
1280
- logger.info("🚀 Starting Enhanced Multi-Modal Anonymization System...")
1281
- logger.info(f"🤖 XLM-RoBERTa Status: {anonymizer.model_status}")
1282
- logger.info("✅ Ready for high-accuracy bilingual processing!")
1283
-
1284
- app.launch(
1285
- share=False,
1286
- server_name="0.0.0.0",
1287
- server_port=7860,
1288
- show_error=True
1289
- )