leilaghomashchi commited on
Commit
31cfccf
·
verified ·
1 Parent(s): 83f0287

Delete app1.py

Browse files
Files changed (1) hide show
  1. app1.py +0 -1801
app1.py DELETED
@@ -1,1801 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- 🚀 Enhanced Bilingual Data Anonymization Benchmark System
5
- ====================================================================
6
-
7
- نسخه ساده‌شده فقط با قابلیت بنچمارک پیشرفته
8
- """
9
-
10
- import gradio as gr
11
- import pandas as pd
12
- import numpy as np
13
- import json
14
- import time
15
- import os
16
- import re
17
- import logging
18
- import requests
19
- from datetime import datetime
20
- from functools import lru_cache
21
- from packaging import version
22
- from typing import Dict, List, Tuple, Any
23
- import matplotlib.pyplot as plt
24
- import plotly.express as px
25
- import plotly.graph_objects as go
26
- from plotly.subplots import make_subplots
27
- import warnings
28
- import gc
29
- import threading
30
- from collections import defaultdict
31
-
32
- # Enhanced metrics imports
33
- try:
34
- import psutil
35
- PSUTIL_AVAILABLE = True
36
- except ImportError:
37
- PSUTIL_AVAILABLE = False
38
-
39
- try:
40
- from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
41
- SKLEARN_AVAILABLE = True
42
- except ImportError:
43
- SKLEARN_AVAILABLE = False
44
-
45
- warnings.filterwarnings('ignore')
46
-
47
- # تنظیم logging
48
- logging.basicConfig(level=logging.INFO)
49
- logger = logging.getLogger(__name__)
50
-
51
- # تنظیم فونت فارسی برای matplotlib
52
- plt.rcParams['font.family'] = ['Arial Unicode MS', 'Tahoma', 'sans-serif']
53
-
54
- # =============================================================================
55
- # بخش 1: سیستم اصلی نام‌نشان‌سازی (برای بنچمارک)
56
- # =============================================================================
57
-
58
- def auto_setup_models():
59
- """راه‌اندازی خودکار مدل‌ها در صورت عدم وجود"""
60
- models_dir = "./models"
61
- required_models = {
62
- 'bert-fa-ner': 'HooshvareLab/bert-fa-zwnj-base-ner',
63
- 'bert-base-NER': 'dslim/bert-base-NER',
64
- }
65
-
66
- missing_models = []
67
- for model_name in required_models.keys():
68
- model_path = os.path.join(models_dir, model_name)
69
- if not os.path.exists(model_path) or not os.listdir(model_path):
70
- missing_models.append(model_name)
71
-
72
- if not missing_models:
73
- logger.info("✅ All models are already available")
74
- return True
75
-
76
- logger.info(f"📥 Auto-downloading missing models: {missing_models}")
77
-
78
- try:
79
- from transformers import AutoTokenizer, AutoModelForTokenClassification
80
- os.makedirs(models_dir, exist_ok=True)
81
-
82
- for model_name in missing_models:
83
- hf_repo = required_models[model_name]
84
- model_path = os.path.join(models_dir, model_name)
85
- logger.info(f"📥 Downloading {model_name} from {hf_repo}...")
86
- try:
87
- tokenizer = AutoTokenizer.from_pretrained(hf_repo)
88
- model = AutoModelForTokenClassification.from_pretrained(hf_repo)
89
- tokenizer.save_pretrained(model_path)
90
- model.save_pretrained(model_path)
91
- logger.info(f"✅ {model_name} downloaded successfully")
92
- del tokenizer, model
93
- except Exception as e:
94
- logger.error(f"❌ Failed to download {model_name}: {e}")
95
- if os.path.exists(model_path):
96
- import shutil
97
- shutil.rmtree(model_path)
98
-
99
- logger.info("🎉 Auto-setup completed!")
100
- return True
101
-
102
- except ImportError:
103
- logger.error("❌ transformers library not available for auto-download")
104
- return False
105
- except Exception as e:
106
- logger.error(f"❌ Auto-setup failed: {e}")
107
- return False
108
-
109
- # اجرای auto-setup در startup
110
- try:
111
- auto_setup_models()
112
- except Exception as e:
113
- logger.warning(f"⚠️ Auto-setup encountered an issue: {e}")
114
- logger.info("ℹ️ Continuing with manual setup...")
115
-
116
- class BilingualDataAnonymizer:
117
- """سیستم اصلی نام‌نشان‌سازی دوزبانه - برای بنچمارک"""
118
-
119
- def __init__(self):
120
- self.mapping_table = {}
121
- self.counters = {
122
- 'COMPANY': 0, 'PERSON': 0, 'AMOUNT': 0, 'ACCOUNT': 0,
123
- 'DATE': 0, 'STOCK_SYMBOL': 0, 'PETROCHEMICAL': 0,
124
- 'PRODUCT': 0, 'PERCENTAGE': 0, 'LOCATION': 0,
125
- 'VOLUME': 0, 'PHONE': 0, 'EMAIL': 0, 'ID_NUMBER': 0,
126
- 'FINANCIAL_TERMS': 0, 'BUSINESS_TERMS': 0, 'RATIOS': 0
127
- }
128
-
129
- self.api_key = os.getenv("OPENAI_API_KEY", "")
130
- self.models_base_path = "./models"
131
- self.models_loaded = False
132
- self.model_status = {}
133
- self.load_local_ner_models()
134
-
135
- def ensure_models_directory(self):
136
- if not os.path.exists(self.models_base_path):
137
- try:
138
- os.makedirs(self.models_base_path, exist_ok=True)
139
- logger.info(f"📁 Created models directory: {self.models_base_path}")
140
- except Exception as e:
141
- logger.error(f"❌ Failed to create models directory: {e}")
142
- return False
143
- return True
144
-
145
- def download_model_if_missing(self, local_name, hf_repo):
146
- model_path = os.path.join(self.models_base_path, local_name)
147
- if os.path.exists(model_path) and os.listdir(model_path):
148
- return True, f"Model {local_name} already exists"
149
- try:
150
- logger.info(f"📥 Auto-downloading {local_name} from {hf_repo}...")
151
- from transformers import AutoTokenizer, AutoModelForTokenClassification
152
- tokenizer = AutoTokenizer.from_pretrained(hf_repo)
153
- model = AutoModelForTokenClassification.from_pretrained(hf_repo)
154
- tokenizer.save_pretrained(model_path)
155
- model.save_pretrained(model_path)
156
- logger.info(f"✅ {local_name} auto-downloaded successfully")
157
- return True, f"Downloaded {local_name}"
158
- except Exception as e:
159
- logger.error(f"❌ Auto-download failed for {local_name}: {e}")
160
- return False, str(e)
161
-
162
- def _load_pipeline(self, task, model_path, tokenizer_path=None):
163
- """لود مدل با مدیریت صحیح پارامترهای ورژن مختلف transformers"""
164
- try:
165
- from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, __version__ as tr_version
166
-
167
- supports_agg = version.parse(tr_version) >= version.parse("4.11.0")
168
-
169
- if tokenizer_path:
170
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
171
- else:
172
- tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
173
-
174
- model = AutoModelForTokenClassification.from_pretrained(model_path, local_files_only=True)
175
-
176
- pipeline_kwargs = {
177
- "model": model,
178
- "tokenizer": tokenizer,
179
- "device": -1
180
- }
181
-
182
- if supports_agg:
183
- pipeline_kwargs["aggregation_strategy"] = "simple"
184
-
185
- return pipeline(task, **pipeline_kwargs)
186
-
187
- except Exception as e:
188
- logger.error(f"❌ Failed to load pipeline for {model_path}: {e}")
189
- return None
190
-
191
- def load_local_ner_models(self):
192
- logger.info("📄 Loading local NER models with auto-download...")
193
- if not self.ensure_models_directory():
194
- self.models_loaded = False
195
- self.model_status['directory'] = "❌ Cannot create models directory"
196
- return
197
-
198
- try:
199
- try:
200
- import torch
201
- from transformers import AutoTokenizer, AutoModelForTokenClassification
202
- transformers_available = True
203
- logger.info("✅ Transformers library available")
204
- except ImportError as e:
205
- transformers_available = False
206
- self.model_status['transformers'] = f"❌ Transformers library not installed: {str(e)}"
207
- self.models_loaded = False
208
- return
209
-
210
- # Persian model
211
- persian_model_path = os.path.join(self.models_base_path, "bert-fa-ner")
212
- self.download_model_if_missing("bert-fa-ner", "HooshvareLab/bert-fa-zwnj-base-ner")
213
- if os.path.exists(persian_model_path) and os.listdir(persian_model_path):
214
- try:
215
- self.persian_ner = self._load_pipeline("ner", persian_model_path)
216
- if self.persian_ner:
217
- self.model_status['persian'] = f"✅ Local Persian NER: {persian_model_path}"
218
- else:
219
- self.model_status['persian'] = f"❌ Failed to load Persian model: {persian_model_path}"
220
- except Exception as e:
221
- self.persian_ner = None
222
- self.model_status['persian'] = f"❌ Persian model loading error: {str(e)[:100]}"
223
- else:
224
- self.persian_ner = None
225
- self.model_status['persian'] = f"❌ Persian model not found: {persian_model_path}"
226
-
227
- # English model
228
- english_model_path = os.path.join(self.models_base_path, "bert-base-NER")
229
- self.download_model_if_missing("bert-base-NER", "dslim/bert-base-NER")
230
- if os.path.exists(english_model_path) and os.listdir(english_model_path):
231
- try:
232
- self.english_ner = self._load_pipeline("ner", english_model_path)
233
- if self.english_ner:
234
- self.model_status['english'] = f"✅ Local English NER: {english_model_path}"
235
- else:
236
- self.model_status['english'] = f"❌ Failed to load English model: {english_model_path}"
237
- except Exception as e:
238
- self.english_ner = None
239
- self.model_status['english'] = f"❌ English model loading error: {str(e)[:100]}"
240
- else:
241
- self.english_ner = None
242
- self.model_status['english'] = f"❌ English model not found: {english_model_path}"
243
-
244
- loaded_models = sum(1 for status in self.model_status.values() if status.startswith("✅"))
245
- self.models_loaded = loaded_models > 0
246
- if loaded_models == 0:
247
- self.model_status['fallback'] = "⚠️ Using regex-only mode (no local models found)"
248
-
249
- except Exception as e:
250
- self.models_loaded = False
251
- self.model_status['critical'] = f"❌ Critical error: {str(e)[:100]}..."
252
-
253
- def detect_language(self, text):
254
- """تشخیص زبان متن"""
255
- if not text:
256
- return 'fa'
257
-
258
- persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
259
- english_chars = len(re.findall(r'[a-zA-Z]', text))
260
- total = persian_chars + english_chars
261
-
262
- if total == 0:
263
- return 'fa'
264
-
265
- if persian_chars / total > 0.6:
266
- return 'fa'
267
- elif english_chars / total > 0.6:
268
- return 'en'
269
- else:
270
- return 'mixed'
271
-
272
- def extract_entities_with_ner(self, text, lang='fa'):
273
- """استخراج entities با مدل‌های NER محلی"""
274
- entities = []
275
-
276
- if not self.models_loaded:
277
- logger.info("ℹ️ Local NER models not available - using regex only")
278
- return entities
279
-
280
- try:
281
- # مدل فارسی محلی
282
- if lang in ['fa', 'mixed'] and hasattr(self, 'persian_ner') and self.persian_ner:
283
- try:
284
- persian_results = self.persian_ner(text)
285
- for entity in persian_results:
286
- if isinstance(entity, dict):
287
- if 'entity_group' in entity:
288
- entities.append({
289
- 'text': entity['word'].strip(),
290
- 'label': entity['entity_group'],
291
- 'start': entity['start'],
292
- 'end': entity['end'],
293
- 'confidence': entity['score'],
294
- 'source': 'local_persian_ner'
295
- })
296
- else:
297
- entities.append({
298
- 'text': entity['word'].strip(),
299
- 'label': entity['entity'],
300
- 'start': entity['start'],
301
- 'end': entity['end'],
302
- 'confidence': entity['score'],
303
- 'source': 'local_persian_ner'
304
- })
305
- logger.info(f"Local Persian NER found {len(persian_results)} entities")
306
- except Exception as e:
307
- logger.error(f"Local Persian NER extraction error: {e}")
308
-
309
- # مدل انگلیسی محلی
310
- if lang in ['en', 'mixed'] and hasattr(self, 'english_ner') and self.english_ner:
311
- try:
312
- english_results = self.english_ner(text)
313
- for entity in english_results:
314
- if isinstance(entity, dict):
315
- if 'entity_group' in entity:
316
- entities.append({
317
- 'text': entity['word'].strip(),
318
- 'label': entity['entity_group'],
319
- 'start': entity['start'],
320
- 'end': entity['end'],
321
- 'confidence': entity['score'],
322
- 'source': 'local_english_ner'
323
- })
324
- else:
325
- entities.append({
326
- 'text': entity['word'].strip(),
327
- 'label': entity['entity'],
328
- 'start': entity['start'],
329
- 'end': entity['end'],
330
- 'confidence': entity['score'],
331
- 'source': 'local_english_ner'
332
- })
333
- logger.info(f"Local English NER found {len(english_results)} entities")
334
- except Exception as e:
335
- logger.error(f"Local English NER extraction error: {e}")
336
-
337
- except Exception as e:
338
- logger.error(f"Local NER extraction general error: {e}")
339
-
340
- # حذف تکراری‌ها
341
- unique_entities = []
342
- seen = set()
343
- for entity in entities:
344
- key = (entity['text'].lower(), entity['start'], entity['end'])
345
- if key not in seen:
346
- seen.add(key)
347
- unique_entities.append(entity)
348
-
349
- logger.info(f"Total unique entities found by local models: {len(unique_entities)}")
350
- return unique_entities
351
-
352
- def map_ner_to_categories(self, ner_label, source=''):
353
- """نگاشت برچسب‌های NER به دسته‌های سیستم"""
354
- mapping = {
355
- 'PER': 'PERSON', 'PERSON': 'PERSON',
356
- 'ORG': 'COMPANY', 'ORGANIZATION': 'COMPANY',
357
- 'LOC': 'LOCATION', 'LOCATION': 'LOCATION',
358
- 'MISC': 'BUSINESS_TERMS', 'MISCELLANEOUS': 'BUSINESS_TERMS',
359
- 'B-PER': 'PERSON', 'I-PER': 'PERSON',
360
- 'B-ORG': 'COMPANY', 'I-ORG': 'COMPANY',
361
- 'B-LOC': 'LOCATION', 'I-LOC': 'LOCATION',
362
- 'B-MISC': 'BUSINESS_TERMS', 'I-MISC': 'BUSINESS_TERMS',
363
- 'MONEY': 'AMOUNT', 'PERCENT': 'PERCENTAGE',
364
- 'DATE': 'DATE', 'TIME': 'DATE'
365
- }
366
- return mapping.get(ner_label.upper(), 'BUSINESS_TERMS')
367
-
368
- def anonymize_text(self, original_text, lang='fa'):
369
- """گام 1: نام‌نشان‌سازی متن - برای بنچمارک"""
370
- try:
371
- if not original_text or not original_text.strip():
372
- return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
373
-
374
- # ریست متغیرها
375
- self.mapping_table = {}
376
- self.counters = {key: 0 for key in self.counters.keys()}
377
-
378
- anonymized = original_text
379
- found_entities = set()
380
-
381
- # تشخیص زبان
382
- detected_lang = self.detect_language(original_text)
383
- logger.info(f"Detected language: {detected_lang}")
384
-
385
- # مرحله 1: استخراج با Local NER
386
- if self.models_loaded:
387
- logger.info("🤖 Running local NER extraction...")
388
- ner_entities = self.extract_entities_with_ner(original_text, detected_lang)
389
-
390
- for entity in ner_entities:
391
- if (entity['text'] not in found_entities and
392
- len(entity['text'].strip()) > 1 and
393
- entity['confidence'] > 0.5):
394
-
395
- category = self.map_ner_to_categories(entity['label'], entity['source'])
396
-
397
- if entity['text'] not in self.mapping_table:
398
- self.counters[category] += 1
399
- code = f"{category}_{self.counters[category]:03d}_LOCAL_NER"
400
- self.mapping_table[entity['text']] = code
401
- found_entities.add(entity['text'])
402
- logger.info(f"Local NER: {entity['text']} -> {code}")
403
- else:
404
- logger.info("ℹ️ Using regex-only mode")
405
-
406
- # مرحله 2: الگوهای Regex
407
- patterns = {
408
- 'STOCK_SYMBOL': [
409
- r'نماد\s+([آ-ی\a-zA-Z0-9]+)',
410
- r'(سبهان|غدیر|شتران|شپنا|پترول|فارس|خارک|پلاسکو|جم|کرمان|مارون|اراک|رازی|شازند|کاوه|بندر|پارس|خوزستان|ماهشهر|عسلویه)(?=\s|$|،|\.|\s+—)',
411
- r'شرکت\s+([آ-ی\a-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+را|\s+به)',
412
- r'پتروشیمی\s+([آ-ی\a-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+توان)',
413
- r'(AAPL|GOOGL|MSFT|AMZN|TSLA|META|NVDA|SABIC)(?=\s|$|,|\.)'
414
- ],
415
- 'COMPANY': [
416
- r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به|\s+طی)',
417
- r'([آ-ی\a-zA-Z\s]+)\s+شرکت',
418
- r'این\s+شرکت(?=\s|$|،|\.)',
419
- r'(بانک\s+[آ-ی\a-zA-Z\s]+)',
420
- r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))'
421
- ],
422
- 'PERSON': [
423
- r'آقای\s+([آ-ی\a-zA-Z]+(?:\s+[آ-ی\a-zA-Z]+)*)',
424
- r'خانم\s+([آ-ی\a-zA-Z]+(?:\s+[آ-ی\a-zA-Z]+)*)',
425
- r'مهندس\s+([آ-ی\a-zA-Z]+(?:\s+[آ-ی\a-zA-Z]+)*)',
426
- r'دکتر\s+([آ-ی\a-zA-Z]+(?:\s+[آ-ی\a-zA-Z]+)*)',
427
- r'([آ-ی\a-zA-Z]+\s+[آ-ی\a-zA-Z]+)(?=،\s+مدیرعامل|\s+مدیرعامل|\s+رئیس)',
428
- r'مدیرعامل(?=\s|$|،|\.)',
429
- r'سرپرست(?=\s+و|\s|$|،|\.)',
430
- r'رئیس\s+هیأت‌مدیره',
431
- r'وی(?=\s+ادامه|\s+اظهار|\s+گفت|\s+اعلام|\s+همچنین)'
432
- ],
433
- 'AMOUNT': [
434
- r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
435
- r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان',
436
- r'\d+\s*تومان(?=\s+به\s+ازای|\s+فروش|\s+،)',
437
- r'رقم\s+فعلی\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
438
- r'رقم\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان',
439
- r'به\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
440
- r'از\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
441
- r'برابر\s+با\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان',
442
- r'\d+(?:میلیارد|میلیون)\s*تومان(?=\s+رسیده|\s+ثبت|\s+بوده|\s+،)',
443
- r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?',
444
- r'\d+(?:,\d{3})*\s*ریال',
445
- r'€\d+(?:,\d{3})*(?:\.\d+)?'
446
- ],
447
- 'PERCENTAGE': [
448
- r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش|\s+بالاتر|\s+پایین‌تر)?',
449
- r'\d+(?:\.\d+)?\s*%',
450
- r'معادل\s+\d+(?:\.\d+)?\s*درصد',
451
- r'حدود\s+\d+(?:\.\d+)?\s*درصد',
452
- r'با\s+\d+(?:\.\d+)?\s*درصد\s+افزایش',
453
- r'رشد\s+\d+(?:\.\d+)?\s*درصدی',
454
- r'\d+(?:\.\d+)?\s*درصدی(?=\s+همراه|\s+بوده)',
455
- r'میزان\s+رشد(?=\s+نسبت|\s+معادل)',
456
- r'افزایش\s+قابل‌توجهی',
457
- r'بهبود\s+نسبی'
458
- ],
459
- 'PHONE': [
460
- r'(?:تلفن[\s:]*)?(?:شماره[\s:]*)?(?:با[\s]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
461
- r'(?:تماس[\s:]*)?(?:شماره[\s:]*)?(?:با[\s]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
462
- r'(?:موبایل[\s:]*)?(?:شماره[\s:]*)?(?:0)?9[۰-۹0-9]{9}',
463
- r'[۰-۹0-9]{3,4}[-\s][۰-۹0-9]{7,8}',
464
- r'[۰-۹0-9]{11}(?!\d)',
465
- r'(?:\+98|0098)?[۰-۹0-9]{10}',
466
- r'[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}'
467
- ],
468
- 'EMAIL': [
469
- r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
470
- r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
471
- r'email[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
472
- r'نشانی[\s]*الکترونیک[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
473
- r'آدرس[\s]*ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
474
- ],
475
- 'ACCOUNT': [
476
- r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:بانکی[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
477
- r'حساب[\s]*(?:شماره[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
478
- r'شماره[\s]*حساب[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
479
- r'Account[\s]*(?:Number[\s:]*)?(?:[0-9]{1,3}[-\s]?)*[0-9]{8,20}',
480
- r'[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{6,12}',
481
- r'[۰-۹0-9]{2,4}[-\s]?[۰-۹0-9]{6,12}[-\s]?[۰-۹0-9]{2,4}',
482
- r'واریز[\s]*(?:سود[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
483
- r'سود[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}'
484
- ],
485
- 'ID_NUMBER': [
486
- r'IR[۰-۹0-9]{24}',
487
- r'شبا[\s:]*IR[۰-۹0-9]{24}',
488
- r'IBAN[\s:]*IR[۰-۹0-9]{24}',
489
- r'شماره[\s]*شبا[\s:]*IR[۰-۹0-9]{24}',
490
- r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
491
- r'(?:شناسه[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
492
- r'National[\s]*(?:ID[\s:]*)?[0-9]{10}',
493
- r'(?:پاسپورت[\s:]*)?[A-Z][0-9]{8}',
494
- r'(?:Passport[\s:]*)?[A-Z][0-9]{8}',
495
- r'(?:کارت[\s:]*)?(?:[۰-۹0-9]{4}[-\s]?){3}[۰-۹0-9]{4}',
496
- r'(?:Card[\s:]*)?(?:[0-9]{4}[-\s]?){3}[0-9]{4}'
497
- ],
498
- 'DATE': [
499
- r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}',
500
- r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}',
501
- r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s*(?:[۰-۹0-9]{4})',
502
- r'(?:[0-9]{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(?:[0-9]{4})',
503
- r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s*[0-9]{1,2},?\s*[0-9]{4}'
504
- ]
505
- }
506
-
507
- # پردازش patterns با اولویت‌بندی - از خاص به عام
508
- logger.info("🔍 Running prioritized regex extraction...")
509
-
510
- processed_entities = set()
511
-
512
- for category, pattern_list in patterns.items():
513
- for pattern in pattern_list:
514
- matches = re.finditer(pattern, original_text, re.IGNORECASE | re.MULTILINE)
515
- for match in matches:
516
- if match.groups():
517
- item = match.group(1).strip()
518
- full_match = match.group(0).strip()
519
- else:
520
- item = match.group(0).strip()
521
- full_match = item
522
-
523
- # بررسی تداخل با entities قبلی
524
- overlaps = False
525
- match_start, match_end = match.span()
526
-
527
- for proc_start, proc_end in processed_entities:
528
- if not (match_end <= proc_start or match_start >= proc_end):
529
- overlaps = True
530
- break
531
-
532
- if (not overlaps and
533
- full_match not in found_entities and
534
- full_match not in self.mapping_table and
535
- len(full_match) >= 2):
536
-
537
- self.counters[category] += 1
538
- code = f"{category}_{self.counters[category]:03d}_REGEX"
539
- self.mapping_table[full_match] = code
540
- found_entities.add(full_match)
541
- processed_entities.add((match_start, match_end))
542
- logger.info(f"Regex ({category}): {full_match} -> {code}")
543
-
544
- # جایگزینی در متن با ترتیب طولانی‌ترین اول
545
- sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True)
546
- for original_item, code in sorted_items:
547
- anonymized = anonymized.replace(original_item, code)
548
-
549
- logger.info(f"✅ Anonymization completed. Found {len(self.mapping_table)} entities.")
550
- return anonymized
551
-
552
- except Exception as e:
553
- return f"❌ Error in anonymization: {str(e)}" if lang == 'en' else f"❌ خطا در نام‌نشان‌سازی: {str(e)}"
554
-
555
- # =============================================================================
556
- # بخش 2: رابط کاربری Enhanced Benchmark
557
- # =============================================================================
558
-
559
- TEXTS = {
560
- 'en': {
561
- 'title': '🚀 Enhanced Bilingual Data Anonymization Benchmark',
562
- 'subtitle': 'Comprehensive Performance Analysis for Privacy Protection Systems with Advanced Metrics',
563
- 'upload_label': 'Upload Your Dataset',
564
- 'upload_info': 'Supported formats: CSV, TXT, JSON (Max 10MB)',
565
- 'language_label': 'Interface Language',
566
- 'sample_size_label': 'Sample Size for Analysis',
567
- 'sample_size_info': 'Larger samples give more accurate results but take longer',
568
- 'run_button': '🚀 Run Enhanced Benchmark Analysis',
569
- 'download_button': '📥 Download Results',
570
- 'processing': '⏳ Processing your dataset... Please wait.',
571
- 'error_no_file': '❌ Please upload a dataset file first.',
572
- 'error_processing': '❌ Error processing file: {}',
573
- 'success_message': '✅ Enhanced benchmark completed successfully!',
574
- 'results_tab': 'Results Overview',
575
- 'charts_tab': 'Performance Charts',
576
- 'entities_tab': 'Entity Analysis',
577
- 'details_tab': 'Detailed Report',
578
- 'no_results': 'No results yet. Please run the benchmark first.',
579
- },
580
- 'fa': {
581
- 'title': '🚀 بنچمارک سیستم نام‌نشان‌سازی دوزبانه پیشرفته',
582
- 'subtitle': 'تحلیل جامع عملکرد سیستم‌های حفاظت از حریم خصوصی با متریک‌های پیشرفته',
583
- 'upload_label': 'آپلود دیتاست شما',
584
- 'upload_info': 'فرمت‌های پشتیبانی شده: CSV، TXT، JSON (حداکثر ۱۰ مگابایت)',
585
- 'language_label': 'زبان رابط کاربری',
586
- 'sample_size_label': 'اندازه نمونه برای تحلیل',
587
- 'sample_size_info': 'نمونه‌های بزرگ‌تر نتایج دقیق‌تری می‌دهند اما بیشتر طول می‌کشند',
588
- 'run_button': '🚀 اجرای تحلیل بنچمارک پیشرفته',
589
- 'download_button': '📥 دانلود نتایج',
590
- 'processing': '⏳ در حال پردازش دیتاست شما... لطفاً صبر کنید.',
591
- 'error_no_file': '❌ لطفاً ابتدا فایل دیتاست را آپلود کنید.',
592
- 'error_processing': '❌ خطا در پردازش فایل: {}',
593
- 'success_message': '✅ بنچمارک پیشرفته با موفقیت تکمیل شد!',
594
- 'results_tab': 'خلاصه نتایج',
595
- 'charts_tab': 'نمودارهای عملکرد',
596
- 'entities_tab': 'تحلیل موجودیت‌ها',
597
- 'details_tab': 'گزارش تفصیلی',
598
- 'no_results': 'هنوز نتیجه‌ای وجود ندارد. لطفاً ابتدا بنچمارک را اجرا کنید.',
599
- }
600
- }
601
-
602
- class EnhancedGradioBenchmarkInterface:
603
- """رابط کاربری Gradio برای بنچمارک پیشرفته"""
604
-
605
- def __init__(self):
606
- self.current_results = None
607
- self.current_language = 'fa'
608
- self.memory_baseline = None
609
- self.performance_history = []
610
- self.stress_test_active = False
611
-
612
- # راه‌اندازی anonymizer
613
- try:
614
- self.anonymizer = BilingualDataAnonymizer()
615
- self.system_ready = True
616
- except Exception as e:
617
- print(f"Error initializing anonymizer: {e}")
618
- self.system_ready = False
619
-
620
- def get_text(self, key):
621
- """دریافت متن بر اساس زبان فعلی"""
622
- return TEXTS[self.current_language].get(key, key)
623
-
624
- def change_language(self, language):
625
- """تغییر زبان رابط کاربری"""
626
- self.current_language = 'en' if language == 'English' else 'fa'
627
- return self.update_interface_texts()
628
-
629
- def update_interface_texts(self):
630
- """به‌روزرسانی متن‌های رابط کاربری"""
631
- return [
632
- gr.update(label=f"{self.get_text('upload_label')} - {self.get_text('upload_info')}"),
633
- gr.update(label=f"{self.get_text('sample_size_label')} - {self.get_text('sample_size_info')}"),
634
- gr.update(value=self.get_text('run_button')),
635
- gr.update(value=self.get_text('download_button')),
636
- ]
637
-
638
- def start_memory_monitoring(self):
639
- """شروع مانیتورینگ حافظه"""
640
- if PSUTIL_AVAILABLE:
641
- try:
642
- process = psutil.Process()
643
- self.memory_baseline = process.memory_info().rss / 1024 / 1024 # MB
644
- except:
645
- self.memory_baseline = 0
646
- else:
647
- self.memory_baseline = 0
648
-
649
- def get_memory_usage(self):
650
- """دریافت مصرف حافظه فعلی"""
651
- if not PSUTIL_AVAILABLE:
652
- return 0
653
- try:
654
- process = psutil.Process()
655
- current_memory = process.memory_info().rss / 1024 / 1024 # MB
656
- return current_memory - (self.memory_baseline or 0)
657
- except:
658
- return 0
659
-
660
- def calculate_classification_metrics(self, results):
661
- """محاسبه متریک‌های دقت کلاسیفیکیشن"""
662
- # ساخت متریک‌های ساده بدون sklearn
663
- total_entities = 0
664
- detected_entities = 0
665
- correct_detections = 0
666
- total_sentences = len(results)
667
- successful_sentences = 0
668
-
669
- for result in results:
670
- if not result.get('success', False):
671
- continue
672
-
673
- successful_sentences += 1
674
- original_text = result.get('original_preview', '')
675
- entities_found = result.get('entity_categories', {})
676
-
677
- # محاسبه ground truth
678
- ground_truth_categories = self.generate_ground_truth(original_text)
679
- predicted_categories = list(entities_found.keys())
680
-
681
- # شمارش entities
682
- total_entities += len(ground_truth_categories)
683
- detected_entities += len(predicted_categories)
684
-
685
- # شمارش تشخیص‌های صحیح
686
- for category in predicted_categories:
687
- if category in ground_truth_categories:
688
- correct_detections += 1
689
-
690
- # محاسبه متریک‌ها
691
- if detected_entities == 0:
692
- precision = 0.0
693
- else:
694
- precision = (correct_detections / detected_entities) * 100
695
-
696
- if total_entities == 0:
697
- recall = 0.0
698
- else:
699
- recall = (correct_detections / total_entities) * 100
700
-
701
- if precision + recall == 0:
702
- f1_score = 0.0
703
- else:
704
- f1_score = 2 * (precision * recall) / (precision + recall)
705
-
706
- if total_sentences == 0:
707
- accuracy = 0.0
708
- else:
709
- accuracy = (successful_sentences / total_sentences) * 100
710
-
711
- return {
712
- 'precision': round(precision, 1),
713
- 'recall': round(recall, 1),
714
- 'f1_score': round(f1_score, 1),
715
- 'accuracy': round(accuracy, 1)
716
- }
717
-
718
- def generate_ground_truth(self, text):
719
- """تولید ground truth بر اساس patterns موجود در متن"""
720
- ground_truth = []
721
-
722
- # الگوهای به��ودیافته برای تشخیص دقیق‌تر
723
- patterns = {
724
- 'EMAIL': [
725
- r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
726
- r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
727
- ],
728
- 'PHONE': [
729
- r'(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
730
- r'تلفن[\s:]*(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}',
731
- r'موبایل[\s:]*(?:0)?9[۰-۹0-9]{9}',
732
- ],
733
- 'ID_NUMBER': [
734
- r'IR[۰-۹0-9]{24}',
735
- r'شبا[\s:]*IR[۰-۹0-9]{24}',
736
- r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}',
737
- r'(?:کارت[\s:]*)?(?:[۰-۹0-9]{4}[-\s]?){3}[۰-۹0-9]{4}',
738
- ],
739
- 'AMOUNT': [
740
- r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان',
741
- r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان',
742
- r'\$\d+(?:,\d{3})*(?:\.\d+)?',
743
- r'\d+(?:,\d{3})*\s*ریال',
744
- ],
745
- 'PERCENTAGE': [
746
- r'\d+(?:\.\d+)?\s*درصد',
747
- r'\d+(?:\.\d+)?\s*%',
748
- r'رشد\s+\d+(?:\.\d+)?\s*درصدی',
749
- ],
750
- 'PERSON': [
751
- r'آقای\s+[آ-ی\a-zA-Z]+',
752
- r'خانم\s+[آ-ی\a-zA-Z]+',
753
- r'مهندس\s+[آ-ی\a-zA-Z]+',
754
- r'دکتر\s+[آ-ی\a-zA-Z]+',
755
- r'مدیرعامل',
756
- r'سرپرست',
757
- ],
758
- 'COMPANY': [
759
- r'شرکت\s+[آ-ی\a-zA-Z\s]+',
760
- r'بانک\s+[آ-ی\a-zA-Z\s]+',
761
- r'[A-Z][a-zA-Z\s]+(?:Inc|Corp|Company|Ltd)',
762
- ],
763
- 'ACCOUNT': [
764
- r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:بانکی[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
765
- r'حساب[\s]*(?:شماره[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}',
766
- ],
767
- 'DATE': [
768
- r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}',
769
- r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}',
770
- r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)',
771
- ]
772
- }
773
-
774
- import re
775
- for category, pattern_list in patterns.items():
776
- found = False
777
- for pattern in pattern_list:
778
- if re.search(pattern, text, re.IGNORECASE):
779
- found = True
780
- break
781
- if found:
782
- ground_truth.append(category)
783
-
784
- return ground_truth
785
-
786
- def calculate_scalability_score(self, results):
787
- """محاسبه امتیاز مقیاس‌پذیری"""
788
- if len(results) < 10:
789
- return 50.0
790
-
791
- processing_times = [r['processing_time_ms'] for r in results if r.get('success', False)]
792
-
793
- if len(processing_times) < 2:
794
- return 50.0
795
-
796
- x = np.arange(len(processing_times))
797
- slope = np.polyfit(x, processing_times, 1)[0]
798
-
799
- if slope <= 0:
800
- return 100.0
801
- elif slope < 1:
802
- return 90.0
803
- elif slope < 5:
804
- return 70.0
805
- elif slope < 10:
806
- return 50.0
807
- else:
808
- return 30.0
809
-
810
- def calculate_performance_degradation(self, results):
811
- """محاسبه کاهش عملکرد در طول زمان"""
812
- processing_times = [r['processing_time_ms'] for r in results if r.get('success', False)]
813
-
814
- if len(processing_times) < 10:
815
- return 0.0
816
-
817
- first_10_percent = int(len(processing_times) * 0.1)
818
- last_10_percent = int(len(processing_times) * 0.1)
819
-
820
- if first_10_percent == 0:
821
- return 0.0
822
-
823
- avg_first = np.mean(processing_times[:first_10_percent])
824
- avg_last = np.mean(processing_times[-last_10_percent:])
825
-
826
- degradation = ((avg_last - avg_first) / avg_first) * 100 if avg_first > 0 else 0
827
- return max(0, degradation)
828
-
829
- def run_stress_test(self, sample_text, iterations=50):
830
- """اجرای تست استرس"""
831
- self.stress_test_active = True
832
- stress_results = {
833
- 'total_iterations': iterations,
834
- 'successful_iterations': 0,
835
- 'failed_iterations': 0,
836
- 'avg_response_time': 0,
837
- 'max_response_time': 0,
838
- 'min_response_time': float('inf'),
839
- 'memory_peak': 0,
840
- 'memory_average': 0,
841
- 'errors': []
842
- }
843
-
844
- memory_readings = []
845
- response_times = []
846
-
847
- for i in range(iterations):
848
- try:
849
- start_time = time.time()
850
- start_memory = self.get_memory_usage()
851
-
852
- result = self.anonymizer.anonymize_text(sample_text)
853
-
854
- end_time = time.time()
855
- end_memory = self.get_memory_usage()
856
-
857
- response_time = (end_time - start_time) * 1000 # ms
858
- response_times.append(response_time)
859
- memory_readings.append(end_memory)
860
-
861
- if not result.startswith("❌"):
862
- stress_results['successful_iterations'] += 1
863
- else:
864
- stress_results['failed_iterations'] += 1
865
- stress_results['errors'].append(f"Iteration {i+1}: {result[:100]}")
866
-
867
- except Exception as e:
868
- stress_results['failed_iterations'] += 1
869
- stress_results['errors'].append(f"Iteration {i+1}: {str(e)}")
870
-
871
- if i % 10 == 0:
872
- gc.collect()
873
-
874
- if response_times:
875
- stress_results['avg_response_time'] = np.mean(response_times)
876
- stress_results['max_response_time'] = max(response_times)
877
- stress_results['min_response_time'] = min(response_times)
878
-
879
- if memory_readings:
880
- stress_results['memory_peak'] = max(memory_readings)
881
- stress_results['memory_average'] = np.mean(memory_readings)
882
-
883
- self.stress_test_active = False
884
- return stress_results
885
-
886
- def calculate_advanced_efficiency(self, base_summary, classification_metrics,
887
- scalability_score, performance_degradation, memory_stats):
888
- """محاسبه امتیاز کارایی پیشرفته"""
889
-
890
- weights = {
891
- 'success_rate': 0.25,
892
- 'speed': 0.20,
893
- 'accuracy': 0.15,
894
- 'precision': 0.10,
895
- 'scalability': 0.10,
896
- 'memory_efficiency': 0.10,
897
- 'degradation': 0.10
898
- }
899
-
900
- success_score = base_summary['success_rate'] * 100
901
- speed_score = min(100, 1000 / base_summary['avg_processing_time_ms']) * 100 if base_summary['avg_processing_time_ms'] > 0 else 0
902
- accuracy_score = classification_metrics.get('accuracy', 0)
903
- precision_score = classification_metrics.get('precision', 0)
904
- scalability_score_norm = min(100, scalability_score)
905
- memory_score = max(0, 100 - memory_stats['avg_memory_per_sentence'])
906
- degradation_score = max(0, 100 - performance_degradation)
907
-
908
- advanced_efficiency = (
909
- weights['success_rate'] * success_score +
910
- weights['speed'] * speed_score +
911
- weights['accuracy'] * accuracy_score +
912
- weights['precision'] * precision_score +
913
- weights['scalability'] * scalability_score_norm +
914
- weights['memory_efficiency'] * memory_score +
915
- weights['degradation'] * degradation_score
916
- )
917
-
918
- return min(100, max(0, advanced_efficiency))
919
-
920
- def load_dataset(self, file_path):
921
- """بارگذاری دیتاست از فایل"""
922
- if not file_path:
923
- return []
924
-
925
- try:
926
- ext = os.path.splitext(file_path)[1].lower()
927
-
928
- if ext == '.csv':
929
- df = pd.read_csv(file_path, encoding='utf-8')
930
- text_columns = ['text', 'sentence', 'content', 'data', 'متن', 'جمله']
931
- text_col = None
932
-
933
- for col in text_columns:
934
- if col in df.columns:
935
- text_col = col
936
- break
937
-
938
- if text_col is None:
939
- text_col = df.columns[0]
940
-
941
- sentences = df[text_col].dropna().astype(str).tolist()
942
-
943
- elif ext == '.json':
944
- with open(file_path, 'r', encoding='utf-8') as f:
945
- data = json.load(f)
946
-
947
- sentences = []
948
- if isinstance(data, list):
949
- sentences = [str(item) for item in data if isinstance(item, str)]
950
- elif isinstance(data, dict):
951
- for value in data.values():
952
- if isinstance(value, list):
953
- sentences.extend([str(v) for v in value if isinstance(v, str)])
954
- elif isinstance(value, str):
955
- sentences.append(value)
956
-
957
- else: # text file
958
- with open(file_path, 'r', encoding='utf-8') as f:
959
- content = f.read()
960
- sentences = [line.strip() for line in content.split('\n') if len(line.strip()) > 10]
961
-
962
- sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
963
-
964
- return sentences
965
-
966
- except Exception as e:
967
- print(f"Error loading dataset: {e}")
968
- return []
969
-
970
- def run_enhanced_benchmark(self, file_obj, sample_size, progress=gr.Progress()):
971
- """اجرای بنچمارک پیشرفته"""
972
-
973
- if not file_obj:
974
- return self.get_error_response("No file uploaded")
975
-
976
- if not self.system_ready:
977
- return self.get_error_response("System not ready")
978
-
979
- try:
980
- progress(0.05, desc="Initializing enhanced benchmark...")
981
-
982
- self.start_memory_monitoring()
983
-
984
- progress(0.1, desc="Loading dataset...")
985
- sentences = self.load_dataset(file_obj.name)
986
-
987
- if not sentences:
988
- return self.get_error_response("Could not load sentences")
989
-
990
- if len(sentences) > sample_size:
991
- sentences = sentences[:sample_size]
992
-
993
- progress(0.15, desc=f"Processing {len(sentences)} sentences with enhanced metrics...")
994
-
995
- results = []
996
- start_time = time.time()
997
- memory_readings = []
998
-
999
- for i, sentence in enumerate(sentences):
1000
- progress(0.15 + (0.65 * i / len(sentences)),
1001
- desc=f"Processing sentence {i+1}/{len(sentences)}")
1002
-
1003
- self.anonymizer.mapping_table = {}
1004
- self.anonymizer.counters = {key: 0 for key in self.anonymizer.counters.keys()}
1005
-
1006
- sent_start = time.time()
1007
- memory_before = self.get_memory_usage()
1008
-
1009
- try:
1010
- result = self.anonymizer.anonymize_text(sentence)
1011
- processing_time = time.time() - sent_start
1012
- memory_after = self.get_memory_usage()
1013
- memory_used = memory_after - memory_before
1014
-
1015
- entities_found = len(self.anonymizer.mapping_table)
1016
- success = not result.startswith("❌")
1017
-
1018
- entity_categories = {}
1019
- for entity, code in self.anonymizer.mapping_table.items():
1020
- category = code.split('_')[0] if '_' in code else 'OTHER'
1021
- entity_categories[category] = entity_categories.get(category, 0) + 1
1022
-
1023
- results.append({
1024
- 'index': i + 1,
1025
- 'success': success,
1026
- 'processing_time_ms': processing_time * 1000,
1027
- 'input_length': len(sentence),
1028
- 'output_length': len(result),
1029
- 'entities_found': entities_found,
1030
- 'entity_categories': entity_categories,
1031
- 'speed_chars_per_sec': len(sentence) / processing_time if processing_time > 0 else 0,
1032
- 'memory_used_mb': memory_used,
1033
- 'original_preview': sentence[:100] + "..." if len(sentence) > 100 else sentence,
1034
- 'anonymized_preview': result[:100] + "..." if len(result) > 100 else result,
1035
- })
1036
-
1037
- memory_readings.append(memory_after)
1038
-
1039
- except Exception as e:
1040
- results.append({
1041
- 'index': i + 1,
1042
- 'success': False,
1043
- 'error': str(e),
1044
- 'processing_time_ms': (time.time() - sent_start) * 1000,
1045
- 'input_length': len(sentence),
1046
- 'entities_found': 0,
1047
- 'entity_categories': {},
1048
- 'speed_chars_per_sec': 0,
1049
- 'memory_used_mb': 0
1050
- })
1051
-
1052
- total_time = time.time() - start_time
1053
-
1054
- progress(0.85, desc="Calculating advanced metrics...")
1055
-
1056
- successful_results = [r for r in results if r.get('success', False)]
1057
-
1058
- if not successful_results:
1059
- return self.get_error_response("No successful results")
1060
-
1061
- base_summary = {
1062
- 'total_sentences': len(sentences),
1063
- 'successful_sentences': len(successful_results),
1064
- 'success_rate': len(successful_results) / len(sentences),
1065
- 'avg_processing_time_ms': np.mean([r['processing_time_ms'] for r in successful_results]),
1066
- 'total_entities': sum(r['entities_found'] for r in successful_results),
1067
- 'avg_entities_per_sentence': np.mean([r['entities_found'] for r in successful_results]),
1068
- 'avg_speed_chars_per_sec': np.mean([r['speed_chars_per_sec'] for r in successful_results]),
1069
- 'sentences_per_minute': len(successful_results) / (total_time / 60) if total_time > 0 else 0,
1070
- 'total_time_seconds': total_time
1071
- }
1072
-
1073
- progress(0.90, desc="Computing classification metrics...")
1074
-
1075
- classification_metrics = self.calculate_classification_metrics(successful_results)
1076
-
1077
- progress(0.93, desc="Analyzing performance patterns...")
1078
-
1079
- scalability_score = self.calculate_scalability_score(successful_results)
1080
- performance_degradation = self.calculate_performance_degradation(successful_results)
1081
-
1082
- memory_stats = {
1083
- 'avg_memory_per_sentence': np.mean([r.get('memory_used_mb', 0) for r in successful_results]),
1084
- 'peak_memory_usage': max(memory_readings) if memory_readings else 0,
1085
- 'total_memory_used': sum([r.get('memory_used_mb', 0) for r in successful_results])
1086
- }
1087
-
1088
- progress(0.96, desc="Running stress test...")
1089
-
1090
- if len(sentences) > 0:
1091
- stress_results = self.run_stress_test(sentences[0], iterations=20)
1092
- else:
1093
- stress_results = {'error': 'No sample text for stress test'}
1094
-
1095
- advanced_efficiency = self.calculate_advanced_efficiency(
1096
- base_summary, classification_metrics, scalability_score,
1097
- performance_degradation, memory_stats
1098
- )
1099
-
1100
- enhanced_summary = {
1101
- **base_summary,
1102
- **classification_metrics,
1103
- 'scalability_score': scalability_score,
1104
- 'performance_degradation': performance_degradation,
1105
- 'memory_stats': memory_stats,
1106
- 'stress_test_results': stress_results,
1107
- 'advanced_efficiency_score': advanced_efficiency,
1108
- 'efficiency_score': base_summary['success_rate'] * 100
1109
- }
1110
-
1111
- self.current_results = {
1112
- 'summary': enhanced_summary,
1113
- 'detailed_results': results,
1114
- 'timestamp': datetime.now().isoformat(),
1115
- 'benchmark_version': 'enhanced_v2.0'
1116
- }
1117
-
1118
- progress(1.0, desc="Enhanced benchmark complete!")
1119
-
1120
- overview_plot = self.create_enhanced_overview_chart()
1121
- performance_plot = self.create_enhanced_performance_charts()
1122
- entity_plot = self.create_entity_analysis()
1123
- detailed_report = self.create_enhanced_detailed_report()
1124
-
1125
- return (
1126
- self.get_text('success_message') + f" (Enhanced v2.0 - {len(enhanced_summary)} metrics)",
1127
- overview_plot,
1128
- performance_plot,
1129
- entity_plot,
1130
- detailed_report,
1131
- gr.update(visible=True),
1132
- gr.update(visible=True),
1133
- gr.update(visible=True),
1134
- gr.update(visible=True),
1135
- gr.update(visible=True),
1136
- gr.update(visible=True)
1137
- )
1138
-
1139
- except Exception as e:
1140
- return self.get_error_response(f"Enhanced benchmark error: {str(e)}")
1141
-
1142
- def create_enhanced_overview_chart(self):
1143
- """نمودار خلاصه پیشرفته"""
1144
- if not self.current_results:
1145
- return None
1146
-
1147
- summary = self.current_results['summary']
1148
-
1149
- fig = make_subplots(
1150
- rows=3, cols=3,
1151
- subplot_titles=[
1152
- 'Advanced Efficiency Score',
1153
- 'Classification Accuracy',
1154
- 'Processing Speed',
1155
- 'Memory Usage',
1156
- 'Scalability Score',
1157
- 'Performance Degradation',
1158
- 'Precision Score',
1159
- 'Recall Score',
1160
- 'F1 Score'
1161
- ],
1162
- specs=[[{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}],
1163
- [{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}],
1164
- [{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}]]
1165
- )
1166
-
1167
- # امتیاز کارایی پیشرفته
1168
- fig.add_trace(go.Indicator(
1169
- mode = "gauge+number",
1170
- value = summary['advanced_efficiency_score'],
1171
- domain = {'x': [0, 1], 'y': [0, 1]},
1172
- gauge = {
1173
- 'axis': {'range': [None, 100]},
1174
- 'bar': {'color': "darkblue"},
1175
- 'steps': [
1176
- {'range': [0, 60], 'color': "lightcoral"},
1177
- {'range': [60, 80], 'color': "yellow"},
1178
- {'range': [80, 100], 'color': "lightgreen"}],
1179
- 'threshold': {
1180
- 'line': {'color': "red", 'width': 4},
1181
- 'thickness': 0.75,
1182
- 'value': 90}
1183
- }
1184
- ), row=1, col=1)
1185
-
1186
- # Accuracy
1187
- fig.add_trace(go.Indicator(
1188
- mode = "gauge+number",
1189
- value = summary.get('accuracy', 0),
1190
- domain = {'x': [0, 1], 'y': [0, 1]},
1191
- gauge = {
1192
- 'axis': {'range': [None, 100]},
1193
- 'bar': {'color': "green"},
1194
- 'steps': [{'range': [0, 100], 'color': "lightgray"}],
1195
- }
1196
- ), row=1, col=2)
1197
-
1198
- # Processing Speed (inverse - lower is better)
1199
- speed_score = min(100, 1000 / summary['avg_processing_time_ms']) if summary['avg_processing_time_ms'] > 0 else 0
1200
- fig.add_trace(go.Indicator(
1201
- mode = "gauge+number",
1202
- value = speed_score,
1203
- domain = {'x': [0, 1], 'y': [0, 1]},
1204
- gauge = {
1205
- 'axis': {'range': [None, 100]},
1206
- 'bar': {'color': "orange"},
1207
- }
1208
- ), row=1, col=3)
1209
-
1210
- # Memory Usage
1211
- memory_score = max(0, 100 - summary['memory_stats']['avg_memory_per_sentence'])
1212
- fig.add_trace(go.Indicator(
1213
- mode = "gauge+number",
1214
- value = memory_score,
1215
- domain = {'x': [0, 1], 'y': [0, 1]},
1216
- gauge = {
1217
- 'axis': {'range': [None, 100]},
1218
- 'bar': {'color': "purple"},
1219
- }
1220
- ), row=2, col=1)
1221
-
1222
- # Scalability Score
1223
- fig.add_trace(go.Indicator(
1224
- mode = "gauge+number",
1225
- value = summary['scalability_score'],
1226
- domain = {'x': [0, 1], 'y': [0, 1]},
1227
- gauge = {
1228
- 'axis': {'range': [None, 100]},
1229
- 'bar': {'color': "cyan"},
1230
- }
1231
- ), row=2, col=2)
1232
-
1233
- # Performance Degradation (inverse - lower is better)
1234
- degradation_score = max(0, 100 - summary['performance_degradation'])
1235
- fig.add_trace(go.Indicator(
1236
- mode = "gauge+number",
1237
- value = degradation_score,
1238
- domain = {'x': [0, 1], 'y': [0, 1]},
1239
- gauge = {
1240
- 'axis': {'range': [None, 100]},
1241
- 'bar': {'color': "red"},
1242
- }
1243
- ), row=2, col=3)
1244
-
1245
- # Precision
1246
- fig.add_trace(go.Indicator(
1247
- mode = "number",
1248
- value = summary.get('precision', 0),
1249
- number = {'suffix': "%"},
1250
- domain = {'x': [0, 1], 'y': [0, 1]},
1251
- ), row=3, col=1)
1252
-
1253
- # Recall
1254
- fig.add_trace(go.Indicator(
1255
- mode = "number",
1256
- value = summary.get('recall', 0),
1257
- number = {'suffix': "%"},
1258
- domain = {'x': [0, 1], 'y': [0, 1]},
1259
- ), row=3, col=2)
1260
-
1261
- # F1 Score
1262
- fig.add_trace(go.Indicator(
1263
- mode = "number",
1264
- value = summary.get('f1_score', 0),
1265
- number = {'suffix': "%"},
1266
- domain = {'x': [0, 1], 'y': [0, 1]},
1267
- ), row=3, col=3)
1268
-
1269
- fig.update_layout(
1270
- height=900,
1271
- title_text="📊 Enhanced Benchmark Overview - Advanced Metrics",
1272
- title_font_size=20
1273
- )
1274
-
1275
- return fig
1276
-
1277
- def create_enhanced_performance_charts(self):
1278
- """ایجاد نمودارهای عملکرد پیشرفته"""
1279
- if not self.current_results:
1280
- return None
1281
-
1282
- results = self.current_results['detailed_results']
1283
- df = pd.DataFrame([r for r in results if r.get('success', False)])
1284
-
1285
- if df.empty:
1286
- return None
1287
-
1288
- fig = make_subplots(
1289
- rows=3, cols=2,
1290
- subplot_titles=[
1291
- 'Processing Time vs Memory Usage',
1292
- 'Scalability Analysis',
1293
- 'Entity Detection Efficiency',
1294
- 'Memory Usage Distribution',
1295
- 'Speed Distribution',
1296
- 'Advanced Performance Matrix'
1297
- ]
1298
- )
1299
-
1300
- # 1. Processing Time vs Memory Usage
1301
- fig.add_trace(go.Scatter(
1302
- x=df['processing_time_ms'],
1303
- y=df['memory_used_mb'],
1304
- mode='markers',
1305
- name='Time vs Memory',
1306
- marker=dict(
1307
- size=df['entities_found'],
1308
- color=df['entities_found'],
1309
- colorscale='Viridis',
1310
- showscale=True
1311
- )
1312
- ), row=1, col=1)
1313
-
1314
- # 2. Scalability Analysis
1315
- fig.add_trace(go.Scatter(
1316
- x=df.index,
1317
- y=df['processing_time_ms'],
1318
- mode='lines+markers',
1319
- name='Time Trend',
1320
- line=dict(color='red')
1321
- ), row=1, col=2)
1322
-
1323
- # 3. Entity Detection Efficiency
1324
- fig.add_trace(go.Scatter(
1325
- x=df['input_length'],
1326
- y=df['entities_found'],
1327
- mode='markers',
1328
- name='Detection Efficiency'
1329
- ), row=2, col=1)
1330
-
1331
- # 4. Memory Usage Distribution
1332
- fig.add_trace(go.Histogram(
1333
- x=df['memory_used_mb'],
1334
- name='Memory Distribution',
1335
- nbinsx=20
1336
- ), row=2, col=2)
1337
-
1338
- # 5. Speed Distribution
1339
- fig.add_trace(go.Histogram(
1340
- x=df['speed_chars_per_sec'],
1341
- name='Speed Distribution',
1342
- nbinsx=20
1343
- ), row=3, col=1)
1344
-
1345
- # 6. Advanced Performance Matrix
1346
- performance_score = (
1347
- (df['entities_found'] / df['entities_found'].max() * 40) +
1348
- (df['speed_chars_per_sec'] / df['speed_chars_per_sec'].max() * 30) +
1349
- ((df['memory_used_mb'].max() - df['memory_used_mb']) / df['memory_used_mb'].max() * 30)
1350
- )
1351
-
1352
- fig.add_trace(go.Scatter(
1353
- x=df.index,
1354
- y=performance_score,
1355
- mode='lines+markers',
1356
- name='Performance Score',
1357
- line=dict(color='green')
1358
- ), row=3, col=2)
1359
-
1360
- fig.update_layout(
1361
- height=1000,
1362
- title_text="📈 Enhanced Performance Charts",
1363
- title_font_size=20,
1364
- showlegend=False
1365
- )
1366
-
1367
- return fig
1368
-
1369
- def create_entity_analysis(self):
1370
- """تحلیل انواع موجودیت‌ها"""
1371
- if not self.current_results:
1372
- return None
1373
-
1374
- results = self.current_results['detailed_results']
1375
-
1376
- all_categories = {}
1377
- for result in results:
1378
- if result.get('success', False):
1379
- for category, count in result.get('entity_categories', {}).items():
1380
- all_categories[category] = all_categories.get(category, 0) + count
1381
-
1382
- if not all_categories:
1383
- return None
1384
-
1385
- fig = make_subplots(
1386
- rows=1, cols=2,
1387
- specs=[[{"type": "pie"}, {"type": "bar"}]],
1388
- subplot_titles=[
1389
- 'Entity Types Distribution',
1390
- 'Entity Categories Count'
1391
- ]
1392
- )
1393
-
1394
- categories = list(all_categories.keys())
1395
- values = list(all_categories.values())
1396
-
1397
- # نمودار دایره‌ای
1398
- fig.add_trace(go.Pie(
1399
- labels=categories,
1400
- values=values,
1401
- name="Entity Types"
1402
- ), row=1, col=1)
1403
-
1404
- # نمودار میله‌ای
1405
- fig.add_trace(go.Bar(
1406
- x=categories,
1407
- y=values,
1408
- name="Count"
1409
- ), row=1, col=2)
1410
-
1411
- fig.update_layout(
1412
- height=500,
1413
- title_text="🔍 Entity Analysis",
1414
- title_font_size=20
1415
- )
1416
-
1417
- return fig
1418
-
1419
- def create_enhanced_detailed_report(self):
1420
- """گزارش تفصیلی پیشرفته"""
1421
- if not self.current_results:
1422
- return self.get_text('no_results')
1423
-
1424
- summary = self.current_results['summary']
1425
-
1426
- if self.current_language == 'fa':
1427
- report = f"""
1428
- # 📊 گزارش بنچمارک پیشرفته - نسخه ۲.۰
1429
-
1430
- ## خلاصه نتایج اصلی
1431
- - **کل جملات پردازش شده**: {summary['total_sentences']:,}
1432
- - **جملات موفق**: {summary['successful_sentences']:,}
1433
- - **نرخ موفقیت**: {summary['success_rate']*100:.1f}%
1434
- - **امتیاز کارایی پیشرفته**: {summary['advanced_efficiency_score']:.1f}/100
1435
-
1436
- ## 🎯 متریک‌های دقت کلاسیفیکیشن
1437
- - **دقت (Precision)**: {summary.get('precision', 0):.1f}%
1438
- - **بازخوانی (Recall)**: {summary.get('recall', 0):.1f}%
1439
- - **امتیاز F1**: {summary.get('f1_score', 0):.1f}%
1440
- - **صحت کلی (Accuracy)**: {summary.get('accuracy', 0):.1f}%
1441
-
1442
- ## ⚡ آمار عملکرد پیشرفته
1443
- - **متوسط زمان پردازش**: {summary['avg_processing_time_ms']:.1f} میلی‌ثانیه
1444
- - **امتیاز مقیاس‌پذیری**: {summary['scalability_score']:.1f}/100
1445
- - **کاهش عملکرد**: {summary['performance_degradation']:.1f}%
1446
- - **سرعت پردازش**: {summary['avg_speed_chars_per_sec']:.0f} کاراکتر/ثانیه
1447
-
1448
- ## 💾 آمار مصرف حافظه
1449
- - **متوسط حافظه هر جمله**: {summary['memory_stats']['avg_memory_per_sentence']:.2f} MB
1450
- - **حداکثر مصرف حافظه**: {summary['memory_stats']['peak_memory_usage']:.2f} MB
1451
- - **کل حافظه استفاده شده**: {summary['memory_stats']['total_memory_used']:.2f} MB
1452
-
1453
- ## 🔥 نتایج تست استرس
1454
- """
1455
-
1456
- stress_results = summary.get('stress_test_results', {})
1457
- if 'error' not in stress_results:
1458
- report += f"""
1459
- - **کل تکرارها**: {stress_results.get('total_iterations', 0)}
1460
- - **تکرارهای موفق**: {stress_results.get('successful_iterations', 0)}
1461
- - **تکرارهای ناموفق**: {stress_results.get('failed_iterations', 0)}
1462
- - **متوسط زمان پاسخ**: {stress_results.get('avg_response_time', 0):.1f} ms
1463
- - **حداکثر زمان پاسخ**: {stress_results.get('max_response_time', 0):.1f} ms
1464
- - **حداقل زمان پاسخ**: {stress_results.get('min_response_time', 0):.1f} ms
1465
- """
1466
- else:
1467
- report += f"- **خطا در تست استرس**: {stress_results.get('error', 'نامشخص')}\n"
1468
-
1469
- # پیشنهادات بر اساس نتایج
1470
- efficiency = summary['advanced_efficiency_score']
1471
- if efficiency >= 80:
1472
- report += """
1473
- ✅ **سیستم شما عملکرد خوب تا عالی دارد!**
1474
- - ادامه مانیتورینگ و نگهداری منظم
1475
- - در نظر گیری optimization های ریز
1476
- - آماده‌سازی برای production deployment
1477
- """
1478
- elif efficiency >= 60:
1479
- report += """
1480
- ⚠️ **سیستم نیاز به بهبودهایی دارد:**
1481
- - بهینه‌سازی الگوریتم‌های تشخیص
1482
- - بهبود مدیریت حافظه
1483
- - افزایش دقت کلاسیفیکیشن
1484
- - کاهش زمان پردازش
1485
- """
1486
- else:
1487
- report += """
1488
- 🔧 **سیستم نیاز به بازنگری اساسی دارد:**
1489
- - بازطراحی architecture
1490
- - بهبود پایه‌ای الگوریتم‌ها
1491
- - افزایش منابع سخت‌افزاری
1492
- - training مجدد مدل‌ها
1493
- - پیاده‌سازی caching mechanism
1494
- """
1495
-
1496
- else:
1497
- report = f"""
1498
- # 📊 Advanced Benchmark Report - Version 2.0
1499
-
1500
- ## Main Results Summary
1501
- - **Total Sentences Processed**: {summary['total_sentences']:,}
1502
- - **Successful Sentences**: {summary['successful_sentences']:,}
1503
- - **Success Rate**: {summary['success_rate']*100:.1f}%
1504
- - **Advanced Efficiency Score**: {summary['advanced_efficiency_score']:.1f}/100
1505
-
1506
- ## 🎯 Classification Accuracy Metrics
1507
- - **Precision**: {summary.get('precision', 0):.1f}%
1508
- - **Recall**: {summary.get('recall', 0):.1f}%
1509
- - **F1 Score**: {summary.get('f1_score', 0):.1f}%
1510
- - **Overall Accuracy**: {summary.get('accuracy', 0):.1f}%
1511
-
1512
- ## ⚡ Advanced Performance Statistics
1513
- - **Average Processing Time**: {summary['avg_processing_time_ms']:.1f} ms
1514
- - **Scalability Score**: {summary['scalability_score']:.1f}/100
1515
- - **Performance Degradation**: {summary['performance_degradation']:.1f}%
1516
- - **Processing Speed**: {summary['avg_speed_chars_per_sec']:.0f} chars/sec
1517
-
1518
- ## 💾 Memory Usage Statistics
1519
- - **Average Memory per Sentence**: {summary['memory_stats']['avg_memory_per_sentence']:.2f} MB
1520
- - **Peak Memory Usage**: {summary['memory_stats']['peak_memory_usage']:.2f} MB
1521
- - **Total Memory Used**: {summary['memory_stats']['total_memory_used']:.2f} MB
1522
-
1523
- **This comprehensive benchmark analyzed {summary['total_sentences']} sentences with {len(summary)} different metrics.**
1524
- """
1525
-
1526
- return report
1527
-
1528
- def get_error_response(self, error_msg):
1529
- """پاسخ استانداردشده برای خطاها"""
1530
- return (
1531
- f"❌ {error_msg}",
1532
- None, None, None, None,
1533
- gr.update(visible=False),
1534
- gr.update(visible=False),
1535
- gr.update(visible=False),
1536
- gr.update(visible=False),
1537
- gr.update(visible=False),
1538
- gr.update(visible=False)
1539
- )
1540
-
1541
- def download_results(self):
1542
- """دانلود نتایج"""
1543
- if not self.current_results:
1544
- return None
1545
-
1546
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1547
- filename = f"enhanced_benchmark_results_{timestamp}.json"
1548
-
1549
- with open(filename, 'w', encoding='utf-8') as f:
1550
- json.dump(self.current_results, f, ensure_ascii=False, indent=2, default=str)
1551
-
1552
- return filename
1553
-
1554
- # =============================================================================
1555
- # بخش 3: ایجاد رابط کاربری
1556
- # =============================================================================
1557
-
1558
- def create_benchmark_interface():
1559
- """ایجاد رابط کاربری فقط بنچمارک"""
1560
-
1561
- enhanced_benchmark = EnhancedGradioBenchmarkInterface()
1562
-
1563
- custom_css = """
1564
- body, .gradio-container {
1565
- font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important;
1566
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
1567
- min-height: 100vh !important;
1568
- padding: 20px !important;
1569
- }
1570
-
1571
- .rtl {
1572
- direction: rtl !important;
1573
- text-align: right !important;
1574
- }
1575
-
1576
- .ltr {
1577
- direction: ltr !important;
1578
- text-align: left !important;
1579
- }
1580
-
1581
- .gradio-textbox {
1582
- border-radius: 10px !important;
1583
- box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
1584
- }
1585
-
1586
- .gradio-button {
1587
- border-radius: 25px !important;
1588
- font-weight: bold !important;
1589
- transition: all 0.3s ease !important;
1590
- margin: 5px 0 !important;
1591
- min-height: 50px !important;
1592
- }
1593
-
1594
- .gradio-button:hover {
1595
- transform: translateY(-2px) !important;
1596
- box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important;
1597
- }
1598
-
1599
- h1, h2, h3 {
1600
- text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
1601
- margin-top: 0 !important;
1602
- margin-bottom: 10px !important;
1603
- padding-top: 0 !important;
1604
- line-height: 1.2 !important;
1605
- }
1606
- """
1607
-
1608
- with gr.Blocks(title="🚀 Enhanced Benchmark System", theme=gr.themes.Soft(), css=custom_css) as app:
1609
-
1610
- # انتخاب زبان
1611
- with gr.Row():
1612
- language_selector = gr.Radio(
1613
- choices=["فارسی", "English"],
1614
- value="فارسی",
1615
- label="Language / زبان",
1616
- interactive=True
1617
- )
1618
-
1619
- # عنوان اصلی
1620
- gr.HTML("""
1621
- <div style="text-align: center; padding: 20px;">
1622
- <h1>🚀 بنچمارک سیستم نام‌نشان‌سازی دوزبانه پیشرفته</h1>
1623
- <h2>Enhanced Bilingual Data Anonymization Benchmark</h2>
1624
- <p>تحلیل جامع عملکرد سیستم‌های حفاظت از حریم خصوصی با متریک‌های پیشرفته</p>
1625
- <p>Comprehensive Performance Analysis with Advanced Metrics including Precision, Recall, F1-Score, Memory Usage, Scalability</p>
1626
- </div>
1627
- """)
1628
-
1629
- with gr.Row():
1630
- with gr.Column(scale=1):
1631
- # تنظیمات
1632
- file_upload = gr.File(
1633
- label="آپلود دیتاست شما / Upload Your Dataset (CSV, TXT, JSON - Max 10MB)",
1634
- file_types=[".csv", ".txt", ".json"],
1635
- file_count="single",
1636
- )
1637
-
1638
- sample_size = gr.Slider(
1639
- minimum=10,
1640
- maximum=1000,
1641
- value=200,
1642
- step=10,
1643
- label="اندازه نمونه برای تحلیل / Sample Size - Larger samples = more accurate results"
1644
- )
1645
-
1646
- run_btn = gr.Button(
1647
- "🚀 اجرای تحلیل بنچمارک پیشرفته / Run Enhanced Benchmark",
1648
- variant="primary",
1649
- size="lg"
1650
- )
1651
-
1652
- download_btn = gr.Button(
1653
- "📥 دانلود نتایج / Download Results",
1654
- variant="secondary",
1655
- visible=False
1656
- )
1657
-
1658
- # نمایش وضعیت
1659
- status_output = gr.Textbox(
1660
- label="وضعیت / Status",
1661
- interactive=False,
1662
- lines=2
1663
- )
1664
-
1665
- with gr.Column(scale=2):
1666
- # نتایج در تب‌ها
1667
- with gr.Tabs():
1668
- with gr.Tab("خلاصه نتایج پیشرفته / Enhanced Overview"):
1669
- overview_plot = gr.Plot(
1670
- label="نمودار خلاصه کلی پیشرفته",
1671
- visible=False
1672
- )
1673
-
1674
- with gr.Tab("نمودارهای عملکرد پیشرفته / Advanced Performance"):
1675
- performance_plot = gr.Plot(
1676
- label="نمودارهای عملکرد پیشرفته",
1677
- visible=False
1678
- )
1679
-
1680
- with gr.Tab("تحلیل موجودیت‌ها / Entity Analysis"):
1681
- entity_plot = gr.Plot(
1682
- label="تحلیل موجودیت‌ها",
1683
- visible=False
1684
- )
1685
-
1686
- with gr.Tab("گزارش تفصیلی پیشرفته / Enhanced Report"):
1687
- detailed_report = gr.Markdown(
1688
- "هنوز نتیجه‌ای وجود ندارد. لطفاً ابتدا بنچمارک پیشرفته را اجرا کنید.\n\nNo results yet. Please run the enhanced benchmark first.",
1689
- visible=False
1690
- )
1691
-
1692
- # نمایش وضعیت سیستم
1693
- system_status = "✅ سیستم بنچمارک پیشرفته آماده است / Enhanced benchmark system ready" if enhanced_benchmark.system_ready else "⚠️ سیستم در حالت نمایشی / Running in demo mode"
1694
-
1695
- gr.HTML(f"""
1696
- <div style="text-align: center; margin-top: 20px; padding: 10px; background-color: #e8f4f8; border-radius: 5px;">
1697
- <p><strong>وضعیت سیستم / System Status:</strong> {system_status}</p>
1698
- <p><strong>ویژگی‌های جدید:</strong> Precision, Recall, F1-Score, Memory Usage, Scalability Analysis, Stress Testing</p>
1699
- </div>
1700
- """)
1701
-
1702
- # راهنمای استفاده
1703
- gr.HTML("""
1704
- <div style="margin-top: 30px; padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
1705
- <h3>📋 راهنمای استفاده پیشرفته / Enhanced Usage Guide</h3>
1706
- <div style="display: flex; gap: 20px;">
1707
- <div style="flex: 1;">
1708
- <h4>🇮🇷 فارسی</h4>
1709
- <ul>
1710
- <li>فایل دیتاست خود را آپلود کن��د</li>
1711
- <li>اندازه نمونه مورد نظر را انتخاب کنید</li>
1712
- <li>دکمه "اجرای بنچمارک پیشرفته" را بزنید</li>
1713
- <li>نتایج در تب‌های مختلف با متریک‌های جدید نمایش داده می‌شود</li>
1714
- <li><strong>جدید</strong>: متریک‌های Precision, Recall, F1-Score, Memory Usage</li>
1715
- </ul>
1716
- </div>
1717
- <div style="flex: 1;">
1718
- <h4>🇺🇸 English</h4>
1719
- <ul>
1720
- <li>Upload your dataset file</li>
1721
- <li>Select desired sample size</li>
1722
- <li>Click "Run Enhanced Benchmark"</li>
1723
- <li>Results displayed in different tabs with new metrics</li>
1724
- <li><strong>New</strong>: Precision, Recall, F1-Score, Memory Usage metrics</li>
1725
- </ul>
1726
- </div>
1727
- </div>
1728
- </div>
1729
- """)
1730
-
1731
- # Event handlers
1732
- language_selector.change(
1733
- fn=enhanced_benchmark.change_language,
1734
- inputs=[language_selector],
1735
- outputs=[file_upload, sample_size, run_btn, download_btn]
1736
- )
1737
-
1738
- run_btn.click(
1739
- fn=enhanced_benchmark.run_enhanced_benchmark,
1740
- inputs=[file_upload, sample_size],
1741
- outputs=[
1742
- status_output,
1743
- overview_plot,
1744
- performance_plot,
1745
- entity_plot,
1746
- detailed_report,
1747
- overview_plot, # visibility
1748
- performance_plot, # visibility
1749
- entity_plot, # visibility
1750
- detailed_report, # visibility
1751
- download_btn, # visibility
1752
- download_btn # dummy for compatibility
1753
- ],
1754
- show_progress=True
1755
- )
1756
-
1757
- download_btn.click(
1758
- fn=enhanced_benchmark.download_results,
1759
- outputs=gr.File()
1760
- )
1761
-
1762
- return app
1763
-
1764
- # =============================================================================
1765
- # بخش 4: تابع اصلی
1766
- # =============================================================================
1767
-
1768
- def main():
1769
- """تابع اصلی"""
1770
-
1771
- print("🚀 Starting Enhanced Benchmark System...")
1772
- print("=" * 80)
1773
-
1774
- # ویژگی‌های جدید
1775
- features = []
1776
- if SKLEARN_AVAILABLE:
1777
- features.append("Precision/Recall/F1-Score")
1778
- if PSUTIL_AVAILABLE:
1779
- features.append("Memory Usage Monitoring")
1780
- features.append("Scalability Analysis")
1781
- features.append("Performance Degradation")
1782
- features.append("Stress Testing")
1783
-
1784
- print(f"✨ Enhanced features: {', '.join(features)}")
1785
-
1786
- # ایجاد و اجرای رابط کاربری
1787
- demo = create_benchmark_interface()
1788
-
1789
- # اجرا
1790
- demo.launch(
1791
- server_name="0.0.0.0",
1792
- server_port=7860,
1793
- share=True,
1794
- inbrowser=True,
1795
- show_error=True,
1796
- favicon_path=None,
1797
- ssl_verify=False
1798
- )
1799
-
1800
- if __name__ == "__main__":
1801
- main()