Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -68,267 +68,19 @@ except Exception as e:
|
|
| 68 |
logger.warning(f"⚠️ Auto-setup encountered an issue: {e}")
|
| 69 |
logger.info("ℹ️ Continuing with manual setup...")
|
| 70 |
|
| 71 |
-
class
|
| 72 |
def __init__(self):
|
| 73 |
self.mapping_table = {}
|
| 74 |
-
# counters بهروزرسانی شده با دستههای جدید
|
| 75 |
self.counters = {
|
| 76 |
-
'
|
| 77 |
-
'
|
| 78 |
-
'
|
|
|
|
| 79 |
}
|
| 80 |
-
|
| 81 |
self.api_key = os.getenv("OPENAI_API_KEY", "")
|
| 82 |
-
self.models_base_path = "./models"
|
| 83 |
-
self.models_loaded = False
|
| 84 |
-
self.model_status = {}
|
| 85 |
-
self.load_local_ner_models()
|
| 86 |
-
|
| 87 |
-
def ensure_models_directory(self):
|
| 88 |
-
if not os.path.exists(self.models_base_path):
|
| 89 |
-
try:
|
| 90 |
-
os.makedirs(self.models_base_path, exist_ok=True)
|
| 91 |
-
logger.info(f"📁 Created models directory: {self.models_base_path}")
|
| 92 |
-
except Exception as e:
|
| 93 |
-
logger.error(f"❌ Failed to create models directory: {e}")
|
| 94 |
-
return False
|
| 95 |
-
return True
|
| 96 |
-
|
| 97 |
-
def download_model_if_missing(self, local_name, hf_repo):
|
| 98 |
-
model_path = os.path.join(self.models_base_path, local_name)
|
| 99 |
-
if os.path.exists(model_path) and os.listdir(model_path):
|
| 100 |
-
return True, f"Model {local_name} already exists"
|
| 101 |
-
try:
|
| 102 |
-
logger.info(f"📥 Auto-downloading {local_name} from {hf_repo}...")
|
| 103 |
-
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 104 |
-
tokenizer = AutoTokenizer.from_pretrained(hf_repo)
|
| 105 |
-
model = AutoModelForTokenClassification.from_pretrained(hf_repo)
|
| 106 |
-
tokenizer.save_pretrained(model_path)
|
| 107 |
-
model.save_pretrained(model_path)
|
| 108 |
-
logger.info(f"✅ {local_name} auto-downloaded successfully")
|
| 109 |
-
return True, f"Downloaded {local_name}"
|
| 110 |
-
except Exception as e:
|
| 111 |
-
logger.error(f"❌ Auto-download failed for {local_name}: {e}")
|
| 112 |
-
return False, str(e)
|
| 113 |
-
|
| 114 |
-
def _load_pipeline(self, task, model_path, tokenizer_path=None):
|
| 115 |
-
"""لود مدل با مدیریت صحیح پارامترهای ورژن مختلف transformers"""
|
| 116 |
-
try:
|
| 117 |
-
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, __version__ as tr_version
|
| 118 |
-
|
| 119 |
-
# بررسی پشتیبانی از aggregation_strategy
|
| 120 |
-
supports_agg = version.parse(tr_version) >= version.parse("4.11.0")
|
| 121 |
-
|
| 122 |
-
# لود توکنایزر و مدل به صورت جداگانه
|
| 123 |
-
if tokenizer_path:
|
| 124 |
-
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
|
| 125 |
-
else:
|
| 126 |
-
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
|
| 127 |
-
|
| 128 |
-
model = AutoModelForTokenClassification.from_pretrained(model_path, local_files_only=True)
|
| 129 |
-
|
| 130 |
-
# ایجاد pipeline با پارامترهای مناسب
|
| 131 |
-
pipeline_kwargs = {
|
| 132 |
-
"model": model,
|
| 133 |
-
"tokenizer": tokenizer,
|
| 134 |
-
"device": -1 # استفاده از CPU
|
| 135 |
-
}
|
| 136 |
-
|
| 137 |
-
# اضافه کردن aggregation_strategy اگر پشتیبانی میشود
|
| 138 |
-
if supports_agg:
|
| 139 |
-
pipeline_kwargs["aggregation_strategy"] = "simple"
|
| 140 |
-
|
| 141 |
-
return pipeline(task, **pipeline_kwargs)
|
| 142 |
-
|
| 143 |
-
except Exception as e:
|
| 144 |
-
logger.error(f"❌ Failed to load pipeline for {model_path}: {e}")
|
| 145 |
-
return None
|
| 146 |
-
|
| 147 |
-
def load_local_ner_models(self):
|
| 148 |
-
logger.info("📄 Loading local NER models with auto-download...")
|
| 149 |
-
if not self.ensure_models_directory():
|
| 150 |
-
self.models_loaded = False
|
| 151 |
-
self.model_status['directory'] = "❌ Cannot create models directory"
|
| 152 |
-
return
|
| 153 |
-
|
| 154 |
-
try:
|
| 155 |
-
try:
|
| 156 |
-
import torch
|
| 157 |
-
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 158 |
-
transformers_available = True
|
| 159 |
-
logger.info("✅ Transformers library available")
|
| 160 |
-
except ImportError as e:
|
| 161 |
-
transformers_available = False
|
| 162 |
-
self.model_status['transformers'] = f"❌ Transformers library not installed: {str(e)}"
|
| 163 |
-
self.models_loaded = False
|
| 164 |
-
return
|
| 165 |
-
|
| 166 |
-
# Persian model
|
| 167 |
-
persian_model_path = os.path.join(self.models_base_path, "bert-fa-ner")
|
| 168 |
-
self.download_model_if_missing("bert-fa-ner", "HooshvareLab/bert-fa-zwnj-base-ner")
|
| 169 |
-
if os.path.exists(persian_model_path) and os.listdir(persian_model_path):
|
| 170 |
-
try:
|
| 171 |
-
self.persian_ner = self._load_pipeline("ner", persian_model_path)
|
| 172 |
-
if self.persian_ner:
|
| 173 |
-
self.model_status['persian'] = f"✅ Local Persian NER: {persian_model_path}"
|
| 174 |
-
else:
|
| 175 |
-
self.model_status['persian'] = f"❌ Failed to load Persian model: {persian_model_path}"
|
| 176 |
-
except Exception as e:
|
| 177 |
-
self.persian_ner = None
|
| 178 |
-
self.model_status['persian'] = f"❌ Persian model loading error: {str(e)[:100]}"
|
| 179 |
-
else:
|
| 180 |
-
self.persian_ner = None
|
| 181 |
-
self.model_status['persian'] = f"❌ Persian model not found: {persian_model_path}"
|
| 182 |
-
|
| 183 |
-
# English model
|
| 184 |
-
english_model_path = os.path.join(self.models_base_path, "bert-base-NER")
|
| 185 |
-
self.download_model_if_missing("bert-base-NER", "dslim/bert-base-NER")
|
| 186 |
-
if os.path.exists(english_model_path) and os.listdir(english_model_path):
|
| 187 |
-
try:
|
| 188 |
-
self.english_ner = self._load_pipeline("ner", english_model_path)
|
| 189 |
-
if self.english_ner:
|
| 190 |
-
self.model_status['english'] = f"✅ Local English NER: {english_model_path}"
|
| 191 |
-
else:
|
| 192 |
-
self.model_status['english'] = f"❌ Failed to load English model: {english_model_path}"
|
| 193 |
-
except Exception as e:
|
| 194 |
-
self.english_ner = None
|
| 195 |
-
self.model_status['english'] = f"❌ English model loading error: {str(e)[:100]}"
|
| 196 |
-
else:
|
| 197 |
-
self.english_ner = None
|
| 198 |
-
self.model_status['english'] = f"❌ English model not found: {english_model_path}"
|
| 199 |
-
|
| 200 |
-
loaded_models = sum(1 for status in self.model_status.values() if status.startswith("✅"))
|
| 201 |
-
self.models_loaded = loaded_models > 0
|
| 202 |
-
if loaded_models == 0:
|
| 203 |
-
self.model_status['fallback'] = "⚠️ Using regex-only mode (no local models found)"
|
| 204 |
-
|
| 205 |
-
except Exception as e:
|
| 206 |
-
self.models_loaded = False
|
| 207 |
-
self.model_status['critical'] = f"❌ Critical error: {str(e)[:100]}..."
|
| 208 |
-
|
| 209 |
-
def detect_language(self, text):
|
| 210 |
-
"""تشخیص زبان متن"""
|
| 211 |
-
if not text:
|
| 212 |
-
return 'fa'
|
| 213 |
-
|
| 214 |
-
persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
|
| 215 |
-
english_chars = len(re.findall(r'[a-zA-Z]', text))
|
| 216 |
-
total = persian_chars + english_chars
|
| 217 |
-
|
| 218 |
-
if total == 0:
|
| 219 |
-
return 'fa'
|
| 220 |
-
|
| 221 |
-
if persian_chars / total > 0.6:
|
| 222 |
-
return 'fa'
|
| 223 |
-
elif english_chars / total > 0.6:
|
| 224 |
-
return 'en'
|
| 225 |
-
else:
|
| 226 |
-
return 'mixed'
|
| 227 |
-
|
| 228 |
-
def extract_entities_with_ner(self, text, lang='fa'):
|
| 229 |
-
"""استخراج entities با مدلهای NER محلی"""
|
| 230 |
-
entities = []
|
| 231 |
-
|
| 232 |
-
if not self.models_loaded:
|
| 233 |
-
logger.info("ℹ️ Local NER models not available - using regex only")
|
| 234 |
-
return entities
|
| 235 |
-
|
| 236 |
-
try:
|
| 237 |
-
# مدل فارسی محلی
|
| 238 |
-
if lang in ['fa', 'mixed'] and hasattr(self, 'persian_ner') and self.persian_ner:
|
| 239 |
-
try:
|
| 240 |
-
persian_results = self.persian_ner(text)
|
| 241 |
-
for entity in persian_results:
|
| 242 |
-
# بررسی فرمت خروجی بر اساس ورژن transformers
|
| 243 |
-
if isinstance(entity, dict):
|
| 244 |
-
if 'entity_group' in entity:
|
| 245 |
-
# ورژن جدید با aggregation_strategy
|
| 246 |
-
entities.append({
|
| 247 |
-
'text': entity['word'].strip(),
|
| 248 |
-
'label': entity['entity_group'],
|
| 249 |
-
'start': entity['start'],
|
| 250 |
-
'end': entity['end'],
|
| 251 |
-
'confidence': entity['score'],
|
| 252 |
-
'source': 'local_persian_ner'
|
| 253 |
-
})
|
| 254 |
-
else:
|
| 255 |
-
# ورژن قدیمی
|
| 256 |
-
entities.append({
|
| 257 |
-
'text': entity['word'].strip(),
|
| 258 |
-
'label': entity['entity'],
|
| 259 |
-
'start': entity['start'],
|
| 260 |
-
'end': entity['end'],
|
| 261 |
-
'confidence': entity['score'],
|
| 262 |
-
'source': 'local_persian_ner'
|
| 263 |
-
})
|
| 264 |
-
logger.info(f"Local Persian NER found {len(persian_results)} entities")
|
| 265 |
-
except Exception as e:
|
| 266 |
-
logger.error(f"Local Persian NER extraction error: {e}")
|
| 267 |
-
|
| 268 |
-
# مدل انگلیسی محلی
|
| 269 |
-
if lang in ['en', 'mixed'] and hasattr(self, 'english_ner') and self.english_ner:
|
| 270 |
-
try:
|
| 271 |
-
english_results = self.english_ner(text)
|
| 272 |
-
for entity in english_results:
|
| 273 |
-
# بررسی فرمت خروجی بر اساس ورژن transformers
|
| 274 |
-
if isinstance(entity, dict):
|
| 275 |
-
if 'entity_group' in entity:
|
| 276 |
-
# ورژن جدید با aggregation_strategy
|
| 277 |
-
entities.append({
|
| 278 |
-
'text': entity['word'].strip(),
|
| 279 |
-
'label': entity['entity_group'],
|
| 280 |
-
'start': entity['start'],
|
| 281 |
-
'end': entity['end'],
|
| 282 |
-
'confidence': entity['score'],
|
| 283 |
-
'source': 'local_english_ner'
|
| 284 |
-
})
|
| 285 |
-
else:
|
| 286 |
-
# ورژن قدیمی
|
| 287 |
-
entities.append({
|
| 288 |
-
'text': entity['word'].strip(),
|
| 289 |
-
'label': entity['entity'],
|
| 290 |
-
'start': entity['start'],
|
| 291 |
-
'end': entity['end'],
|
| 292 |
-
'confidence': entity['score'],
|
| 293 |
-
'source': 'local_english_ner'
|
| 294 |
-
})
|
| 295 |
-
logger.info(f"Local English NER found {len(english_results)} entities")
|
| 296 |
-
except Exception as e:
|
| 297 |
-
logger.error(f"Local English NER extraction error: {e}")
|
| 298 |
-
|
| 299 |
-
except Exception as e:
|
| 300 |
-
logger.error(f"Local NER extraction general error: {e}")
|
| 301 |
-
|
| 302 |
-
# حذف تکراریها
|
| 303 |
-
unique_entities = []
|
| 304 |
-
seen = set()
|
| 305 |
-
for entity in entities:
|
| 306 |
-
key = (entity['text'].lower(), entity['start'], entity['end'])
|
| 307 |
-
if key not in seen:
|
| 308 |
-
seen.add(key)
|
| 309 |
-
unique_entities.append(entity)
|
| 310 |
-
|
| 311 |
-
logger.info(f"Total unique entities found by local models: {len(unique_entities)}")
|
| 312 |
-
return unique_entities
|
| 313 |
-
|
| 314 |
-
def map_ner_to_categories(self, ner_label, source=''):
|
| 315 |
-
"""نگاشت برچسبهای NER به دستههای سیستم"""
|
| 316 |
-
mapping = {
|
| 317 |
-
'PER': 'PERSON', 'PERSON': 'PERSON',
|
| 318 |
-
'ORG': 'COMPANY', 'ORGANIZATION': 'COMPANY',
|
| 319 |
-
'LOC': 'LOCATION', 'LOCATION': 'LOCATION',
|
| 320 |
-
'MISC': 'BUSINESS_TERMS', 'MISCELLANEOUS': 'BUSINESS_TERMS',
|
| 321 |
-
'B-PER': 'PERSON', 'I-PER': 'PERSON',
|
| 322 |
-
'B-ORG': 'COMPANY', 'I-ORG': 'COMPANY',
|
| 323 |
-
'B-LOC': 'LOCATION', 'I-LOC': 'LOCATION',
|
| 324 |
-
'B-MISC': 'BUSINESS_TERMS', 'I-MISC': 'BUSINESS_TERMS',
|
| 325 |
-
'MONEY': 'AMOUNT', 'PERCENT': 'PERCENTAGE',
|
| 326 |
-
'DATE': 'DATE', 'TIME': 'DATE'
|
| 327 |
-
}
|
| 328 |
-
return mapping.get(ner_label.upper(), 'BUSINESS_TERMS')
|
| 329 |
|
| 330 |
def anonymize_text(self, original_text, lang='fa'):
|
| 331 |
-
"""
|
| 332 |
try:
|
| 333 |
if not original_text or not original_text.strip():
|
| 334 |
return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
|
|
@@ -338,296 +90,53 @@ class BilingualDataAnonymizer:
|
|
| 338 |
self.counters = {key: 0 for key in self.counters.keys()}
|
| 339 |
|
| 340 |
anonymized = original_text
|
| 341 |
-
found_entities = set()
|
| 342 |
-
|
| 343 |
-
# تشخیص زبان
|
| 344 |
-
detected_lang = self.detect_language(original_text)
|
| 345 |
-
logger.info(f"Detected language: {detected_lang}")
|
| 346 |
-
|
| 347 |
-
# مرحله 1: استخراج با Local NER
|
| 348 |
-
if self.models_loaded:
|
| 349 |
-
logger.info("🤖 Running local NER extraction...")
|
| 350 |
-
ner_entities = self.extract_entities_with_ner(original_text, detected_lang)
|
| 351 |
-
|
| 352 |
-
for entity in ner_entities:
|
| 353 |
-
if (entity['text'] not in found_entities and
|
| 354 |
-
len(entity['text'].strip()) > 1 and
|
| 355 |
-
entity['confidence'] > 0.5):
|
| 356 |
-
|
| 357 |
-
category = self.map_ner_to_categories(entity['label'], entity['source'])
|
| 358 |
-
|
| 359 |
-
if entity['text'] not in self.mapping_table:
|
| 360 |
-
self.counters[category] += 1
|
| 361 |
-
code = f"{category}_{self.counters[category]:03d}_LOCAL_NER"
|
| 362 |
-
self.mapping_table[entity['text']] = code
|
| 363 |
-
found_entities.add(entity['text'])
|
| 364 |
-
logger.info(f"Local NER: {entity['text']} -> {code}")
|
| 365 |
-
else:
|
| 366 |
-
logger.info("ℹ️ Using regex-only mode")
|
| 367 |
|
| 368 |
-
#
|
| 369 |
-
patterns =
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
# نمادهای خارجی
|
| 378 |
-
r'(AAPL|GOOGL|MSFT|AMZN|TSLA|META|NVDA|SABIC|ARAMCO|ADNOC|QGPC|KNPC|SOCAR|LUKOIL|GAZPROM|ROSNEFT|TOTAL|BP|SHELL)(?=\s|$|,|\.)'
|
| 379 |
-
],
|
| 380 |
-
|
| 381 |
-
'COMPANY': [
|
| 382 |
-
# شرکتهای با مخفف در پرانتز
|
| 383 |
-
r'شرکت\s+[آ-ی\s\-]+\s*\([آ-یa-zA-Z\s]+\)',
|
| 384 |
-
|
| 385 |
-
# شرکتهای ساده
|
| 386 |
-
r'(?:شرکت|گروه|هلدینگ|موسسه|سازمان)\s+[آ-ی\s\-]+',
|
| 387 |
-
|
| 388 |
-
# بانکها و موسسات مالی
|
| 389 |
-
r'(?:بانک|موسسه|صندوق|بیمه)\s+[آ-ی\s\-]+',
|
| 390 |
-
|
| 391 |
-
# شرکتهای خارجی
|
| 392 |
-
r'[A-Za-z]+(?:\s+[A-Za-z]+)*\s+(?:Co\.|Company|Corp\.|Corporation|Inc\.|Limited|Ltd\.)',
|
| 393 |
-
|
| 394 |
-
# نامهای برند و پروژه
|
| 395 |
-
r'(?:آفتاب|آلفا\s+لیفت|ژنرالتورک|سپهرموتور|نِیپوش|تاپیکو|شپنا|شپدیس|والبر|شبندر)',
|
| 396 |
-
|
| 397 |
-
# الگوهای کلی
|
| 398 |
-
r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به|\s+طی)',
|
| 399 |
-
r'([آ-یa-zA-Z\s]+)\s+شرکت',
|
| 400 |
-
r'این\s+شرکت(?=\s|$|،|\.)',
|
| 401 |
-
r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))'
|
| 402 |
-
],
|
| 403 |
-
|
| 404 |
-
'PERSON': [
|
| 405 |
-
# نامهای با القاب
|
| 406 |
-
r'آقای\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
|
| 407 |
-
r'خانم\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
|
| 408 |
-
r'مهندس\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
|
| 409 |
-
r'دکتر\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)',
|
| 410 |
-
|
| 411 |
-
# نامهای با مقام اداری
|
| 412 |
-
r'([آ-یa-zA-Z]+\s+[آ-یa-zA-Z]+)(?=،\s+مدیرعامل|\s+مدیرعامل|\s+رئیس)',
|
| 413 |
-
r'مدیرعامل(?=\s|$|،|\.)',
|
| 414 |
-
r'سرپرست(?=\s+و|\s|$|،|\.)',
|
| 415 |
-
r'رئیس\s+هیأتمدیره',
|
| 416 |
-
r'معاون\s+(?:اجرایی|مالی|فروش|بازاریابی|تولید)',
|
| 417 |
-
|
| 418 |
-
# نامهای چند قسمتی
|
| 419 |
-
r'[آ-ی]+\s+[آ-ی]+\s+[آ-ی]+(?:\s+(?:فر|زاده|پور|نژاد|یان|لو))?',
|
| 420 |
-
r'[آ-ی]+\s+[آ-ی]+(?:\s+(?:فر|زاده|پور|نژاد|یان|لو))?',
|
| 421 |
-
|
| 422 |
-
# نامهای با نیمفاصله
|
| 423 |
-
r'[آ-ی]+[آ-ی]+(?:\s+[آ-ی]+)*',
|
| 424 |
-
|
| 425 |
-
# ضمایر و اشارات
|
| 426 |
-
r'وی(?=\s+ادامه|\s+اظهار|\s+گفت|\s+اعلام|\s+همچنین)',
|
| 427 |
-
r'ایشان(?=\s+گفت|\s+اعلام|\s+بیان)'
|
| 428 |
-
],
|
| 429 |
-
|
| 430 |
-
'AMOUNT': [
|
| 431 |
-
# مبالغ با ویرگول و واحدهای مالی
|
| 432 |
-
r'(?:منفی\s+|مثبت\s+|حدود\s+|بیش\s+از\s+|نزدیک\s+به\s+|کمتر\s+از\s+)?'
|
| 433 |
-
r'\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:میلیون|میلیارد|هزار)\s*(?:ریال|تومان|دلار|یورو|درهم)',
|
| 434 |
-
|
| 435 |
-
# مبالغ با نقطه اروپایی
|
| 436 |
-
r'(?:منفی\s+|مثبت\s+|حدود\s+|بیش\s+از\s+|نزدیک\s+به\s+|کمتر\s+از\s+)?'
|
| 437 |
-
r'\d{1,3}(?:\.\d{3})*(?:,\d+)?\s*(?:میلیون|میلیارد|هزار)\s*(?:ریال|تومان|دلار|یورو|درهم)',
|
| 438 |
-
|
| 439 |
-
# مبالغ اعشاری با واحدهای مختلف
|
| 440 |
-
r'(?:منفی\s+|مثبت\s+|حدود\s+|بیش\s+از\s+|نزدیک\s+به\s+|کمتر\s+از\s+)?'
|
| 441 |
-
r'\d+(?:\.\d+)?\s*(?:میلیون|میلیارد|هزار)\s*(?:ریال|تومان|همت|دلار|نفر|تن|دستگاه|واحد|بشکه)',
|
| 442 |
-
|
| 443 |
-
# مبالغ ساده
|
| 444 |
-
r'(?:منفی\s+|مثبت\s+|حدود\s+|بیش\s+از\s+|نزدیک\s+به\s+|کمتر\s+از\s+)?'
|
| 445 |
-
r'\d{1,3}(?:,\d{3})*\s*(?:ریال|تومان|همت|دلار|یورو|درهم)(?:ی)?',
|
| 446 |
-
|
| 447 |
-
# بازههای مقداری
|
| 448 |
-
r'\d+(?:\.\d+)?\s*(?:تا|الی|–|-)\s*\d+(?:\.\d+)?\s*(?:میلیون|میلیارد|هزار)?\s*(?:ریال|تومان|نفر|تن|دستگاه|ماه|سال|درصد)',
|
| 449 |
-
|
| 450 |
-
# مبالغ فارسی با "هزار و"
|
| 451 |
-
r'(?:منفی\s+|مثبت\s+|حدود\s+|بیش\s+از\s+|نزدیک\s+به\s+)?'
|
| 452 |
-
r'\d+\s*هزار\s*(?:و\s*)?\d*\s*(?:میلیارد|میلیون)?\s*(?:ریال|تومان)(?:ی)?',
|
| 453 |
-
|
| 454 |
-
# واحدهای تخصصی و انرژی
|
| 455 |
-
r'\d+(?:\.\d+)?\s*(?:Wh/kg|مگاوات|میلیثانیه|CFU/ml|تن-کیلومتر|مگابایت|گیگابایت|کیلووات|گیگاوات)',
|
| 456 |
-
|
| 457 |
-
# مبالغ با کلمات توضیحی
|
| 458 |
-
r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*(?:تومان|ریال)',
|
| 459 |
-
r'رقم\s+(?:فعلی\s+)?\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*(?:تومان|ریال)',
|
| 460 |
-
r'(?:به|از|برابر\s+با)\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*(?:تومان|ریال)',
|
| 461 |
-
r'\d+(?:میلیارد|میلیون)\s*(?:تومان|ریال)(?=\s+رسیده|\s+ثبت|\s+بوده|\s+،)',
|
| 462 |
-
|
| 463 |
-
# مبالغ خارجی
|
| 464 |
-
r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?',
|
| 465 |
-
r'€\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?',
|
| 466 |
-
r'AED\s*\d+(?:,\d{3})*(?:\.\d+)?',
|
| 467 |
-
r'SAR\s*\d+(?:,\d{3})*(?:\.\d+)?'
|
| 468 |
-
],
|
| 469 |
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
# درصدهای با کلمات توضیحی
|
| 479 |
-
r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش|\s+بالاتر|\s+پایینتر|\s+سود|\s+ضرر)?',
|
| 480 |
-
r'معادل\s+\d+(?:\.\d+)?\s*درصد',
|
| 481 |
-
r'حدود\s+\d+(?:\.\d+)?\s*درصد',
|
| 482 |
-
r'با\s+\d+(?:\.\d+)?\s*درصد\s+(?:افزایش|کاهش|رشد)',
|
| 483 |
-
r'رشد\s+\d+(?:\.\d+)?\s*درصدی',
|
| 484 |
-
r'\d+(?:\.\d+)?\s*درصدی(?=\s+همراه|\s+بوده|\s+رشد|\s+کاهش)',
|
| 485 |
-
|
| 486 |
-
# نسبتها و ضرایب
|
| 487 |
-
r'نسبت\s+\d+(?:\.\d+)?\s*(?:به\s+\d+(?:\.\d+)?|\s*:|\s*برابر)',
|
| 488 |
-
r'ضریب\s+\d+(?:\.\d+)?',
|
| 489 |
-
r'میزان\s+رشد(?=\s+نسبت|\s+معادل)',
|
| 490 |
-
r'افزایش\s+قابلتوجهی',
|
| 491 |
-
r'بهبود\s+نسبی'
|
| 492 |
-
],
|
| 493 |
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
r'\d+(?:\.\d+)?\s*میلیون\s*تن(?=\s+در\s+سال|\s+سالانه|\s|$)',
|
| 498 |
-
r'\d+\s*هزار\s*بشکه(?=\s+در\s+روز|\s+روزانه|\s|$)',
|
| 499 |
-
r'\d+(?:,\d{3})*\s*دستگاه(?=\s+تولید|\s+فروش|\s+صادرات|\s|$)',
|
| 500 |
-
r'\d+(?:,\d{3})*\s*واحد(?=\s+مسکونی|\s+تجاری|\s+صنعتی|\s|$)',
|
| 501 |
-
|
| 502 |
-
# ظرفیتها
|
| 503 |
-
r'ظرفیت\s+\d+(?:,\d{3})*\s*(?:تن|دستگاه|واحد)',
|
| 504 |
-
r'تولید\s+\d+(?:,\d{3})*\s*(?:تن|دستگاه)',
|
| 505 |
-
r'فروش\s+\d+(?:,\d{3})*\s*(?:دستگاه|واحد)'
|
| 506 |
-
],
|
| 507 |
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
r'بازده\s+(?:سرمایه|دارایی|سهام)',
|
| 518 |
-
r'نرخ\s+(?:سود|بهره|تورم|رشد)',
|
| 519 |
-
|
| 520 |
-
# سامانهها و سیستمها
|
| 521 |
-
r'سامانه\s+(?:سجام|کدال|سپام|فرابورس)',
|
| 522 |
-
r'سیستم\s+(?:معاملاتی|بانکی|پرداخت)',
|
| 523 |
-
|
| 524 |
-
# اصطلاحات فنی
|
| 525 |
-
r'(?:RFID|DAP|CIF|FOB|API|SDK|CRM|ERP)',
|
| 526 |
-
r'Read-Replica'
|
| 527 |
-
],
|
| 528 |
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}',
|
| 532 |
-
r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}',
|
| 533 |
-
|
| 534 |
-
# تاریخ با نام ماه فارسی
|
| 535 |
-
r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)(?:ماه)?\s*(?:سال\s*)?(?:[۰-۹0-9]{4})',
|
| 536 |
-
|
| 537 |
-
# تاریخ انگلیسی
|
| 538 |
-
r'(?:[0-9]{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(?:[0-9]{4})',
|
| 539 |
-
r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s*[0-9]{1,2},?\s*[0-9]{4}',
|
| 540 |
-
|
| 541 |
-
# کوارتال و دورههای مالی
|
| 542 |
-
r'Q[1-4]-\d{4}',
|
| 543 |
-
r'کوارتال\s+(?:اول|دوم|سوم|چهارم|\d)',
|
| 544 |
-
r'نیمسال\s+(?:اول|دوم)',
|
| 545 |
-
r'سال\s+مالی\s+\d{4}',
|
| 546 |
-
r'دوره\s+\d+\s*ماهه',
|
| 547 |
-
|
| 548 |
-
# زمانهای دقیق
|
| 549 |
-
r'\d+\s*(?:دقیقه|ساعت|روز|هفته|ماه|سال)(?:ه)?',
|
| 550 |
-
r'طی\s+\d+\s*(?:روز|ماه|سال)',
|
| 551 |
-
r'در\s+\d+\s*(?:ماه|سال)\s+گذشته'
|
| 552 |
-
],
|
| 553 |
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
r'(?:CFO|CEO|CTO|CMO|COO)(?=\s|$)',
|
| 557 |
-
r'مدیر\s+(?:عامل|اجرایی|فروش|بازاریابی|مالی|تولید|فناوری)',
|
| 558 |
-
r'رئیس\s+(?:هیأتمدیره|شورای\s+نظارت)',
|
| 559 |
-
r'معاون\s+(?:اجرایی|مالی|فروش|تولید)',
|
| 560 |
-
|
| 561 |
-
# ساختار شرکتی
|
| 562 |
-
r'هیأتمدیره',
|
| 563 |
-
r'مجمع\s+(?:عمومی|فوقالعاده)',
|
| 564 |
-
r'سهامداران\s+(?:عمده|خرد|اکثریت|اقلیت)',
|
| 565 |
-
r'شورای\s+نظارت',
|
| 566 |
-
r'حسابرس\s+(?:مستقل|قانونی)',
|
| 567 |
|
| 568 |
-
#
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
r'سهم\s+بازار',
|
| 576 |
-
r'موقعیت\s+رقابتی',
|
| 577 |
-
r'مزیت\s+رقابتی',
|
| 578 |
-
r'بازار\s+(?:هدف|محلی|جهانی)',
|
| 579 |
-
|
| 580 |
-
# عملکرد مالی
|
| 581 |
-
r'عملکرد\s+(?:مالی|عملیاتی)',
|
| 582 |
-
r'بازدهی\s+(?:سرمایه|فروش)',
|
| 583 |
-
r'حاشیه\s+(?:سود|فروش)',
|
| 584 |
-
r'نقطه\s+سربهسر'
|
| 585 |
-
]
|
| 586 |
-
}
|
| 587 |
-
|
| 588 |
-
# پردازش patterns با اولویتبندی - از خاص به عام
|
| 589 |
-
logger.info("🔍 Running prioritized regex extraction...")
|
| 590 |
-
|
| 591 |
-
# پردازش به ترتیب اولویت برای جلوگیری از تداخل
|
| 592 |
-
processed_entities = set() # برای جلوگیری از تکرار
|
| 593 |
-
|
| 594 |
-
for category, pattern_list in patterns.items():
|
| 595 |
-
for pattern in pattern_list:
|
| 596 |
-
matches = re.finditer(pattern, original_text, re.IGNORECASE | re.MULTILINE)
|
| 597 |
-
for match in matches:
|
| 598 |
-
if match.groups():
|
| 599 |
-
item = match.group(1).strip()
|
| 600 |
-
full_match = match.group(0).strip()
|
| 601 |
-
else:
|
| 602 |
-
item = match.group(0).strip()
|
| 603 |
-
full_match = item
|
| 604 |
-
|
| 605 |
-
# بررسی تداخل با entities قبلی
|
| 606 |
-
overlaps = False
|
| 607 |
-
match_start, match_end = match.span()
|
| 608 |
-
|
| 609 |
-
for proc_start, proc_end in processed_entities:
|
| 610 |
-
# بررسی تداخل موقعیت
|
| 611 |
-
if not (match_end <= proc_start or match_start >= proc_end):
|
| 612 |
-
overlaps = True
|
| 613 |
-
break
|
| 614 |
-
|
| 615 |
-
if (not overlaps and
|
| 616 |
-
full_match not in found_entities and
|
| 617 |
-
full_match not in self.mapping_table and
|
| 618 |
-
len(full_match) >= 2):
|
| 619 |
-
|
| 620 |
-
self.counters[category] += 1
|
| 621 |
-
code = f"{category}_{self.counters[category]:03d}_REGEX"
|
| 622 |
-
self.mapping_table[full_match] = code
|
| 623 |
-
found_entities.add(full_match)
|
| 624 |
-
processed_entities.add((match_start, match_end))
|
| 625 |
-
logger.info(f"Regex ({category}): {full_match} -> {code}")
|
| 626 |
-
|
| 627 |
-
# جایگزینی در متن با ترتیب طولانیترین اول
|
| 628 |
-
sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True)
|
| 629 |
-
for original_item, code in sorted_items:
|
| 630 |
-
anonymized = anonymized.replace(original_item, code)
|
| 631 |
|
| 632 |
logger.info(f"✅ Anonymization completed. Found {len(self.mapping_table)} entities.")
|
| 633 |
return anonymized
|
|
@@ -636,7 +145,7 @@ class BilingualDataAnonymizer:
|
|
| 636 |
return f"❌ Error in anonymization: {str(e)}" if lang == 'en' else f"❌ خطا در ناشناسسازی: {str(e)}"
|
| 637 |
|
| 638 |
def send_to_chatgpt(self, anonymized_text, lang='fa'):
|
| 639 |
-
"""
|
| 640 |
try:
|
| 641 |
if not anonymized_text or not anonymized_text.strip():
|
| 642 |
return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناسشده خالی است!"
|
|
@@ -686,7 +195,7 @@ class BilingualDataAnonymizer:
|
|
| 686 |
return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}"
|
| 687 |
|
| 688 |
def deanonymize_response(self, gpt_response, lang='fa'):
|
| 689 |
-
"""
|
| 690 |
try:
|
| 691 |
if not gpt_response or not gpt_response.strip():
|
| 692 |
return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!"
|
|
@@ -697,59 +206,16 @@ class BilingualDataAnonymizer:
|
|
| 697 |
final_result = gpt_response
|
| 698 |
reverse_mapping = {code: original for original, code in self.mapping_table.items()}
|
| 699 |
|
|
|
|
| 700 |
sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True)
|
| 701 |
for code, original in sorted_codes:
|
| 702 |
final_result = final_result.replace(code, original)
|
| 703 |
-
escaped_code = code.replace('_', '\\_')
|
| 704 |
-
final_result = final_result.replace(escaped_code, original)
|
| 705 |
|
| 706 |
return final_result
|
| 707 |
|
| 708 |
except Exception as e:
|
| 709 |
return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}"
|
| 710 |
|
| 711 |
-
def get_model_status(self):
|
| 712 |
-
"""وضعیت مدلهای محلی"""
|
| 713 |
-
status = "🤖 **Local Model Status (Business & Financial Data Focus):**\n\n"
|
| 714 |
-
|
| 715 |
-
if hasattr(self, 'model_status') and self.model_status:
|
| 716 |
-
for model_type, model_status in self.model_status.items():
|
| 717 |
-
if model_type == 'persian':
|
| 718 |
-
status += f"• **Persian NER**: {model_status}\n"
|
| 719 |
-
elif model_type == 'english':
|
| 720 |
-
status += f"• **English NER**: {model_status}\n"
|
| 721 |
-
elif model_type == 'financial':
|
| 722 |
-
status += f"• **Financial NER**: {model_status}\n"
|
| 723 |
-
elif model_type == 'transformers':
|
| 724 |
-
status += f"• **Transformers**: {model_status}\n"
|
| 725 |
-
elif model_type == 'fallback':
|
| 726 |
-
status += f"• **Fallback Mode**: {model_status}\n"
|
| 727 |
-
elif model_type == 'critical':
|
| 728 |
-
status += f"• **Critical**: {model_status}\n"
|
| 729 |
-
elif model_type == 'directory':
|
| 730 |
-
status += f"• **Directory**: {model_status}\n"
|
| 731 |
-
|
| 732 |
-
loaded_count = sum(1 for status in getattr(self, 'model_status', {}).values()
|
| 733 |
-
if status.startswith("✅"))
|
| 734 |
-
status += f"\n📊 **Summary**: {loaded_count}/2 local models loaded"
|
| 735 |
-
|
| 736 |
-
status += f"\n📁 **Models Path**: {self.models_base_path}"
|
| 737 |
-
status += f"\n🔧 **Latest Features**: Business & Financial Data Detection"
|
| 738 |
-
|
| 739 |
-
status += f"\n\n🎯 **Business & Financial Data Detection:**"
|
| 740 |
-
status += f"\n 💼 **Company Data**: Stock symbols, company names, business terms"
|
| 741 |
-
status += f"\n 💰 **Financial Data**: Amounts, percentages, volumes, ratios"
|
| 742 |
-
status += f"\n 👔 **Executive Data**: Person names with business titles"
|
| 743 |
-
status += f"\n 📊 **Market Data**: Financial terms, dates, performance metrics"
|
| 744 |
-
|
| 745 |
-
status += f"\n\n✨ **Key Features:**"
|
| 746 |
-
status += f"\n 🎯 Overlap detection prevents double-matching"
|
| 747 |
-
status += f"\n 🏢 Focus on business and financial information"
|
| 748 |
-
status += f"\n 📈 Advanced financial pattern recognition"
|
| 749 |
-
status += f"\n 🔍 Length-based replacement order"
|
| 750 |
-
|
| 751 |
-
return status
|
| 752 |
-
|
| 753 |
def process_all_steps(input_text, language):
|
| 754 |
"""پردازش خودکار تمام مراحل"""
|
| 755 |
lang = 'en' if language == 'English' else 'fa'
|
|
@@ -768,12 +234,7 @@ def process_all_steps(input_text, language):
|
|
| 768 |
gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang)
|
| 769 |
if gpt_response.startswith("❌"):
|
| 770 |
entities_found = len(anonymizer.mapping_table)
|
| 771 |
-
|
| 772 |
-
regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code)
|
| 773 |
-
|
| 774 |
-
method = "Business-Focused Local NER + Regex" if anonymizer.models_loaded else "Business-Focused Regex Only"
|
| 775 |
-
success_msg = (f"✅ Anonymization completed with {method}!\n"
|
| 776 |
-
f"🏢 Business data: {entities_found} | 🤖 NER: {local_ner_count} | 🔍 Regex: {regex_count}\n"
|
| 777 |
f"📊 Total: {entities_found} entities protected")
|
| 778 |
return success_msg, anonymized_text, gpt_response, ""
|
| 779 |
|
|
@@ -781,25 +242,15 @@ def process_all_steps(input_text, language):
|
|
| 781 |
|
| 782 |
total_time = time.time() - start_time
|
| 783 |
entities_found = len(anonymizer.mapping_table)
|
| 784 |
-
local_ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_LOCAL_NER' in code)
|
| 785 |
-
regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code)
|
| 786 |
|
| 787 |
# آمار تفصیلی
|
| 788 |
-
company_count =
|
| 789 |
-
amount_count =
|
| 790 |
-
percent_count =
|
| 791 |
-
|
| 792 |
|
| 793 |
-
business_details = []
|
| 794 |
-
if company_count > 0: business_details.append(f"🏢 Companies: {company_count}")
|
| 795 |
-
if amount_count > 0: business_details.append(f"💰 Amounts: {amount_count}")
|
| 796 |
-
if percent_count > 0: business_details.append(f"📊 Percentages: {percent_count}")
|
| 797 |
-
if stock_count > 0: business_details.append(f"📈 Stocks: {stock_count}")
|
| 798 |
-
|
| 799 |
-
method = "Business-Focused Local NER + Regex" if anonymizer.models_loaded else "Business-Focused Regex Only"
|
| 800 |
success_msg = (f"🎉 Complete anonymization & restoration successful!\n"
|
| 801 |
-
f"
|
| 802 |
-
f"🏢 Business data: {' | '.join(business_details) if business_details else '0'}\n"
|
| 803 |
f"📊 Total: {entities_found} entities | ⏱️ Time: {total_time:.2f}s")
|
| 804 |
|
| 805 |
return success_msg, anonymized_text, gpt_response, final_result
|
|
@@ -815,64 +266,26 @@ def get_mapping_table(language):
|
|
| 815 |
if not anonymizer.mapping_table:
|
| 816 |
return "❌ Mapping table is empty! Please process some text first." if lang == 'en' else "❌ جدول نگاشت خالی است! ابتدا متنی را پردازش کنید."
|
| 817 |
|
| 818 |
-
result = "📋 **
|
| 819 |
-
|
| 820 |
-
local_ner_items = {k: v for k, v in anonymizer.mapping_table.items() if '_LOCAL_NER' in v}
|
| 821 |
-
regex_items = {k: v for k, v in anonymizer.mapping_table.items() if '_REGEX' in v}
|
| 822 |
|
| 823 |
-
# گروهبندی بر اساس نوع
|
| 824 |
-
|
| 825 |
-
'
|
| 826 |
-
'
|
| 827 |
-
'
|
| 828 |
-
'
|
| 829 |
-
'PERSON': '👔 **Business Executives & Personnel**'
|
| 830 |
}
|
| 831 |
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
category_items = {k: v for k, v in anonymizer.mapping_table.items() if category in v}
|
| 835 |
if category_items:
|
| 836 |
-
business_found = True
|
| 837 |
result += f"{title}:\n"
|
| 838 |
-
for original, code in
|
| 839 |
result += f" • `{original}` → `{code}`\n"
|
| 840 |
-
if len(category_items) > 8:
|
| 841 |
-
result += f" ... و {len(category_items) - 8} مورد دیگر\n"
|
| 842 |
result += "\n"
|
| 843 |
|
| 844 |
-
if local_ner_items:
|
| 845 |
-
result += "🤖 **Local NER Detected**:\n"
|
| 846 |
-
for original, code in list(local_ner_items.items())[:8]:
|
| 847 |
-
result += f" • `{original}` → `{code}`\n"
|
| 848 |
-
if len(local_ner_items) > 8:
|
| 849 |
-
result += f" ... و {len(local_ner_items) - 8} مورد دیگر\n"
|
| 850 |
-
result += "\n"
|
| 851 |
-
|
| 852 |
-
# سایر موارد
|
| 853 |
-
other_categories = ['VOLUME', 'FINANCIAL_TERMS', 'BUSINESS_TERMS', 'DATE']
|
| 854 |
-
other_items = {k: v for k, v in regex_items.items()
|
| 855 |
-
if any(cat in v for cat in other_categories)}
|
| 856 |
-
|
| 857 |
-
if other_items:
|
| 858 |
-
result += "📋 **Other Business Data**:\n"
|
| 859 |
-
for original, code in list(other_items.items())[:8]:
|
| 860 |
-
result += f" • `{original}` → `{code}`\n"
|
| 861 |
-
if len(other_items) > 8:
|
| 862 |
-
result += f" ... و {len(other_items) - 8} مور�� دیگر\n"
|
| 863 |
-
|
| 864 |
# آمار کلی
|
| 865 |
-
|
| 866 |
-
for cat in business_categories.keys())
|
| 867 |
-
|
| 868 |
-
result += f"\n📊 **Statistics**:\n"
|
| 869 |
-
result += f"🏢 **Business Data**: {business_count} items\n"
|
| 870 |
-
result += f"🤖 **NER Detected**: {len(local_ner_items)} items\n"
|
| 871 |
-
result += f"📋 **Other Data**: {len(other_items)} items\n"
|
| 872 |
-
result += f"📈 **Total**: {len(anonymizer.mapping_table)} entities\n"
|
| 873 |
-
|
| 874 |
-
result += f"\n✨ **Focus**: Business & financial data protection without personal sensitive information\n"
|
| 875 |
-
result += f"🎯 **Success**: All business-critical data detected and anonymized!"
|
| 876 |
|
| 877 |
return result
|
| 878 |
|
|
@@ -886,30 +299,28 @@ def update_ui_text(language):
|
|
| 886 |
"""بهروزرسانی متنهای رابط کاربری"""
|
| 887 |
if language == 'English':
|
| 888 |
return {
|
| 889 |
-
'title': 'Business
|
| 890 |
'step1': 'Input Text & Settings',
|
| 891 |
'step2': 'Anonymized Text',
|
| 892 |
'step3': 'Raw ChatGPT Response',
|
| 893 |
'step4': 'Final Restored Response',
|
| 894 |
-
'input_placeholder': 'Enter your business text here...\nExample: Company
|
| 895 |
-
'process_btn': 'Process with
|
| 896 |
'clear_btn': 'Clear All',
|
| 897 |
-
'mapping_btn': 'Show
|
| 898 |
-
'copy_btn': 'Copy',
|
| 899 |
'direction': 'ltr'
|
| 900 |
}
|
| 901 |
else:
|
| 902 |
return {
|
| 903 |
-
'title': 'سیستم ناشناسسازی
|
| 904 |
'step1': 'متن ورودی و تنظیمات',
|
| 905 |
'step2': 'متن ناشناسشده',
|
| 906 |
'step3': 'پاسخ خام ChatGPT',
|
| 907 |
'step4': 'پاسخ نهایی بازگردانده شده',
|
| 908 |
-
'input_placeholder': 'متن تجاری خود را اینجا وارد کنید...\nمثال:
|
| 909 |
-
'process_btn': 'پردازش با تشخیص
|
| 910 |
'clear_btn': 'پاک کردن همه',
|
| 911 |
-
'mapping_btn': 'نمایش جدول نگاشت
|
| 912 |
-
'copy_btn': 'کپی',
|
| 913 |
'direction': 'rtl'
|
| 914 |
}
|
| 915 |
|
|
@@ -918,7 +329,6 @@ def update_interface(language):
|
|
| 918 |
ui_text = update_ui_text(language)
|
| 919 |
is_english = (language == 'English')
|
| 920 |
|
| 921 |
-
# تغییر direction برای workflow
|
| 922 |
workflow_css = "workflow ltr" if is_english else "workflow rtl"
|
| 923 |
|
| 924 |
return [
|
|
@@ -940,9 +350,9 @@ def update_interface(language):
|
|
| 940 |
]
|
| 941 |
|
| 942 |
# ایجاد instance
|
| 943 |
-
anonymizer =
|
| 944 |
|
| 945 |
-
# CSS اصلاح شده
|
| 946 |
custom_css = """
|
| 947 |
body, .gradio-container {
|
| 948 |
font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important;
|
|
@@ -1100,38 +510,10 @@ h1 {
|
|
| 1100 |
height: 300px !important;
|
| 1101 |
}
|
| 1102 |
}
|
| 1103 |
-
|
| 1104 |
-
[data-testid="textbox"]:dir(rtl) {
|
| 1105 |
-
text-align: right !important;
|
| 1106 |
-
direction: rtl !important;
|
| 1107 |
-
}
|
| 1108 |
-
|
| 1109 |
-
[data-testid="textbox"]:dir(ltr) {
|
| 1110 |
-
text-align: left !important;
|
| 1111 |
-
direction: ltr !important;
|
| 1112 |
-
}
|
| 1113 |
-
|
| 1114 |
-
.gradio-container .gradio-column {
|
| 1115 |
-
align-self: start !important;
|
| 1116 |
-
vertical-align: top !important;
|
| 1117 |
-
}
|
| 1118 |
-
|
| 1119 |
-
.gradio-container .gradio-row {
|
| 1120 |
-
align-items: flex-start !important;
|
| 1121 |
-
}
|
| 1122 |
-
|
| 1123 |
-
* {
|
| 1124 |
-
box-sizing: border-box !important;
|
| 1125 |
-
}
|
| 1126 |
-
|
| 1127 |
-
.gradio-container {
|
| 1128 |
-
align-items: start !important;
|
| 1129 |
-
justify-content: start !important;
|
| 1130 |
-
}
|
| 1131 |
"""
|
| 1132 |
|
| 1133 |
-
# رابط کاربری Gradio
|
| 1134 |
-
with gr.Blocks(title="📊
|
| 1135 |
|
| 1136 |
with gr.Row():
|
| 1137 |
language_selector = gr.Radio(
|
|
@@ -1142,7 +524,7 @@ with gr.Blocks(title="📊 Business-Focused Anonymization System", theme=gr.them
|
|
| 1142 |
)
|
| 1143 |
|
| 1144 |
with gr.Column():
|
| 1145 |
-
title = gr.HTML("<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 سیستم ناشناسسازی
|
| 1146 |
|
| 1147 |
with gr.Row(elem_classes="workflow rtl") as workflow_row:
|
| 1148 |
with gr.Column(elem_classes="workflow-column"):
|
|
@@ -1150,12 +532,12 @@ with gr.Blocks(title="📊 Business-Focused Anonymization System", theme=gr.them
|
|
| 1150 |
|
| 1151 |
input_text = gr.Textbox(
|
| 1152 |
lines=15,
|
| 1153 |
-
placeholder="متن تجاری خود را اینجا وارد کنید...\n
|
| 1154 |
label="",
|
| 1155 |
rtl=True
|
| 1156 |
)
|
| 1157 |
|
| 1158 |
-
process_btn = gr.Button("🚀 پردازش با تشخیص
|
| 1159 |
clear_btn = gr.Button("🗑️ پاک کردن همه", variant="stop")
|
| 1160 |
|
| 1161 |
status = gr.Textbox(
|
|
@@ -1201,8 +583,8 @@ with gr.Blocks(title="📊 Business-Focused Anonymization System", theme=gr.them
|
|
| 1201 |
|
| 1202 |
with gr.Row():
|
| 1203 |
with gr.Column():
|
| 1204 |
-
mapping_title = gr.HTML('<h2>🗂️ جدول نگاشت
|
| 1205 |
-
mapping_btn = gr.Button("📋 نمایش جدول نگاشت
|
| 1206 |
|
| 1207 |
mapping_output = gr.Textbox(
|
| 1208 |
lines=10,
|
|
@@ -1244,4 +626,4 @@ with gr.Blocks(title="📊 Business-Focused Anonymization System", theme=gr.them
|
|
| 1244 |
)
|
| 1245 |
|
| 1246 |
if __name__ == "__main__":
|
| 1247 |
-
app.launch(
|
|
|
|
| 68 |
logger.warning(f"⚠️ Auto-setup encountered an issue: {e}")
|
| 69 |
logger.info("ℹ️ Continuing with manual setup...")
|
| 70 |
|
| 71 |
+
class SimpleAnonymizer:
|
| 72 |
def __init__(self):
|
| 73 |
self.mapping_table = {}
|
|
|
|
| 74 |
self.counters = {
|
| 75 |
+
'company': 0,
|
| 76 |
+
'person': 0,
|
| 77 |
+
'amount': 0,
|
| 78 |
+
'percent': 0
|
| 79 |
}
|
|
|
|
| 80 |
self.api_key = os.getenv("OPENAI_API_KEY", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
def anonymize_text(self, original_text, lang='fa'):
|
| 83 |
+
"""ناشناسسازی ساده و دقیق"""
|
| 84 |
try:
|
| 85 |
if not original_text or not original_text.strip():
|
| 86 |
return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!"
|
|
|
|
| 90 |
self.counters = {key: 0 for key in self.counters.keys()}
|
| 91 |
|
| 92 |
anonymized = original_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
# الگوهای ساده و دقیق
|
| 95 |
+
patterns = [
|
| 96 |
+
# شرکتها - فقط نامهای کامل شرکتها
|
| 97 |
+
(r'ایران\s+خودرو', 'company'),
|
| 98 |
+
(r'سایپا', 'company'),
|
| 99 |
+
(r'بانک\s+[آ-ی]+(?:\s+[آ-ی]+)?', 'company'),
|
| 100 |
+
(r'شرکت\s+[آ-ی]+(?:\s+[آ-ی]+)*', 'company'),
|
| 101 |
+
(r'گروه\s+[آ-ی]+(?:\s+[آ-ی]+)*', 'company'),
|
| 102 |
+
(r'موسسه\s+[آ-ی]+(?:\s+[آ-ی]+)*', 'company'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
+
# مبالغ مالی - فقط مبالغ کامل
|
| 105 |
+
(r'\d+\s*هزار\s*(?:و\s*)?\d*\s*(?:میلیارد|میلیون)\s*(?:ریال|تومان)', 'amount'),
|
| 106 |
+
(r'\d+(?:,\d{3})*\s*(?:میلیارد|میلیون|هزار)\s*(?:ریال|تومان)', 'amount'),
|
| 107 |
+
(r'\d+(?:\.\d+)?\s*(?:میلیارد|میلیون|هزار)\s*(?:ریال|تومان|همت)', 'amount'),
|
| 108 |
+
(r'\d+\s*همت', 'amount'),
|
| 109 |
+
(r'\d+\s*میلیون\s*تومان', 'amount'),
|
| 110 |
+
(r'بیش\s+از\s+\d+\s*همت', 'amount'),
|
| 111 |
+
(r'حدود\s+\d+\s*میلیون\s*تومان', 'amount'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
# درصدها - فقط درصدهای کامل
|
| 114 |
+
(r'\d+(?:\.\d+)?\s*درصد', 'percent'),
|
| 115 |
+
(r'\d+\s*٪', 'percent'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
# نام اشخاص - فقط با القاب یا عناوین مشخص
|
| 118 |
+
(r'(?:آقای|خانم|مهندس|دکتر)\s+[آ-ی]+(?:\s+[آ-ی]+)+', 'person'),
|
| 119 |
+
(r'[آ-ی]+\s+[آ-ی]+\s+مدیرعامل', 'person'),
|
| 120 |
+
(r'مدیرعامل\s+[آ-ی]+(?:\s+[آ-ی]+)+', 'person'),
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
# پردازش الگوها به ترتیب از طولانیترین به کوتاهترین
|
| 124 |
+
for pattern, category in patterns:
|
| 125 |
+
matches = list(re.finditer(pattern, anonymized, re.IGNORECASE))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
# مرتبسازی matches بر اساس طول (طولانیترین اول)
|
| 128 |
+
matches.sort(key=lambda x: len(x.group(0)), reverse=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
+
for match in matches:
|
| 131 |
+
matched_text = match.group(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
+
# بررسی که قبلاً جایگزین نشده باشد
|
| 134 |
+
if matched_text in anonymized and matched_text not in self.mapping_table:
|
| 135 |
+
self.counters[category] += 1
|
| 136 |
+
code = f"{category}-{self.counters[category]}"
|
| 137 |
+
self.mapping_table[matched_text] = code
|
| 138 |
+
anonymized = anonymized.replace(matched_text, code)
|
| 139 |
+
logger.info(f"Replaced: {matched_text} -> {code}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
logger.info(f"✅ Anonymization completed. Found {len(self.mapping_table)} entities.")
|
| 142 |
return anonymized
|
|
|
|
| 145 |
return f"❌ Error in anonymization: {str(e)}" if lang == 'en' else f"❌ خطا در ناشناسسازی: {str(e)}"
|
| 146 |
|
| 147 |
def send_to_chatgpt(self, anonymized_text, lang='fa'):
|
| 148 |
+
"""ارسال به ChatGPT"""
|
| 149 |
try:
|
| 150 |
if not anonymized_text or not anonymized_text.strip():
|
| 151 |
return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناسشده خالی است!"
|
|
|
|
| 195 |
return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}"
|
| 196 |
|
| 197 |
def deanonymize_response(self, gpt_response, lang='fa'):
|
| 198 |
+
"""بازگردانی"""
|
| 199 |
try:
|
| 200 |
if not gpt_response or not gpt_response.strip():
|
| 201 |
return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!"
|
|
|
|
| 206 |
final_result = gpt_response
|
| 207 |
reverse_mapping = {code: original for original, code in self.mapping_table.items()}
|
| 208 |
|
| 209 |
+
# جایگزینی از طولانیترین کد اول
|
| 210 |
sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True)
|
| 211 |
for code, original in sorted_codes:
|
| 212 |
final_result = final_result.replace(code, original)
|
|
|
|
|
|
|
| 213 |
|
| 214 |
return final_result
|
| 215 |
|
| 216 |
except Exception as e:
|
| 217 |
return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}"
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
def process_all_steps(input_text, language):
|
| 220 |
"""پردازش خودکار تمام مراحل"""
|
| 221 |
lang = 'en' if language == 'English' else 'fa'
|
|
|
|
| 234 |
gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang)
|
| 235 |
if gpt_response.startswith("❌"):
|
| 236 |
entities_found = len(anonymizer.mapping_table)
|
| 237 |
+
success_msg = (f"✅ Anonymization completed!\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
f"📊 Total: {entities_found} entities protected")
|
| 239 |
return success_msg, anonymized_text, gpt_response, ""
|
| 240 |
|
|
|
|
| 242 |
|
| 243 |
total_time = time.time() - start_time
|
| 244 |
entities_found = len(anonymizer.mapping_table)
|
|
|
|
|
|
|
| 245 |
|
| 246 |
# آمار تفصیلی
|
| 247 |
+
company_count = anonymizer.counters['company']
|
| 248 |
+
amount_count = anonymizer.counters['amount']
|
| 249 |
+
percent_count = anonymizer.counters['percent']
|
| 250 |
+
person_count = anonymizer.counters['person']
|
| 251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
success_msg = (f"🎉 Complete anonymization & restoration successful!\n"
|
| 253 |
+
f"🏢 Companies: {company_count} | 💰 Amounts: {amount_count} | 📊 Percentages: {percent_count} | 👤 Persons: {person_count}\n"
|
|
|
|
| 254 |
f"📊 Total: {entities_found} entities | ⏱️ Time: {total_time:.2f}s")
|
| 255 |
|
| 256 |
return success_msg, anonymized_text, gpt_response, final_result
|
|
|
|
| 266 |
if not anonymizer.mapping_table:
|
| 267 |
return "❌ Mapping table is empty! Please process some text first." if lang == 'en' else "❌ جدول نگاشت خالی است! ابتدا متنی را پردازش کنید."
|
| 268 |
|
| 269 |
+
result = "📋 **Simple Mapping Table:**\n\n" if lang == 'en' else "📋 **جدول نگاشت ساده:**\n\n"
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
+
# گروهبندی بر اساس نوع
|
| 272 |
+
categories = {
|
| 273 |
+
'company': '🏢 **Companies**',
|
| 274 |
+
'amount': '💰 **Amounts**',
|
| 275 |
+
'percent': '📊 **Percentages**',
|
| 276 |
+
'person': '👤 **Persons**'
|
|
|
|
| 277 |
}
|
| 278 |
|
| 279 |
+
for category, title in categories.items():
|
| 280 |
+
category_items = {k: v for k, v in anonymizer.mapping_table.items() if v.startswith(category)}
|
|
|
|
| 281 |
if category_items:
|
|
|
|
| 282 |
result += f"{title}:\n"
|
| 283 |
+
for original, code in category_items.items():
|
| 284 |
result += f" • `{original}` → `{code}`\n"
|
|
|
|
|
|
|
| 285 |
result += "\n"
|
| 286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
# آمار کلی
|
| 288 |
+
result += f"📊 **Summary**: {len(anonymizer.mapping_table)} total entities anonymized\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
return result
|
| 291 |
|
|
|
|
| 299 |
"""بهروزرسانی متنهای رابط کاربری"""
|
| 300 |
if language == 'English':
|
| 301 |
return {
|
| 302 |
+
'title': 'Simple Business Data Anonymization System',
|
| 303 |
'step1': 'Input Text & Settings',
|
| 304 |
'step2': 'Anonymized Text',
|
| 305 |
'step3': 'Raw ChatGPT Response',
|
| 306 |
'step4': 'Final Restored Response',
|
| 307 |
+
'input_placeholder': 'Enter your business text here...\nExample: Company names, financial amounts, percentages, executive names...',
|
| 308 |
+
'process_btn': 'Process with Simple Detection',
|
| 309 |
'clear_btn': 'Clear All',
|
| 310 |
+
'mapping_btn': 'Show Simple Mapping Table',
|
|
|
|
| 311 |
'direction': 'ltr'
|
| 312 |
}
|
| 313 |
else:
|
| 314 |
return {
|
| 315 |
+
'title': 'سیستم ناشناسسازی ساده اطلاعات تجاری',
|
| 316 |
'step1': 'متن ورودی و تنظیمات',
|
| 317 |
'step2': 'متن ناشناسشده',
|
| 318 |
'step3': 'پاسخ خام ChatGPT',
|
| 319 |
'step4': 'پاسخ نهایی بازگردانده شده',
|
| 320 |
+
'input_placeholder': 'متن تجاری خود را اینجا وارد کنید...\nمثال: نام شرکتها، مبالغ مالی، درصدها، نام مدیران...',
|
| 321 |
+
'process_btn': 'پردازش با تشخیص ساده',
|
| 322 |
'clear_btn': 'پاک کردن همه',
|
| 323 |
+
'mapping_btn': 'نمایش جدول نگاشت ساده',
|
|
|
|
| 324 |
'direction': 'rtl'
|
| 325 |
}
|
| 326 |
|
|
|
|
| 329 |
ui_text = update_ui_text(language)
|
| 330 |
is_english = (language == 'English')
|
| 331 |
|
|
|
|
| 332 |
workflow_css = "workflow ltr" if is_english else "workflow rtl"
|
| 333 |
|
| 334 |
return [
|
|
|
|
| 350 |
]
|
| 351 |
|
| 352 |
# ایجاد instance
|
| 353 |
+
anonymizer = SimpleAnonymizer()
|
| 354 |
|
| 355 |
+
# CSS اصلاح شده
|
| 356 |
custom_css = """
|
| 357 |
body, .gradio-container {
|
| 358 |
font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important;
|
|
|
|
| 510 |
height: 300px !important;
|
| 511 |
}
|
| 512 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
"""
|
| 514 |
|
| 515 |
+
# رابط کاربری Gradio
|
| 516 |
+
with gr.Blocks(title="📊 Simple Anonymization System", theme=gr.themes.Soft(), css=custom_css) as app:
|
| 517 |
|
| 518 |
with gr.Row():
|
| 519 |
language_selector = gr.Radio(
|
|
|
|
| 524 |
)
|
| 525 |
|
| 526 |
with gr.Column():
|
| 527 |
+
title = gr.HTML("<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 سیستم ناشناسسازی ساده اطلاعات تجاری</h1>")
|
| 528 |
|
| 529 |
with gr.Row(elem_classes="workflow rtl") as workflow_row:
|
| 530 |
with gr.Column(elem_classes="workflow-column"):
|
|
|
|
| 532 |
|
| 533 |
input_text = gr.Textbox(
|
| 534 |
lines=15,
|
| 535 |
+
placeholder="متن تجاری خود را اینجا وارد کنید...\nمثال: نام شرکتها، مبالغ مالی، درصدها، نام مدیران...",
|
| 536 |
label="",
|
| 537 |
rtl=True
|
| 538 |
)
|
| 539 |
|
| 540 |
+
process_btn = gr.Button("🚀 پردازش با تشخیص ساده", variant="primary")
|
| 541 |
clear_btn = gr.Button("🗑️ پاک کردن همه", variant="stop")
|
| 542 |
|
| 543 |
status = gr.Textbox(
|
|
|
|
| 583 |
|
| 584 |
with gr.Row():
|
| 585 |
with gr.Column():
|
| 586 |
+
mapping_title = gr.HTML('<h2>🗂️ جدول نگاشت ساده</h2>')
|
| 587 |
+
mapping_btn = gr.Button("📋 نمایش جدول نگاشت ساده")
|
| 588 |
|
| 589 |
mapping_output = gr.Textbox(
|
| 590 |
lines=10,
|
|
|
|
| 626 |
)
|
| 627 |
|
| 628 |
if __name__ == "__main__":
|
| 629 |
+
app.launch()
|