|
|
import gradio as gr |
|
|
import re |
|
|
import os |
|
|
import requests |
|
|
import time |
|
|
import logging |
|
|
from packaging import version |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def auto_setup_models(): |
|
|
"""راهاندازی خودکار مدلها در صورت عدم وجود""" |
|
|
models_dir = "./models" |
|
|
required_models = { |
|
|
'bert-fa-ner': 'HooshvareLab/bert-fa-zwnj-base-ner', |
|
|
'bert-base-NER': 'dslim/bert-base-NER', |
|
|
} |
|
|
|
|
|
missing_models = [] |
|
|
for model_name in required_models.keys(): |
|
|
model_path = os.path.join(models_dir, model_name) |
|
|
if not os.path.exists(model_path) or not os.listdir(model_path): |
|
|
missing_models.append(model_name) |
|
|
|
|
|
if not missing_models: |
|
|
logger.info("✅ All models are already available") |
|
|
return True |
|
|
|
|
|
logger.info(f"📥 Auto-downloading missing models: {missing_models}") |
|
|
|
|
|
try: |
|
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
os.makedirs(models_dir, exist_ok=True) |
|
|
|
|
|
for model_name in missing_models: |
|
|
hf_repo = required_models[model_name] |
|
|
model_path = os.path.join(models_dir, model_name) |
|
|
logger.info(f"📥 Downloading {model_name} from {hf_repo}...") |
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(hf_repo) |
|
|
model = AutoModelForTokenClassification.from_pretrained(hf_repo) |
|
|
tokenizer.save_pretrained(model_path) |
|
|
model.save_pretrained(model_path) |
|
|
logger.info(f"✅ {model_name} downloaded successfully") |
|
|
del tokenizer, model |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Failed to download {model_name}: {e}") |
|
|
if os.path.exists(model_path): |
|
|
import shutil |
|
|
shutil.rmtree(model_path) |
|
|
|
|
|
logger.info("🎉 Auto-setup completed!") |
|
|
return True |
|
|
|
|
|
except ImportError: |
|
|
logger.error("❌ transformers library not available for auto-download") |
|
|
return False |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Auto-setup failed: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
try: |
|
|
auto_setup_models() |
|
|
except Exception as e: |
|
|
logger.warning(f"⚠️ Auto-setup encountered an issue: {e}") |
|
|
logger.info("ℹ️ Continuing with manual setup...") |
|
|
|
|
|
class ComprehensiveBilingualDataAnonymizer: |
|
|
def __init__(self): |
|
|
self.mapping_table = {} |
|
|
|
|
|
self.pattern_categories = { |
|
|
'personal_identity': { |
|
|
'name_fa': 'اطلاعات شخصی و هویتی', |
|
|
'name_en': 'Personal & Identity Information', |
|
|
'patterns': ['PERSON', 'MIXED_NAMES', 'ID_NUMBER', 'ENGLISH_TITLES'], |
|
|
'icon': '👤' |
|
|
}, |
|
|
'financial': { |
|
|
'name_fa': 'اطلاعات مالی', |
|
|
'name_en': 'Financial Information', |
|
|
'patterns': ['AMOUNT', 'INTERNATIONAL_CURRENCIES', 'ACCOUNT', 'FINANCIAL_TERMS', 'STOCK_SYMBOL'], |
|
|
'icon': '💰' |
|
|
}, |
|
|
'temporal': { |
|
|
'name_fa': 'اطلاعات زمانی', |
|
|
'name_en': 'Temporal Information', |
|
|
'patterns': ['DATE', 'ADVANCED_DATE_FORMATS', 'TIME_RANGES'], |
|
|
'icon': '📅' |
|
|
}, |
|
|
'location': { |
|
|
'name_fa': 'اطلاعات مکانی', |
|
|
'name_en': 'Location Information', |
|
|
'patterns': ['LOCATION', 'COMPLEX_ADDRESSES'], |
|
|
'icon': '📍' |
|
|
}, |
|
|
'technical': { |
|
|
'name_fa': 'اطلاعات فنی و تکنولوژیکی', |
|
|
'name_en': 'Technical & Technological', |
|
|
'patterns': ['TECHNICAL_CODES', 'NETWORK_ADDRESSES', 'TECHNICAL_UNITS', 'ACRONYMS_ABBREVIATIONS'], |
|
|
'icon': '⚙️' |
|
|
}, |
|
|
'business': { |
|
|
'name_fa': 'اطلاعات کسبوکار', |
|
|
'name_en': 'Business Information', |
|
|
'patterns': ['COMPANY', 'BUSINESS_TERMS', 'PRODUCT', 'PETROCHEMICAL'], |
|
|
'icon': '🏢' |
|
|
}, |
|
|
'quantity': { |
|
|
'name_fa': 'اطلاعات کمیت و واحد', |
|
|
'name_en': 'Quantity & Unit Information', |
|
|
'patterns': ['PERCENTAGE', 'VOLUME', 'RATIOS'], |
|
|
'icon': '📊' |
|
|
}, |
|
|
'communication': { |
|
|
'name_fa': 'اطلاعات ارتباطی', |
|
|
'name_en': 'Communication Information', |
|
|
'patterns': ['PHONE', 'EMAIL'], |
|
|
'icon': '📞' |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
self.counters = { |
|
|
|
|
|
'PERSON': 0, 'MIXED_NAMES': 0, 'ID_NUMBER': 0, 'ENGLISH_TITLES': 0, |
|
|
|
|
|
|
|
|
'AMOUNT': 0, 'INTERNATIONAL_CURRENCIES': 0, 'ACCOUNT': 0, |
|
|
'FINANCIAL_TERMS': 0, 'STOCK_SYMBOL': 0, |
|
|
|
|
|
|
|
|
'DATE': 0, 'ADVANCED_DATE_FORMATS': 0, 'TIME_RANGES': 0, |
|
|
|
|
|
|
|
|
'LOCATION': 0, 'COMPLEX_ADDRESSES': 0, |
|
|
|
|
|
|
|
|
'TECHNICAL_CODES': 0, 'NETWORK_ADDRESSES': 0, 'TECHNICAL_UNITS': 0, |
|
|
'ACRONYMS_ABBREVIATIONS': 0, |
|
|
|
|
|
|
|
|
'COMPANY': 0, 'BUSINESS_TERMS': 0, 'PRODUCT': 0, 'PETROCHEMICAL': 0, |
|
|
|
|
|
|
|
|
'PERCENTAGE': 0, 'VOLUME': 0, 'RATIOS': 0, |
|
|
|
|
|
|
|
|
'PHONE': 0, 'EMAIL': 0 |
|
|
} |
|
|
|
|
|
self.api_key = os.getenv("OPENAI_API_KEY", "") |
|
|
self.models_base_path = "./models" |
|
|
self.models_loaded = False |
|
|
self.model_status = {} |
|
|
self.load_local_ner_models() |
|
|
|
|
|
def get_category_choices(self, language='fa'): |
|
|
"""دریافت لیست دستهبندیها برای چکباکس""" |
|
|
choices = [] |
|
|
for cat_key, cat_info in self.pattern_categories.items(): |
|
|
name = cat_info['name_fa'] if language == 'fa' else cat_info['name_en'] |
|
|
icon = cat_info['icon'] |
|
|
choices.append(f"{icon} {name}") |
|
|
return choices |
|
|
|
|
|
def get_selected_patterns(self, selected_categories, language='fa'): |
|
|
"""تبدیل دستهبندیهای انتخاب شده به لیست الگوها""" |
|
|
selected_patterns = [] |
|
|
|
|
|
for cat_key, cat_info in self.pattern_categories.items(): |
|
|
name = cat_info['name_fa'] if language == 'fa' else cat_info['name_en'] |
|
|
icon = cat_info['icon'] |
|
|
category_display = f"{icon} {name}" |
|
|
|
|
|
if category_display in selected_categories: |
|
|
selected_patterns.extend(cat_info['patterns']) |
|
|
|
|
|
return selected_patterns |
|
|
|
|
|
def ensure_models_directory(self): |
|
|
if not os.path.exists(self.models_base_path): |
|
|
try: |
|
|
os.makedirs(self.models_base_path, exist_ok=True) |
|
|
logger.info(f"📁 Created models directory: {self.models_base_path}") |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Failed to create models directory: {e}") |
|
|
return False |
|
|
return True |
|
|
|
|
|
def download_model_if_missing(self, local_name, hf_repo): |
|
|
model_path = os.path.join(self.models_base_path, local_name) |
|
|
if os.path.exists(model_path) and os.listdir(model_path): |
|
|
return True, f"Model {local_name} already exists" |
|
|
try: |
|
|
logger.info(f"📥 Auto-downloading {local_name} from {hf_repo}...") |
|
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
tokenizer = AutoTokenizer.from_pretrained(hf_repo) |
|
|
model = AutoModelForTokenClassification.from_pretrained(hf_repo) |
|
|
tokenizer.save_pretrained(model_path) |
|
|
model.save_pretrained(model_path) |
|
|
logger.info(f"✅ {local_name} auto-downloaded successfully") |
|
|
return True, f"Downloaded {local_name}" |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Auto-download failed for {local_name}: {e}") |
|
|
return False, str(e) |
|
|
|
|
|
def _load_pipeline(self, task, model_path, tokenizer_path=None): |
|
|
"""لود مدل با مدیریت صحیح پارامترهای ورژن مختلف transformers""" |
|
|
try: |
|
|
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, __version__ as tr_version |
|
|
|
|
|
|
|
|
supports_agg = version.parse(tr_version) >= version.parse("4.11.0") |
|
|
|
|
|
|
|
|
if tokenizer_path: |
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True) |
|
|
else: |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True) |
|
|
|
|
|
model = AutoModelForTokenClassification.from_pretrained(model_path, local_files_only=True) |
|
|
|
|
|
|
|
|
pipeline_kwargs = { |
|
|
"model": model, |
|
|
"tokenizer": tokenizer, |
|
|
"device": -1 |
|
|
} |
|
|
|
|
|
|
|
|
if supports_agg: |
|
|
pipeline_kwargs["aggregation_strategy"] = "simple" |
|
|
|
|
|
return pipeline(task, **pipeline_kwargs) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Failed to load pipeline for {model_path}: {e}") |
|
|
return None |
|
|
|
|
|
def load_local_ner_models(self): |
|
|
logger.info("📄 Loading local NER models with auto-download...") |
|
|
if not self.ensure_models_directory(): |
|
|
self.models_loaded = False |
|
|
self.model_status['directory'] = "❌ Cannot create models directory" |
|
|
return |
|
|
|
|
|
try: |
|
|
try: |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
transformers_available = True |
|
|
logger.info("✅ Transformers library available") |
|
|
except ImportError as e: |
|
|
transformers_available = False |
|
|
self.model_status['transformers'] = f"❌ Transformers library not installed: {str(e)}" |
|
|
self.models_loaded = False |
|
|
return |
|
|
|
|
|
|
|
|
persian_model_path = os.path.join(self.models_base_path, "bert-fa-ner") |
|
|
self.download_model_if_missing("bert-fa-ner", "HooshvareLab/bert-fa-zwnj-base-ner") |
|
|
if os.path.exists(persian_model_path) and os.listdir(persian_model_path): |
|
|
try: |
|
|
self.persian_ner = self._load_pipeline("ner", persian_model_path) |
|
|
if self.persian_ner: |
|
|
self.model_status['persian'] = f"✅ Local Persian NER: {persian_model_path}" |
|
|
else: |
|
|
self.model_status['persian'] = f"❌ Failed to load Persian model: {persian_model_path}" |
|
|
except Exception as e: |
|
|
self.persian_ner = None |
|
|
self.model_status['persian'] = f"❌ Persian model loading error: {str(e)[:100]}" |
|
|
else: |
|
|
self.persian_ner = None |
|
|
self.model_status['persian'] = f"❌ Persian model not found: {persian_model_path}" |
|
|
|
|
|
|
|
|
english_model_path = os.path.join(self.models_base_path, "bert-base-NER") |
|
|
self.download_model_if_missing("bert-base-NER", "dslim/bert-base-NER") |
|
|
if os.path.exists(english_model_path) and os.listdir(english_model_path): |
|
|
try: |
|
|
self.english_ner = self._load_pipeline("ner", english_model_path) |
|
|
if self.english_ner: |
|
|
self.model_status['english'] = f"✅ Local English NER: {english_model_path}" |
|
|
else: |
|
|
self.model_status['english'] = f"❌ Failed to load English model: {english_model_path}" |
|
|
except Exception as e: |
|
|
self.english_ner = None |
|
|
self.model_status['english'] = f"❌ English model loading error: {str(e)[:100]}" |
|
|
else: |
|
|
self.english_ner = None |
|
|
self.model_status['english'] = f"❌ English model not found: {english_model_path}" |
|
|
|
|
|
loaded_models = sum(1 for status in self.model_status.values() if status.startswith("✅")) |
|
|
self.models_loaded = loaded_models > 0 |
|
|
if loaded_models == 0: |
|
|
self.model_status['fallback'] = "⚠️ Using regex-only mode (no local models found)" |
|
|
|
|
|
except Exception as e: |
|
|
self.models_loaded = False |
|
|
self.model_status['critical'] = f"❌ Critical error: {str(e)[:100]}..." |
|
|
|
|
|
def detect_language(self, text): |
|
|
"""تشخیص زبان متن""" |
|
|
if not text: |
|
|
return 'fa' |
|
|
|
|
|
persian_chars = len(re.findall(r'[\u0600-\u06FF]', text)) |
|
|
english_chars = len(re.findall(r'[a-zA-Z]', text)) |
|
|
total = persian_chars + english_chars |
|
|
|
|
|
if total == 0: |
|
|
return 'fa' |
|
|
|
|
|
if persian_chars / total > 0.6: |
|
|
return 'fa' |
|
|
elif english_chars / total > 0.6: |
|
|
return 'en' |
|
|
else: |
|
|
return 'mixed' |
|
|
|
|
|
def extract_entities_with_ner(self, text, lang='fa'): |
|
|
"""استخراج entities با مدلهای NER محلی""" |
|
|
entities = [] |
|
|
|
|
|
if not self.models_loaded: |
|
|
logger.info("ℹ️ Local NER models not available - using regex only") |
|
|
return entities |
|
|
|
|
|
try: |
|
|
|
|
|
if lang in ['fa', 'mixed'] and hasattr(self, 'persian_ner') and self.persian_ner: |
|
|
try: |
|
|
persian_results = self.persian_ner(text) |
|
|
for entity in persian_results: |
|
|
|
|
|
if isinstance(entity, dict): |
|
|
if 'entity_group' in entity: |
|
|
|
|
|
entities.append({ |
|
|
'text': entity['word'].strip(), |
|
|
'label': entity['entity_group'], |
|
|
'start': entity['start'], |
|
|
'end': entity['end'], |
|
|
'confidence': entity['score'], |
|
|
'source': 'local_persian_ner' |
|
|
}) |
|
|
else: |
|
|
|
|
|
entities.append({ |
|
|
'text': entity['word'].strip(), |
|
|
'label': entity['entity'], |
|
|
'start': entity['start'], |
|
|
'end': entity['end'], |
|
|
'confidence': entity['score'], |
|
|
'source': 'local_persian_ner' |
|
|
}) |
|
|
logger.info(f"Local Persian NER found {len(persian_results)} entities") |
|
|
except Exception as e: |
|
|
logger.error(f"Local Persian NER extraction error: {e}") |
|
|
|
|
|
|
|
|
if lang in ['en', 'mixed'] and hasattr(self, 'english_ner') and self.english_ner: |
|
|
try: |
|
|
english_results = self.english_ner(text) |
|
|
for entity in english_results: |
|
|
|
|
|
if isinstance(entity, dict): |
|
|
if 'entity_group' in entity: |
|
|
|
|
|
entities.append({ |
|
|
'text': entity['word'].strip(), |
|
|
'label': entity['entity_group'], |
|
|
'start': entity['start'], |
|
|
'end': entity['end'], |
|
|
'confidence': entity['score'], |
|
|
'source': 'local_english_ner' |
|
|
}) |
|
|
else: |
|
|
|
|
|
entities.append({ |
|
|
'text': entity['word'].strip(), |
|
|
'label': entity['entity'], |
|
|
'start': entity['start'], |
|
|
'end': entity['end'], |
|
|
'confidence': entity['score'], |
|
|
'source': 'local_english_ner' |
|
|
}) |
|
|
logger.info(f"Local English NER found {len(english_results)} entities") |
|
|
except Exception as e: |
|
|
logger.error(f"Local English NER extraction error: {e}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Local NER extraction general error: {e}") |
|
|
|
|
|
|
|
|
unique_entities = [] |
|
|
seen = set() |
|
|
for entity in entities: |
|
|
key = (entity['text'].lower(), entity['start'], entity['end']) |
|
|
if key not in seen: |
|
|
seen.add(key) |
|
|
unique_entities.append(entity) |
|
|
|
|
|
logger.info(f"Total unique entities found by local models: {len(unique_entities)}") |
|
|
return unique_entities |
|
|
|
|
|
def map_ner_to_categories(self, ner_label, source=''): |
|
|
"""نگاشت برچسبهای NER به دستههای سیستم""" |
|
|
mapping = { |
|
|
'PER': 'PERSON', 'PERSON': 'PERSON', |
|
|
'ORG': 'COMPANY', 'ORGANIZATION': 'COMPANY', |
|
|
'LOC': 'LOCATION', 'LOCATION': 'LOCATION', |
|
|
'MISC': 'BUSINESS_TERMS', 'MISCELLANEOUS': 'BUSINESS_TERMS', |
|
|
'B-PER': 'PERSON', 'I-PER': 'PERSON', |
|
|
'B-ORG': 'COMPANY', 'I-ORG': 'COMPANY', |
|
|
'B-LOC': 'LOCATION', 'I-LOC': 'LOCATION', |
|
|
'B-MISC': 'BUSINESS_TERMS', 'I-MISC': 'BUSINESS_TERMS', |
|
|
'MONEY': 'AMOUNT', 'PERCENT': 'PERCENTAGE', |
|
|
'DATE': 'DATE', 'TIME': 'DATE' |
|
|
} |
|
|
return mapping.get(ner_label.upper(), 'BUSINESS_TERMS') |
|
|
|
|
|
def get_comprehensive_patterns(self): |
|
|
"""الگوهای جامع ناشناسسازی بر اساس 221 الگوی دستهبندی شده""" |
|
|
return { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'PERSON': [ |
|
|
|
|
|
r'آقای\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)', |
|
|
r'خانم\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)', |
|
|
r'مهندس\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)', |
|
|
r'دکتر\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)', |
|
|
r'استاد\s+([آ-یa-zA-Z]+(?:\s+[آ-یa-zA-Z]+)*)', |
|
|
|
|
|
r'([آ-یa-zA-Z]+\s+[آ-یa-zA-Z]+)(?:، مدیرعامل|\s+مدیرعامل|\s+رئیس)', |
|
|
r'مدیرعامل(?=\s|$|،|\.)', |
|
|
r'سرپرست(?=\s+و|\s|$|،|\.)', |
|
|
r'رئیس\s+هیأتمدیره', |
|
|
|
|
|
r'وی(?=\s+ادامه|\s+اظهار|\s+گفت|\s+اعلام|\s+همچنین)', |
|
|
|
|
|
r'Mr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)', |
|
|
r'Ms\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)', |
|
|
r'Dr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)', |
|
|
|
|
|
r'([آ-یa-zA-Z]{3,}\s+[آ-یa-zA-Z]{3,})(?=\s+گفت|\s+اظهار|\s+اعلام)' |
|
|
], |
|
|
|
|
|
'MIXED_NAMES': [ |
|
|
|
|
|
r'([آ-ی]+[a-zA-Z\s]+[آ-ی]+)', |
|
|
r'Dr\.\s+([آ-یa-zA-Z\s]+)', |
|
|
|
|
|
r'([آ-یa-zA-Z]{2,}\s+[آ-یa-zA-Z]{2,})', |
|
|
|
|
|
r'([A-Z][a-z]+-[A-Z][a-z]+)', |
|
|
r"([A-Z]'[A-Z][a-z]+)", |
|
|
|
|
|
r'([A-Z][a-z]+\s+[A-Z][a-z]+\s+[IVX]+)', |
|
|
|
|
|
r'([a-z\s]+)\s+([آ-ی\s]+)', |
|
|
|
|
|
r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s*\(([A-Z][a-z]+\s+[A-Z][a-z]+)\)' |
|
|
], |
|
|
|
|
|
'ID_NUMBER': [ |
|
|
|
|
|
r'IR[۰-۹0-9]{24}', |
|
|
r'شبا[\s:]*IR[۰-۹0-9]{24}', |
|
|
r'IBAN[\s:]*IR[۰-۹0-9]{24}', |
|
|
r'شماره[\s]*شبا[\s:]*IR[۰-۹0-9]{24}', |
|
|
|
|
|
r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}', |
|
|
r'(?:شناسه[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}', |
|
|
r'National[\s]*(?:ID[\s:]*)?[0-9]{10}', |
|
|
|
|
|
r'(?:پاسپورت[\s:]*)?[A-Z][0-9]{8}', |
|
|
r'(?:Passport[\s:]*)?[A-Z][0-9]{8}', |
|
|
|
|
|
r'(?:کارت[\s:]*)?(?:[۰-۹0-9]{4}[-\s]?){3}[۰-۹0-9]{4}', |
|
|
r'(?:Card[\s:]*)?(?:[0-9]{4}[-\s]?){3}[0-9]{4}', |
|
|
|
|
|
r'SSN[\s:]*[0-9]{3}-[0-9]{2}-[0-9]{4}', |
|
|
r'FICO[\s]*(?:score[\s:]*)?[0-9]{3}', |
|
|
|
|
|
r'EIN[\s:]*[0-9]{2}-[0-9]{7}', |
|
|
r'Meeting[\s]*ID[\s:]*[0-9]{9,11}' |
|
|
], |
|
|
|
|
|
'ENGLISH_TITLES': [ |
|
|
|
|
|
r'business\s+partner', |
|
|
r'team\s+lead', |
|
|
r'head\s+of\s+production', |
|
|
|
|
|
r'senior\s+architect', |
|
|
r'civil\s+engineer', |
|
|
r'quantity\s+surveyor', |
|
|
r'system\s+administrator', |
|
|
r'network\s+engineer', |
|
|
|
|
|
r'environmental\s+consultant', |
|
|
r'HSE\s+coordinator', |
|
|
|
|
|
r'senior\s+loan\s+officer', |
|
|
r'investment\s+advisor', |
|
|
r'Chief\s+Financial\s+Officer', |
|
|
|
|
|
r'facility\s+manager', |
|
|
r'quality\s+control\s+manager', |
|
|
r'maintenance\s+window', |
|
|
r'project\s+team', |
|
|
r'technical\s+support', |
|
|
|
|
|
r'supervision', |
|
|
r'troubleshooting', |
|
|
r'monitoring', |
|
|
r'compliance\s+certificate' |
|
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'AMOUNT': [ |
|
|
|
|
|
r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان', |
|
|
r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان', |
|
|
r'\d+\s*تومان(?=\s+به\s+ازای|\s+فروش|،)', |
|
|
r'رقم\s+فعلی\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان', |
|
|
r'رقم\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان', |
|
|
r'به\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان', |
|
|
r'از\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان', |
|
|
r'برابر\s+با\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان', |
|
|
r'\d+(?:میلیارد|میلیون)\s*تومان(?=\s+رسیده|\s+ثبت|\s+بوده|،)', |
|
|
|
|
|
r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?', |
|
|
r'\d+(?:,\d{3})*\s*ریال', |
|
|
|
|
|
r'€\d+(?:,\d{3})*(?:\.\d+)?', |
|
|
|
|
|
r'\d+(?:,\d{3})*\s*AED', |
|
|
|
|
|
r'\$\d+(?:\.\d+)?[KMB]', |
|
|
r'€\d+(?:\.\d+)?[KM]' |
|
|
], |
|
|
|
|
|
'INTERNATIONAL_CURRENCIES': [ |
|
|
|
|
|
r'\d+(?:,\d{3})*\s+euro', |
|
|
r'€\d+(?:\.\d+)?M', |
|
|
r'\d+\s+EUR', |
|
|
|
|
|
r'\d+(?:,\d{3})*\s+AED', |
|
|
r'\d+(?:\.\d+)?M\s+AED', |
|
|
|
|
|
r'\$\d+(?:\.\d+)?M', |
|
|
r'\$\d+(?:\.\d+)?K', |
|
|
|
|
|
r'£\d+(?:,\d{3})*(?:\.\d+)?', |
|
|
r'\d+\s+GBP', |
|
|
|
|
|
r'\d+\s+CHF', |
|
|
|
|
|
r'¥\d+(?:,\d{3})*', |
|
|
r'\d+\s+JPY' |
|
|
], |
|
|
|
|
|
'ACCOUNT': [ |
|
|
|
|
|
r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:بانکی[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}', |
|
|
r'حساب[\s]*(?:شماره[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}', |
|
|
r'شماره[\s]*حساب[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}', |
|
|
|
|
|
r'Account[\s]*(?:Number[\s:]*)?(?:[0-9]{1,3}[-\s]?)*[0-9]{8,20}', |
|
|
r'[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{6,12}', |
|
|
r'[۰-۹0-9]{2,4}[-\s]?[۰-۹0-9]{6,12}[-\s]?[۰-۹0-9]{2,4}', |
|
|
|
|
|
r'واریز[\s]*(?:سود[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}', |
|
|
r'سود[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}' |
|
|
], |
|
|
|
|
|
'FINANCIAL_TERMS': [ |
|
|
|
|
|
r'فروش\s+(?:ماهانه|تجمیعی|صادراتی)', |
|
|
r'درآمد\s+شرکت', |
|
|
r'سود\s+(?:خالص|نقدی)', |
|
|
r'صورتهای\s+مالی', |
|
|
r'بهای\s+تمامشده', |
|
|
r'سودآوری', |
|
|
r'عملکرد\s+مالی', |
|
|
r'میانگین\s+فروش', |
|
|
r'بالاترین\s+رقم\s+فروش', |
|
|
r'رقم\s+فروش', |
|
|
r'درآمدهای\s+عملیاتی' |
|
|
], |
|
|
|
|
|
'STOCK_SYMBOL': [ |
|
|
|
|
|
r'نماد\s+([آ-یa-zA-Z0-9]+)', |
|
|
r'(سبهان|غدیر|شتران|شپنا|پترول|فارس|خارک|پلاسکو|جم|کرمان|مارون|اراک|رازی|شازند|کاوه|بندر|پارس|خوزستان|ماهشهر|عسلویه)(?=\s|$|،|\.|\s+)', |
|
|
r'شرکت\s+([آ-یa-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+را|\s+به)', |
|
|
r'پتروشیمی\s+([آ-یa-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+توان)', |
|
|
|
|
|
r'(AAPL|GOOGL|MSFT|AMZN|TSLA|META|NVDA|SABIC)(?=\s|$|,|\.)' |
|
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'DATE': [ |
|
|
|
|
|
r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}', |
|
|
r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}', |
|
|
r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s*(?:[۰-۹0-9]{4})', |
|
|
|
|
|
r'(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s+[۰-۹0-9]{4}', |
|
|
|
|
|
r'(?:[0-9]{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(?:[0-9]{4})', |
|
|
r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s*[0-9]{1,2},?\s*[0-9]{4}', |
|
|
|
|
|
r'سال\s+گذشته', |
|
|
r'سال\s+جاری', |
|
|
r'این\s+سال', |
|
|
r'ماه\s+قبل', |
|
|
r'ماه\s+اخیر', |
|
|
r'دومین\s+ماه\s+سال', |
|
|
r'ابتدای\s+سال\s+جاری', |
|
|
r'مدت\s+مشابه\s+سال\s+گذشته', |
|
|
r'چند\s+ماهه\s+اخیر', |
|
|
|
|
|
r'(?:13[0-9]{2}|14[0-9]{2}|20[0-9]{2}|19[0-9]{2})(?=\s|$|،|\.)' |
|
|
], |
|
|
|
|
|
'ADVANCED_DATE_FORMATS': [ |
|
|
|
|
|
r'(?:March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th),?\s+\d{4}', |
|
|
r'(?:January|February)\s+\d{1,2}(?:st|nd|rd|th),?\s+\d{4}', |
|
|
|
|
|
r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d{3})?Z', |
|
|
|
|
|
r'(?:PST|EST|GMT|UTC)(?:[+-]\d{1,2}:\d{2})?', |
|
|
r'Eastern\s+Time', |
|
|
r'GMT[+-]\d{1,2}:\d{2}', |
|
|
|
|
|
r'\d{1,2}(?:st|nd|rd|th)\s+of\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}', |
|
|
|
|
|
r'ending\s+(?:December|January|February|March|April|May|June|July|August|September|October|November)\s+\d{1,2}(?:st|nd|rd|th)', |
|
|
|
|
|
r'end\s+of\s+fiscal\s+year\s+\d{4}/\d{2}/\d{2}', |
|
|
|
|
|
r'\d{1,2}\s+(?:روز|days?)\s+(?:کاری|business)\s+پس\s+از\s+(?:delivery|تحویل)', |
|
|
|
|
|
r'COB\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)' |
|
|
], |
|
|
|
|
|
'TIME_RANGES': [ |
|
|
|
|
|
r'\d{2}:\d{2}-\d{2}:\d{2}', |
|
|
r'\d{2}:\d{2}\s+تا\s+\d{2}:\d{2}', |
|
|
|
|
|
r'(?:Saturday|Sunday|Monday|Tuesday|Wednesday|Thursday|Friday)\s+night\s+\d{1,2}:\d{2}\s+(?:AM|PM)\s+to\s+\d{1,2}:\d{2}\s+(?:AM|PM)', |
|
|
|
|
|
r'\d{1,2}:\d{2}\s+(?:AM|PM)\s+(?:PST|EST|GMT|UTC)', |
|
|
r'\d{1,2}:\d{2}\s+(?:AM|PM)\s+Eastern\s+Time', |
|
|
|
|
|
r'\d{2}:\d{2}:\d{2}\s+(?:AM|PM)', |
|
|
|
|
|
r'COB\s*\(Close\s+of\s+Business\)', |
|
|
|
|
|
r'\d{1,3}\s+(?:business\s+days|روز\s+کاری)', |
|
|
r'warranty\s+period\s+(?:دو\s+سال|\d+\s+(?:years?|سال))' |
|
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'LOCATION': [ |
|
|
|
|
|
r'(تهران|اصفهان|ماهشهر|عسلویه|بندرعباس|اهواز|شیراز|مشهد|تبریز|کرج|قم|رشت|کرمان|یزد|زاهدان|بوشهر|خرمشهر|آبادان|اراک|قزوین)', |
|
|
|
|
|
r'استان\s+([آ-ی\s]+)', |
|
|
r'شهر\s+([آ-ی\s]+)', |
|
|
|
|
|
r'(ایران|عراق|کویت|عربستان|امارات|قطر|عمان|بحرین|ترکیه|پاکستان|افغانستان|آذربایجان|ارمنستان|گرجستان)', |
|
|
|
|
|
r'داخلی|بازار\s+داخلی', |
|
|
r'خارجی|بازارهای\s+خارجی', |
|
|
|
|
|
r'(London|Paris|Tokyo|New\s+York|Dubai|Singapore|Hong\s+Kong|Shanghai|Mumbai|Frankfurt|Amsterdam)' |
|
|
], |
|
|
|
|
|
'COMPLEX_ADDRESSES': [ |
|
|
|
|
|
r'کیلومتر\s+\d+\s+جاده\s+[آ-ی\s]+-[آ-ی\s]+', |
|
|
|
|
|
r'روبروی\s+(?:پمپ\s+بنزین|بانک|پارک|مسجد|بیمارستان)\s+[آ-یa-zA-Z\s]+', |
|
|
|
|
|
r'Building-[A-Z],?\s+Floor-\d+,?\s+Unit-[A-Z0-9]+', |
|
|
|
|
|
r'rack\s+number\s+R-\d+,?\s+slot\s+\d+', |
|
|
|
|
|
r'phase\s+\d+\s+development,?\s+block\s+[A-Z],?\s+plot\s+\d+-[A-Z]', |
|
|
|
|
|
r'\d{2,5}\s+[A-Z][a-z]+\s+(?:Street|Avenue|Boulevard|Road|Drive),?\s+Floor\s+\d+,?\s+Building\s+[A-Z]', |
|
|
|
|
|
r'شهرک\s+صنعتی\s+[آ-ی\s]+،?\s+محور\s+[آ-ی\s]+', |
|
|
|
|
|
r'[آ-ی\s]+-پارک\s+فناوری\s+[آ-ی\s]+' |
|
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'TECHNICAL_CODES': [ |
|
|
|
|
|
r'SN-\d{4}-[A-Z]{3}-\d{4}', |
|
|
r'Serial\s+Number[\s:]*[A-Z0-9-]+', |
|
|
|
|
|
r'REF-[A-Z]{3}-\d{4}-\d{3}', |
|
|
r'DOC-[A-Z]{2}-\d{4}-\d{4}', |
|
|
|
|
|
r'INF-\d{4}-\d{4}', |
|
|
r'CTR/\d{4}/\d{3}', |
|
|
|
|
|
r'HVAC-\d{7}', |
|
|
r'Generator-Model-[A-Z0-9]+', |
|
|
|
|
|
r'LOI-\d{4}-[A-Z]{4}-\d{3}', |
|
|
r'BOQ-\d{4}-[A-Z]{3}-\d{3}', |
|
|
|
|
|
r'#INV-\d{4}-Q\d-\d{4}', |
|
|
|
|
|
r'ESC-\d{4}-[A-Z]{3}-\d{3}', |
|
|
|
|
|
r'BN-\d{6}-[A-Z]\d+' |
|
|
], |
|
|
|
|
|
'NETWORK_ADDRESSES': [ |
|
|
|
|
|
r'\b(?:\d{1,3}\.){3}\d{1,3}\b', |
|
|
r'xxx\.xxx\.xxx\.xxx', |
|
|
|
|
|
r'[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}', |
|
|
|
|
|
r'srv-[a-z]+-[a-z]+-\d{2}', |
|
|
r'[a-z]+-[a-z]+\d*\.[a-z]+\.[a-z]+', |
|
|
|
|
|
r'[a-zA-Z0-9-]+\.[a-zA-Z]{2,4}(?:\.[a-zA-Z]{2,4})?' |
|
|
], |
|
|
|
|
|
'TECHNICAL_UNITS': [ |
|
|
|
|
|
r'\d+(?:\.\d+)?\s*MW', |
|
|
r'\d+(?:\.\d+)?\s*kWh?', |
|
|
|
|
|
r'\d+(?:,\d{3})*\s*cubic\s+meters', |
|
|
r'\d+(?:,\d{3})*\s*m³', |
|
|
r'\d+(?:,\d{3})*\s*sq\s+ft', |
|
|
|
|
|
r'\d+(?:\.\d+)?\s*ppm', |
|
|
r'\d+(?:\.\d+)?\s*mg/m³', |
|
|
r'\b(?:CO2|NOx|SO2)\b', |
|
|
|
|
|
r'\d+(?:\.\d+)?\s*TB', |
|
|
r'\d+(?:\.\d+)?\s*GB', |
|
|
|
|
|
r'\d+(?:,\d{3})*\s*square\s+meters', |
|
|
r'\d+(?:\.\d+)?\s*per\s+sq\s+ft\s+NNN', |
|
|
|
|
|
r'\d+(?:\.\d+)?\%\s*efficiency', |
|
|
r'score:\s*\d+(?:\.\d+)?/10', |
|
|
|
|
|
r'FICO\s+score:\s*\d{3}', |
|
|
|
|
|
r'\d+(?:\.\d+)?\s*(?:bar|psi)', |
|
|
|
|
|
r'\d+(?:\.\d+)?\s*°[CF]', |
|
|
|
|
|
r'\d+(?:\.\d+)?\s*(?:rpm|m/s)' |
|
|
], |
|
|
|
|
|
'ACRONYMS_ABBREVIATIONS': [ |
|
|
|
|
|
r'\b(?:HVAC|IT|HSE|BOQ|LC|COB)\b', |
|
|
|
|
|
r'\b(?:YTD|NNN|EIN|SSN|FICO)\b', |
|
|
|
|
|
r'\bIP\s+Address\b', |
|
|
r'\bMAC\s+Address\b', |
|
|
r'\bURL\b', |
|
|
|
|
|
r'\b(?:LLC|Corp|Inc|Ltd)\b', |
|
|
|
|
|
r'\b(?:PST|GMT|UTC|EST)\b', |
|
|
|
|
|
r'\b(?:CO2|NOx|pH|UV)\b', |
|
|
|
|
|
r'\b(?:SCADA|PLC|HMI)\b', |
|
|
|
|
|
r'\b(?:GDP|CPI|ROI|NPV)\b', |
|
|
|
|
|
r'\b(?:FOB|CIF|DDP)\b', |
|
|
|
|
|
r'\b(?:ABA|SWIFT|IBAN)\b' |
|
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'COMPANY': [ |
|
|
|
|
|
r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به|\s+طی)', |
|
|
r'([آ-یa-zA-Z\s]+)\s+شرکت', |
|
|
r'این\s+شرکت(?=\s|$|،|\.)', |
|
|
|
|
|
r'(بانک\s+[آ-یa-zA-Z\s]+)', |
|
|
|
|
|
r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))' |
|
|
], |
|
|
|
|
|
'BUSINESS_TERMS': [ |
|
|
|
|
|
r'تحلیل\s+عملکرد', |
|
|
r'گزارش\s+(?:فعالیت|عملکرد)\s+ماهانه', |
|
|
r'وضعیت\s+فروش', |
|
|
|
|
|
r'تولید\s+پایدار', |
|
|
r'سهم\s+بازار', |
|
|
r'صادرات\s+هدفمند', |
|
|
r'بهرهوری', |
|
|
r'ظرفیتهای\s+داخلی', |
|
|
|
|
|
r'شرکتهای\s+پیشرو', |
|
|
r'صنعت\s+پتروشیمی', |
|
|
r'سرمایهگذاران\s+بنیادی', |
|
|
|
|
|
r'شاخصهای\s+عملیاتی', |
|
|
r'برنامهریزی\s+مناسب', |
|
|
|
|
|
r'واحد\s+فروش', |
|
|
r'موجودی\s+انبار', |
|
|
|
|
|
r'فاز\s+رشد\s+جدید', |
|
|
r'ترکیب\s+فروش', |
|
|
r'سهم\s+صادراتی', |
|
|
|
|
|
r'روند\s+عملکرد', |
|
|
r'اعداد\s+اعلامشده', |
|
|
r'دادههای\s+ثبتشده' |
|
|
], |
|
|
|
|
|
'PRODUCT': [ |
|
|
|
|
|
r'\b(?:VCM|PVC|PE|PP|PS|ABS|SAN|PC|PMMA|PET|PBT|PA|POM|TPU)\b', |
|
|
|
|
|
r'پلی\s*(?:اتیلن|پروپیلن|استایرن|کربنات|متیل)', |
|
|
|
|
|
r'\b(?:اتیلن|پروپیلن|بنزن|تولوئن|زایلن|متانول|اتانول|استون|فنول)\b', |
|
|
|
|
|
r'\b(?:کلر|هیدروژن|اکسیژن|نیتروژن|آمونیاک|اتان|پروپان|بوتان)\b', |
|
|
|
|
|
r'محصول(?:ات)?', |
|
|
r'تولیدات\s+شرکت' |
|
|
], |
|
|
|
|
|
'PETROCHEMICAL': [ |
|
|
|
|
|
r'\b(?:LDPE|HDPE|LLDPE|PP|PS|EPS|ABS|SAN|PC|PMMA|PET|PBT|PA6|PA66|POM|TPU|EVA|EAA)\b', |
|
|
|
|
|
r'(?:Ethylene\s+Vinyl\s+Acetate|Ethyl\s+Acrylate|Methyl\s+Methacrylate|Polyethylene\s+Terephthalate)' |
|
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'PERCENTAGE': [ |
|
|
|
|
|
r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش|\s+بالاتر|\s+پایینتر)?', |
|
|
r'\d+(?:\.\d+)?\s*%', |
|
|
r'معادل\s+\d+(?:\.\d+)?\s*درصد', |
|
|
r'حدود\s+\d+(?:\.\d+)?\s*درصد', |
|
|
r'با\s+\d+(?:\.\d+)?\s*درصد\s+افزایش', |
|
|
r'رشد\s+\d+(?:\.\d+)?\s*درصدی', |
|
|
r'\d+(?:\.\d+)?\s*درصدی(?=\s+همراه|\s+بوده)', |
|
|
|
|
|
r'میزان\s+رشد(?=\s+نسبت|\s+معادل)', |
|
|
r'افزایش\s+قابلتوجهی', |
|
|
r'بهبود\s+نسبی', |
|
|
|
|
|
r'\d+(?:\.\d+)?\%\s*(?:increase|decrease|growth|improvement)', |
|
|
r'(?:approximately|about)\s+\d+(?:\.\d+)?\%' |
|
|
], |
|
|
|
|
|
'VOLUME': [ |
|
|
|
|
|
r'\d+(?:,\d{3})*\s*تن', |
|
|
r'\d+(?:,\d{3})*\s*(?:کیلوگرم|لیتر|بشکه)', |
|
|
r'میزان\s+\d+(?:,\d{3})*\s*تن', |
|
|
r'مقدار\s+تولید', |
|
|
r'حجم\s+فروش', |
|
|
r'ظرفیت\s+(?:تولید|اسمی)', |
|
|
|
|
|
r'\d+(?:,\d{3})*\s*(?:tons|kg|liters|barrels)', |
|
|
r'\d+(?:,\d{3})*\s*(?:metric\s+tons|MT)', |
|
|
r'\d+(?:,\d{3})*\s*(?:thousand\s+tons|KT)' |
|
|
], |
|
|
|
|
|
'RATIOS': [ |
|
|
|
|
|
r'نسبت\s+(?:فروش|تولید)\s+به\s+[آ-ی\s]+', |
|
|
r'\d+(?:\.\d+)?\s*نزدیک', |
|
|
r'برابر\s+با\s+\d+(?:\.\d+)?', |
|
|
r'معادل\s+\d+(?:\.\d+)?', |
|
|
r'میزان\s+(?:رشد|افزایش)', |
|
|
r'شاخص\s+(?:مهم|عملیاتی)', |
|
|
r'\d+(?:\.\d+)?\s*درصد\s+کل\s+تولید' |
|
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'PHONE': [ |
|
|
|
|
|
r'(?:تلفن[\s:]*)?(?:شماره[\s:]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}', |
|
|
r'(?:تماس[\s:]*)?(?:شماره[\s:]*)?(?:با[\s]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}', |
|
|
r'(?:موبایل[\s:]*)?(?:شماره[\s:]*)?(?:0)?9[۰-۹0-9]{9}', |
|
|
r'[۰-۹0-9]{3,4}[-\s][۰-۹0-9]{7,8}', |
|
|
r'[۰-۹0-9]{11}(?!\d)', |
|
|
r'(?:\+98|0098)?[۰-۹0-9]{10}', |
|
|
r'[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}', |
|
|
|
|
|
r'\+[0-9]{1,3}-[0-9]{3}-[0-9]{3}-[0-9]{4}(?:\s+ext\.\s+[0-9]{3,4})?', |
|
|
r'\([0-9]{3}\)\s+[0-9]{3}-[0-9]{4}' |
|
|
], |
|
|
|
|
|
'EMAIL': [ |
|
|
|
|
|
r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', |
|
|
|
|
|
r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', |
|
|
r'email[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', |
|
|
r'نشانی[\s]*الکترونیکی[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', |
|
|
r'آدرس[\s]*ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', |
|
|
|
|
|
r'facility\.manager@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' |
|
|
] |
|
|
} |
|
|
|
|
|
def anonymize_text(self, original_text, lang='fa', selected_categories=None): |
|
|
"""گام 1: ناشناسسازی متن با الگوهای انتخاب شده""" |
|
|
try: |
|
|
if not original_text or not original_text.strip(): |
|
|
return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!" |
|
|
|
|
|
|
|
|
self.mapping_table = {} |
|
|
self.counters = {key: 0 for key in self.counters.keys()} |
|
|
|
|
|
anonymized = original_text |
|
|
found_entities = set() |
|
|
|
|
|
|
|
|
detected_lang = self.detect_language(original_text) |
|
|
logger.info(f"Detected language: {detected_lang}") |
|
|
|
|
|
|
|
|
if self.models_loaded: |
|
|
logger.info("🤖 Running comprehensive local NER extraction...") |
|
|
ner_entities = self.extract_entities_with_ner(original_text, detected_lang) |
|
|
|
|
|
for entity in ner_entities: |
|
|
if (entity['text'] not in found_entities and |
|
|
len(entity['text'].strip()) > 1 and |
|
|
entity['confidence'] > 0.5): |
|
|
|
|
|
category = self.map_ner_to_categories(entity['label'], entity['source']) |
|
|
|
|
|
if entity['text'] not in self.mapping_table: |
|
|
self.counters[category] += 1 |
|
|
code = f"{category}_{self.counters[category]:03d}_LOCAL_NER" |
|
|
self.mapping_table[entity['text']] = code |
|
|
found_entities.add(entity['text']) |
|
|
logger.info(f"Local NER: {entity['text']} -> {code}") |
|
|
else: |
|
|
logger.info("ℹ️ Using comprehensive regex-only mode") |
|
|
|
|
|
|
|
|
all_patterns = self.get_comprehensive_patterns() |
|
|
|
|
|
|
|
|
if selected_categories: |
|
|
selected_pattern_types = self.get_selected_patterns(selected_categories, lang) |
|
|
patterns = {k: v for k, v in all_patterns.items() if k in selected_pattern_types} |
|
|
logger.info(f"📋 Using selected pattern categories: {len(patterns)} types") |
|
|
else: |
|
|
patterns = all_patterns |
|
|
logger.info("📋 Using all available pattern categories") |
|
|
|
|
|
|
|
|
logger.info("🔍 Running selective priority-based regex extraction...") |
|
|
|
|
|
|
|
|
processed_entities = set() |
|
|
|
|
|
|
|
|
priority_order = [ |
|
|
'ID_NUMBER', 'EMAIL', 'PHONE', 'ACCOUNT', 'TECHNICAL_CODES', |
|
|
'NETWORK_ADDRESSES', 'INTERNATIONAL_CURRENCIES', 'AMOUNT', |
|
|
'TECHNICAL_UNITS', 'ACRONYMS_ABBREVIATIONS', 'ADVANCED_DATE_FORMATS', |
|
|
'TIME_RANGES', 'COMPLEX_ADDRESSES', 'MIXED_NAMES', 'ENGLISH_TITLES', |
|
|
'STOCK_SYMBOL', 'COMPANY', 'PERSON', 'PERCENTAGE', 'VOLUME', |
|
|
'RATIOS', 'LOCATION', 'DATE', 'FINANCIAL_TERMS', 'BUSINESS_TERMS', |
|
|
'PRODUCT', 'PETROCHEMICAL' |
|
|
] |
|
|
|
|
|
for category in priority_order: |
|
|
if category in patterns: |
|
|
pattern_list = patterns[category] |
|
|
for pattern in pattern_list: |
|
|
matches = re.finditer(pattern, original_text, re.IGNORECASE | re.MULTILINE) |
|
|
for match in matches: |
|
|
if match.groups(): |
|
|
item = match.group(1).strip() |
|
|
full_match = match.group(0).strip() |
|
|
else: |
|
|
item = match.group(0).strip() |
|
|
full_match = item |
|
|
|
|
|
|
|
|
overlaps = False |
|
|
match_start, match_end = match.span() |
|
|
|
|
|
for proc_start, proc_end in processed_entities: |
|
|
|
|
|
if not (match_end <= proc_start or match_start >= proc_end): |
|
|
overlaps = True |
|
|
break |
|
|
|
|
|
if (not overlaps and |
|
|
full_match not in found_entities and |
|
|
full_match not in self.mapping_table and |
|
|
len(full_match) >= 2): |
|
|
|
|
|
self.counters[category] += 1 |
|
|
code = f"{category}_{self.counters[category]:03d}_REGEX" |
|
|
self.mapping_table[full_match] = code |
|
|
found_entities.add(full_match) |
|
|
processed_entities.add((match_start, match_end)) |
|
|
logger.info(f"Regex ({category}): {full_match} -> {code}") |
|
|
|
|
|
|
|
|
sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True) |
|
|
for original_item, code in sorted_items: |
|
|
anonymized = anonymized.replace(original_item, code) |
|
|
|
|
|
logger.info(f"✅ Selective anonymization completed. Found {len(self.mapping_table)} entities.") |
|
|
return anonymized |
|
|
|
|
|
except Exception as e: |
|
|
return f"❌ Error in anonymization: {str(e)}" if lang == 'en' else f"❌ خطا در ناشناسسازی: {str(e)}" |
|
|
|
|
|
def send_to_chatgpt(self, anonymized_text, lang='fa'): |
|
|
"""گام 2: ارسال به ChatGPT""" |
|
|
try: |
|
|
if not anonymized_text or not anonymized_text.strip(): |
|
|
return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناسشده خالی است!" |
|
|
|
|
|
if not self.api_key: |
|
|
return "❌ API Key not configured! Please set OPENAI_API_KEY environment variable." if lang == 'en' else "❌ کلید API تنظیم نشده است! لطفاً OPENAI_API_KEY را در متغیرهای محیطی تنظیم کنید." |
|
|
|
|
|
system_msg = "You are a professional financial analyst. The text contains anonymous codes. Answer questions accurately." if lang == 'en' else "شما یک تحلیلگر مالی حرفهای هستید. متن حاوی کدهای ناشناس است. به سوالات با دقت پاسخ دهید." |
|
|
|
|
|
headers = { |
|
|
"Authorization": f"Bearer {self.api_key}", |
|
|
"Content-Type": "application/json" |
|
|
} |
|
|
|
|
|
data = { |
|
|
"model": "gpt-4o-mini", |
|
|
"messages": [ |
|
|
{"role": "system", "content": system_msg}, |
|
|
{"role": "user", "content": anonymized_text} |
|
|
], |
|
|
"max_tokens": 2000, |
|
|
"temperature": 0.7 |
|
|
} |
|
|
|
|
|
response = requests.post( |
|
|
"https://api.openai.com/v1/chat/completions", |
|
|
headers=headers, |
|
|
json=data, |
|
|
timeout=30 |
|
|
) |
|
|
|
|
|
if response.status_code == 200: |
|
|
result = response.json() |
|
|
return result['choices'][0]['message']['content'] |
|
|
else: |
|
|
error_data = response.json() if response.content else {} |
|
|
error_message = error_data.get('error', {}).get('message', response.text) |
|
|
|
|
|
if 'Incorrect API key' in error_message: |
|
|
return "❌ Invalid API key." if lang == 'en' else "❌ کلید API نامعتبر است." |
|
|
elif 'quota' in error_message: |
|
|
return "❌ API quota exceeded." if lang == 'en' else "❌ سهمیه API تمام شده است." |
|
|
else: |
|
|
return f"❌ API Error: {error_message}" |
|
|
|
|
|
except Exception as e: |
|
|
return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}" |
|
|
|
|
|
def deanonymize_response(self, gpt_response, lang='fa'): |
|
|
"""گام 3: بازگردانی""" |
|
|
try: |
|
|
if not gpt_response or not gpt_response.strip(): |
|
|
return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!" |
|
|
|
|
|
if not self.mapping_table: |
|
|
return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!" |
|
|
|
|
|
final_result = gpt_response |
|
|
reverse_mapping = {code: original for original, code in self.mapping_table.items()} |
|
|
|
|
|
sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True) |
|
|
for code, original in sorted_codes: |
|
|
final_result = final_result.replace(code, original) |
|
|
escaped_code = code.replace('_', '\\_') |
|
|
final_result = final_result.replace(escaped_code, original) |
|
|
|
|
|
return final_result |
|
|
|
|
|
except Exception as e: |
|
|
return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}" |
|
|
|
|
|
def get_model_status(self): |
|
|
"""وضعیت مدلهای محلی""" |
|
|
status = "🤖 **Selective Anonymization System Status (Enhanced with Category Selection):**\n\n" |
|
|
|
|
|
if hasattr(self, 'model_status') and self.model_status: |
|
|
for model_type, model_status in self.model_status.items(): |
|
|
if model_type == 'persian': |
|
|
status += f"• **Persian NER**: {model_status}\n" |
|
|
elif model_type == 'english': |
|
|
status += f"• **English NER**: {model_status}\n" |
|
|
elif model_type == 'transformers': |
|
|
status += f"• **Transformers**: {model_status}\n" |
|
|
elif model_type == 'fallback': |
|
|
status += f"• **Fallback Mode**: {model_status}\n" |
|
|
elif model_type == 'critical': |
|
|
status += f"• **Critical**: {model_status}\n" |
|
|
elif model_type == 'directory': |
|
|
status += f"• **Directory**: {model_status}\n" |
|
|
|
|
|
loaded_count = sum(1 for status in getattr(self, 'model_status', {}).values() |
|
|
if status.startswith("✅")) |
|
|
status += f"\n📊 **Summary**: {loaded_count}/2 local models loaded" |
|
|
|
|
|
status += f"\n🔍 **Models Path**: {self.models_base_path}" |
|
|
status += f"\n🔧 **Latest Features**: Selective pattern categories with user control" |
|
|
|
|
|
status += f"\n\n🎯 **Selective Sensitive Data Detection Categories:**" |
|
|
|
|
|
|
|
|
for cat_key, cat_info in self.pattern_categories.items(): |
|
|
icon = cat_info['icon'] |
|
|
name_fa = cat_info['name_fa'] |
|
|
name_en = cat_info['name_en'] |
|
|
pattern_count = len(cat_info['patterns']) |
|
|
|
|
|
status += f"\n\n{icon} **{name_fa} ({name_en})**:" |
|
|
status += f"\n 📋 الگوها: {pattern_count} نوع" |
|
|
status += f"\n 🔍 شامل: {', '.join(cat_info['patterns'][:3])}{'...' if len(cat_info['patterns']) > 3 else ''}" |
|
|
|
|
|
status += f"\n\n✨ **Key Improvements with Category Selection:**" |
|
|
status += f"\n 🎯 **User Control**: انتخاب دقیق دستهبندیهای مورد نیاز" |
|
|
status += f"\n 🛡️ **Flexible Protection**: محافظت انتخابی از دادههای حساس" |
|
|
status += f("\n 📊 **Efficiency**: پردازش سریعتر با الگوهای انتخاب شده") |
|
|
status += f"\n 🔍 **Precision**: کاهش false positives با فیلترینگ هدفمند" |
|
|
status += f"\n ⚡ **Performance**: بهینهسازی بر اساس نیاز کاربر" |
|
|
|
|
|
status += f"\n\nℹ️ **Usage**: انتخاب دستهبندیهای مورد نظر از چکباکسها برای ناشناسسازی هدفمند!" |
|
|
|
|
|
return status |
|
|
|
|
|
def process_all_steps(input_text, language, selected_categories): |
|
|
"""پردازش خودکار تمام مراحل با دقت بالا و کنترل دستهبندی""" |
|
|
lang = 'en' if language == 'English' else 'fa' |
|
|
|
|
|
if not input_text.strip(): |
|
|
error_msg = "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!" |
|
|
return error_msg, "", "", "" |
|
|
|
|
|
try: |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
anonymized_text = anonymizer.anonymize_text(input_text, lang, selected_categories) |
|
|
if anonymized_text.startswith("❌"): |
|
|
return anonymized_text, "", "", "" |
|
|
|
|
|
gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang) |
|
|
if gpt_response.startswith("❌"): |
|
|
entities_found = len(anonymizer.mapping_table) |
|
|
ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_NER' in code) |
|
|
regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code) |
|
|
|
|
|
selected_count = len(selected_categories) if selected_categories else 0 |
|
|
|
|
|
|
|
|
critical_categories = ['ID_NUMBER', 'EMAIL', 'PHONE', 'ACCOUNT', 'AMOUNT', 'DATE'] |
|
|
critical_count = sum(1 for code in anonymizer.mapping_table.values() |
|
|
if any(cat in code for cat in critical_categories)) |
|
|
|
|
|
method = "Selective Local NER + Regex" if anonymizer.models_loaded else "Selective Regex Only" |
|
|
success_msg = (f"✅ Selective anonymization completed with {method}!\n" |
|
|
f"📋 Selected categories: {selected_count} | 🔍 Critical data: {critical_count}\n" |
|
|
f"🤖 NER: {ner_count} | 🔍 Regex: {regex_count} | 📊 Total: {entities_found}") |
|
|
return success_msg, anonymized_text, gpt_response, "" |
|
|
|
|
|
final_result = anonymizer.deanonymize_response(gpt_response, lang) |
|
|
|
|
|
total_time = time.time() - start_time |
|
|
entities_found = len(anonymizer.mapping_table) |
|
|
ner_count = sum(1 for code in anonymizer.mapping_table.values() if '_NER' in code) |
|
|
regex_count = sum(1 for code in anonymizer.mapping_table.values() if '_REGEX' in code) |
|
|
|
|
|
selected_count = len(selected_categories) if selected_categories else 8 |
|
|
|
|
|
|
|
|
id_count = sum(1 for code in anonymizer.mapping_table.values() if 'ID_NUMBER' in code) |
|
|
email_count = sum(1 for code in anonymizer.mapping_table.values() if 'EMAIL' in code) |
|
|
phone_count = sum(1 for code in anonymizer.mapping_table.values() if 'PHONE' in code) |
|
|
account_count = sum(1 for code in anonymizer.mapping_table.values() if 'ACCOUNT' in code) |
|
|
amount_count = sum(1 for code in anonymizer.mapping_table.values() if 'AMOUNT' in code) |
|
|
person_count = sum(1 for code in anonymizer.mapping_table.values() if 'PERSON' in code) |
|
|
|
|
|
critical_details = [] |
|
|
if id_count > 0: critical_details.append(f"🆔 IDs: {id_count}") |
|
|
if phone_count > 0: critical_details.append(f"📞 Phones: {phone_count}") |
|
|
if email_count > 0: critical_details.append(f"📧 Emails: {email_count}") |
|
|
if account_count > 0: critical_details.append(f"🏦 Accounts: {account_count}") |
|
|
if amount_count > 0: critical_details.append(f"💰 Amounts: {amount_count}") |
|
|
if person_count > 0: critical_details.append(f"👤 Names: {person_count}") |
|
|
|
|
|
method = "Selective Local NER + Regex" if anonymizer.models_loaded else "Selective Regex Only" |
|
|
success_msg = (f"🎉 Complete selective anonymization & restoration successful!\n" |
|
|
f"🔧 Method: {method} | 📋 Categories: {selected_count}/8\n" |
|
|
f"🔍 Protected: {' | '.join(critical_details) if critical_details else '0'}\n" |
|
|
f"📊 Total: {entities_found} entities | ⏱️ Time: {total_time:.2f}s | 🎯 User-controlled selection") |
|
|
|
|
|
return success_msg, anonymized_text, gpt_response, final_result |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"❌ Processing error: {str(e)}" if lang == 'en' else f"❌ خطا در پردازش: {str(e)}" |
|
|
return error_msg, "", "", "" |
|
|
|
|
|
def get_mapping_table(language): |
|
|
"""نمایش جدول نگاشت""" |
|
|
lang = 'en' if language == 'English' else 'fa' |
|
|
|
|
|
if not anonymizer.mapping_table: |
|
|
return "❌ Mapping table is empty! Please process some text first." if lang == 'en' else "❌ جدول نگاشت خالی است! ابتدا متنی را پردازش کنید." |
|
|
|
|
|
result = "📋 **Selective High-Precision Sensitive Data Mapping Table:**\n\n" if lang == 'en' else "📋 **جدول نگاشت انتخابی اطلاعات حساس:**\n\n" |
|
|
|
|
|
ner_items = {k: v for k, v in anonymizer.mapping_table.items() if '_NER' in v} |
|
|
regex_items = {k: v for k, v in anonymizer.mapping_table.items() if '_REGEX' in v} |
|
|
|
|
|
|
|
|
critical_categories = { |
|
|
'ID_NUMBER': '🆔 **Identity Codes (Critical)**', |
|
|
'PHONE': '📞 **Phone Numbers**', |
|
|
'EMAIL': '📧 **Email Addresses**', |
|
|
'ACCOUNT': '🏦 **Bank Accounts**', |
|
|
'AMOUNT': '💰 **Financial Amounts**', |
|
|
'DATE': '📅 **Dates**' |
|
|
} |
|
|
|
|
|
business_categories = { |
|
|
'PERSON': '👤 **Person Names**', |
|
|
'COMPANY': '🏢 **Companies**', |
|
|
'LOCATION': '📍 **Locations**', |
|
|
'PERCENTAGE': '📊 **Percentages**', |
|
|
'VOLUME': '📦 **Volumes & Units**', |
|
|
'STOCK_SYMBOL': '📈 **Stock Symbols**' |
|
|
} |
|
|
|
|
|
technical_categories = { |
|
|
'TECHNICAL_CODES': '⚙️ **Technical Codes**', |
|
|
'NETWORK_ADDRESSES': '🌐 **Network Addresses**' |
|
|
} |
|
|
|
|
|
|
|
|
for category, title in critical_categories.items(): |
|
|
category_items = {k: v for k, v in anonymizer.mapping_table.items() if category in v} |
|
|
if category_items: |
|
|
result += f"{title}:\n" |
|
|
for original, code in list(category_items.items())[:5]: |
|
|
result += f" • `{original}` → `{code}`\n" |
|
|
if len(category_items) > 5: |
|
|
result += f" ... و {len(category_items) - 5} مورد دیگر\n" |
|
|
result += "\n" |
|
|
|
|
|
|
|
|
if ner_items: |
|
|
result += "🤖 **Local NER Detected**:\n" |
|
|
for original, code in list(ner_items.items())[:5]: |
|
|
result += f" • `{original}` → `{code}`\n" |
|
|
if len(ner_items) > 5: |
|
|
result += f" ... و {len(ner_items) - 5} مورد دیگر\n" |
|
|
result += "\n" |
|
|
|
|
|
|
|
|
business_items = {k: v for k, v in regex_items.items() |
|
|
if any(cat in v for cat in business_categories.keys())} |
|
|
if business_items: |
|
|
result += "💼 **Business Data**:\n" |
|
|
for original, code in list(business_items.items())[:8]: |
|
|
result += f" • `{original}` → `{code}`\n" |
|
|
if len(business_items) > 8: |
|
|
result += f" ... و {len(business_items) - 8} مورد دیگر\n" |
|
|
result += "\n" |
|
|
|
|
|
|
|
|
technical_items = {k: v for k, v in regex_items.items() |
|
|
if any(cat in v for cat in technical_categories.keys())} |
|
|
if technical_items: |
|
|
result += "⚙️ **Technical Data**:\n" |
|
|
for original, code in list(technical_items.items())[:5]: |
|
|
result += f" • `{original}` → `{code}`\n" |
|
|
if len(technical_items) > 5: |
|
|
result += f" ... و {len(technical_items) - 5} مورد دیگر\n" |
|
|
result += "\n" |
|
|
|
|
|
|
|
|
critical_count = sum(len({k: v for k, v in anonymizer.mapping_table.items() if cat in v}) |
|
|
for cat in critical_categories.keys()) |
|
|
|
|
|
result += f"📊 **Selective Statistics**:\n" |
|
|
result += f"🔍 **Critical Sensitive Data**: {critical_count} items\n" |
|
|
result += f"🤖 **NER Detected**: {len(ner_items)} items\n" |
|
|
result += f"💼 **Business Data**: {len(business_items)} items\n" |
|
|
result += f"⚙️ **Technical Data**: {len(technical_items)} items\n" |
|
|
result += f"📋 **Total Protected**: {len(anonymizer.mapping_table)} entities\n" |
|
|
|
|
|
result += f"\n✨ **System Enhancement**: User-controlled selective detection\n" |
|
|
result += f"🎯 **Accuracy**: Targeted processing based on user selection\n" |
|
|
result += f"🛡️ **Protection Level**: Flexible sensitive data security with category control!" |
|
|
|
|
|
return result |
|
|
|
|
|
def clear_all(): |
|
|
"""پاک کردن همه""" |
|
|
anonymizer.mapping_table = {} |
|
|
anonymizer.counters = {key: 0 for key in anonymizer.counters.keys()} |
|
|
return "", "", "", "", "" |
|
|
|
|
|
def update_ui_text(language): |
|
|
"""بهروزرسانی متنهای رابط کاربری""" |
|
|
if language == 'English': |
|
|
return { |
|
|
'title': 'Selective High-Precision Bilingual Data Anonymization System', |
|
|
'step1': 'Input Text & Category Selection', |
|
|
'step2': 'Anonymized Text', |
|
|
'step3': 'Raw ChatGPT Response', |
|
|
'step4': 'Final Restored Response', |
|
|
'input_placeholder': 'Enter your original text here...\nExample: Company reports, person names, financial amounts, phone numbers, emails, IBAN codes, bank accounts, etc.\n\n✨ Selective system with category-based control!', |
|
|
'process_btn': 'Process with Selected Categories', |
|
|
'clear_btn': 'Clear All', |
|
|
'mapping_btn': 'Show Selective Mapping Table', |
|
|
'status_btn': 'Show System Status', |
|
|
'categories_label': 'Select Pattern Categories:', |
|
|
'copy_btn': 'Copy', |
|
|
'direction': 'ltr' |
|
|
} |
|
|
else: |
|
|
return { |
|
|
'title': 'سیستم ناشناسسازی انتخابی دقیق دوزبانه', |
|
|
'step1': 'متن ورودی و انتخاب دستهبندی', |
|
|
'step2': 'متن ناشناسشده', |
|
|
'step3': 'پاسخ خام ChatGPT', |
|
|
'step4': 'پاسخ نهایی بازگردانده شده', |
|
|
'input_placeholder': 'متن اصلی خود را اینجا وارد کنید...\nمثال: گزارشهای شرکت، نام اشخاص، مبالغ مالی، شماره تلفن، ایمیل، شماره شبا، حساب بانکی و غیره\n\n✨ سیستم انتخابی با کنترل دستهبندی!', |
|
|
'process_btn': 'پردازش با دستهبندیهای انتخاب شده', |
|
|
'clear_btn': 'پاک کردن همه', |
|
|
'mapping_btn': 'نمایش جدول نگاشت انتخابی', |
|
|
'status_btn': 'نمایش وضعیت سیستم', |
|
|
'categories_label': 'انتخاب دستهبندیهای الگو:', |
|
|
'copy_btn': 'کپی', |
|
|
'direction': 'rtl' |
|
|
} |
|
|
|
|
|
def update_interface(language): |
|
|
"""تغییر رابط کاربری بر اساس زبان""" |
|
|
ui_text = update_ui_text(language) |
|
|
is_english = (language == 'English') |
|
|
|
|
|
|
|
|
workflow_css = "workflow ltr" if is_english else "workflow rtl" |
|
|
|
|
|
|
|
|
category_choices = anonymizer.get_category_choices('en' if is_english else 'fa') |
|
|
|
|
|
return [ |
|
|
gr.update(value=f"<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 {ui_text['title']}</h1>"), |
|
|
gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🔍 {ui_text['step1']}</h2>"), |
|
|
gr.update(placeholder=ui_text['input_placeholder'], rtl=not is_english), |
|
|
gr.update(value=f"🚀 {ui_text['process_btn']}"), |
|
|
gr.update(value=f"🗑️ {ui_text['clear_btn']}"), |
|
|
gr.update(rtl=not is_english), |
|
|
gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🎭 {ui_text['step2']}</h2>"), |
|
|
gr.update(rtl=not is_english), |
|
|
gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🤖 {ui_text['step3']}</h2>"), |
|
|
gr.update(rtl=not is_english), |
|
|
gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>✅ {ui_text['step4']}</h2>"), |
|
|
gr.update(rtl=not is_english), |
|
|
gr.update(value=f"📋 {ui_text['mapping_btn']}"), |
|
|
gr.update(value=f"📊 {ui_text['status_btn']}"), |
|
|
gr.update(rtl=not is_english), |
|
|
gr.update(elem_classes=workflow_css), |
|
|
gr.update(label=ui_text['categories_label'], choices=category_choices, value=category_choices) |
|
|
] |
|
|
|
|
|
|
|
|
anonymizer = ComprehensiveBilingualDataAnonymizer() |
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
body, .gradio-container { |
|
|
font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important; |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; |
|
|
min-height: 100vh !important; |
|
|
padding: 20px !important; |
|
|
} |
|
|
|
|
|
.rtl { |
|
|
direction: rtl !important; |
|
|
text-align: right !important; |
|
|
} |
|
|
|
|
|
.ltr { |
|
|
direction: ltr !important; |
|
|
text-align: left !important; |
|
|
} |
|
|
|
|
|
.workflow { |
|
|
display: grid !important; |
|
|
grid-template-columns: 1fr 1fr 1fr 1fr !important; |
|
|
gap: 25px !important; |
|
|
padding: 30px !important; |
|
|
align-items: start !important; |
|
|
align-content: start !important; |
|
|
grid-auto-rows: auto !important; |
|
|
} |
|
|
|
|
|
.workflow > * { |
|
|
align-self: start !important; |
|
|
vertical-align: top !important; |
|
|
margin-top: 0 !important; |
|
|
} |
|
|
|
|
|
.workflow .gradio-column, |
|
|
.workflow-column { |
|
|
display: flex !important; |
|
|
flex-direction: column !important; |
|
|
align-items: stretch !important; |
|
|
justify-content: flex-start !important; |
|
|
height: auto !important; |
|
|
min-height: 0 !important; |
|
|
margin-top: 0 !important; |
|
|
padding-top: 0 !important; |
|
|
} |
|
|
|
|
|
.gradio-textbox { |
|
|
border-radius: 10px !important; |
|
|
box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important; |
|
|
flex-grow: 1 !important; |
|
|
min-height: 380px !important; |
|
|
max-height: 380px !important; |
|
|
height: 380px !important; |
|
|
} |
|
|
|
|
|
.gradio-textbox textarea { |
|
|
min-height: 350px !important; |
|
|
max-height: 350px !important; |
|
|
height: 350px !important; |
|
|
resize: vertical !important; |
|
|
} |
|
|
|
|
|
.workflow.rtl { |
|
|
direction: rtl !important; |
|
|
} |
|
|
|
|
|
.workflow.ltr { |
|
|
direction: ltr !important; |
|
|
} |
|
|
|
|
|
h1, h2, h3 { |
|
|
text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important; |
|
|
margin-top: 0 !important; |
|
|
margin-bottom: 10px !important; |
|
|
padding-top: 0 !important; |
|
|
line-height: 1.2 !important; |
|
|
} |
|
|
|
|
|
h2 { |
|
|
min-height: 40px !important; |
|
|
max-height: 40px !important; |
|
|
display: flex !important; |
|
|
align-items: center !important; |
|
|
margin-bottom: 15px !important; |
|
|
} |
|
|
|
|
|
.status-box { |
|
|
background: linear-gradient(135deg, #4CAF50, #45a049) !important; |
|
|
border: 3px solid #2E7D32 !important; |
|
|
border-radius: 15px !important; |
|
|
padding: 15px !important; |
|
|
margin: 10px 0 !important; |
|
|
box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3) !important; |
|
|
animation: pulse 2s infinite !important; |
|
|
min-height: 120px !important; |
|
|
max-height: 120px !important; |
|
|
} |
|
|
|
|
|
.status-box textarea { |
|
|
background: rgba(255, 255, 255, 0.95) !important; |
|
|
border: none !important; |
|
|
border-radius: 10px !important; |
|
|
font-weight: bold !important; |
|
|
font-size: 1.1em !important; |
|
|
color: #1B5E20 !important; |
|
|
text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.8) !important; |
|
|
min-height: 80px !important; |
|
|
max-height: 80px !important; |
|
|
} |
|
|
|
|
|
.category-selection { |
|
|
background: linear-gradient(135deg, #E3F2FD, #BBDEFB) !important; |
|
|
border: 2px solid #1976D2 !important; |
|
|
border-radius: 15px !important; |
|
|
padding: 20px !important; |
|
|
margin: 15px 0 !important; |
|
|
box-shadow: 0 6px 20px rgba(25, 118, 210, 0.2) !important; |
|
|
} |
|
|
|
|
|
.gradio-checkboxgroup { |
|
|
background: rgba(255, 255, 255, 0.9) !important; |
|
|
border-radius: 10px !important; |
|
|
padding: 15px !important; |
|
|
margin: 10px 0 !important; |
|
|
} |
|
|
|
|
|
@keyframes pulse { |
|
|
0% { box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3); } |
|
|
50% { box-shadow: 0 8px 40px rgba(76, 175, 80, 0.6); } |
|
|
100% { box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3); } |
|
|
} |
|
|
|
|
|
.gradio-button { |
|
|
border-radius: 25px !important; |
|
|
font-weight: bold !important; |
|
|
transition: all 0.3s ease !important; |
|
|
margin: 5px 0 !important; |
|
|
min-height: 50px !important; |
|
|
max-height: 50px !important; |
|
|
} |
|
|
|
|
|
.gradio-button:hover { |
|
|
transform: translateY(-2px) !important; |
|
|
box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important; |
|
|
} |
|
|
|
|
|
h1 { |
|
|
background: linear-gradient(45deg, #FFD700, #FFA500) !important; |
|
|
-webkit-background-clip: text !important; |
|
|
-webkit-text-fill-color: transparent !important; |
|
|
background-clip: text !important; |
|
|
min-height: 80px !important; |
|
|
} |
|
|
|
|
|
@media (max-width: 1200px) { |
|
|
.workflow { |
|
|
grid-template-columns: 1fr 1fr !important; |
|
|
gap: 20px !important; |
|
|
} |
|
|
} |
|
|
|
|
|
@media (max-width: 768px) { |
|
|
.workflow { |
|
|
grid-template-columns: 1fr !important; |
|
|
gap: 15px !important; |
|
|
} |
|
|
|
|
|
.gradio-textbox { |
|
|
min-height: 300px !important; |
|
|
max-height: 300px !important; |
|
|
height: 300px !important; |
|
|
} |
|
|
} |
|
|
|
|
|
[data-testid="textbox"]:dir(rtl) { |
|
|
text-align: right !important; |
|
|
direction: rtl !important; |
|
|
} |
|
|
|
|
|
[data-testid="textbox"]:dir(ltr) { |
|
|
text-align: left !important; |
|
|
direction: ltr !important; |
|
|
} |
|
|
|
|
|
.gradio-container .gradio-column { |
|
|
align-self: start !important; |
|
|
vertical-align: top !important; |
|
|
} |
|
|
|
|
|
.gradio-container .gradio-row { |
|
|
align-items: flex-start !important; |
|
|
} |
|
|
|
|
|
* { |
|
|
box-sizing: border-box !important; |
|
|
} |
|
|
|
|
|
.gradio-container { |
|
|
align-items: start !important; |
|
|
justify-content: start !important; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="📊 Selective High-Precision Anonymization System", theme=gr.themes.Soft(), css=custom_css) as app: |
|
|
|
|
|
with gr.Row(): |
|
|
language_selector = gr.Radio( |
|
|
choices=["فارسی", "English"], |
|
|
value="فارسی", |
|
|
label="Language / زبان", |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
title = gr.HTML("<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 سیستم ناشناسسازی انتخابی دقیق دوزبانه</h1>") |
|
|
|
|
|
|
|
|
with gr.Row(elem_classes="category-selection"): |
|
|
with gr.Column(): |
|
|
category_title = gr.HTML("<h3 style='text-align: center; color: #1976D2; margin-bottom: 15px;'>🎯 انتخاب دستهبندیهای الگوی ناشناسسازی</h3>") |
|
|
|
|
|
pattern_categories = gr.CheckboxGroup( |
|
|
choices=anonymizer.get_category_choices('fa'), |
|
|
value=anonymizer.get_category_choices('fa'), |
|
|
label="انتخاب دستهبندیهای الگو:", |
|
|
interactive=True, |
|
|
elem_classes=["gradio-checkboxgroup"] |
|
|
) |
|
|
|
|
|
category_info = gr.HTML(""" |
|
|
<div style='background: rgba(255, 255, 255, 0.9); padding: 15px; border-radius: 10px; margin-top: 10px;'> |
|
|
<p style='margin: 0; color: #666; font-size: 0.9em; text-align: center;'> |
|
|
💡 <strong>راهنمایی:</strong> فقط دستهبندیهایی که نیاز دارید را انتخاب کنید تا ناشناسسازی دقیقتر و سریعتر انجام شود |
|
|
</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(elem_classes="workflow rtl") as workflow_row: |
|
|
with gr.Column(elem_classes="workflow-column"): |
|
|
step1_title = gr.HTML('<h2 style="direction: rtl;">🔍 متن ورودی و انتخاب دستهبندی</h2>') |
|
|
|
|
|
input_text = gr.Textbox( |
|
|
lines=15, |
|
|
placeholder="متن اصلی خود را اینجا وارد کنید...\nمثال: گزارشهای شرکت، نام اشخاص، مبالغ مالی، شماره تلفن، ایمیل، شماره شبا، حساب بانکی و غیره\n\n✨ سیستم انتخابی با کنترل دستهبندی!", |
|
|
label="", |
|
|
rtl=True |
|
|
) |
|
|
|
|
|
process_btn = gr.Button("🚀 پردازش با دستهبندیهای انتخاب شده", variant="primary") |
|
|
clear_btn = gr.Button("🗑️ پاک کردن همه", variant="stop") |
|
|
|
|
|
status = gr.Textbox( |
|
|
label="وضعیت", |
|
|
lines=4, |
|
|
interactive=False, |
|
|
rtl=True, |
|
|
elem_classes=["status-box"] |
|
|
) |
|
|
|
|
|
with gr.Column(elem_classes="workflow-column"): |
|
|
step2_title = gr.HTML('<h2 style="direction: rtl;">🎭 متن ناشناسشده</h2>') |
|
|
|
|
|
anonymized_output = gr.Textbox( |
|
|
lines=15, |
|
|
placeholder="متن ناشناسشده اینجا نمایش داده میشود...", |
|
|
label="", |
|
|
interactive=False, |
|
|
rtl=True |
|
|
) |
|
|
|
|
|
with gr.Column(elem_classes="workflow-column"): |
|
|
step3_title = gr.HTML('<h2 style="direction: rtl;">🤖 پاسخ خام ChatGPT</h2>') |
|
|
|
|
|
gpt_output = gr.Textbox( |
|
|
lines=15, |
|
|
placeholder="پاسخ خام ChatGPT اینجا نمایش داده میشود...", |
|
|
label="", |
|
|
interactive=False, |
|
|
rtl=True |
|
|
) |
|
|
|
|
|
with gr.Column(elem_classes="workflow-column"): |
|
|
step4_title = gr.HTML('<h2 style="direction: rtl;">✅ پاسخ نهایی بازگردانده شده</h2>') |
|
|
|
|
|
final_output = gr.Textbox( |
|
|
lines=15, |
|
|
placeholder="پاسخ نهایی اینجا نمایش داده میشود...", |
|
|
label="", |
|
|
interactive=False, |
|
|
rtl=True |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
mapping_title = gr.HTML('<h2>🗂️ جدول نگاشت انتخابی</h2>') |
|
|
mapping_btn = gr.Button("📋 نمایش جدول نگاشت انتخابی") |
|
|
|
|
|
mapping_output = gr.Textbox( |
|
|
lines=15, |
|
|
label="جدول نگاشت اطلاعات", |
|
|
interactive=False, |
|
|
visible=False, |
|
|
rtl=True |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
status_title = gr.HTML('<h2>⚙️ وضعیت سیستم و قابلیتها</h2>') |
|
|
system_status_btn = gr.Button("📊 نمایش وضعیت سیستم انتخابی") |
|
|
|
|
|
system_status_output = gr.Textbox( |
|
|
lines=20, |
|
|
label="وضعیت سیستم", |
|
|
interactive=False, |
|
|
visible=False, |
|
|
rtl=True |
|
|
) |
|
|
|
|
|
|
|
|
language_selector.change( |
|
|
fn=update_interface, |
|
|
inputs=[language_selector], |
|
|
outputs=[title, step1_title, input_text, process_btn, clear_btn, |
|
|
status, step2_title, anonymized_output, step3_title, gpt_output, |
|
|
step4_title, final_output, mapping_btn, system_status_btn, |
|
|
mapping_output, workflow_row, pattern_categories] |
|
|
) |
|
|
|
|
|
process_btn.click( |
|
|
fn=process_all_steps, |
|
|
inputs=[input_text, language_selector, pattern_categories], |
|
|
outputs=[status, anonymized_output, gpt_output, final_output] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=clear_all, |
|
|
outputs=[input_text, anonymized_output, gpt_output, final_output, status] |
|
|
) |
|
|
|
|
|
mapping_btn.click( |
|
|
fn=get_mapping_table, |
|
|
inputs=[language_selector], |
|
|
outputs=[mapping_output] |
|
|
) |
|
|
|
|
|
mapping_btn.click( |
|
|
fn=lambda: gr.update(visible=True), |
|
|
outputs=[mapping_output] |
|
|
) |
|
|
|
|
|
system_status_btn.click( |
|
|
fn=lambda: anonymizer.get_model_status(), |
|
|
outputs=[system_status_output] |
|
|
) |
|
|
|
|
|
system_status_btn.click( |
|
|
fn=lambda: gr.update(visible=True), |
|
|
outputs=[system_status_output] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("🚀 SELECTIVE HIGH-PRECISION BILINGUAL DATA ANONYMIZATION SYSTEM") |
|
|
print("="*80) |
|
|
print("📊 System Features:") |
|
|
print(" • User-controlled category selection with checkboxes") |
|
|
print(" • Selective pattern processing for efficiency") |
|
|
print(" • High-precision detection with validation system") |
|
|
print(" • Blacklist filtering for common words") |
|
|
print(" • Priority-based sensitive data protection") |
|
|
print(" • Bilingual support (Persian/English)") |
|
|
print(" • Local NER + Advanced Regex processing") |
|
|
print(" • OpenAI ChatGPT integration") |
|
|
print(" • Complete anonymization-restoration workflow") |
|
|
print("\n🎯 Available Pattern Categories:") |
|
|
for cat_key, cat_info in anonymizer.pattern_categories.items(): |
|
|
icon = cat_info['icon'] |
|
|
name_fa = cat_info['name_fa'] |
|
|
name_en = cat_info['name_en'] |
|
|
pattern_count = len(cat_info['patterns']) |
|
|
print(f" {icon} {name_fa} ({name_en}) - {pattern_count} patterns") |
|
|
|
|
|
print("\n🔍 Protected Data Types (High Priority):") |
|
|
print(" • Identity Codes (کد ملی، شبا، کارت بانکی)") |
|
|
print(" • Contact Information (تلفن، ایمیل)") |
|
|
print(" • Financial Data (مبالغ، حسابها)") |
|
|
print(" • Personal Names (با عناوین مشخص)") |
|
|
print(" • Business Information (شرکتها، آدرسها)") |
|
|
print(" • Technical Codes (کدهای سریال، شبکه)") |
|
|
print("\n⚙️ Enhanced Features:") |
|
|
print(" • Category-based selective processing") |
|
|
print(" • User control with checkbox interface") |
|
|
print(" • Improved efficiency with targeted detection") |
|
|
print(" • Validation system prevents false positives") |
|
|
print(" • Common word blacklist filtering") |
|
|
print(" • Context-aware pattern matching") |
|
|
print(" • Overlap detection system") |
|
|
print(" • Persian/Arabic digit support") |
|
|
print(" • Refined accuracy with readable output") |
|
|
print("="*80) |
|
|
print("🎯 Now you can select exactly which data types to anonymize!") |
|
|
|
|
|
app.launch( |
|
|
share=True, |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
show_error=True, |
|
|
favicon_path=None, |
|
|
ssl_verify=False |
|
|
) |