Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import re | |
| import os | |
| import requests | |
| import time | |
| import logging | |
| # تنظیم logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class LightweightDataAnonymizer: | |
| def __init__(self): | |
| self.mapping_table = {} | |
| # دستهبندیهای الگوها برای UI | |
| self.pattern_categories = { | |
| 'personal_identity': { | |
| 'name_fa': 'اطلاعات شخصی و هویتی', | |
| 'name_en': 'Personal & Identity Information', | |
| 'patterns': ['PERSON', 'MIXED_NAMES', 'ID_NUMBER', 'ENGLISH_TITLES'], | |
| 'icon': '👤' | |
| }, | |
| 'financial': { | |
| 'name_fa': 'اطلاعات مالی', | |
| 'name_en': 'Financial Information', | |
| 'patterns': ['AMOUNT', 'INTERNATIONAL_CURRENCIES', 'ACCOUNT', 'FINANCIAL_TERMS', 'STOCK_SYMBOL'], | |
| 'icon': '💰' | |
| }, | |
| 'temporal': { | |
| 'name_fa': 'اطلاعات زمانی', | |
| 'name_en': 'Temporal Information', | |
| 'patterns': ['DATE', 'ADVANCED_DATE_FORMATS', 'TIME_RANGES'], | |
| 'icon': '📅' | |
| }, | |
| 'location': { | |
| 'name_fa': 'اطلاعات مکانی', | |
| 'name_en': 'Location Information', | |
| 'patterns': ['LOCATION', 'COMPLEX_ADDRESSES'], | |
| 'icon': '📍' | |
| }, | |
| 'technical': { | |
| 'name_fa': 'اطلاعات فنی و تکنولوژیکی', | |
| 'name_en': 'Technical & Technological', | |
| 'patterns': ['TECHNICAL_CODES', 'NETWORK_ADDRESSES', 'TECHNICAL_UNITS', 'ACRONYMS_ABBREVIATIONS'], | |
| 'icon': '⚙️' | |
| }, | |
| 'business': { | |
| 'name_fa': 'اطلاعات کسبوکار', | |
| 'name_en': 'Business Information', | |
| 'patterns': ['COMPANY', 'BUSINESS_TERMS', 'PRODUCT', 'PETROCHEMICAL'], | |
| 'icon': '🏢' | |
| }, | |
| 'quantity': { | |
| 'name_fa': 'اطلاعات کمیت و واحد', | |
| 'name_en': 'Quantity & Unit Information', | |
| 'patterns': ['PERCENTAGE', 'VOLUME', 'RATIOS'], | |
| 'icon': '📊' | |
| }, | |
| 'communication': { | |
| 'name_fa': 'اطلاعات ارتباطی', | |
| 'name_en': 'Communication Information', | |
| 'patterns': ['PHONE', 'EMAIL'], | |
| 'icon': '📞' | |
| } | |
| } | |
| # counters | |
| self.counters = { | |
| 'PERSON': 0, 'MIXED_NAMES': 0, 'ID_NUMBER': 0, 'ENGLISH_TITLES': 0, | |
| 'AMOUNT': 0, 'INTERNATIONAL_CURRENCIES': 0, 'ACCOUNT': 0, | |
| 'FINANCIAL_TERMS': 0, 'STOCK_SYMBOL': 0, | |
| 'DATE': 0, 'ADVANCED_DATE_FORMATS': 0, 'TIME_RANGES': 0, | |
| 'LOCATION': 0, 'COMPLEX_ADDRESSES': 0, | |
| 'TECHNICAL_CODES': 0, 'NETWORK_ADDRESSES': 0, 'TECHNICAL_UNITS': 0, | |
| 'ACRONYMS_ABBREVIATIONS': 0, | |
| 'COMPANY': 0, 'BUSINESS_TERMS': 0, 'PRODUCT': 0, 'PETROCHEMICAL': 0, | |
| 'PERCENTAGE': 0, 'VOLUME': 0, 'RATIOS': 0, | |
| 'PHONE': 0, 'EMAIL': 0 | |
| } | |
| self.api_key = os.getenv("OPENAI_API_KEY", "") | |
| def get_category_choices(self, language='fa'): | |
| """دریافت لیست دستهبندیها برای چکباکس""" | |
| choices = [] | |
| for cat_key, cat_info in self.pattern_categories.items(): | |
| name = cat_info['name_fa'] if language == 'fa' else cat_info['name_en'] | |
| icon = cat_info['icon'] | |
| choices.append(f"{icon} {name}") | |
| return choices | |
| def get_selected_patterns(self, selected_categories, language='fa'): | |
| """تبدیل دستهبندیهای انتخاب شده به لیست الگوها""" | |
| selected_patterns = [] | |
| for cat_key, cat_info in self.pattern_categories.items(): | |
| name = cat_info['name_fa'] if language == 'fa' else cat_info['name_en'] | |
| icon = cat_info['icon'] | |
| category_display = f"{icon} {name}" | |
| if category_display in selected_categories: | |
| selected_patterns.extend(cat_info['patterns']) | |
| return selected_patterns | |
| def detect_language(self, text): | |
| """تشخیص زبان متن""" | |
| if not text: | |
| return 'fa' | |
| persian_chars = len(re.findall(r'[\u0600-\u06FF]', text)) | |
| english_chars = len(re.findall(r'[a-zA-Z]', text)) | |
| total = persian_chars + english_chars | |
| if total == 0: | |
| return 'fa' | |
| if persian_chars / total > 0.6: | |
| return 'fa' | |
| elif english_chars / total > 0.6: | |
| return 'en' | |
| else: | |
| return 'mixed' | |
| def get_comprehensive_patterns(self): | |
| """الگوهای جامع ناشناسسازی - نسخه کامل""" | |
| return { | |
| 'PERSON': [ | |
| r'آقای\s+([آ-ی۰-۹a-zA-Z]+(?:\s+[آ-ی۰-۹a-zA-Z]+)*)', | |
| r'خانم\s+([آ-ی۰-۹a-zA-Z]+(?:\s+[آ-ی۰-۹a-zA-Z]+)*)', | |
| r'مهندس\s+([آ-ی۰-۹a-zA-Z]+(?:\s+[آ-ی۰-۹a-zA-Z]+)*)', | |
| r'دکتر\s+([آ-ی۰-۹a-zA-Z]+(?:\s+[آ-ی۰-۹a-zA-Z]+)*)', | |
| r'استاد\s+([آ-ی۰-۹a-zA-Z]+(?:\s+[آ-ی۰-۹a-zA-Z]+)*)', | |
| r'Mr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)', | |
| r'Ms\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)', | |
| r'Dr\.\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)', | |
| r'([آ-ی۰-۹a-zA-Z]+\s+[آ-ی۰-۹a-zA-Z]+)(?:، مدیرعامل|\s+مدیرعامل|\s+رئیس)', | |
| r'مدیرعامل(?=\s|$|،|\.)', | |
| r'سرپرست(?=\s+و|\s|$|،|\.)', | |
| r'رئیس\s+هیأتمدیره' | |
| ], | |
| 'MIXED_NAMES': [ | |
| r'([آ-ی۰-۹a-zA-Z]{2,}\s+[آ-ی۰-۹a-zA-Z]{2,})', | |
| r'([A-Z][a-z]+-[A-Z][a-z]+)', | |
| r"([A-Z]'[A-Z][a-z]+)", | |
| r'Dr\.\s+([آ-ی۰-۹a-zA-Z\s]+)' | |
| ], | |
| 'ID_NUMBER': [ | |
| r'IR[۰-۹0-9]{24}', | |
| r'شبا[\s:]*IR[۰-۹0-9]{24}', | |
| r'IBAN[\s:]*IR[۰-۹0-9]{24}', | |
| r'شماره[\s]*شبا[\s:]*IR[۰-۹0-9]{24}', | |
| r'(?:کد[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}', | |
| r'(?:شناسه[\s]*)?(?:ملی[\s:]*)?[۰-۹0-9]{10}', | |
| r'National[\s]*(?:ID[\s:]*)?[0-9]{10}', | |
| r'(?:پاسپورت[\s:]*)?[A-Z][0-9]{8}', | |
| r'(?:Passport[\s:]*)?[A-Z][0-9]{8}', | |
| r'(?:کارت[\s:]*)?(?:[۰-۹0-9]{4}[-\s]?){3}[۰-۹0-9]{4}', | |
| r'(?:Card[\s:]*)?(?:[0-9]{4}[-\s]?){3}[0-9]{4}', | |
| r'SSN[\s:]*[0-9]{3}-[0-9]{2}-[0-9]{4}', | |
| r'FICO[\s]*(?:score[\s:]*)?[0-9]{3}', | |
| r'EIN[\s:]*[0-9]{2}-[0-9]{7}', | |
| r'Meeting[\s]*ID[\s:]*[0-9]{9,11}' | |
| ], | |
| 'ENGLISH_TITLES': [ | |
| r'business\s+partner', | |
| r'team\s+lead', | |
| r'head\s+of\s+production', | |
| r'senior\s+architect', | |
| r'civil\s+engineer', | |
| r'system\s+administrator', | |
| r'network\s+engineer', | |
| r'environmental\s+consultant', | |
| r'senior\s+loan\s+officer', | |
| r'facility\s+manager', | |
| r'project\s+team', | |
| r'technical\s+support' | |
| ], | |
| 'AMOUNT': [ | |
| r'\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان', | |
| r'مبلغ\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)?\s*تومان', | |
| r'\d+\s*تومان(?=\s+به\s+ازای|\s+فروش|،)', | |
| r'رقم\s+فعلی\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان', | |
| r'رقم\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد)\s*تومان', | |
| r'به\s+\d+(?:,\d{3})*\s*(?:میلیون|میلیارد|هزار)\s*تومان', | |
| r'\$\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)?', | |
| r'\d+(?:,\d{3})*\s*ریال', | |
| r'€\d+(?:,\d{3})*(?:\.\d+)?', | |
| r'\d+(?:,\d{3})*\s*AED', | |
| r'\$\d+(?:\.\d+)?[KMB]', | |
| r'€\d+(?:\.\d+)?[KM]' | |
| ], | |
| 'INTERNATIONAL_CURRENCIES': [ | |
| r'\d+(?:,\d{3})*\s+euro', | |
| r'€\d+(?:\.\d+)?M', | |
| r'\d+\s+EUR', | |
| r'\d+(?:,\d{3})*\s+AED', | |
| r'\d+(?:\.\d+)?M\s+AED', | |
| r'\$\d+(?:\.\d+)?M', | |
| r'\$\d+(?:\.\d+)?K', | |
| r'£\d+(?:,\d{3})*(?:\.\d+)?', | |
| r'\d+\s+GBP', | |
| r'\d+\s+CHF', | |
| r'¥\d+(?:,\d{3})*', | |
| r'\d+\s+JPY' | |
| ], | |
| 'ACCOUNT': [ | |
| r'(?:شماره[\s]*)?(?:حساب[\s]*)?(?:بانکی[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}', | |
| r'حساب[\s]*(?:شماره[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}', | |
| r'شماره[\s]*حساب[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}', | |
| r'Account[\s]*(?:Number[\s:]*)?(?:[0-9]{1,3}[-\s]?)*[0-9]{8,20}', | |
| r'[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{3}[-\s]?[۰-۹0-9]{6,12}', | |
| r'[۰-۹0-9]{2,4}[-\s]?[۰-۹0-9]{6,12}[-\s]?[۰-۹0-9]{2,4}', | |
| r'واریز[\s]*(?:سود[\s:]*)?(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}', | |
| r'سود[\s:]*(?:[۰-۹0-9]{1,3}[-\s]?)*[۰-۹0-9]{8,20}' | |
| ], | |
| 'FINANCIAL_TERMS': [ | |
| r'فروش\s+(?:ماهانه|تجمیعی|صادراتی)', | |
| r'درآمد\s+شرکت', | |
| r'سود\s+(?:خالص|نقدی)', | |
| r'صورتهای\s+مالی', | |
| r'بهای\s+تمامشده', | |
| r'سودآوری', | |
| r'عملکرد\s+مالی', | |
| r'میانگین\s+فروش', | |
| r'بالاترین\s+رقم\s+فروش', | |
| r'رقم\s+فروش', | |
| r'درآمدهای\s+عملیاتی' | |
| ], | |
| 'STOCK_SYMBOL': [ | |
| r'نماد\s+([آ-یa-zA-Z0-9]+)', | |
| r'(سبهان|غدیر|شتران|شپنا|پترول|فارس|خارک|پلاسکو|جم|کرمان|مارون|اراک|رازی|شازند|کاوه|بندر|پارس|خوزستان|ماهشهر|عسلویه)(?=\s|$|،|\.|\s+)', | |
| r'شرکت\s+([آ-یa-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+را|\s+به)', | |
| r'پتروشیمی\s+([آ-یa-zA-Z\s]+?)(?=\s+در|\s+که|\s+با|،|\.|\s+$|\s+توان)', | |
| r'(AAPL|GOOGL|MSFT|AMZN|TSLA|META|NVDA|SABIC)(?=\s|$|,|\.)' | |
| ], | |
| 'DATE': [ | |
| r'[۰-۹0-9]{4}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}', | |
| r'[۰-۹0-9]{1,2}[/-][۰-۹0-9]{1,2}[/-][۰-۹0-9]{4}', | |
| r'(?:[۰-۹0-9]{1,2})\s*(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s*(?:[۰-۹0-9]{4})', | |
| r'(?:فروردین|اردیبهشت|خرداد|تیر|مرداد|شهریور|مهر|آبان|آذر|دی|بهمن|اسفند)\s+[۰-۹0-9]{4}', | |
| r'(?:[0-9]{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(?:[0-9]{4})', | |
| r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s*[0-9]{1,2},?\s*[0-9]{4}', | |
| r'سال\s+گذشته', | |
| r'سال\s+جاری', | |
| r'این\s+سال', | |
| r'ماه\s+قبل', | |
| r'ماه\s+اخیر', | |
| r'(?:13[0-9]{2}|14[0-9]{2}|20[0-9]{2}|19[0-9]{2})(?=\s|$|،|\.)' | |
| ], | |
| 'ADVANCED_DATE_FORMATS': [ | |
| r'(?:March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th),?\s+\d{4}', | |
| r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d{3})?Z', | |
| r'(?:PST|EST|GMT|UTC)(?:[+-]\d{1,2}:\d{2})?', | |
| r'Eastern\s+Time', | |
| r'GMT[+-]\d{1,2}:\d{2}', | |
| r'end\s+of\s+fiscal\s+year\s+\d{4}/\d{2}/\d{2}' | |
| ], | |
| 'TIME_RANGES': [ | |
| r'\d{2}:\d{2}-\d{2}:\d{2}', | |
| r'\d{2}:\d{2}\s+تا\s+\d{2}:\d{2}', | |
| r'\d{1,2}:\d{2}\s+(?:AM|PM)\s+(?:PST|EST|GMT|UTC)', | |
| r'\d{2}:\d{2}:\d{2}\s+(?:AM|PM)', | |
| r'COB\s*\(Close\s+of\s+Business\)', | |
| r'\d{1,3}\s+(?:business\s+days|روز\s+کاری)' | |
| ], | |
| 'LOCATION': [ | |
| r'(تهران|اصفهان|ماهشهر|عسلویه|بندرعباس|اهواز|شیراز|مشهد|تبریز|کرج|قم|رشت|کرمان|یزد|زاهدان|بوشهر|خرمشهر|آبادان|اراک|قزوین)', | |
| r'استان\s+([آ-ی\s]+)', | |
| r'شهر\s+([آ-ی\s]+)', | |
| r'(ایران|عراق|کویت|عربستان|امارات|قطر|عمان|بحرین|ترکیه|پاکستان|افغانستان)', | |
| r'داخلی|بازار\s+داخلی', | |
| r'خارجی|بازارهای\s+خارجی', | |
| r'(London|Paris|Tokyo|New\s+York|Dubai|Singapore|Hong\s+Kong|Shanghai|Mumbai|Frankfurt|Amsterdam)' | |
| ], | |
| 'COMPLEX_ADDRESSES': [ | |
| r'کیلومتر\s+\d+\s+جاده\s+[آ-ی\s]+-[آ-ی\s]+', | |
| r'روبروی\s+(?:پمپ\s+بنزین|بانک|پارک|مسجد|بیمارستان)\s+[آ-یa-zA-Z\s]+', | |
| r'Building-[A-Z],?\s+Floor-\d+,?\s+Unit-[A-Z0-9]+', | |
| r'rack\s+number\s+R-\d+,?\s+slot\s+\d+', | |
| r'phase\s+\d+\s+development,?\s+block\s+[A-Z],?\s+plot\s+\d+-[A-Z]', | |
| r'\d{2,5}\s+[A-Z][a-z]+\s+(?:Street|Avenue|Boulevard|Road|Drive),?\s+Floor\s+\d+,?\s+Building\s+[A-Z]', | |
| r'شهرک\s+صنعتی\s+[آ-ی\s]+،?\s+محور\s+[آ-ی\s]+' | |
| ], | |
| 'TECHNICAL_CODES': [ | |
| r'SN-\d{4}-[A-Z]{3}-\d{4}', | |
| r'Serial\s+Number[\s:]*[A-Z0-9-]+', | |
| r'REF-[A-Z]{3}-\d{4}-\d{3}', | |
| r'DOC-[A-Z]{2}-\d{4}-\d{4}', | |
| r'INF-\d{4}-\d{4}', | |
| r'CTR/\d{4}/\d{3}', | |
| r'HVAC-\d{7}', | |
| r'Generator-Model-[A-Z0-9]+', | |
| r'LOI-\d{4}-[A-Z]{4}-\d{3}', | |
| r'BOQ-\d{4}-[A-Z]{3}-\d{3}', | |
| r'#INV-\d{4}-Q\d-\d{4}', | |
| r'ESC-\d{4}-[A-Z]{3}-\d{3}', | |
| r'BN-\d{6}-[A-Z]\d+' | |
| ], | |
| 'NETWORK_ADDRESSES': [ | |
| r'\b(?:\d{1,3}\.){3}\d{1,3}\b', | |
| r'xxx\.xxx\.xxx\.xxx', | |
| r'[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}:[A-F0-9]{2}', | |
| r'srv-[a-z]+-[a-z]+-\d{2}', | |
| r'[a-z]+-[a-z]+\d*\.[a-z]+\.[a-z]+', | |
| r'[a-zA-Z0-9-]+\.[a-zA-Z]{2,4}(?:\.[a-zA-Z]{2,4})?' | |
| ], | |
| 'TECHNICAL_UNITS': [ | |
| r'\d+(?:\.\d+)?\s*MW', | |
| r'\d+(?:\.\d+)?\s*kWh?', | |
| r'\d+(?:,\d{3})*\s*cubic\s+meters', | |
| r'\d+(?:,\d{3})*\s*m³', | |
| r'\d+(?:,\d{3})*\s*sq\s+ft', | |
| r'\d+(?:\.\d+)?\s*ppm', | |
| r'\d+(?:\.\d+)?\s*mg/m³', | |
| r'\b(?:CO2|NOx|SO2)\b', | |
| r'\d+(?:\.\d+)?\s*TB', | |
| r'\d+(?:\.\d+)?\s*GB', | |
| r'\d+(?:,\d{3})*\s*square\s+meters', | |
| r'\d+(?:\.\d+)?\%\s*efficiency', | |
| r'FICO\s+score:\s*\d{3}', | |
| r'\d+(?:\.\d+)?\s*(?:bar|psi)', | |
| r'\d+(?:\.\d+)?\s*°[CF]', | |
| r'\d+(?:\.\d+)?\s*(?:rpm|m/s)' | |
| ], | |
| 'ACRONYMS_ABBREVIATIONS': [ | |
| r'\b(?:HVAC|IT|HSE|BOQ|LC|COB)\b', | |
| r'\b(?:YTD|NNN|EIN|SSN|FICO)\b', | |
| r'\bIP\s+Address\b', | |
| r'\bMAC\s+Address\b', | |
| r'\bURL\b', | |
| r'\b(?:LLC|Corp|Inc|Ltd)\b', | |
| r'\b(?:PST|GMT|UTC|EST)\b', | |
| r'\b(?:CO2|NOx|pH|UV)\b', | |
| r'\b(?:SCADA|PLC|HMI)\b', | |
| r'\b(?:GDP|CPI|ROI|NPV)\b', | |
| r'\b(?:FOB|CIF|DDP)\b', | |
| r'\b(?:ABA|SWIFT|IBAN)\b' | |
| ], | |
| 'COMPANY': [ | |
| r'شرکت(?=\s+در|\s+که|\s+با|\s+را|\s+به|\s+طی)', | |
| r'([آ-یa-zA-Z\s]+)\s+شرکت', | |
| r'این\s+شرکت(?=\s|$|،|\.)', | |
| r'(بانک\s+[آ-یa-zA-Z\s]+)', | |
| r'([A-Z][a-zA-Z\s]+(?:Inc|Corp|Corporation|Company|Ltd|Limited|LLC))' | |
| ], | |
| 'BUSINESS_TERMS': [ | |
| r'تحلیل\s+عملکرد', | |
| r'گزارش\s+(?:فعالیت|عملکرد)\s+ماهانه', | |
| r'وضعیت\s+فروش', | |
| r'تولید\s+پایدار', | |
| r'سهم\s+بازار', | |
| r'صادرات\s+هدفمند', | |
| r'بهرهوری', | |
| r'ظرفیتهای\s+داخلی', | |
| r'شرکتهای\s+پیشرو', | |
| r'صنعت\s+پتروشیمی', | |
| r'سرمایهگذاران\s+بنیادی', | |
| r'شاخصهای\s+عملیاتی', | |
| r'برنامهریزی\s+مناسب', | |
| r'واحد\s+فروش', | |
| r'موجودی\s+انبار', | |
| r'فاز\s+رشد\s+جدید', | |
| r'ترکیب\s+فروش', | |
| r'سهم\s+صادراتی', | |
| r'روند\s+عملکرد', | |
| r'اعداد\s+اعلامشده', | |
| r'دادههای\s+ثبتشده' | |
| ], | |
| 'PRODUCT': [ | |
| r'\b(?:VCM|PVC|PE|PP|PS|ABS|SAN|PC|PMMA|PET|PBT|PA|POM|TPU)\b', | |
| r'پلی\s*(?:اتیلن|پروپیلن|استایرن|کربنات|متیل)', | |
| r'\b(?:اتیلن|پروپیلن|بنزن|تولوئن|زایلن|متانول|اتانول|استون|فنول)\b', | |
| r'\b(?:کلر|هیدروژن|اکسیژن|نیتروژن|آمونیاک|اتان|پروپان|بوتان)\b', | |
| r'محصول(?:ات)?', | |
| r'تولیدات\s+شرکت' | |
| ], | |
| 'PETROCHEMICAL': [ | |
| r'\b(?:LDPE|HDPE|LLDPE|PP|PS|EPS|ABS|SAN|PC|PMMA|PET|PBT|PA6|PA66|POM|TPU|EVA|EAA)\b', | |
| r'(?:Ethylene\s+Vinyl\s+Acetate|Ethyl\s+Acrylate|Methyl\s+Methacrylate|Polyethylene\s+Terephthalate)' | |
| ], | |
| 'PERCENTAGE': [ | |
| r'\d+(?:\.\d+)?\s*درصد(?:\s+افزایش|\s+رشد|\s+کاهش|\s+بالاتر|\s+پایینتر)?', | |
| r'\d+(?:\.\d+)?\s*%', | |
| r'معادل\s+\d+(?:\.\d+)?\s*درصد', | |
| r'حدود\s+\d+(?:\.\d+)?\s*درصد', | |
| r'با\s+\d+(?:\.\d+)?\s*درصد\s+افزایش', | |
| r'رشد\s+\d+(?:\.\d+)?\s*درصدی', | |
| r'\d+(?:\.\d+)?\s*درصدی(?=\s+همراه|\s+بوده)', | |
| r'میزان\s+رشد(?=\s+نسبت|\s+معادل)', | |
| r'افزایش\s+قابلتوجهی', | |
| r'بهبود\s+نسبی', | |
| r'\d+(?:\.\d+)?\%\s*(?:increase|decrease|growth|improvement)', | |
| r'(?:approximately|about)\s+\d+(?:\.\d+)?\%' | |
| ], | |
| 'VOLUME': [ | |
| r'\d+(?:,\d{3})*\s*تن', | |
| r'\d+(?:,\d{3})*\s*(?:کیلوگرم|لیتر|بشکه)', | |
| r'میزان\s+\d+(?:,\d{3})*\s*تن', | |
| r'مقدار\s+تولید', | |
| r'حجم\s+فروش', | |
| r'ظرفیت\s+(?:تولید|اسمی)', | |
| r'\d+(?:,\d{3})*\s*(?:tons|kg|liters|barrels)', | |
| r'\d+(?:,\d{3})*\s*(?:metric\s+tons|MT)', | |
| r'\d+(?:,\d{3})*\s*(?:thousand\s+tons|KT)' | |
| ], | |
| 'RATIOS': [ | |
| r'نسبت\s+(?:فروش|تولید)\s+به\s+[آ-ی\s]+', | |
| r'\d+(?:\.\d+)?\s*نزدیک', | |
| r'برابر\s+با\s+\d+(?:\.\d+)?', | |
| r'معادل\s+\d+(?:\.\d+)?', | |
| r'میزان\s+(?:رشد|افزایش)', | |
| r'شاخص\s+(?:مهم|عملیاتی)', | |
| r'\d+(?:\.\d+)?\s*درصد\s+کل\s+تولید' | |
| ], | |
| 'PHONE': [ | |
| r'(?:تلفن[\s:]*)?(?:شماره[\s:]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}', | |
| r'(?:تماس[\s:]*)?(?:شماره[\s:]*)?(?:با[\s]*)?(?:0)?(?:[۰-۹0-9]{2,3}[-\s]?)?[۰-۹0-9]{7,8}', | |
| r'(?:موبایل[\s:]*)?(?:شماره[\s:]*)?(?:0)?9[۰-۹0-9]{9}', | |
| r'[۰-۹0-9]{3,4}[-\s][۰-۹0-9]{7,8}', | |
| r'[۰-۹0-9]{11}(?!\d)', | |
| r'(?:\+98|0098)?[۰-۹0-9]{10}', | |
| r'[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}[-\s]?[۰-۹0-9]{3,4}', | |
| r'\+[0-9]{1,3}-[0-9]{3}-[0-9]{3}-[0-9]{4}(?:\s+ext\.\s+[0-9]{3,4})?', | |
| r'\([0-9]{3}\)\s+[0-9]{3}-[0-9]{4}' | |
| ], | |
| 'EMAIL': [ | |
| r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', | |
| r'ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', | |
| r'email[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', | |
| r'نشانی[\s]*الکترونیکی[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', | |
| r'آدرس[\s]*ایمیل[\s:]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', | |
| r'facility\.manager@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' | |
| ] | |
| } | |
| def anonymize_text(self, original_text, lang='fa', selected_categories=None): | |
| """گام 1: ناشناسسازی متن با الگوهای انتخاب شده - نسخه Lightweight""" | |
| try: | |
| if not original_text or not original_text.strip(): | |
| return "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!" | |
| # ریست متغیرها | |
| self.mapping_table = {} | |
| self.counters = {key: 0 for key in self.counters.keys()} | |
| anonymized = original_text | |
| found_entities = set() | |
| # تشخیص زبان | |
| detected_lang = self.detect_language(original_text) | |
| logger.info(f"Detected language: {detected_lang}") | |
| # استخراج با الگوهای Regex - Lightweight mode | |
| all_patterns = self.get_comprehensive_patterns() | |
| # فیلتر کردن الگوها بر اساس انتخاب کاربر | |
| if selected_categories: | |
| selected_pattern_types = self.get_selected_patterns(selected_categories, lang) | |
| patterns = {k: v for k, v in all_patterns.items() if k in selected_pattern_types} | |
| logger.info(f"📋 Using selected pattern categories: {len(patterns)} types") | |
| else: | |
| patterns = all_patterns | |
| logger.info("📋 Using all available pattern categories") | |
| # پردازش patterns | |
| logger.info("🔍 Running lightweight regex extraction...") | |
| processed_entities = set() | |
| # اولویتبندی دستهها | |
| priority_order = [ | |
| 'ID_NUMBER', 'EMAIL', 'PHONE', 'ACCOUNT', 'TECHNICAL_CODES', | |
| 'NETWORK_ADDRESSES', 'INTERNATIONAL_CURRENCIES', 'AMOUNT', | |
| 'TECHNICAL_UNITS', 'ACRONYMS_ABBREVIATIONS', 'ADVANCED_DATE_FORMATS', | |
| 'TIME_RANGES', 'COMPLEX_ADDRESSES', 'MIXED_NAMES', 'ENGLISH_TITLES', | |
| 'STOCK_SYMBOL', 'COMPANY', 'PERSON', 'PERCENTAGE', 'VOLUME', | |
| 'RATIOS', 'LOCATION', 'DATE', 'FINANCIAL_TERMS', 'BUSINESS_TERMS', | |
| 'PRODUCT', 'PETROCHEMICAL' | |
| ] | |
| for category in priority_order: | |
| if category in patterns: | |
| pattern_list = patterns[category] | |
| for pattern in pattern_list: | |
| try: | |
| matches = re.finditer(pattern, original_text, re.IGNORECASE | re.MULTILINE) | |
| for match in matches: | |
| if match.groups(): | |
| item = match.group(1).strip() | |
| full_match = match.group(0).strip() | |
| else: | |
| item = match.group(0).strip() | |
| full_match = item | |
| # بررسی تداخل | |
| overlaps = False | |
| match_start, match_end = match.span() | |
| for proc_start, proc_end in processed_entities: | |
| if not (match_end <= proc_start or match_start >= proc_end): | |
| overlaps = True | |
| break | |
| if (not overlaps and | |
| full_match not in found_entities and | |
| full_match not in self.mapping_table and | |
| len(full_match) >= 2): | |
| self.counters[category] += 1 | |
| code = f"{category}_{self.counters[category]:03d}_REGEX" | |
| self.mapping_table[full_match] = code | |
| found_entities.add(full_match) | |
| processed_entities.add((match_start, match_end)) | |
| except re.error as e: | |
| logger.error(f"Regex error in pattern {pattern}: {e}") | |
| continue | |
| # جایگزینی در متن | |
| sorted_items = sorted(self.mapping_table.items(), key=lambda x: len(x[0]), reverse=True) | |
| for original_item, code in sorted_items: | |
| anonymized = anonymized.replace(original_item, code) | |
| logger.info(f"✅ Lightweight anonymization completed. Found {len(self.mapping_table)} entities.") | |
| return anonymized | |
| except Exception as e: | |
| logger.error(f"Anonymization error: {e}") | |
| return f"❌ Error in anonymization: {str(e)}" if lang == 'en' else f"❌ خطا در ناشناسسازی: {str(e)}" | |
| def send_to_chatgpt(self, anonymized_text, lang='fa'): | |
| """گام 2: ارسال به ChatGPT""" | |
| try: | |
| if not anonymized_text or not anonymized_text.strip(): | |
| return "❌ Anonymized text is empty!" if lang == 'en' else "❌ متن ناشناسشده خالی است!" | |
| if not self.api_key: | |
| return "❌ API Key not configured! Please set OPENAI_API_KEY environment variable." if lang == 'en' else "❌ کلید API تنظیم نشده است!" | |
| system_msg = "You are a professional analyst. Answer questions accurately." if lang == 'en' else "شما یک تحلیلگر حرفهای هستید. به سوالات با دقت پاسخ دهید." | |
| headers = { | |
| "Authorization": f"Bearer {self.api_key}", | |
| "Content-Type": "application/json" | |
| } | |
| data = { | |
| "model": "gpt-4o-mini", | |
| "messages": [ | |
| {"role": "system", "content": system_msg}, | |
| {"role": "user", "content": anonymized_text} | |
| ], | |
| "max_tokens": 2000, | |
| "temperature": 0.7 | |
| } | |
| response = requests.post( | |
| "https://api.openai.com/v1/chat/completions", | |
| headers=headers, | |
| json=data, | |
| timeout=30 | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| return result['choices'][0]['message']['content'] | |
| else: | |
| error_data = response.json() if response.content else {} | |
| error_message = error_data.get('error', {}).get('message', response.text) | |
| return f"❌ API Error: {error_message}" | |
| except Exception as e: | |
| return f"❌ Error connecting to ChatGPT: {str(e)}" if lang == 'en' else f"❌ خطا در ارتباط با ChatGPT: {str(e)}" | |
| def deanonymize_response(self, gpt_response, lang='fa'): | |
| """گام 3: بازگردانی""" | |
| try: | |
| if not gpt_response or not gpt_response.strip(): | |
| return "❌ ChatGPT response is empty!" if lang == 'en' else "❌ پاسخ ChatGPT خالی است!" | |
| if not self.mapping_table: | |
| return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!" | |
| final_result = gpt_response | |
| reverse_mapping = {code: original for original, code in self.mapping_table.items()} | |
| sorted_codes = sorted(reverse_mapping.items(), key=lambda x: len(x[0]), reverse=True) | |
| for code, original in sorted_codes: | |
| final_result = final_result.replace(code, original) | |
| return final_result | |
| except Exception as e: | |
| return f"❌ Deanonymization error: {str(e)}" if lang == 'en' else f"❌ خطا در بازگردانی: {str(e)}" | |
| def get_model_status(self): | |
| """وضعیت سیستم""" | |
| status = "🚀 **Lightweight Anonymization System Status (No Dependencies):**\n\n" | |
| status += "• **Mode**: Pure Regex Processing (No torch/transformers required)\n" | |
| status += "• **Performance**: High-speed lightweight processing\n" | |
| status += "• **Compatibility**: Works on any Python environment\n" | |
| status += "• **Memory Usage**: Minimal (< 100MB)\n" | |
| status += f"\n🎯 **Available Pattern Categories:**" | |
| for cat_key, cat_info in self.pattern_categories.items(): | |
| icon = cat_info['icon'] | |
| name_fa = cat_info['name_fa'] | |
| pattern_count = len(cat_info['patterns']) | |
| status += f"\n {icon} {name_fa}: {pattern_count} patterns" | |
| status += f"\n\n✨ **Lightweight Features:**" | |
| status += f"\n 🎯 User-controlled category selection" | |
| status += f"\n 🛡️ High-precision regex patterns (221 total)" | |
| status += f"\n 📊 Efficient targeted processing" | |
| status += f"\n ⚡ Zero external model dependencies" | |
| status += f"\n 🔥 Works perfectly in HuggingFace Spaces" | |
| status += f"\n\n💡 **Advantages of Lightweight Mode:**" | |
| status += f"\n ✅ No dependency issues" | |
| status += f"\n ✅ Fast processing speed" | |
| status += f"\n ✅ Low memory usage" | |
| status += f"\n ✅ Consistent results" | |
| status += f"\n ✅ Easy deployment anywhere" | |
| return status | |
| # ایجاد instance | |
| anonymizer = LightweightDataAnonymizer() | |
| def process_all_steps(input_text, language, selected_categories): | |
| """پردازش خودکار تمام مراحل - نسخه Lightweight""" | |
| lang = 'en' if language == 'English' else 'fa' | |
| if not input_text.strip(): | |
| error_msg = "❌ Please enter input text!" if lang == 'en' else "❌ لطفاً متن ورودی را وارد کنید!" | |
| return error_msg, "", "", "" | |
| try: | |
| start_time = time.time() | |
| anonymized_text = anonymizer.anonymize_text(input_text, lang, selected_categories) | |
| if anonymized_text.startswith("❌"): | |
| return anonymized_text, "", "", "" | |
| gpt_response = anonymizer.send_to_chatgpt(anonymized_text, lang) | |
| if gpt_response.startswith("❌"): | |
| entities_found = len(anonymizer.mapping_table) | |
| selected_count = len(selected_categories) if selected_categories else 0 | |
| success_msg = (f"✅ Lightweight anonymization completed successfully!\n" | |
| f"📋 Selected categories: {selected_count} | 🔍 Pure Regex Processing\n" | |
| f"📊 Total protected entities: {entities_found} | ⚡ High-speed lightweight mode") | |
| return success_msg, anonymized_text, gpt_response, "" | |
| final_result = anonymizer.deanonymize_response(gpt_response, lang) | |
| total_time = time.time() - start_time | |
| entities_found = len(anonymizer.mapping_table) | |
| selected_count = len(selected_categories) if selected_categories else 8 | |
| success_msg = (f"🎉 Complete lightweight anonymization & restoration successful!\n" | |
| f"🔧 Method: Pure Regex Processing | 📋 Categories: {selected_count}/8\n" | |
| f"📊 Total: {entities_found} entities | ⏱️ Time: {total_time:.2f}s\n" | |
| f"⚡ Zero dependencies - Maximum compatibility!") | |
| return success_msg, anonymized_text, gpt_response, final_result | |
| except Exception as e: | |
| error_msg = f"❌ Processing error: {str(e)}" if lang == 'en' else f"❌ خطا در پردازش: {str(e)}" | |
| return error_msg, "", "", "" | |
| def get_mapping_table(language): | |
| """نمایش جدول نگاشت""" | |
| lang = 'en' if language == 'English' else 'fa' | |
| if not anonymizer.mapping_table: | |
| return "❌ Mapping table is empty!" if lang == 'en' else "❌ جدول نگاشت خالی است!" | |
| result = "📋 **Lightweight Mapping Table (Pure Regex):**\n\n" | |
| # نمایش آمار کلی | |
| result += f"📊 **Statistics**: {len(anonymizer.mapping_table)} total entities\n" | |
| result += f"🔍 **Method**: Pure Regex Processing (221 patterns)\n" | |
| result += f"⚡ **Mode**: Lightweight (No external dependencies)\n\n" | |
| # دستهبندی نتایج | |
| category_stats = {} | |
| for original, code in anonymizer.mapping_table.items(): | |
| category = code.split('_')[0] | |
| if category not in category_stats: | |
| category_stats[category] = [] | |
| category_stats[category].append((original, code)) | |
| # نمایش نتایج بر اساس دستهبندی | |
| for category, items in category_stats.items(): | |
| if len(items) > 0: | |
| result += f"🔍 **{category}** ({len(items)} items):\n" | |
| for original, code in items[:3]: # نمایش 3 نمونه اول | |
| result += f" • `{original}` → `{code}`\n" | |
| if len(items) > 3: | |
| result += f" ... و {len(items) - 3} مورد دیگر\n" | |
| result += "\n" | |
| result += "✨ **Lightweight System**: Maximum efficiency with minimal dependencies!" | |
| return result | |
| def clear_all(): | |
| """پاک کردن همه""" | |
| anonymizer.mapping_table = {} | |
| anonymizer.counters = {key: 0 for key in anonymizer.counters.keys()} | |
| return "", "", "", "", "" | |
| def update_ui_text(language): | |
| """بهروزرسانی متنهای رابط کاربری""" | |
| if language == 'English': | |
| return { | |
| 'title': 'Lightweight High-Precision Data Anonymization System', | |
| 'step1': 'Input Text & Category Selection', | |
| 'step2': 'Anonymized Text', | |
| 'step3': 'Raw ChatGPT Response', | |
| 'step4': 'Final Restored Response', | |
| 'input_placeholder': 'Enter your original text here...\nExample: Company reports, person names, financial amounts, phone numbers, emails, IBAN codes, bank accounts, etc.\n\n✨ Lightweight system with category-based control!', | |
| 'process_btn': 'Process with Selected Categories', | |
| 'clear_btn': 'Clear All', | |
| 'mapping_btn': 'Show Lightweight Mapping Table', | |
| 'status_btn': 'Show Lightweight System Status', | |
| 'categories_label': 'Select Pattern Categories:', | |
| 'direction': 'ltr' | |
| } | |
| else: | |
| return { | |
| 'title': 'سیستم ناشناسسازی سبک و دقیق دادهها', | |
| 'step1': 'متن ورودی و انتخاب دستهبندی', | |
| 'step2': 'متن ناشناسشده', | |
| 'step3': 'پاسخ خام ChatGPT', | |
| 'step4': 'پاسخ نهایی بازگردانده شده', | |
| 'input_placeholder': 'متن اصلی خود را اینجا وارد کنید...\nمثال: گزارشهای شرکت، نام اشخاص، مبالغ مالی، شماره تلفن، ایمیل، شماره شبا، حساب بانکی و غیره\n\n✨ سیستم سبک با کنترل دستهبندی!', | |
| 'process_btn': 'پردازش با دستهبندیهای انتخاب شده', | |
| 'clear_btn': 'پاک کردن همه', | |
| 'mapping_btn': 'نمایش جدول نگاشت سبک', | |
| 'status_btn': 'نمایش وضعیت سیستم سبک', | |
| 'categories_label': 'انتخاب دستهبندیهای الگو:', | |
| 'direction': 'rtl' | |
| } | |
| def update_interface(language): | |
| """تغییر رابط کاربری بر اساس زبان""" | |
| ui_text = update_ui_text(language) | |
| is_english = (language == 'English') | |
| # تغییر direction برای workflow | |
| workflow_css = "workflow ltr" if is_english else "workflow rtl" | |
| # دریافت دستهبندیها بر اساس زبان | |
| category_choices = anonymizer.get_category_choices('en' if is_english else 'fa') | |
| return [ | |
| gr.update(value=f"<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 {ui_text['title']}</h1>"), | |
| gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🔍 {ui_text['step1']}</h2>"), | |
| gr.update(placeholder=ui_text['input_placeholder'], rtl=not is_english), | |
| gr.update(value=f"🚀 {ui_text['process_btn']}"), | |
| gr.update(value=f"🗑️ {ui_text['clear_btn']}"), | |
| gr.update(rtl=not is_english), | |
| gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🎭 {ui_text['step2']}</h2>"), | |
| gr.update(rtl=not is_english), | |
| gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>🤖 {ui_text['step3']}</h2>"), | |
| gr.update(rtl=not is_english), | |
| gr.update(value=f"<h2 style='direction: {ui_text['direction']};'>✅ {ui_text['step4']}</h2>"), | |
| gr.update(rtl=not is_english), | |
| gr.update(value=f"📋 {ui_text['mapping_btn']}"), | |
| gr.update(value=f"📊 {ui_text['status_btn']}"), | |
| gr.update(rtl=not is_english), | |
| gr.update(elem_classes=workflow_css), | |
| gr.update(label=ui_text['categories_label'], choices=category_choices, value=category_choices) # انتخاب همه به طور پیشفرض | |
| ] | |
| # CSS اصلاح شده برای ترازبندی عمودی مناسب | |
| custom_css = """ | |
| body, .gradio-container { | |
| font-family: 'Segoe UI', Tahoma, Arial, sans-serif !important; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; | |
| min-height: 100vh !important; | |
| padding: 20px !important; | |
| } | |
| .rtl { | |
| direction: rtl !important; | |
| text-align: right !important; | |
| } | |
| .ltr { | |
| direction: ltr !important; | |
| text-align: left !important; | |
| } | |
| .workflow { | |
| display: grid !important; | |
| grid-template-columns: 1fr 1fr 1fr 1fr !important; | |
| gap: 25px !important; | |
| padding: 30px !important; | |
| align-items: start !important; | |
| align-content: start !important; | |
| grid-auto-rows: auto !important; | |
| } | |
| .workflow > * { | |
| align-self: start !important; | |
| vertical-align: top !important; | |
| margin-top: 0 !important; | |
| } | |
| .workflow .gradio-column, | |
| .workflow-column { | |
| display: flex !important; | |
| flex-direction: column !important; | |
| align-items: stretch !important; | |
| justify-content: flex-start !important; | |
| height: auto !important; | |
| min-height: 0 !important; | |
| margin-top: 0 !important; | |
| padding-top: 0 !important; | |
| } | |
| .gradio-textbox { | |
| border-radius: 10px !important; | |
| box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important; | |
| flex-grow: 1 !important; | |
| min-height: 380px !important; | |
| max-height: 380px !important; | |
| height: 380px !important; | |
| } | |
| .gradio-textbox textarea { | |
| min-height: 350px !important; | |
| max-height: 350px !important; | |
| height: 350px !important; | |
| resize: vertical !important; | |
| } | |
| .workflow.rtl { | |
| direction: rtl !important; | |
| } | |
| .workflow.ltr { | |
| direction: ltr !important; | |
| } | |
| h1, h2, h3 { | |
| text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important; | |
| margin-top: 0 !important; | |
| margin-bottom: 10px !important; | |
| padding-top: 0 !important; | |
| line-height: 1.2 !important; | |
| } | |
| h2 { | |
| min-height: 40px !important; | |
| max-height: 40px !important; | |
| display: flex !important; | |
| align-items: center !important; | |
| margin-bottom: 15px !important; | |
| } | |
| .status-box { | |
| background: linear-gradient(135deg, #4CAF50, #45a049) !important; | |
| border: 3px solid #2E7D32 !important; | |
| border-radius: 15px !important; | |
| padding: 15px !important; | |
| margin: 10px 0 !important; | |
| box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3) !important; | |
| animation: pulse 2s infinite !important; | |
| min-height: 120px !important; | |
| max-height: 120px !important; | |
| } | |
| .status-box textarea { | |
| background: rgba(255, 255, 255, 0.95) !important; | |
| border: none !important; | |
| border-radius: 10px !important; | |
| font-weight: bold !important; | |
| font-size: 1.1em !important; | |
| color: #1B5E20 !important; | |
| text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.8) !important; | |
| min-height: 80px !important; | |
| max-height: 80px !important; | |
| } | |
| .category-selection { | |
| background: linear-gradient(135deg, #E3F2FD, #BBDEFB) !important; | |
| border: 2px solid #1976D2 !important; | |
| border-radius: 15px !important; | |
| padding: 20px !important; | |
| margin: 15px 0 !important; | |
| box-shadow: 0 6px 20px rgba(25, 118, 210, 0.2) !important; | |
| } | |
| .gradio-checkboxgroup { | |
| background: rgba(255, 255, 255, 0.9) !important; | |
| border-radius: 10px !important; | |
| padding: 15px !important; | |
| margin: 10px 0 !important; | |
| } | |
| @keyframes pulse { | |
| 0% { box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3); } | |
| 50% { box-shadow: 0 8px 40px rgba(76, 175, 80, 0.6); } | |
| 100% { box-shadow: 0 8px 32px rgba(76, 175, 80, 0.3); } | |
| } | |
| .gradio-button { | |
| border-radius: 25px !important; | |
| font-weight: bold !important; | |
| transition: all 0.3s ease !important; | |
| margin: 5px 0 !important; | |
| min-height: 50px !important; | |
| max-height: 50px !important; | |
| } | |
| .gradio-button:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important; | |
| } | |
| h1 { | |
| background: linear-gradient(45deg, #FFD700, #FFA500) !important; | |
| -webkit-background-clip: text !important; | |
| -webkit-text-fill-color: transparent !important; | |
| background-clip: text !important; | |
| min-height: 80px !important; | |
| } | |
| @media (max-width: 1200px) { | |
| .workflow { | |
| grid-template-columns: 1fr 1fr !important; | |
| gap: 20px !important; | |
| } | |
| } | |
| @media (max-width: 768px) { | |
| .workflow { | |
| grid-template-columns: 1fr !important; | |
| gap: 15px !important; | |
| } | |
| .gradio-textbox { | |
| min-height: 300px !important; | |
| max-height: 300px !important; | |
| height: 300px !important; | |
| } | |
| } | |
| [data-testid="textbox"]:dir(rtl) { | |
| text-align: right !important; | |
| direction: rtl !important; | |
| } | |
| [data-testid="textbox"]:dir(ltr) { | |
| text-align: left !important; | |
| direction: ltr !important; | |
| } | |
| .gradio-container .gradio-column { | |
| align-self: start !important; | |
| vertical-align: top !important; | |
| } | |
| .gradio-container .gradio-row { | |
| align-items: flex-start !important; | |
| } | |
| * { | |
| box-sizing: border-box !important; | |
| } | |
| .gradio-container { | |
| align-items: start !important; | |
| justify-content: start !important; | |
| } | |
| """ | |
| # رابط کاربری Gradio با ترازبندی اصلاح شده مطابق enhanced_anonymization_selective.py | |
| with gr.Blocks(title="⚡ Lightweight Anonymization System", theme=gr.themes.Soft(), css=custom_css) as app: | |
| with gr.Row(): | |
| language_selector = gr.Radio( | |
| choices=["فارسی", "English"], | |
| value="فارسی", | |
| label="Language / زبان", | |
| interactive=True | |
| ) | |
| with gr.Column(): | |
| title = gr.HTML("<h1 style='text-align: center; color: #FFD700; font-size: 3.5em; font-weight: bold; text-shadow: 3px 3px 6px rgba(0,0,0,0.5); margin: 20px 0; background: linear-gradient(45deg, #FFD700, #FFA500); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;'>📊 سیستم ناشناسسازی سبک و دقیق دادهها</h1>") | |
| # بخش انتخاب دستهبندیها | |
| with gr.Row(elem_classes="category-selection"): | |
| with gr.Column(): | |
| category_title = gr.HTML("<h3 style='text-align: center; color: #1976D2; margin-bottom: 15px;'>🎯 انتخاب دستهبندیهای الگوی ناشناسسازی</h3>") | |
| pattern_categories = gr.CheckboxGroup( | |
| choices=anonymizer.get_category_choices('fa'), | |
| value=anonymizer.get_category_choices('fa'), # انتخاب همه به طور پیشفرض | |
| label="انتخاب دستهبندیهای الگو:", | |
| interactive=True, | |
| elem_classes=["gradio-checkboxgroup"] | |
| ) | |
| category_info = gr.HTML(""" | |
| <div style='background: rgba(255, 255, 255, 0.9); padding: 15px; border-radius: 10px; margin-top: 10px;'> | |
| <p style='margin: 0; color: #666; font-size: 0.9em; text-align: center;'> | |
| 💡 <strong>راهنمایی:</strong> فقط دستهبندیهایی که نیاز دارید را انتخاب کنید تا ناشناسسازی دقیقتر و سریعتر انجام شود | |
| </p> | |
| </div> | |
| """) | |
| with gr.Row(elem_classes="workflow rtl") as workflow_row: | |
| with gr.Column(elem_classes="workflow-column"): | |
| step1_title = gr.HTML('<h2 style="direction: rtl;">🔍 متن ورودی و انتخاب دستهبندی</h2>') | |
| input_text = gr.Textbox( | |
| lines=15, | |
| placeholder="متن اصلی خود را اینجا وارد کنید...\nمثال: گزارشهای شرکت، نام اشخاص، مبالغ مالی، شماره تلفن، ایمیل، شماره شبا، حساب بانکی و غیره\n\n✨ سیستم سبک با کنترل دستهبندی!", | |
| label="", | |
| rtl=True | |
| ) | |
| process_btn = gr.Button("🚀 پردازش با دستهبندیهای انتخاب شده", variant="primary") | |
| clear_btn = gr.Button("🗑️ پاک کردن همه", variant="stop") | |
| status = gr.Textbox( | |
| label="وضعیت", | |
| lines=4, | |
| interactive=False, | |
| rtl=True, | |
| elem_classes=["status-box"] | |
| ) | |
| with gr.Column(elem_classes="workflow-column"): | |
| step2_title = gr.HTML('<h2 style="direction: rtl;">🎭 متن ناشناسشده</h2>') | |
| anonymized_output = gr.Textbox( | |
| lines=15, | |
| placeholder="متن ناشناسشده اینجا نمایش داده میشود...", | |
| label="", | |
| interactive=False, | |
| rtl=True | |
| ) | |
| with gr.Column(elem_classes="workflow-column"): | |
| step3_title = gr.HTML('<h2 style="direction: rtl;">🤖 پاسخ خام ChatGPT</h2>') | |
| gpt_output = gr.Textbox( | |
| lines=15, | |
| placeholder="پاسخ خام ChatGPT اینجا نمایش داده میشود...", | |
| label="", | |
| interactive=False, | |
| rtl=True | |
| ) | |
| with gr.Column(elem_classes="workflow-column"): | |
| step4_title = gr.HTML('<h2 style="direction: rtl;">✅ پاسخ نهایی بازگردانده شده</h2>') | |
| final_output = gr.Textbox( | |
| lines=15, | |
| placeholder="پاسخ نهایی اینجا نمایش داده میشود...", | |
| label="", | |
| interactive=False, | |
| rtl=True | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| mapping_title = gr.HTML('<h2>🗂️ جدول نگاشت سبک</h2>') | |
| mapping_btn = gr.Button("📋 نمایش جدول نگاشت سبک") | |
| mapping_output = gr.Textbox( | |
| lines=15, | |
| label="جدول نگاشت اطلاعات", | |
| interactive=False, | |
| visible=False, | |
| rtl=True | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| status_title = gr.HTML('<h2>⚙️ وضعیت سیستم و قابلیتها</h2>') | |
| system_status_btn = gr.Button("📊 نمایش وضعیت سیستم سبک") | |
| system_status_output = gr.Textbox( | |
| lines=20, | |
| label="وضعیت سیستم", | |
| interactive=False, | |
| visible=False, | |
| rtl=True | |
| ) | |
| # Event handlers | |
| language_selector.change( | |
| fn=update_interface, | |
| inputs=[language_selector], | |
| outputs=[title, step1_title, input_text, process_btn, clear_btn, | |
| status, step2_title, anonymized_output, step3_title, gpt_output, | |
| step4_title, final_output, mapping_btn, system_status_btn, | |
| mapping_output, workflow_row, pattern_categories] | |
| ) | |
| process_btn.click( | |
| fn=process_all_steps, | |
| inputs=[input_text, language_selector, pattern_categories], | |
| outputs=[status, anonymized_output, gpt_output, final_output] | |
| ) | |
| clear_btn.click( | |
| fn=clear_all, | |
| outputs=[input_text, anonymized_output, gpt_output, final_output, status] | |
| ) | |
| mapping_btn.click( | |
| fn=get_mapping_table, | |
| inputs=[language_selector], | |
| outputs=[mapping_output] | |
| ) | |
| mapping_btn.click( | |
| fn=lambda: gr.update(visible=True), | |
| outputs=[mapping_output] | |
| ) | |
| system_status_btn.click( | |
| fn=lambda: anonymizer.get_model_status(), | |
| outputs=[system_status_output] | |
| ) | |
| system_status_btn.click( | |
| fn=lambda: gr.update(visible=True), | |
| outputs=[system_status_output] | |
| ) | |
| if __name__ == "__main__": | |
| logger.info("⚡ Starting Lightweight Anonymization System...") | |
| logger.info("🔥 No torch/transformers dependencies required!") | |
| logger.info("✅ Ready for any environment including HuggingFace Spaces!") | |
| app.launch( | |
| share=False, | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) | |