|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import re |
|
|
from collections import defaultdict |
|
|
import io |
|
|
|
|
|
class TextAnonymizer: |
|
|
def __init__(self): |
|
|
self.person_counter = 0 |
|
|
self.company_counter = 0 |
|
|
self.amount_counter = 0 |
|
|
self.percent_counter = 0 |
|
|
|
|
|
|
|
|
self.person_mapping = {} |
|
|
self.company_mapping = {} |
|
|
self.amount_mapping = {} |
|
|
self.percent_mapping = {} |
|
|
|
|
|
def reset_counters(self): |
|
|
"""بازنشانی شمارندهها برای پردازش جدید""" |
|
|
self.person_counter = 0 |
|
|
self.company_counter = 0 |
|
|
self.amount_counter = 0 |
|
|
self.percent_counter = 0 |
|
|
self.person_mapping.clear() |
|
|
self.company_mapping.clear() |
|
|
self.amount_mapping.clear() |
|
|
self.percent_mapping.clear() |
|
|
|
|
|
def detect_financial_amounts(self, text): |
|
|
"""تشخیص مبالغ مالی (فارسی و انگلیسی)""" |
|
|
patterns = [ |
|
|
|
|
|
r'\$[\d,]+(?:\.\d{2})?', |
|
|
r'[\d,]+\s*(?:dollars?|USD|usd|Dollars?)', |
|
|
r'[\d,]+\s*(?:million|billion|thousand|Million|Billion|Thousand)', |
|
|
r'[\d,]+(?:\.\d+)?\s*(?:M|B|K|m|b|k)', |
|
|
r'€[\d,]+(?:\.\d{2})?', |
|
|
r'£[\d,]+(?:\.\d{2})?', |
|
|
|
|
|
|
|
|
r'[\u06F0-\u06F9\u06F0-\u06F9\d,\u060C]+\s*(?:هزار|میلیون|میلیارد|تریلیون)\s*(?:و\s*[\u06F0-\u06F9\u06F0-\u06F9\d,\u060C]+\s*(?:هزار|میلیون|میلیارد)?)?\s*(?:تومان|ریال|دلار|یورو|درهم)', |
|
|
r'[\u06F0-\u06F9\u06F0-\u06F9\d,\u060C]+\s*(?:همت)', |
|
|
r'[\u06F0-\u06F9\u06F0-\u06F9\d,\u060C]+\s*(?:هزار|میلیون|میلیارد)\s*(?:تومان|ریال|دلار)', |
|
|
|
|
|
|
|
|
r'[\u06F0-\u06F9\u06F0-\u06F9\d]+[\u06F0-\u06F9\u06F0-\u06F9\d,\u060C]*\s*(?:هزار|میلیون|میلیارد)', |
|
|
r'بیش\s*از\s*[\u06F0-\u06F9\u06F0-\u06F9\d,\u060C]+\s*(?:هزار|میلیون|میلیارد|همت)', |
|
|
r'حدود\s*[\u06F0-\u06F9\u06F0-\u06F9\d,\u060C]+\s*(?:هزار|میلیون|میلیارد|همت)', |
|
|
r'نزدیک\s*به\s*[\u06F0-\u06F9\u06F0-\u06F9\d,\u060C]+\s*(?:هزار|میلیون|میلیارد|همت)', |
|
|
] |
|
|
|
|
|
amounts = [] |
|
|
for pattern in patterns: |
|
|
matches = re.finditer(pattern, text, re.IGNORECASE) |
|
|
for match in matches: |
|
|
amounts.append((match.start(), match.end(), match.group())) |
|
|
|
|
|
return amounts |
|
|
|
|
|
def detect_percentages(self, text): |
|
|
"""تشخیص درصدها (فارسی و انگلیسی)""" |
|
|
patterns = [ |
|
|
r'\d+(?:\.\d+)?%', |
|
|
r'[\u06F0-\u06F9\u06F0-\u06F9]+(?:\.[\u06F0-\u06F9\u06F0-\u06F9]+)?\s*درصد', |
|
|
r'[\u06F0-\u06F9\u06F0-\u06F9]+(?:\.[\u06F0-\u06F9\u06F0-\u06F9]+)?%', |
|
|
] |
|
|
percentages = [] |
|
|
for pattern in patterns: |
|
|
matches = re.finditer(pattern, text) |
|
|
for match in matches: |
|
|
percentages.append((match.start(), match.end(), match.group())) |
|
|
|
|
|
return percentages |
|
|
|
|
|
def detect_names_regex(self, text): |
|
|
"""تشخیص اسامی اشخاص (فارسی و انگلیسی)""" |
|
|
patterns = [ |
|
|
|
|
|
r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', |
|
|
r'\b[A-Z][a-z]+ [A-Z]\. [A-Z][a-z]+\b', |
|
|
r'\b[A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+\b', |
|
|
r'\bMr\. [A-Z][a-z]+\b', |
|
|
r'\bMs\. [A-Z][a-z]+\b', |
|
|
r'\bDr\. [A-Z][a-z]+\b', |
|
|
|
|
|
|
|
|
r'[\u0600-\u06FF]+\s+[\u0600-\u06FF]+(?:\s+[\u0600-\u06FF]+)?(?:\s+[\u0600-\u06FF]+)?', |
|
|
|
|
|
|
|
|
r'(?:دکتر|آقای|خانم|مهندس|استاد)\s+[\u0600-\u06FF]+(?:\s+[\u0600-\u06FF]+){1,3}', |
|
|
|
|
|
|
|
|
r'[\u0600-\u06FF]+(?:\s+[\u0600-\u06FF]+){1,3}،\s*مدیرعامل', |
|
|
r'[\u0600-\u06FF]+(?:\s+[\u0600-\u06FF]+){1,3}،\s*(?:مدیر|رئیس|نایب)', |
|
|
|
|
|
|
|
|
r'(?:با\s+(?:حضور|سکانداری)\s+)[\u0600-\u06FF]+(?:\s+[\u0600-\u06FF]+){1,3}', |
|
|
r'(?:امضای\s+(?:مشترک\s+)?(?:«)?)[\u0600-\u06FF]+(?:\s+[\u0600-\u06FF]+){1,3}(?:»)?', |
|
|
] |
|
|
|
|
|
names = [] |
|
|
processed_spans = set() |
|
|
|
|
|
for pattern in patterns: |
|
|
matches = re.finditer(pattern, text) |
|
|
for match in matches: |
|
|
start, end = match.start(), match.end() |
|
|
name = match.group().strip() |
|
|
|
|
|
|
|
|
name = re.sub(r'^(?:دکتر|آقای|خانم|مهندس|استاد)\s+', '', name) |
|
|
name = re.sub(r'،\s*(?:مدیرعامل|مدیر|رئیس|نایب).*', '', name) |
|
|
name = re.sub(r'^(?:با\s+(?:حضور|سکانداری)\s+)', '', name) |
|
|
name = re.sub(r'^(?:امضای\s+(?:مشترک\s+)?(?:«)?)', '', name) |
|
|
name = name.strip('،» ()') |
|
|
|
|
|
|
|
|
if (len(name.split()) >= 2 and |
|
|
len(name) > 3 and |
|
|
not any(start < existing_end and end > existing_start |
|
|
for existing_start, existing_end in processed_spans)): |
|
|
|
|
|
names.append((start, end, name)) |
|
|
processed_spans.add((start, end)) |
|
|
|
|
|
return names |
|
|
|
|
|
def detect_companies_regex(self, text): |
|
|
"""تشخیص شرکتها (فارسی و انگلیسی)""" |
|
|
|
|
|
general_patterns = [ |
|
|
r'\b[A-Z][a-z]+ (?:Inc|Corp|LLC|Ltd|Company|Co|Corporation|Group|Technologies|Tech|Systems|Solutions|Services|International|Global|Enterprises)\.?\b', |
|
|
r'\b[A-Z][A-Z]+ (?:Inc|Corp|LLC|Ltd|Company|Co|Corporation)\.?\b', |
|
|
r'\b[A-Z][a-z]+ [A-Z][a-z]+ (?:Inc|Corp|LLC|Ltd|Company|Co|Corporation)\.?\b', |
|
|
] |
|
|
|
|
|
|
|
|
tech_companies = r'\b(?:Apple|Microsoft|Google|Amazon|Facebook|Meta|Netflix|Tesla|Oracle|IBM|Intel|Cisco|Adobe|Salesforce|PayPal|Uber|Airbnb|Twitter|LinkedIn|NVIDIA|AMD|Zoom|Slack|Dropbox|Spotify)\b' |
|
|
auto_companies = r'\b(?:Toyota|Honda|Ford|BMW|Mercedes|Audi|Volkswagen|Nissan|Hyundai|Kia|Mazda|Subaru|Volvo|Porsche|Ferrari|Lamborghini)\b' |
|
|
finance_companies = r'\b(?:JPMorgan|Goldman Sachs|Morgan Stanley|Bank of America|Wells Fargo|Chase|Citibank|American Express|Visa|Mastercard|PayPal)\b' |
|
|
retail_companies = r'\b(?:Walmart|Target|Costco|Amazon|eBay|Alibaba|Nike|Adidas|Zara|H&M|IKEA|Starbucks|McDonalds|KFC|Subway)\b' |
|
|
|
|
|
|
|
|
persian_company_patterns = [ |
|
|
|
|
|
r'شرکت\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
r'گروه\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
r'هلدینگ\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
r'بانک\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
r'بیمه\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
r'پتروشیمی\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
r'صنایع\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
r'فولاد\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
r'سازمان\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
r'موسسه\s+[\u0600-\u06FF\s]+(?:[\u0600-\u06FF])', |
|
|
|
|
|
|
|
|
r'ایران\s*خودرو', |
|
|
r'همراه\s*اول', |
|
|
r'فولاد\s*مبارکه(?:\s+اصفهان)?', |
|
|
r'بانک\s+(?:ملت|پاسارگاد|سرمایه|مرکزی|کشاورزی)', |
|
|
r'بیمه\s+(?:پارسیان|سامان)', |
|
|
r'پتروشیمی\s+(?:پارس|بوعلی\s*سینا|اروند)', |
|
|
|
|
|
|
|
|
r'[\u0600-\u06FF\s]+\s*\([\u0600-\u06FF\s]+\)', |
|
|
|
|
|
|
|
|
r'[\u0600-\u06FF]+(?:\s+[\u0600-\u06FF]+){1,4}(?:\s+(?:شرکت|گروه|بانک|بیمه|صنایع))?', |
|
|
] |
|
|
|
|
|
|
|
|
all_patterns = general_patterns + [ |
|
|
tech_companies, |
|
|
auto_companies, |
|
|
finance_companies, |
|
|
retail_companies |
|
|
] + persian_company_patterns |
|
|
|
|
|
companies = [] |
|
|
processed_spans = set() |
|
|
|
|
|
for pattern in all_patterns: |
|
|
matches = re.finditer(pattern, text, re.IGNORECASE) |
|
|
for match in matches: |
|
|
start, end = match.start(), match.end() |
|
|
company = match.group().strip() |
|
|
|
|
|
|
|
|
if (len(company) > 2 and len(company) < 100 and |
|
|
not any(start < existing_end and end > existing_start |
|
|
for existing_start, existing_end in processed_spans)): |
|
|
|
|
|
companies.append((start, end, company)) |
|
|
processed_spans.add((start, end)) |
|
|
|
|
|
return companies |
|
|
|
|
|
def anonymize_text(self, text): |
|
|
"""ناشناسسازی متن با regex""" |
|
|
if not text or pd.isna(text): |
|
|
return text |
|
|
|
|
|
replacements = [] |
|
|
|
|
|
|
|
|
names = self.detect_names_regex(text) |
|
|
for start, end, name in names: |
|
|
if name not in self.person_mapping: |
|
|
self.person_counter += 1 |
|
|
self.person_mapping[name] = f"person-{self.person_counter:02d}" |
|
|
replacements.append((start, end, self.person_mapping[name])) |
|
|
|
|
|
|
|
|
companies = self.detect_companies_regex(text) |
|
|
for start, end, company in companies: |
|
|
if company not in self.company_mapping: |
|
|
self.company_counter += 1 |
|
|
self.company_mapping[company] = f"company-{self.company_counter:02d}" |
|
|
replacements.append((start, end, self.company_mapping[company])) |
|
|
|
|
|
|
|
|
amounts = self.detect_financial_amounts(text) |
|
|
for start, end, amount in amounts: |
|
|
if amount not in self.amount_mapping: |
|
|
self.amount_counter += 1 |
|
|
self.amount_mapping[amount] = f"amount-{self.amount_counter:02d}" |
|
|
replacements.append((start, end, self.amount_mapping[amount])) |
|
|
|
|
|
|
|
|
percentages = self.detect_percentages(text) |
|
|
for start, end, percent in percentages: |
|
|
if percent not in self.percent_mapping: |
|
|
self.percent_counter += 1 |
|
|
self.percent_mapping[percent] = f"percent-{self.percent_counter:02d}" |
|
|
replacements.append((start, end, self.percent_mapping[percent])) |
|
|
|
|
|
|
|
|
replacements = self.remove_overlaps(replacements) |
|
|
replacements.sort(key=lambda x: x[0], reverse=True) |
|
|
|
|
|
|
|
|
result = text |
|
|
for start, end, replacement in replacements: |
|
|
result = result[:start] + replacement + result[end:] |
|
|
|
|
|
return result |
|
|
|
|
|
def remove_overlaps(self, replacements): |
|
|
"""حذف تداخلها در جایگزینیها""" |
|
|
if not replacements: |
|
|
return [] |
|
|
|
|
|
|
|
|
replacements.sort(key=lambda x: x[0]) |
|
|
|
|
|
filtered = [] |
|
|
for start, end, replacement in replacements: |
|
|
|
|
|
if not filtered or start >= filtered[-1][1]: |
|
|
filtered.append((start, end, replacement)) |
|
|
|
|
|
return filtered |
|
|
|
|
|
def get_mapping_summary(self): |
|
|
"""خلاصهای از تبدیلهای انجام شده""" |
|
|
summary = [] |
|
|
|
|
|
if self.person_mapping: |
|
|
summary.append("**اسامی اشخاص:**") |
|
|
for original, anonymized in self.person_mapping.items(): |
|
|
summary.append(f"- {original} → {anonymized}") |
|
|
summary.append("") |
|
|
|
|
|
if self.company_mapping: |
|
|
summary.append("**نام شرکتها:**") |
|
|
for original, anonymized in self.company_mapping.items(): |
|
|
summary.append(f"- {original} → {anonymized}") |
|
|
summary.append("") |
|
|
|
|
|
if self.amount_mapping: |
|
|
summary.append("**مبالغ مالی:**") |
|
|
for original, anonymized in self.amount_mapping.items(): |
|
|
summary.append(f"- {original} → {anonymized}") |
|
|
summary.append("") |
|
|
|
|
|
if self.percent_mapping: |
|
|
summary.append("**درصدها:**") |
|
|
for original, anonymized in self.percent_mapping.items(): |
|
|
summary.append(f"- {original} → {anonymized}") |
|
|
|
|
|
return "\n".join(summary) if summary else "هیچ موجودیت حساسی یافت نشد." |
|
|
|
|
|
|
|
|
anonymizer = TextAnonymizer() |
|
|
|
|
|
def process_csv(file): |
|
|
"""پردازش فایل CSV""" |
|
|
try: |
|
|
|
|
|
anonymizer.reset_counters() |
|
|
|
|
|
|
|
|
if file is None: |
|
|
return None, "لطفاً فایل CSV آپلود کنید.", "", None |
|
|
|
|
|
|
|
|
if file.name.endswith('.csv'): |
|
|
df = pd.read_csv(file.name) |
|
|
else: |
|
|
return None, "لطفاً فایل CSV آپلود کنید.", "", None |
|
|
|
|
|
|
|
|
if 'original_text' not in df.columns: |
|
|
available_columns = ', '.join(df.columns.tolist()) |
|
|
return None, f"ستون 'original_text' در فایل یافت نشد. ستونهای موجود: {available_columns}", "", None |
|
|
|
|
|
|
|
|
result_df = df.copy() |
|
|
|
|
|
|
|
|
result_df['anonymized_text'] = df['original_text'].apply(anonymizer.anonymize_text) |
|
|
|
|
|
|
|
|
output = io.StringIO() |
|
|
result_df.to_csv(output, index=False, encoding='utf-8') |
|
|
csv_content = output.getvalue() |
|
|
|
|
|
|
|
|
output_file = "anonymized_data.csv" |
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
|
f.write(csv_content) |
|
|
|
|
|
|
|
|
sample_df = result_df[['original_text', 'anonymized_text']].head(10) |
|
|
|
|
|
|
|
|
mapping_summary = anonymizer.get_mapping_summary() |
|
|
|
|
|
return output_file, f"✅ پردازش کامل شد! {len(df)} ردیف پردازش شد.", mapping_summary, sample_df |
|
|
|
|
|
except Exception as e: |
|
|
return None, f"❌ خطا در پردازش فایل: {str(e)}", "", None |
|
|
|
|
|
def process_single_text(text): |
|
|
"""پردازش تک متن""" |
|
|
if not text.strip(): |
|
|
return "", "لطفاً متنی وارد کنید." |
|
|
|
|
|
anonymizer.reset_counters() |
|
|
anonymized = anonymizer.anonymize_text(text) |
|
|
mapping_summary = anonymizer.get_mapping_summary() |
|
|
|
|
|
return anonymized, mapping_summary |
|
|
|
|
|
|
|
|
with gr.Blocks(title="ناشناسسازی متن", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# 🔒 برنامه ناشناسسازی متن (نسخه Regex) |
|
|
|
|
|
⚡ **وضعیت:** حالت سریع - بدون نیاز به spaCy |
|
|
|
|
|
این برنامه متنهای شما را ناشناس میکند و اطلاعات حساس زیر را جایگزین میکند: |
|
|
- 👤 **اسامی اشخاص** → person-01, person-02, ... |
|
|
- 🏢 **نام شرکتها** → company-01, company-02, ... |
|
|
- 💰 **مبالغ مالی** → amount-01, amount-02, ... |
|
|
- 📊 **درصدها** → percent-01, percent-02, ... |
|
|
|
|
|
**نسخه ۱:** آدرسها، مکانها و تاریخها ناشناسسازی نمیشوند. |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.TabItem("📁 پردازش فایل CSV"): |
|
|
gr.Markdown("### آپلود فایل CSV با ستون 'original_text'") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
file_input = gr.File( |
|
|
label="فایل CSV را انتخاب کنید", |
|
|
file_types=[".csv"], |
|
|
type="filepath" |
|
|
) |
|
|
process_btn = gr.Button("🚀 شروع پردازش", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
status_output = gr.Textbox( |
|
|
label="وضعیت", |
|
|
interactive=False |
|
|
) |
|
|
download_file = gr.File( |
|
|
label="دانلود فایل ناشناسسازی شده", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
mapping_output = gr.Markdown( |
|
|
label="خلاصه تبدیلها", |
|
|
value="خلاصه تبدیلها اینجا نمایش داده میشود..." |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
sample_output = gr.Dataframe( |
|
|
label="نمونه نتایج (۱۰ ردیف اول)", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
with gr.TabItem("📝 تست تک متن"): |
|
|
gr.Markdown("### تست ناشناسسازی روی یک متن") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox( |
|
|
label="متن اصلی", |
|
|
placeholder="متن خود را اینجا وارد کنید...", |
|
|
lines=5 |
|
|
) |
|
|
test_btn = gr.Button("🔍 ناشناسسازی", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
text_output = gr.Textbox( |
|
|
label="متن ناشناسسازی شده", |
|
|
lines=5, |
|
|
interactive=False |
|
|
) |
|
|
text_mapping = gr.Markdown( |
|
|
label="تبدیلهای انجام شده" |
|
|
) |
|
|
|
|
|
|
|
|
process_btn.click( |
|
|
fn=process_csv, |
|
|
inputs=[file_input], |
|
|
outputs=[download_file, status_output, mapping_output, sample_output] |
|
|
) |
|
|
|
|
|
test_btn.click( |
|
|
fn=process_single_text, |
|
|
inputs=[text_input], |
|
|
outputs=[text_output, text_mapping] |
|
|
) |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["John Smith works at Microsoft and earned $50,000 with a 15% bonus."], |
|
|
["Sarah Johnson from Google Inc. reported revenues of $2.5 million, representing a 25% increase."], |
|
|
["The CEO of Apple, Tim Cook, announced profits of $1.2B with 18.5% growth rate."], |
|
|
["Dr. Michael Brown from IBM Corp. received €75,000 salary increase of 12%."], |
|
|
["Ms. Lisa Wilson at Amazon reported quarterly results of £500K with 8.7% margin."] |
|
|
], |
|
|
inputs=[text_input], |
|
|
label="نمونه متنها" |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |