Spaces:

leilaghomashchi
/

Data-anonymization

Sleeping

App Files Files Community

Data-anonymization / app_anonymizer_v2 (1).py

leilaghomashchi

Upload app_anonymizer_v2 (1).py

a5fd5e3 verified 14 days ago

raw

history blame

25.5 kB

	"""
	سیستم ناشناس‌سازی متون فارسی با پرامپت بهبود یافته
	بر اساس تحلیل 340 نمونه بنچمارک - نسخه 2.0
	"""

	import requests
	import json
	import gradio as gr
	from typing import Dict, Any, List, Generator
	import os
	from dataclasses import dataclass
	import re
	import pandas as pd
	import time
	from datetime import datetime
	import threading

	# ============================================
	# پرامپت بهبود یافته
	# ============================================

	IMPROVED_SYSTEM_PROMPT = """شما یک «ناشناس‌ساز متون مالی/خبری فارسی» هستید. وظیفه‌تان جایگزینی اسامی خاص و مقادیر عددی با شناسه‌های بی‌معناست.

	## قوانین اندیس‌گذاری - CRITICAL
	### 1. ترتیب شماره‌گذاری الزامی:
	- شرکت‌ها: company-01, company-02, company-03, ... (پیوسته و بدون گپ)
	- اشخاص: person-01, person-02, person-03, ... (پیوسته و بدون گپ)
	- اعداد/مبالغ: amount-01, amount-02, amount-03, ... (پیوسته و بدون گپ)
	- درصدها: percent-01, percent-02, percent-03, ... (پیوسته و بدون گپ)

	### 2. ثبات شناسه‌ها در متن:
	- اگر "همراه اول" اول‌بار company-01 شد، در تمام متن همان باشد

	## ⚠️ قوانین حیاتی برای واحدها و مبالغ:

	### قانون 1: مبالغ کامل را یکجا جایگزین کن (بدون واحد)
	- "23 هزار و 296 میلیارد تومان" → `amount-01` ✅
	- "23 هزار و 296 میلیارد تومان" → `amount-01 تومان` ❌
	- "500 میلیون دلار" → `amount-01` ✅
	- "681,667 میلیارد ریال" → `amount-01` ✅

	### قانون 2: پسوندهای صفتی (-ی) را حفظ کن
	- "155 هزار میلیارد ریالی" → `amount-01 ریالی` ✅
	- "2700 میلیارد تومانی" → `amount-01 تومانی` ✅

	### قانون 3: کلمه "درصد" را حذف کن
	- "4.58 درصد" → `percent-01` ✅
	- "4.58 درصد" → `percent-01 درصد` ❌
	- "37 درصدی" → `percent-01` ✅

	## ⚠️ موارد حفظ شده (CRITICAL):

	### 1. سامانه کدال - حفظ شود!
	- "سامانه کدال" → `سامانه کدال` ✅ (تغییر نکند!)
	- "سامانه کدال" → `company-XX` ❌ (اشتباه!)

	### 2. تاریخ‌ها و سال‌ها
	- "سال 1402" → `سال 1402` ✅
	- "1404/04/29" → `1404/04/29` ✅
	- "پاییز ۱۴۰۱" → `پاییز ۱۴۰۱` ✅

	### 3. دوره‌های زمانی
	- "۵ ماهه سال"، "سه‌ماهه نخست"، "۹ ماهه" → حفظ شوند ✅

	### 4. کلمات عمومی بدون نام خاص
	- "سه شرکت دارویی"، "چند بانک"، "12 بانک کشور" → حفظ شوند ✅

	## تشخیص صحیح انواع موجودیت‌ها:

	### شرکت/سازمان (company-XX):
	- نام‌های خاص شرکت: ایران خودرو، بانک ملی، همراه اول
	- سازمان‌های دولتی: سازمان تامین اجتماعی، وزارت نفت
	- گروه‌ها: "گروه همراه اول" → company-XX ✅
	- بازرس/حسابرس: "شرکت وانیا نیک تدبیر" → company-XX ✅

	### شخص (person-XX):
	- نام و نام‌خانوادگی: مهدی اخوان بهابادی، فرج‌اله قدمی

	### مبلغ/عدد (amount-XX):
	- مبالغ مالی، تعداد، اعداد (⚠️ سال‌ها amount نیستند!)

	### درصد (percent-XX):
	- "4.58 درصد"، "37 درصدی" → percent-XX (بدون کلمه درصد)

	## مثال‌های صحیح:

	مثال 1:
	ورودی: ایران خودرو در اسفندماه سال 1402 حدود 23 هزار و 296 میلیارد تومان درآمد کسب کرد که در مقایسه با بهمن 4.58 درصد افزایش داشت.
	خروجی: company-01 در اسفندماه سال 1402 حدود amount-01 درآمد کسب کرد که در مقایسه با بهمن percent-01 افزایش داشت.

	مثال 2:
	ورودی: بانک پاسارگاد با شناسایی سود خالص 155 هزار میلیارد ریالی در رده دوم قرار گرفت.
	خروجی: company-01 با شناسایی سود خالص amount-01 ریالی در رده دوم قرار گرفت.

	مثال 3:
	ورودی: شرکت تیپیکو گزارش خود را در سامانه کدال منتشر کرد.
	خروجی: company-01 گزارش خود را در سامانه کدال منتشر کرد.

	مثال 4:
	ورودی: رشد 14 درصدی سرمایه‌گذاری‌ها به 5000 میلیارد تومان رسید.
	خروجی: رشد percent-01 سرمایه‌گذاری‌ها به amount-01 رسید.

	مثال 5:
	ورودی: زیان خالص 2700 میلیارد تومانی در سه‌ماهه نخست 1404 گزارش کرد.
	خروجی: زیان خالص amount-01 تومانی در سه‌ماهه نخست 1404 گزارش کرد.

	مثال 6:
	ورودی: سازمان تامین اجتماعی دارای سه شرکت دارویی است.
	خروجی: company-01 دارای سه شرکت دارویی است.

	## خلاصه قوانین:
	1. مبالغ کامل → amount-XX (بدون واحد: تومان، ریال، دلار، همت)
	2. پسوند صفتی (-ی) → حفظ شود (ریالی، تومانی)
	3. درصد/درصدی → percent-XX (بدون کلمه درصد)
	4. سامانه کدال → حفظ شود (company نشود)
	5. سال‌ها/تاریخ‌ها → حفظ شوند
	6. کلمات عمومی → حفظ شوند
	7. گروه‌ها → company-XX

	فقط متن ناشناس‌شده را برگردان - هیچ توضیح اضافی نیاز نیست.
	"""

	# ============================================
	# تنظیمات
	# ============================================

	@dataclass
	class CerebrasConfig:
	"""تنظیمات Cerebras API"""
	api_key: str
	base_url: str = "https://api.cerebras.ai/v1"
	model: str = "llama-3.3-70b"
	max_tokens: int = 2000
	temperature: float = 0.1

	@dataclass
	class RateLimitConfig:
	"""تنظیمات محدودیت نرخ درخواست"""
	requests_per_minute: int = 30
	min_delay_between_requests: float = 2.5
	max_retries: int = 5
	initial_backoff: float = 5.0
	max_backoff: float = 120.0
	backoff_multiplier: float = 2.0

	# ============================================
	# Rate Limiter
	# ============================================

	class RateLimiter:
	"""مدیریت محدودیت نرخ درخواست"""

	def __init__(self, config: RateLimitConfig):
	self.config = config
	self.request_times: List[float] = []
	self.lock = threading.Lock()
	self.consecutive_failures = 0

	def wait_if_needed(self) -> float:
	with self.lock:
	now = time.time()
	self.request_times = [t for t in self.request_times if now - t < 60]

	wait_time = 0.0

	if len(self.request_times) >= self.config.requests_per_minute:
	oldest_request = min(self.request_times)
	wait_time = max(wait_time, 60 - (now - oldest_request) + 1)

	if self.request_times:
	time_since_last = now - max(self.request_times)
	if time_since_last < self.config.min_delay_between_requests:
	wait_time = max(wait_time, self.config.min_delay_between_requests - time_since_last)

	if self.consecutive_failures > 0:
	failure_wait = min(
	self.config.initial_backoff * (self.config.backoff_multiplier ** self.consecutive_failures),
	self.config.max_backoff
	)
	wait_time = max(wait_time, failure_wait)

	if wait_time > 0:
	time.sleep(wait_time)

	self.request_times.append(time.time())
	return wait_time

	def report_success(self):
	with self.lock:
	self.consecutive_failures = 0

	def report_failure(self, is_rate_limit: bool = False):
	with self.lock:
	if is_rate_limit:
	self.consecutive_failures += 1
	else:
	self.consecutive_failures = min(self.consecutive_failures + 0.5, 3)

	# ============================================
	# Anonymizer با پرامپت بهبود یافته
	# ============================================

	class ImprovedCerebrasAnonymizer:
	"""سیستم ناشناس‌سازی با پرامپت بهبود یافته"""

	def __init__(self, api_key: str = None, rate_limit_config: RateLimitConfig = None):
	if api_key is None:
	api_key = os.getenv("CEREBRAS_API_KEY")
	if not api_key:
	raise ValueError("کلید API یافت نشد")

	self.config = CerebrasConfig(api_key=api_key)
	self.rate_limit_config = rate_limit_config or RateLimitConfig()
	self.rate_limiter = RateLimiter(self.rate_limit_config)
	self.system_prompt = IMPROVED_SYSTEM_PROMPT

	def _make_api_request_with_retry(self, text: str) -> Dict[str, Any]:
	"""ارسال درخواست با مدیریت retry"""
	headers = {
	"Authorization": f"Bearer {self.config.api_key}",
	"Content-Type": "application/json"
	}

	payload = {
	"messages": [
	{"role": "system", "content": self.system_prompt},
	{"role": "user", "content": text}
	],
	"model": self.config.model,
	"temperature": self.config.temperature,
	"max_tokens": self.config.max_tokens
	}

	last_error = None

	for attempt in range(self.rate_limit_config.max_retries):
	self.rate_limiter.wait_if_needed()

	try:
	response = requests.post(
	f"{self.config.base_url}/chat/completions",
	headers=headers,
	json=payload,
	timeout=60
	)

	if response.status_code == 429:
	self.rate_limiter.report_failure(is_rate_limit=True)
	retry_after = response.headers.get('Retry-After')
	wait_seconds = int(retry_after) if retry_after else min(
	self.rate_limit_config.initial_backoff * (self.rate_limit_config.backoff_multiplier ** attempt),
	self.rate_limit_config.max_backoff
	)
	last_error = f"Rate limit (429). تلاش {attempt + 1}/{self.rate_limit_config.max_retries}"
	time.sleep(wait_seconds)
	continue

	response.raise_for_status()
	self.rate_limiter.report_success()
	return response.json()

	except requests.exceptions.Timeout:
	self.rate_limiter.report_failure(is_rate_limit=False)
	last_error = f"Timeout. تلاش {attempt + 1}/{self.rate_limit_config.max_retries}"
	time.sleep(self.rate_limit_config.initial_backoff)

	except requests.exceptions.RequestException as e:
	self.rate_limiter.report_failure(is_rate_limit=False)
	last_error = f"خطا: {str(e)}"
	time.sleep(self.rate_limit_config.initial_backoff)

	raise Exception(f"ناموفق پس از {self.rate_limit_config.max_retries} تلاش: {last_error}")

	def anonymize_text(self, text: str) -> Dict[str, Any]:
	"""ناشناس‌سازی متن"""
	if not text or not text.strip():
	return {"success": False, "error": "متن خالی", "anonymized_text": ""}

	try:
	response = self._make_api_request_with_retry(text)

	if "choices" not in response or not response["choices"]:
	return {"success": False, "error": "پاسخ نامعتبر", "anonymized_text": ""}

	content = response["choices"][0]["message"]["content"]
	content = self._clean_markdown(content).strip()

	analysis = self._analyze_anonymized_text(content)

	return {
	"success": True,
	"anonymized_text": content,
	"entities": analysis["entities"],
	"statistics": analysis["statistics"],
	"usage": response.get("usage", {})
	}

	except Exception as e:
	return {"success": False, "error": str(e), "anonymized_text": ""}

	def _clean_markdown(self, content: str) -> str:
	if "```" in content:
	lines = content.split('\n')
	clean_lines = []
	skip = False
	for line in lines:
	if line.strip().startswith('```'):
	skip = not skip
	continue
	if not skip:
	clean_lines.append(line)
	content = '\n'.join(clean_lines)
	return content

	def _analyze_anonymized_text(self, text: str) -> Dict[str, Any]:
	companies = re.findall(r'company-(\d+)', text)
	persons = re.findall(r'person-(\d+)', text)
	amounts = re.findall(r'amount-(\d+)', text)
	percents = re.findall(r'percent-(\d+)', text)

	statistics = {
	"company": len(set(companies)),
	"person": len(set(persons)),
	"amount": len(set(amounts)),
	"percent": len(set(percents)),
	"total": len(companies) + len(persons) + len(amounts) + len(percents)
	}

	entities = {
	"companies": sorted(list(set(companies)), key=lambda x: int(x)),
	"persons": sorted(list(set(persons)), key=lambda x: int(x)),
	"amounts": sorted(list(set(amounts)), key=lambda x: int(x)),
	"percents": sorted(list(set(percents)), key=lambda x: int(x))
	}

	return {"statistics": statistics, "entities": entities}

	# ============================================
	# Batch Processor
	# ============================================

	class BatchProcessor:
	"""پردازشگر دسته‌ای"""

	def __init__(self, api_key: str, rate_limit_config: RateLimitConfig = None):
	self.api_key = api_key
	self.rate_limit_config = rate_limit_config or RateLimitConfig()
	self.anonymizer = None
	self.is_cancelled = False
	self.processed_rows = 0
	self.failed_rows = 0
	self.start_time = None

	def cancel(self):
	self.is_cancelled = True

	def reset(self):
	self.is_cancelled = False
	self.processed_rows = 0
	self.failed_rows = 0
	self.start_time = None

	def process_csv(self, file_path: str, text_column: str, output_column: str = "anonymized_text"):
	self.reset()
	self.start_time = time.time()

	# خواندن CSV
	try:
	df = pd.read_csv(file_path, encoding='utf-8')
	except:
	try:
	df = pd.read_csv(file_path, encoding='utf-8-sig')
	except:
	df = pd.read_csv(file_path, encoding='cp1256')

	if text_column not in df.columns:
	yield {"type": "error", "message": f"ستون '{text_column}' یافت نشد"}
	return

	total_rows = len(df)
	self.anonymizer = ImprovedCerebrasAnonymizer(
	api_key=self.api_key,
	rate_limit_config=self.rate_limit_config
	)

	df[output_column] = ""
	df["status"] = ""

	yield {"type": "info", "message": f"🚀 شروع پردازش {total_rows} ردیف..."}

	for idx, row in df.iterrows():
	if self.is_cancelled:
	yield {"type": "cancelled", "processed": self.processed_rows}
	break

	text = str(row[text_column]) if pd.notna(row[text_column]) else ""

	if not text.strip():
	df.at[idx, output_column] = ""
	df.at[idx, "status"] = "خالی"
	self.processed_rows += 1
	continue

	result = self.anonymizer.anonymize_text(text)

	if result["success"]:
	df.at[idx, output_column] = result["anonymized_text"]
	df.at[idx, "status"] = "✅"
	self.processed_rows += 1
	else:
	df.at[idx, output_column] = f"خطا: {result.get('error', '')}"
	df.at[idx, "status"] = "❌"
	self.failed_rows += 1

	progress = (idx + 1) / total_rows * 100
	elapsed = time.time() - self.start_time

	yield {
	"type": "progress",
	"current": idx + 1,
	"total": total_rows,
	"progress": progress,
	"processed": self.processed_rows,
	"failed": self.failed_rows,
	"elapsed": elapsed
	}

	if not self.is_cancelled:
	output_path = file_path.replace('.csv', '_anonymized_v2.csv')
	df.to_csv(output_path, index=False, encoding='utf-8-sig')

	yield {
	"type": "complete",
	"output_path": output_path,
	"total": total_rows,
	"processed": self.processed_rows,
	"failed": self.failed_rows,
	"time": time.time() - self.start_time,
	"dataframe": df
	}

	# ============================================
	# رابط کاربری Gradio
	# ============================================

	def create_interface():
	"""ایجاد رابط کاربری"""

	api_key_available = bool(os.getenv("CEREBRAS_API_KEY"))
	batch_processor = {"instance": None}

	css = """
	.gradio-container { direction: rtl; font-family: Tahoma, Arial; }
	.success-box { background: #d4edda; padding: 15px; border-radius: 10px; color: #155724; }
	.warning-box { background: #fff3cd; padding: 15px; border-radius: 10px; color: #856404; }
	.info-box { background: #d1ecf1; padding: 15px; border-radius: 10px; color: #0c5460; }
	"""

	with gr.Blocks(css=css, title="ناشناس‌ساز بهبود یافته v2.0", theme=gr.themes.Soft()) as interface:

	gr.Markdown("""
	# 🔒 سیستم ناشناس‌سازی متون فارسی - نسخه بهبود یافته 2.0
	### ⚡ با پرامپت بهینه‌شده بر اساس تحلیل 340 نمونه بنچمارک
	""")

	gr.Markdown("""
	<div class="info-box">
	📌 <strong>بهبودهای نسخه 2.0:</strong><br>
	• حذف واحدها از مبالغ (تومان، ریال، دلار → amount-XX)<br>
	• حفظ پسوندهای صفتی (ریالی، تومانی)<br>
	• حذف کلمه "درصد" (37 درصد → percent-01)<br>
	• حفظ "سامانه کدال" (company نمی‌شود)<br>
	• حفظ سال‌ها و تاریخ‌ها
	</div>
	""")

	with gr.Tabs():
	# تب پردازش تکی
	with gr.Tab("📝 پردازش تکی"):
	if not api_key_available:
	api_key = gr.Textbox(label="🔑 کلید API", type="password")
	else:
	api_key = gr.Textbox(visible=False, value="")

	with gr.Row():
	input_text = gr.Textbox(label="📝 متن ورودی", lines=8)
	output_text = gr.Textbox(label="🎯 متن ناشناس‌شده", lines=8)

	process_btn = gr.Button("🔒 ناشناس‌سازی", variant="primary")
	stats_output = gr.Markdown()

	# تب پردازش دسته‌ای
	with gr.Tab("📁 پردازش دسته‌ای CSV"):
	if not api_key_available:
	batch_api_key = gr.Textbox(label="🔑 کلید API", type="password")
	else:
	batch_api_key = gr.Textbox(visible=False, value="")

	csv_file = gr.File(label="📂 فایل CSV", file_types=[".csv"])

	with gr.Row():
	text_column = gr.Dropdown(label="📑 ستون متن", choices=[], interactive=True)
	delay_slider = gr.Slider(1, 10, value=2.5, label="⏱️ تأخیر (ثانیه)")

	with gr.Row():
	start_btn = gr.Button("🚀 شروع", variant="primary")
	cancel_btn = gr.Button("⏹️ لغو", variant="stop")

	progress_bar = gr.Slider(0, 100, value=0, label="📊 پیشرفت", interactive=False)
	progress_text = gr.Markdown("در انتظار...")
	output_file = gr.File(label="📥 دانلود", visible=False)

	# توابع
	def process_single(text, key):
	if not text.strip():
	return "", "⚠️ متن خالی"

	api = key if key else os.getenv("CEREBRAS_API_KEY")
	if not api:
	return "", "❌ کلید API وارد نشده"

	try:
	anonymizer = ImprovedCerebrasAnonymizer(api_key=api)
	result = anonymizer.anonymize_text(text)

	if result["success"]:
	stats = result.get("statistics", {})
	return result["anonymized_text"], f"✅ موفق \| شرکت: {stats.get('company',0)} \| شخص: {stats.get('person',0)} \| مبلغ: {stats.get('amount',0)} \| درصد: {stats.get('percent',0)}"
	return "", f"❌ {result.get('error', 'خطا')}"
	except Exception as e:
	return "", f"❌ {str(e)}"

	def update_columns(file):
	if file is None:
	return gr.update(choices=[])
	try:
	df = pd.read_csv(file.name, encoding='utf-8', nrows=1)
	except:
	try:
	df = pd.read_csv(file.name, encoding='utf-8-sig', nrows=1)
	except:
	df = pd.read_csv(file.name, encoding='cp1256', nrows=1)
	return gr.update(choices=list(df.columns), value=df.columns[0])

	def start_batch(file, text_col, delay, key):
	if file is None:
	yield 0, "❌ فایل انتخاب نشده", gr.update(visible=False)
	return

	api = key if key else os.getenv("CEREBRAS_API_KEY")
	if not api:
	yield 0, "❌ کلید API وارد نشده", gr.update(visible=False)
	return

	config = RateLimitConfig(min_delay_between_requests=float(delay))
	processor = BatchProcessor(api_key=api, rate_limit_config=config)
	batch_processor["instance"] = processor

	for update in processor.process_csv(file.name, text_col):
	if update["type"] == "error":
	yield 0, f"❌ {update['message']}", gr.update(visible=False)
	elif update["type"] == "progress":
	yield update["progress"], f"📊 {update['current']}/{update['total']} \| ✅ {update['processed']} \| ❌ {update['failed']}", gr.update(visible=False)
	elif update["type"] == "complete":
	yield 100, f"✅ تکمیل! \| کل: {update['total']} \| موفق: {update['processed']} \| ناموفق: {update['failed']} \| زمان: {update['time']/60:.1f} دقیقه", gr.update(value=update['output_path'], visible=True)
	elif update["type"] == "cancelled":
	yield 0, f"⏹️ لغو شد \| پردازش شده: {update['processed']}", gr.update(visible=False)

	def cancel_batch():
	if batch_processor["instance"]:
	batch_processor["instance"].cancel()
	return "⏹️ درخواست لغو..."

	# اتصال رویدادها
	process_btn.click(process_single, [input_text, api_key], [output_text, stats_output])
	csv_file.change(update_columns, [csv_file], [text_column])
	start_btn.click(start_batch, [csv_file, text_column, delay_slider, batch_api_key], [progress_bar, progress_text, output_file])
	cancel_btn.click(cancel_batch, outputs=[progress_text])

	return interface

	# ============================================
	# اجرا
	# ============================================

	if __name__ == "__main__":
	interface = create_interface()
	interface.launch(server_name="0.0.0.0", server_port=7860, share=True)