Spaces:

leilaghomashchi
/

Data-anonymization

Sleeping

App Files Files Community

leilaghomashchi commited on Nov 30, 2025

Commit

419e1e7

verified ·

1 Parent(s): f099dda

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -834

app.py DELETED Viewed

@@ -1,834 +0,0 @@
-import requests
-import json
-import gradio as gr
-from typing import Dict, Any, List, Optional
-import os
-from dataclasses import dataclass
-import re
-import pandas as pd
-import time
-import random
-from io import StringIO
-from pathlib import Path
-@dataclass
-class CerebrasConfig:
-    """تنظیمات Cerebras API"""
-    api_key: str
-    base_url: str = "https://api.cerebras.ai/v1"
-    model: str = "llama-3.3-70b"
-    max_tokens: int = 2000
-    temperature: float = 0.1
-# ============= تنظیمات Rate Limit =============
-# ⚠️ Cerebras Free Tier: حدود 30 request/minute
-DELAY_BETWEEN_REQUESTS = 8.0      # ⭐ ثانیه تاخیر بین هر درخواست (8 ثانیه = ~7.5 req/min)
-MAX_RETRIES = 5                    # کاهش تعداد retry
-INITIAL_BACKOFF = 30.0             # ⭐ تاخیر اولیه بیشتر
-BACKOFF_MULTIPLIER = 1.5           # ⭐ ضریب کمتر
-MAX_BACKOFF = 120.0                # حداکثر تاخیر (2 دقیقه)
-CHECKPOINT_INTERVAL = 5            # هر چند ردیف یکبار ذخیره شود
-class AdvancedCerebrasAnonymizer:
-    """سیستم پیشرفته ناشناس‌سازی متون مالی/خبری فارسی"""
-    def __init__(self, api_key: str = None):
-        if api_key is None:
-            api_key = os.getenv("CEREBRAS_API_KEY")
-            if not api_key:
-                raise ValueError("کلید API یافت نشد")
-        self.config = CerebrasConfig(api_key=api_key)
-        self.system_prompt = self._create_advanced_system_prompt()
-        self.last_request_time = 0
-        self.request_count = 0
-    def _create_advanced_system_prompt(self) -> str:
-        """ایجاد دستورالعمل سیستمی پیشرفته برای Cerebras"""
-        return """شما یک «ناشناس‌ساز متون مالی/خبری فارسی» هستید. وظیفه‌تان جایگزینی اسامی خاص و مقادیر عددی با شناسه‌های بی‌معناست.
-## **قوانین اندیس‌گذاری - CRITICAL**
-### **1. ترتیب شماره‌گذاری الزامی:**
-- شرکت‌ها: company-01, company-02, company-03, company-04, ... (پیوسته و بدون گپ)
-- اشخاص: person-01, person-02, person-03, ... (پیوسته و بدون گپ)
-- اعداد: amount-01, amount-02, amount-03, ... (پیوسته و بدون گپ)
-- درصدها: percent-01, percent-02, percent-03, ... (پیوسته و بدون گپ)
-### **2. ثبات شناسه‌ها در متن:**
-- اگر "همراه اول" اول‌بار company-01 شد، در تمام متن همان باشد
-- اگر "مهدی احمدی" اول‌بار person-01 شد، در تمام متن همان باشد
-### **3. تشخیص صحیح انواع:**
-**شرکت/سازمان:** همراه اول، بانک ملی، ایران‌خودرو، سایپا، بانک مرکزی، سامانه کدال، وزارت نفت، سازمان تنظیم مقررات رادیویی، سازمان تامین اجتماعی
-**⚠️ CRITICAL - گروه‌ها:** "گروه همراه اول"، "گروه اقتصادی آزادگان"، "گروه مالی صبا" → همه company-XX هستند (نه group-XX)
-**⚠️ CRITICAL - کلمات عمومی:** "سه شرکت دارویی"، "چند بانک"، "یک شرکت" → کلمات عمومی هستند، موجودیت نیستند (حفظ شوند)
-**⚠️ CRITICAL - نام‌های مستعار:** "فاما" همان "فولاد مبارکه اصفهان" است → هر دو company-01
-**شخص:** مهدی اخوان بهابادی، محمدرضا فرزین، ابوالفضل نجارزاده
-**عدد:** 37، 70، 677، 73.7، 178 (هر عددی)
-**درصد:** 37 درصدی، 15 درصدی، 53 درصد، 43%
-## **مثال‌های صحیح:**
-### **مثال 1 (الگوی کامل):**
-**ورودی:** مهدی اخوان بهابادی، مدیرعامل همراه اول، اعلام کرد درآمد عملیاتی شرکت با رشد 37 درصدی به 70 هزار و 677 میلیارد تومان رسیده است. سود خالص 7101 میلیارد تومان و تلفیقی گروه همراه اول 8003 میلیارد تومان شد.
-**خروجی صحیح:** person-01، مدیرعامل company-01، اعلام کرد درآمد عملیاتی شرکت با رشد percent-01 به amount-01 رسیده است. سود خالص amount-02 و تلفیقی گروه company-01 amount-03 شد.
-### **مثال 2:**
-**ورودی:** بانک مرکزی و بانک ملی با همکاری محمدرضا فرزین، 60 درصد سپرده‌ها را مدیریت کردند.
-**خروجی:** company-01 و company-02 با همکاری person-01، percent-01 سپرده‌ها را مدیریت کردند.
-## **موارد حفظ شده:**
-- تاریخ‌ها: 1404/04/23، 30 آذر 1403، پاییز 1401
-- فصل‌های سال: پاییز، بهار، تابستان، زمستان
-- عناوین شغلی: مدیرعامل، رئیس کل، مدیرکل
-- واحدها: میلیارد تومان، همت، ریال، ماه، سال
-- مکان‌ها: تهران، اصفهان، ایران
-- کلمات عمومی: "سه شرکت دارویی"، "چند بانک"، "یک شرکت"، "مراکز درمانی"
-- دوره‌های زمانی: "۵ ماهه سال"، "۹ ماهه"، "۳ ماهه اول"
-## **ممنوع:**
-- کلمات انگلیسی اضافی
-- تغییر ساختار جمله
-- حذف یا اضافه کردن کلمات
-- استفاده از group-XX - همه گروه‌ها باید company-XX باشند
-**فقط متن ناشناس‌شده را برگردان - هیچ توضیح اضافی نیاز نیست.**
-"""
-    def _wait_for_rate_limit(self):
-        """اطمینان از رعایت فاصله بین درخواست‌ها"""
-        elapsed = time.time() - self.last_request_time
-        if elapsed < DELAY_BETWEEN_REQUESTS:
-            sleep_time = DELAY_BETWEEN_REQUESTS - elapsed
-            print(f"   ⏳ انتظار {sleep_time:.1f} ثانیه برای رعایت rate limit...")
-            time.sleep(sleep_time)
-    def _make_api_request(self, text: str) -> Dict[str, Any]:
-        """ارسال درخواست به Cerebras API با retry logic پیشرفته"""
-        headers = {
-            "Authorization": f"Bearer {self.config.api_key}",
-            "Content-Type": "application/json"
-        }
-        payload = {
-            "messages": [
-                {"role": "system", "content": self.system_prompt},
-                {"role": "user", "content": text}
-            ],
-            "model": self.config.model,
-            "temperature": self.config.temperature,
-            "max_tokens": self.config.max_tokens
-        }
-        for attempt in range(MAX_RETRIES):
-            try:
-                # ⭐ رعایت فاصله بین درخواست‌ها
-                self._wait_for_rate_limit()
-                response = requests.post(
-                    f"{self.config.base_url}/chat/completions",
-                    headers=headers,
-                    json=payload,
-                    timeout=90  # افزایش timeout
-                )
-                # ⭐ بررسی خطای 429 قبل از raise_for_status
-                if response.status_code == 429:
-                    raise requests.exceptions.HTTPError("429 Too Many Requests")
-                response.raise_for_status()
-                self.last_request_time = time.time()
-                self.request_count += 1
-                return response.json()
-            except requests.exceptions.RequestException as e:
-                error_str = str(e)
-                # ⭐ بررسی خطای Rate Limit
-                if "429" in error_str or "Too Many Requests" in error_str or response.status_code == 429:
-                    # محاسبه زمان انتظار با exponential backoff
-                    backoff = min(
-                        INITIAL_BACKOFF * (BACKOFF_MULTIPLIER ** attempt),
-                        MAX_BACKOFF
-                    )
-                    # ⭐ اضافه کردن jitter تصادفی
-                    jitter = random.uniform(0, backoff * 0.2)
-                    wait_time = backoff + jitter
-                    print(f"   ⚠️ Rate Limit! تلاش {attempt + 1}/{MAX_RETRIES}")
-                    print(f"   ⏳ انتظار {wait_time:.1f} ثانیه...")
-                    time.sleep(wait_time)
-                    if attempt == MAX_RETRIES - 1:
-                        raise Exception(f"خطا در ارتباط با Cerebras API پس از {MAX_RETRIES} تلاش: Rate Limit")
-                # ⭐ خطای 503 (Service Unavailable)
-                elif "503" in error_str or "Service Unavailable" in error_str:
-                    wait_time = INITIAL_BACKOFF * (attempt + 1)
-                    print(f"   ⚠️ سرویس موقتاً در دسترس نیست. تلاش {attempt + 1}/{MAX_RETRIES}")
-                    print(f"   ⏳ انتظار {wait_time:.1f} ثانیه...")
-                    time.sleep(wait_time)
-                    if attempt == MAX_RETRIES - 1:
-                        raise Exception(f"خطا: سرویس در دسترس نیست پس از {MAX_RETRIES} تلاش")
-                # ⭐ خطای timeout
-                elif "timeout" in error_str.lower() or "timed out" in error_str.lower():
-                    wait_time = 5 * (attempt + 1)
-                    print(f"   ⚠️ Timeout! تلاش {attempt + 1}/{MAX_RETRIES}")
-                    time.sleep(wait_time)
-                    if attempt == MAX_RETRIES - 1:
-                        raise Exception(f"خطا: Timeout پس از {MAX_RETRIES} تلاش")
-                else:
-                    # سایر خطاها
-                    if attempt < MAX_RETRIES - 1:
-                        print(f"   ⚠️ خطا: {error_str[:80]}... تلاش مجدد...")
-                        time.sleep(INITIAL_BACKOFF)
-                    else:
-                        raise Exception(f"خطا در ارتباط با Cerebras API: {error_str}")
-        raise Exception(f"ناموفق پس از {MAX_RETRIES} تلاش")
-    def _clean_markdown(self, content: str) -> str:
-        """پاک کردن markdown از پاسخ"""
-        if "```" in content:
-            lines = content.split('\n')
-            clean_lines = []
-            skip = False
-            for line in lines:
-                if line.strip().startswith('```'):
-                    skip = not skip
-                    continue
-                if not skip:
-                    clean_lines.append(line)
-            content = '\n'.join(clean_lines)
-        return content
-    def _analyze_anonymized_text(self, text: str) -> Dict[str, Any]:
-        """تحلیل متن ناشناس‌سازی شده"""
-        companies = re.findall(r'company-(\d+)', text)
-        persons = re.findall(r'person-(\d+)', text)
-        amounts = re.findall(r'amount-(\d+)', text)
-        percents = re.findall(r'percent-(\d+)', text)
-        statistics = {
-            "company": len(set(companies)),
-            "person": len(set(persons)),
-            "amount": len(set(amounts)),
-            "percent": len(set(percents)),
-            "total_replacements": len(companies) + len(persons) + len(amounts) + len(percents)
-        }
-        entities = {
-            "companies": sorted(list(set(companies)), key=lambda x: int(x)),
-            "persons": sorted(list(set(persons)), key=lambda x: int(x)),
-            "amounts": sorted(list(set(amounts)), key=lambda x: int(x)),
-            "percents": sorted(list(set(percents)), key=lambda x: int(x))
-        }
-        detailed_analysis = {
-            "preserved_dates": len(re.findall(r'\d{4}/\d{1,2}/\d{1,2}|\d{1,2}\s+\w+\s+\d{4}', text)),
-            "preserved_times": len(re.findall(r'\d{1,2}:\d{2}', text)),
-            "financial_indicators": len(re.findall(r'\b(EPS|P/E|ARPU|NPL|ROE|ROA)\b', text)),
-            "units_preserved": len(re.findall(r'(میلیارد|میلیون|هزار|تومان|ریال|درهم|دلار|یورو|تن|کیلوگرم)', text))
-        }
-        return {
-            "statistics": statistics,
-            "entities": entities,
-            "detailed_analysis": detailed_analysis
-        }
-    def _validate_anonymized_text(self, text: str) -> Dict[str, Any]:
-        """اعتبارسنجی پیشرفته متن ناشناس‌شده"""
-        companies = re.findall(r'company-(\d+)', text)
-        persons = re.findall(r'person-(\d+)', text)
-        amounts = re.findall(r'amount-(\d+)', text)
-        percents = re.findall(r'percent-(\d+)', text)
-        validation_issues = []
-        for entity_type, indices in [
-            ("company", companies),
-            ("person", persons),
-            ("amount", amounts),
-            ("percent", percents)
-        ]:
-            if indices:
-                unique_indices = sorted(list(set([int(x) for x in indices])))
-                if unique_indices[0] != 1:
-                    validation_issues.append(f"اندیس {entity_type} از 01 شروع نشده")
-                expected = list(range(1, len(unique_indices) + 1))
-                if unique_indices != expected:
-                    validation_issues.append(f"اندیس‌های {entity_type} پیوسته نیستند")
-        return {
-            "is_valid": len(validation_issues) == 0,
-            "issues": validation_issues,
-            "entity_counts": {
-                "company": len(set(companies)),
-                "person": len(set(persons)),
-                "amount": len(set(amounts)),
-                "percent": len(set(percents))
-            }
-        }
-    def anonymize_text(self, text: str) -> Dict[str, Any]:
-        """ناشناس‌سازی متن با استفاده از Cerebras"""
-        if not text or not text.strip():
-            return {
-                "success": False,
-                "error": "متن ورودی خالی است"
-            }
-        try:
-            response = self._make_api_request(text)
-            if "choices" not in response or not response["choices"]:
-                return {
-                    "success": False,
-                    "error": "پاسخ نامعتبر از API"
-                }
-            content = response["choices"][0]["message"]["content"]
-            content = self._clean_markdown(content)
-            content = content.strip()
-            analysis = self._analyze_anonymized_text(content)
-            return {
-                "success": True,
-                "anonymized_text": content,
-                "entities": analysis["entities"],
-                "statistics": analysis["statistics"],
-                "detailed_analysis": analysis["detailed_analysis"],
-                "usage": response.get("usage", {}),
-                "quality_check": self._validate_anonymized_text(content)
-            }
-        except Exception as e:
-            return {
-                "success": False,
-                "error": f"خطا در پردازش: {str(e)}"
-            }
-    def anonymize_batch(self, texts: List[str], progress_callback=None) -> List[Dict[str, Any]]:
-        """ناشناس‌سازی دسته‌ای متون"""
-        results = []
-        total = len(texts)
-        for idx, text in enumerate(texts):
-            if progress_callback:
-                progress_callback((idx + 1) / total, f"پردازش سطر {idx + 1} از {total}")
-            result = self.anonymize_text(text)
-            results.append(result)
-            # تاخیر برای جلوگیری از rate limiting
-            if idx < total - 1:
-                time.sleep(DELAY_BETWEEN_REQUESTS)
-        return results
-# ============= توابع Checkpoint =============
-def save_checkpoint(checkpoint_path: str, data: dict):
-    """ذخیره checkpoint"""
-    with open(checkpoint_path, 'w', encoding='utf-8') as f:
-        json.dump(data, f, ensure_ascii=False, indent=2)
-    print(f"   💾 Checkpoint ذخیره شد")
-def load_checkpoint(checkpoint_path: str) -> dict:
-    """بارگذاری checkpoint"""
-    if Path(checkpoint_path).exists():
-        with open(checkpoint_path, 'r', encoding='utf-8') as f:
-            return json.load(f)
-    return None
-def create_advanced_interface():
-    """ایجاد رابط کاربری پیشرفته با قابلیت پردازش دسته‌ای"""
-    api_key_available = bool(os.getenv("CEREBRAS_API_KEY"))
-    custom_css = """
-    .gradio-container {
-        font-family: 'Tahoma', 'Arial', sans-serif !important;
-        direction: rtl;
-        max-width: 1400px;
-        margin: 0 auto;
-    }
-    .result-box {
-        background-color: #f8f9fa;
-        border: 2px solid #e9ecef;
-        border-radius: 12px;
-        padding: 20px;
-        margin: 10px 0;
-    }
-    .warning-box {
-        background-color: #fff3cd;
-        border: 2px solid #ffeaa7;
-        border-radius: 12px;
-        padding: 15px;
-        color: #856404;
-        margin: 10px 0;
-    }
-    .success-box {
-        background-color: #d4edda;
-        border: 2px solid #c3e6cb;
-        border-radius: 12px;
-        padding: 15px;
-        color: #155724;
-        margin: 10px 0;
-    }
-    .batch-progress {
-        background-color: #e3f2fd;
-        border: 2px solid #90caf9;
-        border-radius: 12px;
-        padding: 15px;
-        margin: 10px 0;
-    }
-    """
-    with gr.Blocks(css=custom_css, title="ناشناس‌ساز پیشرفته متن فارسی با Cerebras", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("""
-        # 🔒 سیستم پیشرفته ناشناس‌سازی متون مالی/خبری فارسی
-        ### ⚡ قدرت‌گرفته از Cerebras AI - با مدیریت هوشمند Rate Limit
-        """)
-        # API Key input
-        if api_key_available:
-            gr.Markdown("""
-            <div class="success-box">
-            ✅ <strong>سیستم آماده است</strong> - کلید API تنظیم شده
-            </div>
-            """)
-            api_key_input = gr.Textbox(visible=False, value="")
-        else:
-            gr.Markdown("""
-            <div class="warning-box">
-            ⚠️ <strong>کلید API تنظیم نشده</strong><br>
-            لطفاً کلید Cerebras API خود را در زیر وارد کنید
-            </div>
-            """)
-            api_key_input = gr.Textbox(
-                label="🔑 کلید Cerebras API",
-                placeholder="csk-...",
-                type="password"
-            )
-        # تب‌های اصلی
-        with gr.Tabs() as tabs:
-            # ===============================
-            # تب 1: پردازش تکی
-            # ===============================
-            with gr.TabItem("📝 پردازش تکی"):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        input_text = gr.Textbox(
-                            label="📝 متن ورودی",
-                            placeholder="متن مالی یا خبری خود را اینجا وارد کنید...",
-                            lines=12,
-                            rtl=True
-                        )
-                        with gr.Row():
-                            anonymize_btn = gr.Button("🚀 ناشناس‌سازی", variant="primary", size="lg")
-                            clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary")
-                    with gr.Column(scale=1):
-                        output_text = gr.Textbox(
-                            label="📤 متن ناشناس‌شده",
-                            lines=12,
-                            rtl=True,
-                            interactive=False
-                        )
-                        copy_btn = gr.Button("📋 کپی نتیجه", variant="secondary")
-                        copy_output = gr.Textbox(visible=False)
-                # آمار و تحلیل
-                with gr.Row():
-                    with gr.Column():
-                        statistics_output = gr.Markdown(label="📊 آمار")
-                    with gr.Column():
-                        quality_output = gr.Markdown(label="✅ کیفیت")
-                with gr.Row():
-                    with gr.Column():
-                        entities_output = gr.Markdown(label="🏷️ موجودیت‌ها")
-                    with gr.Column():
-                        detailed_analysis_output = gr.Markdown(label="📈 تحلیل تفصیلی")
-                usage_output = gr.Markdown(label="⚡ مصرف")
-            # ===============================
-            # تب 2: پردازش دسته‌ای
-            # ===============================
-            with gr.TabItem("📁 پردازش دسته‌ای"):
-                gr.Markdown("""
-                ### 📁 پردازش دسته‌ای فایل CSV
-                ⚠️ **توجه:** برای جلوگیری از خطای Rate Limit، بین هر درخواست **4 ثانیه** تاخیر اعمال می‌شود.
-                💾 **قابلیت Checkpoint:** اگر پردازش قطع شود، می‌توانید از همان نقطه ادامه دهید.
-                """)
-                with gr.Row():
-                    with gr.Column():
-                        csv_file = gr.File(
-                            label="📂 آپلود فایل CSV",
-                            file_types=[".csv"]
-                        )
-                        text_column = gr.Dropdown(
-                            label="📝 ستون متن",
-                            choices=[],
-                            interactive=True
-                        )
-                        output_column_name = gr.Textbox(
-                            label="📤 نام ستون خروجی",
-                            value="anonymized_text",
-                            placeholder="anonymized_text"
-                        )
-                        # ⭐ گزینه ادامه از checkpoint
-                        continue_from_checkpoint = gr.Checkbox(
-                            label="📍 ادامه از checkpoint قبلی (اگر موجود باشد)",
-                            value=True
-                        )
-                        batch_btn = gr.Button("🚀 شروع پردازش", variant="primary", size="lg")
-                    with gr.Column():
-                        preview_df = gr.Dataframe(
-                            label="👁️ پیش‌نمایش فایل",
-                            interactive=False,
-                            wrap=True
-                        )
-                batch_progress = gr.Markdown("📊 آماده پردازش...")
-                batch_stats = gr.Markdown("")
-                with gr.Row():
-                    result_df = gr.Dataframe(
-                        label="📊 نتایج پردازش",
-                        interactive=False,
-                        wrap=True
-                    )
-                download_btn = gr.File(
-                    label="📥 دانلود نتایج",
-                    visible=False
-                )
-                error_log = gr.Markdown("")
-        # ===============================
-        # توابع پردازش
-        # ===============================
-        def load_csv_columns(file_path):
-            """بارگذاری ستون‌های CSV"""
-            if file_path is None:
-                return gr.Dropdown(choices=[]), None, "📊 فایل انتخاب نشده"
-            try:
-                try:
-                    df = pd.read_csv(file_path, encoding='utf-8')
-                except:
-                    try:
-                        df = pd.read_csv(file_path, encoding='utf-8-sig')
-                    except:
-                        df = pd.read_csv(file_path, encoding='cp1256')
-                columns = df.columns.tolist()
-                preview = df.head(5)
-                return (
-                    gr.Dropdown(choices=columns, value=columns[0] if columns else None),
-                    preview,
-                    f"✅ ��ایل بارگذاری شد | {len(df)} سطر | {len(columns)} ستون"
-                )
-            except Exception as e:
-                return gr.Dropdown(choices=[]), None, f"❌ خطا در بارگذاری: {str(e)}"
-        def process_single_text(text, api_key):
-            """پردازش تک متن"""
-            if not text or not text.strip():
-                return "", "⚠️ لطفاً متن وارد کنید", "", "", "", ""
-            try:
-                key = api_key if api_key else os.getenv("CEREBRAS_API_KEY")
-                if not key:
-                    return "", "❌ کلید API تنظیم نشده", "", "", "", ""
-                anonymizer = AdvancedCerebrasAnonymizer(api_key=key)
-                result = anonymizer.anonymize_text(text)
-                if not result["success"]:
-                    return "", f"❌ خطا: {result['error']}", "", "", "", ""
-                # آمار
-                stats = result["statistics"]
-                stats_md = f"""
-### 📊 آمار جایگزینی:
-| نوع | تعداد |
-|-----|-------|
-| شرکت | {stats['company']} |
-| شخص | {stats['person']} |
-| مبلغ | {stats['amount']} |
-| درصد | {stats['percent']} |
-| **کل** | **{stats['total_replacements']}** |
-"""
-                # کیفیت
-                quality = result["quality_check"]
-                quality_status = "✅ معتبر" if quality["is_valid"] else "⚠️ مشکل دارد"
-                quality_md = f"### {quality_status}\n"
-                if quality["issues"]:
-                    quality_md += "\n".join([f"- {issue}" for issue in quality["issues"]])
-                # موجودیت‌ها
-                entities = result["entities"]
-                entities_md = f"""
-### 🏷️ موجودیت‌های شناسایی شده:
-- **شرکت‌ها:** {', '.join([f'company-{x}' for x in entities['companies']]) or '-'}
-- **اشخاص:** {', '.join([f'person-{x}' for x in entities['persons']]) or '-'}
-- **مبالغ:** {', '.join([f'amount-{x}' for x in entities['amounts']]) or '-'}
-- **درصدها:** {', '.join([f'percent-{x}' for x in entities['percents']]) or '-'}
-"""
-                # تحلیل تفصیلی
-                detailed = result["detailed_analysis"]
-                detailed_md = f"""
-| شاخص | مقدار |
-|------|-------|
-| تاریخ‌های حفظ شده | {detailed['preserved_dates']} |
-| شاخص‌های مالی | {detailed['financial_indicators']} |
-| واحدهای حفظ شده | {detailed['units_preserved']} |
-"""
-                # مصرف
-                usage = result.get("usage", {})
-                usage_md = f"⚡ **توکن‌ها:** ورودی: {usage.get('prompt_tokens', '-')} | خروجی: {usage.get('completion_tokens', '-')}"
-                return (
-                    result["anonymized_text"],
-                    stats_md,
-                    quality_md,
-                    entities_md,
-                    detailed_md,
-                    usage_md
-                )
-            except Exception as e:
-                return "", f"❌ خطا: {str(e)}", "", "", "", ""
-        def process_batch_csv(file_path, text_col, output_col, api_key, use_checkpoint, progress=gr.Progress()):
-            """پردازش دسته‌ای فایل CSV با Checkpoint"""
-            if file_path is None:
-                return None, "❌ لطفاً فایل CSV آپلود کنید", "", gr.File(visible=False), None
-            if not text_col:
-                return None, "❌ لطفاً ستون متن را انتخاب کنید", "", gr.File(visible=False), None
-            try:
-                # خواندن فایل
-                try:
-                    df = pd.read_csv(file_path, encoding='utf-8')
-                except:
-                    try:
-                        df = pd.read_csv(file_path, encoding='utf-8-sig')
-                    except:
-                        df = pd.read_csv(file_path, encoding='cp1256')
-                if text_col not in df.columns:
-                    return None, f"❌ ستون '{text_col}' در فایل یافت نشد", "", gr.File(visible=False), None
-                # محدودیت تعداد سطرها
-                max_rows = 1000
-                if len(df) > max_rows:
-                    return None, f"❌ تعداد سطرها ({len(df)}) از حداکثر مجاز ({max_rows}) بیشتر است", "", gr.File(visible=False), None
-                # ایجاد anonymizer
-                key = api_key if api_key else os.getenv("CEREBRAS_API_KEY")
-                if not key:
-                    return None, "❌ کلید API تنظیم نشده", "", gr.File(visible=False), None
-                anonymizer = AdvancedCerebrasAnonymizer(api_key=key)
-                # ⭐ بررسی checkpoint
-                checkpoint_path = "/tmp/anonymizer_checkpoint.json"
-                start_index = 0
-                anonymized_texts = [""] * len(df)
-                error_rows = []
-                success_count = 0
-                if use_checkpoint:
-                    checkpoint = load_checkpoint(checkpoint_path)
-                    if checkpoint:
-                        start_index = checkpoint.get('last_processed', -1) + 1
-                        anonymized_texts = checkpoint.get('results', [""] * len(df))
-                        success_count = checkpoint.get('success_count', 0)
-                        error_rows = checkpoint.get('errors', [])
-                        print(f"✅ ادامه از checkpoint - ردیف {start_index}")
-                        progress(start_index / len(df), desc=f"ادامه از ردیف {start_index}...")
-                # پردازش سطرها
-                total = len(df)
-                progress(start_index / total, desc="شروع پردازش...")
-                for idx in range(start_index, total):
-                    row = df.iloc[idx]
-                    text = str(row[text_col])
-                    progress((idx + 1) / total, desc=f"پردازش سطر {idx + 1} از {total} (⏱️ ~{DELAY_BETWEEN_REQUESTS}s/row)")
-                    if not text or text.strip() == '' or text.lower() == 'nan':
-                        anonymized_texts[idx] = ""
-                        continue
-                    result = anonymizer.anonymize_text(text)
-                    if result["success"]:
-                        anonymized_texts[idx] = result["anonymized_text"]
-                        success_count += 1
-                    else:
-                        anonymized_texts[idx] = f"[خطا: {result['error']}]"
-                        error_rows.append(f"سطر {idx + 1}: {result['error']}")
-                    # ⭐ ذخیره checkpoint هر چند ردیف
-                    if (idx + 1) % CHECKPOINT_INTERVAL == 0:
-                        save_checkpoint(checkpoint_path, {
-                            'last_processed': idx,
-                            'results': anonymized_texts,
-                            'success_count': success_count,
-                            'errors': error_rows,
-                            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
-                        })
-                    # ⭐ تاخیر برای جلوگیری از rate limit (اعمال شده در _make_api_request)
-                # ⭐ ذخیره checkpoint نهایی
-                save_checkpoint(checkpoint_path, {
-                    'last_processed': total - 1,
-                    'results': anonymized_texts,
-                    'success_count': success_count,
-                    'errors': error_rows,
-                    'completed': True,
-                    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
-                })
-                # اضافه کردن ستون جدید
-                output_col_name = output_col if output_col else "anonymized_text"
-                df[output_col_name] = anonymized_texts
-                # ذخیره فایل خروجی
-                output_path = "/tmp/anonymized_output.csv"
-                df.to_csv(output_path, index=False, encoding='utf-8-sig')
-                # آمار
-                stats_md = f"""
-### 📊 آمار پردازش:
-| شاخص | مقدار |
-|------|-------|
-| کل سطرها | {total} |
-| پردازش موفق | {success_count} |
-| خطا | {len(error_rows)} |
-| درصد موفقیت | {(success_count/total*100):.1f}% |
-| تاخیر بین درخواست‌ها | {DELAY_BETWEEN_REQUESTS} ثانیه |
-"""
-                # گزارش خطاها
-                error_md = ""
-                if error_rows:
-                    error_md = "### ⚠️ خطاهای مشاهده شده:\n" + "\n".join([f"- {e}" for e in error_rows[:20]])
-                    if len(error_rows) > 20:
-                        error_md += f"\n... و {len(error_rows) - 20} خطای دیگر"
-                # نمایش نتایج
-                result_preview = df[[text_col, output_col_name]].head(10)
-                return (
-                    result_preview,
-                    f"✅ **پردازش کامل شد!** | {success_count} سطر با موفقیت",
-                    stats_md,
-                    gr.File(value=output_path, visible=True),
-                    error_md
-                )
-            except Exception as e:
-                return None, f"❌ خطا در پردازش: {str(e)}", "", gr.File(visible=False), str(e)
-        def copy_text(text_to_copy):
-            """کپی متن"""
-            if not text_to_copy or not text_to_copy.strip():
-                return gr.Textbox(visible=False), "⚠️ متنی برای کپی وجود ندارد"
-            return gr.Textbox(value=text_to_copy, visible=True), "✅ متن کپی شد"
-        def clear_all():
-            """پاک کردن فیلدها"""
-            return "", "", "", "", "", "", "", gr.Textbox(visible=False)
-        # ===============================
-        # اتصال رویدادها
-        # ===============================
-        # پردازش تکی
-        anonymize_btn.click(
-            fn=process_single_text,
-            inputs=[input_text, api_key_input],
-            outputs=[output_text, statistics_output, quality_output, entities_output, detailed_analysis_output, usage_output]
-        )
-        copy_btn.click(
-            fn=copy_text,
-            inputs=[output_text],
-            outputs=[copy_output, statistics_output]
-        )
-        clear_btn.click(
-            fn=clear_all,
-            outputs=[input_text, output_text, statistics_output, quality_output, entities_output, detailed_analysis_output, usage_output, copy_output]
-        )
-        # پردازش دسته‌ای
-        csv_file.change(
-            fn=load_csv_columns,
-            inputs=[csv_file],
-            outputs=[text_column, preview_df, batch_progress]
-        )
-        batch_btn.click(
-            fn=process_batch_csv,
-            inputs=[csv_file, text_column, output_column_name, api_key_input, continue_from_checkpoint],
-            outputs=[result_df, batch_progress, batch_stats, download_btn, error_log]
-        )
-        # مثال‌ها
-        gr.Examples(
-            examples=[
-                ["مهدی اخوان بهابادی، مدیرعامل همراه اول، اعلام کرد درآمد عملیاتی شرکت با رشد 37 درصدی به 70 هزار و 677 میلیارد تومان رسیده است."],
-                ["بانک مرکزی و بانک ملی با همکاری محمدرضا فرزین، 60 درصد سپرده‌ها را مدیریت کردند."],
-                ["سازمان تامین اجتماعی دارای سه شرکت دارویی است که از مراکز درمانی وابسته به وزارت بهداشت مطالباتی دارند."]
-            ],
-            inputs=input_text,
-            label="📚 مثال‌ها"
-        )
-        return interface
-# اجرای برنامه
-if __name__ == "__main__":
-    interface = create_advanced_interface()
-    interface.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True,
-        show_error=True
-    )