Spaces:

leilaghomashchi
/

Data-anonymization

Sleeping

App Files Files Community

leilaghomashchi commited on Oct 28

Commit

440ba32

verified ·

1 Parent(s): 72a098e

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -390

app.py DELETED Viewed

@@ -1,390 +0,0 @@
-import json
-import gradio as gr
-from typing import Dict, Any
-import os
-from dataclasses import dataclass
-import re
-import requests
-@dataclass
-class QwenConfig:
-    """تنظیمات Qwen 2.5-32B via HF Inference API"""
-    model_id: str = "Qwen/Qwen2.5-32B-Instruct"
-    api_url: str = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-32B-Instruct"
-    max_tokens: int = 1024
-    temperature: float = 0.3
-    top_p: float = 0.8
-class QwenAnonymizer:
-    """سیستم ناشناس‌سازی متون مالی فارسی"""
-    def __init__(self, hf_token: str = None):
-        self.config = QwenConfig()
-        self.hf_token = hf_token or os.getenv("HF_TOKEN")
-        self.model_loaded = bool(self.hf_token)
-    def anonymize_text(self, text: str) -> Dict[str, Any]:
-        """ناشناس‌سازی متن"""
-        if not self.hf_token:
-            return {"success": False, "error": "HF_TOKEN یافت نشد"}
-        if not text.strip():
-            return {"success": False, "error": "متن ورودی خالی است"}
-        try:
-            print(f"⏳ پردازش متن...")
-            system_prompt = self._create_system_prompt()
-            # ایجاد payload
-            payload = {
-                "inputs": f"""[INST] {system_prompt}
-متن ورودی:
-{text}
-فقط متن ناشناس‌سازی شده را برگردان: [/INST]""",
-                "parameters": {
-                    "max_new_tokens": self.config.max_tokens,
-                    "temperature": self.config.temperature,
-                    "top_p": self.config.top_p,
-                    "do_sample": True,
-                    "return_full_text": False,
-                }
-            }
-            # درخواست API
-            headers = {"Authorization": f"Bearer {self.hf_token}"}
-            response = requests.post(
-                self.config.api_url,
-                headers=headers,
-                json=payload,
-                timeout=120
-            )
-            if response.status_code != 200:
-                return {
-                    "success": False,
-                    "error": f"خطا از API: {response.status_code} - {response.text}"
-                }
-            result = response.json()
-            if isinstance(result, list) and len(result) > 0:
-                content = result[0].get("generated_text", "").strip()
-            else:
-                content = str(result).strip()
-            # پاک‌سازی
-            content = self._clean_explanations(content)
-            content = content.strip()
-            analysis = self._analyze_anonymized_text(content)
-            return {
-                "success": True,
-                "anonymized_text": content,
-                "entities": analysis["entities"],
-                "statistics": analysis["statistics"],
-                "detailed_analysis": analysis["detailed_analysis"],
-                "quality_check": self._validate_anonymized_text(content)
-            }
-        except requests.exceptions.Timeout:
-            return {"success": False, "error": "⏱️ مدل درحال بارگذاری است (۳۰-۶۰ ثانیه صبر کنید)"}
-        except Exception as e:
-            return {"success": False, "error": f"خطا: {str(e)}"}
-    def _create_system_prompt(self) -> str:
-        """دستورالعمل سیستمی"""
-        return """شما یک سیستم ناشناس‌سازی متون مالی فارسی هستید.
-قوانین اندیس‌گذاری:
-1. ترتیب پیوسته: company-01, company-02, ... | person-01, person-02, ... | amount-01, amount-02, ... | percent-01, percent-02, ...
-2. ثبات: اگر "همراه اول" → company-01 شد، در تمام متن همان باشد
-انواع موجودیت:
-- company-XX: شرکت‌ها، بانک‌ها، سازمان‌ها
-- person-XX: نام و نام خانوادگی اشخاص
-- amount-XX: مبالغ - واحد را حفظ کن
-- percent-XX: درصدها
-مثال:
-ورودی: ایران خودرو در اسفند 1402 حدود 23 هزار میلیارد درآمد کسب کرد که 4.58 درصد افزایش داشت.
-خروجی: company-01 در اسفند 1402 حدود amount-01 درآمد کسب کرد که percent-01 افزایش داشت."""
-    def _clean_explanations(self, content: str) -> str:
-        """حذف توضیحات اضافی"""
-        lines = content.split('\n')
-        clean_lines = []
-        for line in lines:
-            if any(word in line.lower() for word in
-                   ['okay', 'let me', 'here is', 'خروجی', 'نتیجه', 'پاسخ:', 'assistant']):
-                continue
-            clean_lines.append(line)
-        return '\n'.join(clean_lines).strip()
-    def _analyze_anonymized_text(self, text: str) -> Dict[str, Any]:
-        companies = re.findall(r'company-(\d+)', text)
-        persons = re.findall(r'person-(\d+)', text)
-        amounts = re.findall(r'amount-(\d+)', text)
-        percents = re.findall(r'percent-(\d+)', text)
-        statistics = {
-            "company": len(set(companies)),
-            "person": len(set(persons)),
-            "amount": len(set(amounts)),
-            "percent": len(set(percents)),
-            "total": len(companies) + len(persons) + len(amounts) + len(percents)
-        }
-        entities = {
-            "companies": sorted(list(set(companies)), key=lambda x: int(x)),
-            "persons": sorted(list(set(persons)), key=lambda x: int(x)),
-            "amounts": sorted(list(set(amounts)), key=lambda x: int(x)),
-            "percents": sorted(list(set(percents)), key=lambda x: int(x))
-        }
-        detailed_analysis = {
-            "preserved_dates": len(re.findall(r'\d{4}/\d{1,2}/\d{1,2}|\d{1,2}\s+\w+\s+\d{4}', text)),
-            "financial_indicators": len(re.findall(r'\b(EPS|P/E|ARPU|NPL|ROE|ROA)\b', text)),
-            "units_preserved": len(re.findall(r'(میلیارد|میلیون|هزار|تومان|ریال|درهم|دلار)', text))
-        }
-        return {
-            "statistics": statistics,
-            "entities": entities,
-            "detailed_analysis": detailed_analysis
-        }
-    def _validate_anonymized_text(self, text: str) -> Dict[str, Any]:
-        companies = re.findall(r'company-(\d+)', text)
-        persons = re.findall(r'person-(\d+)', text)
-        amounts = re.findall(r'amount-(\d+)', text)
-        percents = re.findall(r'percent-(\d+)', text)
-        validation_issues = []
-        for entity_type, indices in [("company", companies), ("person", persons),
-                                      ("amount", amounts), ("percent", percents)]:
-            if indices:
-                unique_indices = sorted(list(set([int(x) for x in indices])))
-                if unique_indices[0] != 1:
-                    validation_issues.append(f"⚠️ {entity_type} از 01 شروع نشده")
-        return {
-            "is_valid": len(validation_issues) == 0,
-            "issues": validation_issues
-        }
-# ========== رابط کاربری ==========
-anonymizer = None
-def create_interface():
-    global anonymizer
-    custom_css = """
-    .gradio-container {
-        font-family: 'Tahoma', 'Arial', sans-serif !important;
-        direction: rtl;
-        max-width: 1200px;
-        margin: 0 auto;
-    }
-    .info-box {
-        background-color: #e3f2fd;
-        border: 2px solid #2196F3;
-        border-radius: 12px;
-        padding: 15px;
-        color: #0d47a1;
-        margin: 10px 0;
-    }
-    .success-box {
-        background-color: #e8f5e9;
-        border: 2px solid #4caf50;
-        border-radius: 12px;
-        padding: 15px;
-        color: #1b5e20;
-        margin: 10px 0;
-    }
-    .warning-box {
-        background-color: #fff3cd;
-        border: 2px solid #ffc107;
-        border-radius: 12px;
-        padding: 15px;
-        color: #856404;
-        margin: 10px 0;
-    }
-    .result-box {
-        background-color: #f8f9fa;
-        border: 2px solid #e9ecef;
-        border-radius: 12px;
-        padding: 20px;
-    }
-    """
-    with gr.Blocks(css=custom_css, title="ناشناس‌ساز Qwen2.5", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("""
-        # 🔒 سیستم ناشناس‌سازی متون مالی فارسی
-        ### 🚀 Qwen 2.5-32B (HuggingFace Inference API)
-        """)
-        hf_token_input = gr.Textbox(
-            label="🔑 HuggingFace API Token",
-            placeholder="hf_...",
-            type="password",
-            info="از https://huggingface.co/settings/tokens بگیرید"
-        )
-        gr.Markdown("""
-        <div class="info-box">
-        📊 <strong>مدل:</strong> Qwen2.5-32B-Instruct<br>
-        🌐 <strong>منبع:</strong> HuggingFace Inference API<br>
-        ✅ <strong>مزیت:</strong> بدون نیاز به نصب • سریع • رایگان<br>
-        ⚡ <strong>وضعیت:</strong> آماده برای استفاده فوری
-        </div>
-        """)
-        status_box = gr.Textbox(label="📋 وضعیت", interactive=False, value="✅ آماده")
-        with gr.Row():
-            with gr.Column(scale=1):
-                input_text = gr.Textbox(
-                    label="📝 متن ورودی",
-                    placeholder="متن خود را اینجا وارد کنید...",
-                    lines=10,
-                    max_lines=20
-                )
-                with gr.Row():
-                    anonymize_btn = gr.Button("🔒 ناشناس‌سازی", variant="primary", size="lg")
-                    clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary")
-            with gr.Column(scale=1):
-                output_text = gr.Textbox(
-                    label="🎯 متن ناشناس‌سازی شده",
-                    lines=10,
-                    max_lines=20,
-                    elem_classes=["result-box"]
-                )
-        with gr.Row():
-            with gr.Column():
-                statistics_output = gr.Markdown(label="📊 آمار")
-            with gr.Column():
-                quality_output = gr.Markdown(label="✅ کیفیت")
-        with gr.Row():
-            entities_output = gr.Markdown(label="🏷️ موجودیت‌ها")
-            detailed_output = gr.Markdown(label="🔍 تحلیل")
-        def process_text(text, token):
-            """پردازش متن"""
-            global anonymizer
-            if not token or not token.strip():
-                return ("", "❌ HF Token الزامی است", "", "", "", "")
-            if not text.strip():
-                return ("", "❌ متن خالی است", "", "", "", "")
-            # ایجاد anonymizer با token
-            anonymizer = QwenAnonymizer(hf_token=token.strip())
-            result = anonymizer.anonymize_text(text)
-            if not result["success"]:
-                return ("", f"❌ {result['error']}", "", "", "", "")
-            stats = result.get("statistics", {})
-            stats_md = f"""📊 **آمار:**
-🏢 شرکت: {stats.get('company', 0)}
-👤 اشخاص: {stats.get('person', 0)}
-💰 مبالغ: {stats.get('amount', 0)}
-📊 درصدها: {stats.get('percent', 0)}
-🔢 کل: {stats.get('total', 0)}"""
-            quality = result.get("quality_check", {})
-            quality_md = f"""✅ **کنترل کیفیت:**
-{'✅ موفق' if quality.get('is_valid') else '⚠️ هشدار'}
-"""
-            if quality.get("issues"):
-                quality_md += "\n**نکات:**\n"
-                for issue in quality["issues"]:
-                    quality_md += f"• {issue}\n"
-            entities = result.get("entities", {})
-            entities_md = "🏷️ **موجودیت‌ها:**\n"
-            if entities.get("companies"):
-                entities_md += f"\n🏢 company-{', company-'.join(entities['companies'])}"
-            if entities.get("persons"):
-                entities_md += f"\n👤 person-{', person-'.join(entities['persons'])}"
-            if entities.get("amounts"):
-                entities_md += f"\n💰 amount-{', amount-'.join(entities['amounts'])}"
-            if entities.get("percents"):
-                entities_md += f"\n📊 percent-{', percent-'.join(entities['percents'])}"
-            detailed = result.get("detailed_analysis", {})
-            detailed_md = f"""🔍 **تحلیل:**
-📅 تاریخ: {detailed.get('preserved_dates', 0)}
-📈 شاخص: {detailed.get('financial_indicators', 0)}
-📏 واحد: {detailed.get('units_preserved', 0)}"""
-            return (
-                result["anonymized_text"],
-                stats_md,
-                quality_md,
-                entities_md,
-                detailed_md,
-                "✅ موفق"
-            )
-        def clear_all():
-            return "", "", "", "", "", ""
-        anonymize_btn.click(
-            fn=process_text,
-            inputs=[input_text, hf_token_input],
-            outputs=[output_text, statistics_output, quality_output, entities_output, detailed_output, status_box]
-        )
-        clear_btn.click(
-            fn=clear_all,
-            outputs=[input_text, output_text, statistics_output, quality_output, entities_output, detailed_output]
-        )
-        gr.Examples(
-            examples=[
-                ["ایران خودرو در اسفندماه حدود 23 هزار میلیارد تومان درآمد کسب کرد که 4.58 درصد افزایش داشت."],
-                ["بانک ملی ایران و حسن روحانی در جلسه امروز بحث کردند."],
-            ],
-            inputs=input_text,
-            label="📚 مثال‌ها"
-        )
-        with gr.Accordion("📖 راهنما", open=False):
-            gr.Markdown("""
-            ## 🔑 چگونه HF Token بگیرید:
-            1. به https://huggingface.co/settings/tokens بروید
-            2. **New token** کلیک کنید
-            3. نام انتخاب کنید (مثلاً: qwen-anonymizer)
-            4. **Type: Read** انتخاب کنید
-            5. **Generate** کلیک کنید
-            6. Token رو کپی کنید
-            ## 🚀 چگونه استفاده کنید:
-            1. Token را در بالا وارد کنید
-            2. متن خود را در جعبه "متن ورودی" بنویسید
-            3. دکمه "🔒 ناشناس‌سازی" را کلیک کنید
-            4. نتیجه در جعبه "متن ناشناس‌سازی شده" نمایش داده می‌شود
-            """)
-        return interface
-if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch()