Spaces:

leilaghomashchi
/

Data-anonymization

Sleeping

File size: 16,725 Bytes

6ceefb7
 
 
 
 
 
 
bec85f3
6ceefb7
 
 
 
bec85f3
 
6ceefb7
 
 
bec85f3
 
 
6ceefb7
 
 
 
bec85f3
 
 
 
 
 
 
6ceefb7
bec85f3
 
 
 
 
 
 
 
 
 
 
 
 
 
6ceefb7
 
bec85f3
6ceefb7
 
 
 
 
bec85f3
 
6ceefb7
bec85f3
 
6ceefb7
bec85f3
 
 
6ceefb7
bec85f3
 
6ceefb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bec85f3
6ceefb7
 
bec85f3
 
 
 
6ceefb7
bec85f3
6ceefb7
 
 
bec85f3
6ceefb7
 
 
 
 
 
 
 
 
 
 
 
bec85f3
6ceefb7
 
 
 
bec85f3
6ceefb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bec85f3
6ceefb7
 
bec85f3
6ceefb7
 
 
 
 
 
 
bec85f3
6ceefb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bec85f3
6ceefb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bec85f3
6ceefb7
 
 
 
 
 
 
 
 
 
 
 
bec85f3
6ceefb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bec85f3
 
6ceefb7
 
 
bec85f3
6ceefb7
 
 
bec85f3
6ceefb7
 
 
 
 
 
 
 
 
 
 
 
bec85f3
 
 
 
 
6ceefb7
 
 
 
 
 
 
bec85f3
 
 
6ceefb7
bec85f3
 
6ceefb7
 
 
 
 
 
 
 
 
 
bec85f3
 
 
6ceefb7
bec85f3
6ceefb7
 
 
bec85f3
6ceefb7
 
 
bec85f3
6ceefb7
 
bec85f3
6ceefb7
bec85f3
 
 
 
6ceefb7
 
 
bec85f3
 
 
6ceefb7
bec85f3
6ceefb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bec85f3
 
 
 
 
6ceefb7
bec85f3
6ceefb7
bec85f3
6ceefb7
bec85f3
 
 
 
 
 
 
 
 
6ceefb7
bec85f3
6ceefb7
bec85f3
 
6ceefb7
bec85f3
6ceefb7
bec85f3
 
6ceefb7
bec85f3
 
6ceefb7
 
 
 
bec85f3
 
 
 
 
 
6ceefb7
bec85f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ceefb7
 
bec85f3
 
 
 
 
 
 
 
 
 
6ceefb7
 
 
 
bec85f3
 
 
 
 
6ceefb7
bec85f3
6ceefb7
bec85f3
6ceefb7
 
 
 
bec85f3
6ceefb7
 
 
 
bec85f3
6ceefb7
 
 
 
 
 
 
 
 
bec85f3

import json
import gradio as gr
from typing import Dict, Any
import os
from dataclasses import dataclass
import re
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

@dataclass
class LocalModelConfig:
    """تنظیمات مدل محلی GGUF - Qwen2.5-32B"""
    repo_id: str = "Qwen/Qwen2.5-32B-Instruct-GGUF"
    filename: str = "qwen2.5-32b-instruct-q4_k_m.gguf"
    max_tokens: int = 8000
    temperature: float = 0.3
    top_p: float = 0.8
    n_ctx: int = 4096
    n_threads: int = 4  # کمتر برای Spaces
    n_gpu_layers: int = 50

class LocalCerebrasAnonymizer:
    """سیستم ناشناس‌سازی متون مالی فارسی با مدل محلی"""
    
    def __init__(self):
        self.config = LocalModelConfig()
        self.llm = None
        self.model_loaded = False
    
    def load_model(self) -> str:
        """بارگذاری مدل از HuggingFace"""
        try:
            print(f"🤖 درحال دانلود مدل از HuggingFace...")
            print(f"📦 Repo: {self.config.repo_id}")
            print(f"📄 Filename: {self.config.filename}")
            
            # دانلود مدل
            model_path = hf_hub_download(
                repo_id=self.config.repo_id,
                filename=self.config.filename,
                local_dir="./models",
                local_dir_use_symlinks=False
            )
            
            print(f"✅ مدل دانلود شد: {model_path}")
            print(f"🤖 درحال بارگذاری مدل...")
            
            self.llm = Llama(
                model_path=model_path,
                n_ctx=self.config.n_ctx,
                n_threads=self.config.n_threads,
                n_gpu_layers=self.config.n_gpu_layers,
                verbose=False
            )
            
            self.model_loaded = True
            print("✅ مدل با موفقیت بارگذاری شد\n")
            return "✅ مدل آماده است"
            
        except Exception as e:
            error_msg = f"❌ خطا: {str(e)}"
            print(error_msg)
            return error_msg
    
    def _create_system_prompt(self) -> str:
        """دستورالعمل سیستمی"""
        return """شما یک سیستم ناشناس‌سازی متون مالی فارسی هستید.

⚠️ CRITICAL: در پاسخ نهایی خود، فقط و فقط متن ناشناس‌سازی شده را برگردانید، بدون هیچ توضیح، تحلیل، یا تگ اضافی.

## قوانین اندیس‌گذاری:
1. **ترتیب پیوسته**: company-01, company-02, ... | person-01, person-02, ... | amount-01, amount-02, ... | percent-01, percent-02, ...
2. **ثبات**: اگر "همراه اول" → company-01 شد، در تمام متن همان باشد
3. **نام مستعار**: "فاما" = "فولاد مبارکه" → هر دو company-01

## انواع موجودیت:
- **company-XX**: شرکت‌ها، بانک‌ها، سازمان‌ها
- **person-XX**: نام و نام خانوادگی اشخاص  
- **amount-XX**: مبالغ - واحد را حفظ کن
- **percent-XX**: درصدها

## مثال:
ورودی: ایران خودرو در اسفند 1402 حدود 23 هزار میلیارد درآمد کسب کرد که 4.58 درصد افزایش داشت.
خروجی: company-01 در اسفند 1402 حدود amount-01 درآمد کسب کرد که percent-01 افزایش داشت.

⚠️ یادآوری: فقط متن ناشناس‌شده."""

    def anonymize_text(self, text: str) -> Dict[str, Any]:
        """ناشناس‌سازی متن"""
        if not self.model_loaded:
            return {"success": False, "error": "مدل بارگذاری نشده است"}
        
        if not text.strip():
            return {"success": False, "error": "متن ورودی خالی است"}
        
        try:
            messages = [
                {"role": "system", "content": self._create_system_prompt()},
                {"role": "user", "content": text}
            ]
            
            prompt = self._format_prompt(messages)
            
            print(f"⏳ پردازش متن... (طول: {len(text)} کاراکتر)")
            
            response = self.llm(
                prompt,
                max_tokens=self.config.max_tokens,
                temperature=self.config.temperature,
                top_p=self.config.top_p,
                stop=["</s>", "[/INST]", "### User:"]
            )
            
            content = response["choices"][0]["text"].strip()
            
            # پاک‌سازی
            content = self._remove_thinking_tags(content)
            content = self._clean_markdown(content)
            content = self._clean_explanations(content)
            content = content.strip()
            
            analysis = self._analyze_anonymized_text(content)
            
            return {
                "success": True,
                "anonymized_text": content,
                "entities": analysis["entities"],
                "statistics": analysis["statistics"],
                "detailed_analysis": analysis["detailed_analysis"],
                "quality_check": self._validate_anonymized_text(content)
            }
                
        except Exception as e:
            return {"success": False, "error": f"خطا: {str(e)}"}
    
    def _format_prompt(self, messages: list) -> str:
        """فرمت prompt برای Qwen2.5"""
        formatted = ""
        for message in messages:
            role = message["role"]
            content = message["content"]
            if role == "system":
                formatted += f"{content}\n\n"
            elif role == "user":
                formatted += f"[INST] {content} [/INST]\n"
            elif role == "assistant":
                formatted += f"{content}\n\n"
        return formatted
    
    def _remove_thinking_tags(self, content: str) -> str:
        content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
        content = re.sub(r'</?think>', '', content)
        return content.strip()
    
    def _clean_markdown(self, content: str) -> str:
        if "```" in content:
            lines = content.split('\n')
            clean_lines = []
            skip = False
            for line in lines:
                if line.strip().startswith('```'):
                    skip = not skip
                    continue
                if not skip:
                    clean_lines.append(line)
            content = '\n'.join(clean_lines)
        return content
    
    def _clean_explanations(self, content: str) -> str:
        lines = content.split('\n')
        clean_lines = []
        for line in lines:
            if any(word in line.lower() for word in 
                   ['okay', 'let me', 'here is', 'خروجی', 'نتیجه', 'پاسخ:', 'assistant', '[inst]']):
                continue
            clean_lines.append(line)
        return '\n'.join(clean_lines).strip()
    
    def _analyze_anonymized_text(self, text: str) -> Dict[str, Any]:
        companies = re.findall(r'company-(\d+)', text)
        persons = re.findall(r'person-(\d+)', text)
        amounts = re.findall(r'amount-(\d+)', text)
        percents = re.findall(r'percent-(\d+)', text)
        
        statistics = {
            "company": len(set(companies)),
            "person": len(set(persons)),
            "amount": len(set(amounts)),
            "percent": len(set(percents)),
            "total": len(companies) + len(persons) + len(amounts) + len(percents)
        }
        
        entities = {
            "companies": sorted(list(set(companies)), key=lambda x: int(x)),
            "persons": sorted(list(set(persons)), key=lambda x: int(x)),
            "amounts": sorted(list(set(amounts)), key=lambda x: int(x)),
            "percents": sorted(list(set(percents)), key=lambda x: int(x))
        }
        
        detailed_analysis = {
            "preserved_dates": len(re.findall(r'\d{4}/\d{1,2}/\d{1,2}|\d{1,2}\s+\w+\s+\d{4}', text)),
            "financial_indicators": len(re.findall(r'\b(EPS|P/E|ARPU|NPL|ROE|ROA)\b', text)),
            "units_preserved": len(re.findall(r'(میلیارد|میلیون|هزار|تومان|ریال|درهم|دلار)', text))
        }
        
        return {
            "statistics": statistics,
            "entities": entities,
            "detailed_analysis": detailed_analysis
        }
    
    def _validate_anonymized_text(self, text: str) -> Dict[str, Any]:
        companies = re.findall(r'company-(\d+)', text)
        persons = re.findall(r'person-(\d+)', text)
        amounts = re.findall(r'amount-(\d+)', text)
        percents = re.findall(r'percent-(\d+)', text)
        
        validation_issues = []
        
        for entity_type, indices in [("company", companies), ("person", persons), 
                                      ("amount", amounts), ("percent", percents)]:
            if indices:
                unique_indices = sorted(list(set([int(x) for x in indices])))
                if unique_indices[0] != 1:
                    validation_issues.append(f"⚠️ {entity_type} از 01 شروع نشده")
                
                expected = list(range(1, len(unique_indices) + 1))
                if unique_indices != expected:
                    validation_issues.append(f"⚠️ {entity_type} پیوسته نیست")
        
        return {
            "is_valid": len(validation_issues) == 0,
            "issues": validation_issues,
            "entity_counts": {
                "company": len(set(companies)),
                "person": len(set(persons)),
                "amount": len(set(amounts)),
                "percent": len(set(percents))
            }
        }

# ========== رابط کاربری ==========

anonymizer = LocalCerebrasAnonymizer()

def create_interface():
    custom_css = """
    .gradio-container {
        font-family: 'Tahoma', 'Arial', sans-serif !important;
        direction: rtl;
        max-width: 1400px;
        margin: 0 auto;
    }
    .info-box {
        background-color: #e3f2fd;
        border: 2px solid #2196F3;
        border-radius: 12px;
        padding: 15px;
        color: #0d47a1;
        margin: 10px 0;
    }
    .local-box {
        background-color: #e8f5e9;
        border: 2px solid #4caf50;
        border-radius: 12px;
        padding: 15px;
        color: #1b5e20;
        margin: 10px 0;
    }
    .result-box {
        background-color: #f8f9fa;
        border: 2px solid #e9ecef;
        border-radius: 12px;
        padding: 20px;
    }
    """
    
    with gr.Blocks(css=custom_css, title="ناشناس‌ساز Qwen2.5", theme=gr.themes.Soft()) as interface:
        
        gr.Markdown("""
        # 🔒 سیستم ناشناس‌سازی متون مالی فارسی
        ### 🚀 Qwen 2.5-32B (HuggingFace Spaces)
        """)
        
        gr.Markdown("""
        <div class="info-box">
        📊 <strong>مدل:</strong> Qwen2.5-32B-Instruct-Q4_K_M<br>
        🌐 <strong>منبع:</strong> HuggingFace Hub<br>
        💾 <strong>حجم:</strong> ~20 GB (Q4 quantization)<br>
        ⚡ <strong>سرعت:</strong> بستگی به GPU Spaces دارد
        </div>
        """)
        
        status_box = gr.Textbox(label="📋 وضعیت", interactive=False, value="⏳ درحال بارگذاری مدل...")
        
        load_btn = gr.Button("🤖 بارگذاری مدل", variant="primary", size="lg")
        
        with gr.Row(visible=False) as input_section:
            with gr.Column(scale=1):
                input_text = gr.Textbox(
                    label="📝 متن ورودی",
                    placeholder="متن خود را اینجا وارد کنید...",
                    lines=12,
                    max_lines=25
                )
                
                with gr.Row():
                    anonymize_btn = gr.Button("🔒 ناشناس‌سازی", variant="primary", size="lg")
                    clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary")
            
            with gr.Column(scale=1):
                output_text = gr.Textbox(
                    label="🎯 متن ناشناس‌سازی شده",
                    lines=12,
                    max_lines=25,
                    elem_classes=["result-box"]
                )
        
        with gr.Row(visible=False) as output_section:
            with gr.Column():
                statistics_output = gr.Markdown(label="📊 آمار")
            with gr.Column():
                quality_output = gr.Markdown(label="✅ کیفیت")
        
        with gr.Row(visible=False) as output_section2:
            entities_output = gr.Markdown(label="🏷️ موجودیت‌ها")
            detailed_output = gr.Markdown(label="🔍 تحلیل")
        
        def load_model_action():
            """بارگذاری مدل"""
            msg = anonymizer.load_model()
            return (
                gr.Textbox(value=msg),
                gr.Row(visible=True),
                gr.Row(visible=True),
                gr.Row(visible=True)
            )
        
        def process_text(text):
            """پردازش متن"""
            if not text.strip():
                return ("", "❌ متن خالی است", "", "", "", "")
            
            result = anonymizer.anonymize_text(text)
            
            if not result["success"]:
                return ("", f"❌ {result['error']}", "", "", "", "")
            
            stats = result.get("statistics", {})
            stats_md = f"""📊 **آمار:**
🏢 شرکت: {stats.get('company', 0)}
👤 اشخاص: {stats.get('person', 0)}
💰 مبالغ: {stats.get('amount', 0)}
📊 درصدها: {stats.get('percent', 0)}
🔢 کل: {stats.get('total', 0)}"""
            
            quality = result.get("quality_check", {})
            quality_md = f"""✅ **کنترل کیفیت:**

{'✅ موفق' if quality.get('is_valid') else '❌ مشکل'}
"""
            if quality.get("issues"):
                quality_md += "\n**مشکلات:**\n"
                for issue in quality["issues"]:
                    quality_md += f"• {issue}\n"
            
            entities = result.get("entities", {})
            entities_md = "🏷️ **موجودیت‌ها:**\n"
            if entities.get("companies"):
                entities_md += f"\n🏢 company-{', company-'.join(entities['companies'])}"
            if entities.get("persons"):
                entities_md += f"\n👤 person-{', person-'.join(entities['persons'])}"
            if entities.get("amounts"):
                entities_md += f"\n💰 amount-{', amount-'.join(entities['amounts'])}"
            if entities.get("percents"):
                entities_md += f"\n📊 percent-{', percent-'.join(entities['percents'])}"
            
            detailed = result.get("detailed_analysis", {})
            detailed_md = f"""🔍 **تحلیل:**
📅 تاریخ: {detailed.get('preserved_dates', 0)}
📈 شاخص: {detailed.get('financial_indicators', 0)}
📏 واحد: {detailed.get('units_preserved', 0)}"""
            
            return (
                result["anonymized_text"],
                stats_md,
                quality_md,
                entities_md,
                detailed_md,
                "✅ موفق"
            )
        
        def clear_all():
            return "", "", "", "", "", ""
        
        load_btn.click(
            fn=load_model_action,
            outputs=[status_box, input_section, output_section, output_section2]
        )
        
        anonymize_btn.click(
            fn=process_text,
            inputs=[input_text],
            outputs=[output_text, statistics_output, quality_output, entities_output, detailed_output, status_box]
        )
        
        clear_btn.click(
            fn=clear_all,
            outputs=[input_text, output_text, statistics_output, quality_output, entities_output, detailed_output]
        )
        
        gr.Examples(
            examples=[
                ["ایران خودرو در اسفندماه حدود 23 هزار میلیارد تومان درآمد کسب کرد که 4.58 درصد افزایش داشت."],
                ["مجمع پتروشیمی برگزار شد. وانیا نیک تدبیر را بازرس انتخاب کردند."],
            ],
            inputs=input_text,
            label="📚 مثال‌ها"
        )
        
        return interface

if __name__ == "__main__":
    interface = create_interface()
    interface.launch()