Spaces:

leilaghomashchi
/

Data-anonymization

Sleeping

App Files Files Community

leilaghomashchi commited on Oct 21

Commit

5ef727d

verified ·

1 Parent(s): f224893

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -112

app.py CHANGED Viewed

@@ -4,58 +4,63 @@ from typing import Dict, Any
 import os
 from dataclasses import dataclass
 import re
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
 @dataclass
 class LocalModelConfig:
-    """تنظیمات مدل محلی GGUF - Qwen2.5-32B"""
-    repo_id: str = "Qwen/Qwen2.5-32B-Instruct-GGUF"
-    filename: str = "qwen2.5-32b-instruct-q4_k_m.gguf"
-    max_tokens: int = 8000
     temperature: float = 0.3
     top_p: float = 0.8
-    n_ctx: int = 4096
-    n_threads: int = 4  # کمتر برای Spaces
-    n_gpu_layers: int = 50
-class LocalCerebrasAnonymizer:
-    """سیستم ناشناس‌سازی متون مالی فارسی با مدل محلی"""
     def __init__(self):
         self.config = LocalModelConfig()
-        self.llm = None
         self.model_loaded = False
     def load_model(self) -> str:
         """بارگذاری مدل از HuggingFace"""
         try:
-            print(f"🤖 درحال دانلود مدل از HuggingFace...")
-            print(f"📦 Repo: {self.config.repo_id}")
-            print(f"📄 Filename: {self.config.filename}")
-            # دانلود مدل
-            model_path = hf_hub_download(
-                repo_id=self.config.repo_id,
-                filename=self.config.filename,
-                local_dir="./models",
-                local_dir_use_symlinks=False
-            )
-            print(f"✅ مدل دانلود شد: {model_path}")
-            print(f"🤖 درحال بارگذاری مدل...")
-            self.llm = Llama(
-                model_path=model_path,
-                n_ctx=self.config.n_ctx,
-                n_threads=self.config.n_threads,
-                n_gpu_layers=self.config.n_gpu_layers,
-                verbose=False
-            )
             self.model_loaded = True
             print("✅ مدل با موفقیت بارگذاری شد\n")
-            return "✅ مدل آماده است"
         except Exception as e:
             error_msg = f"❌ خطا: {str(e)}"
@@ -69,21 +74,20 @@ class LocalCerebrasAnonymizer:
 ⚠️ CRITICAL: در پاسخ نهایی خود، فقط و فقط متن ناشناس‌سازی شده را برگردانید، بدون هیچ توضیح، تحلیل، یا تگ اضافی.
 ## قوانین اندیس‌گذاری:
-1. **ترتیب پیوسته**: company-01, company-02, ... | person-01, person-02, ... | amount-01, amount-02, ... | percent-01, percent-02, ...
-2. **ثبات**: اگر "همراه اول" → company-01 شد، در تمام متن همان باشد
-3. **نام مستعار**: "فاما" = "فولاد مبارکه" → هر دو company-01
 ## انواع موجودیت:
-- **company-XX**: شرکت‌ها، بانک‌ها، سازمان‌ها
-- **person-XX**: نام و نام خانوادگی اشخاص
-- **amount-XX**: مبالغ - واحد را حفظ کن
-- **percent-XX**: درصدها
 ## مثال:
 ورودی: ایران خودرو در اسفند 1402 حدود 23 هزار میلیارد درآمد کسب کرد که 4.58 درصد افزایش داشت.
 خروجی: company-01 در اسفند 1402 حدود amount-01 درآمد کسب کرد که percent-01 افزایش داشت.
-⚠️ یادآوری: فقط متن ناشناس‌شده."""
     def anonymize_text(self, text: str) -> Dict[str, Any]:
         """ناشناس‌سازی متن"""
@@ -94,28 +98,49 @@ class LocalCerebrasAnonymizer:
             return {"success": False, "error": "متن ورودی خالی است"}
         try:
             messages = [
-                {"role": "system", "content": self._create_system_prompt()},
-                {"role": "user", "content": text}
             ]
-            prompt = self._format_prompt(messages)
-            print(f"⏳ پردازش متن... (طول: {len(text)} کاراکتر)")
-            response = self.llm(
-                prompt,
-                max_tokens=self.config.max_tokens,
-                temperature=self.config.temperature,
-                top_p=self.config.top_p,
-                stop=["</s>", "[/INST]", "### User:"]
-            )
-            content = response["choices"][0]["text"].strip()
             # پاک‌سازی
-            content = self._remove_thinking_tags(content)
-            content = self._clean_markdown(content)
             content = self._clean_explanations(content)
             content = content.strip()
@@ -133,40 +158,8 @@ class LocalCerebrasAnonymizer:
         except Exception as e:
             return {"success": False, "error": f"خطا: {str(e)}"}
-    def _format_prompt(self, messages: list) -> str:
-        """فرمت prompt برای Qwen2.5"""
-        formatted = ""
-        for message in messages:
-            role = message["role"]
-            content = message["content"]
-            if role == "system":
-                formatted += f"{content}\n\n"
-            elif role == "user":
-                formatted += f"[INST] {content} [/INST]\n"
-            elif role == "assistant":
-                formatted += f"{content}\n\n"
-        return formatted
-    def _remove_thinking_tags(self, content: str) -> str:
-        content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
-        content = re.sub(r'</?think>', '', content)
-        return content.strip()
-    def _clean_markdown(self, content: str) -> str:
-        if "```" in content:
-            lines = content.split('\n')
-            clean_lines = []
-            skip = False
-            for line in lines:
-                if line.strip().startswith('```'):
-                    skip = not skip
-                    continue
-                if not skip:
-                    clean_lines.append(line)
-            content = '\n'.join(clean_lines)
-        return content
     def _clean_explanations(self, content: str) -> str:
         lines = content.split('\n')
         clean_lines = []
         for line in lines:
@@ -223,32 +216,22 @@ class LocalCerebrasAnonymizer:
                 unique_indices = sorted(list(set([int(x) for x in indices])))
                 if unique_indices[0] != 1:
                     validation_issues.append(f"⚠️ {entity_type} از 01 شروع نشده")
-                expected = list(range(1, len(unique_indices) + 1))
-                if unique_indices != expected:
-                    validation_issues.append(f"⚠️ {entity_type} پیوسته نیست")
         return {
             "is_valid": len(validation_issues) == 0,
-            "issues": validation_issues,
-            "entity_counts": {
-                "company": len(set(companies)),
-                "person": len(set(persons)),
-                "amount": len(set(amounts)),
-                "percent": len(set(percents))
-            }
         }
 # ========== رابط کاربری ==========
-anonymizer = LocalCerebrasAnonymizer()
 def create_interface():
     custom_css = """
     .gradio-container {
         font-family: 'Tahoma', 'Arial', sans-serif !important;
         direction: rtl;
-        max-width: 1400px;
         margin: 0 auto;
     }
     .info-box {
@@ -279,19 +262,19 @@ def create_interface():
         gr.Markdown("""
         # 🔒 سیستم ناشناس‌سازی متون مالی فارسی
-        ### 🚀 Qwen 2.5-32B (HuggingFace Spaces)
         """)
         gr.Markdown("""
         <div class="info-box">
-        📊 <strong>مدل:</strong> Qwen2.5-32B-Instruct-Q4_K_M<br>
         🌐 <strong>منبع:</strong> HuggingFace Hub<br>
-        💾 <strong>حجم:</strong> ~20 GB (Q4 quantization)<br>
-        ⚡ <strong>سرعت:</strong> بستگی به GPU Spaces دارد
         </div>
         """)
-        status_box = gr.Textbox(label="📋 وضعیت", interactive=False, value="⏳ درحال بارگذاری مدل...")
         load_btn = gr.Button("🤖 بارگذاری مدل", variant="primary", size="lg")
@@ -300,8 +283,8 @@ def create_interface():
                 input_text = gr.Textbox(
                     label="📝 متن ورودی",
                     placeholder="متن خود را اینجا وارد کنید...",
-                    lines=12,
-                    max_lines=25
                 )
                 with gr.Row():
@@ -311,8 +294,8 @@ def create_interface():
             with gr.Column(scale=1):
                 output_text = gr.Textbox(
                     label="🎯 متن ناشناس‌سازی شده",
-                    lines=12,
-                    max_lines=25,
                     elem_classes=["result-box"]
                 )
@@ -357,10 +340,10 @@ def create_interface():
             quality = result.get("quality_check", {})
             quality_md = f"""✅ **کنترل کیفیت:**
-{'✅ موفق' if quality.get('is_valid') else '❌ مشکل'}
 """
             if quality.get("issues"):
-                quality_md += "\n**مشکلات:**\n"
                 for issue in quality["issues"]:
                     quality_md += f"• {issue}\n"
@@ -412,7 +395,7 @@ def create_interface():
         gr.Examples(
             examples=[
                 ["ایران خودرو در اسفندماه حدود 23 هزار میلیارد تومان درآمد کسب کرد که 4.58 درصد افزایش داشت."],
-                ["مجمع پتروشیمی برگزار شد. وانیا نیک تدبیر را بازرس انتخاب کردند."],
             ],
             inputs=input_text,
             label="📚 مثال‌ها"

 import os
 from dataclasses import dataclass
 import re
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 @dataclass
 class LocalModelConfig:
+    """تنظیمات مدل Qwen2.5-32B"""
+    model_id: str = "Qwen/Qwen2.5-32B-Instruct"
+    max_tokens: int = 2048
     temperature: float = 0.3
     top_p: float = 0.8
+class QwenAnonymizer:
+    """سیستم ناشناس‌سازی متون مالی فارسی"""
     def __init__(self):
         self.config = LocalModelConfig()
+        self.tokenizer = None
+        self.model = None
         self.model_loaded = False
     def load_model(self) -> str:
         """بارگذاری مدل از HuggingFace"""
         try:
+            print(f"🤖 درحال دانلود و بارگذاری مدل...")
+            print(f"📦 Model: {self.config.model_id}")
+            # بررسی GPU
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"💻 دستگاه: {device}")
+            # بارگذاری tokenizer
+            print("📝 بارگذاری tokenizer...")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_id)
+            # بارگذاری مدل
+            print("🧠 بارگذاری مدل...")
+            if device == "cuda":
+                # برای GPU
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.config.model_id,
+                    torch_dtype=torch.float16,
+                    device_map="auto",
+                    load_in_4bit=True,  # 4-bit quantization
+                )
+            else:
+                # برای CPU
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.config.model_id,
+                    torch_dtype=torch.float32,
+                    device_map="cpu",
+                )
+            self.model.eval()
             self.model_loaded = True
             print("✅ مدل با موفقیت بارگذاری شد\n")
+            return f"✅ مدل آماده است\n💻 دستگاه: {device}\n🧠 پارامترها: 32B"
         except Exception as e:
             error_msg = f"❌ خطا: {str(e)}"
 ⚠️ CRITICAL: در پاسخ نهایی خود، فقط و فقط متن ناشناس‌سازی شده را برگردانید، بدون هیچ توضیح، تحلیل، یا تگ اضافی.
 ## قوانین اندیس‌گذاری:
+1. ترتیب پیوسته: company-01, company-02, ... | person-01, person-02, ... | amount-01, amount-02, ... | percent-01, percent-02, ...
+2. ثبات: اگر "همراه اول" → company-01 شد، در تمام متن همان باشد
 ## انواع موجودیت:
+- company-XX: شرکت‌ها، بانک‌ها، سازمان‌ها
+- person-XX: نام و نام خانوادگی اشخاص
+- amount-XX: مبالغ - و��حد را حفظ کن
+- percent-XX: درصدها
 ## مثال:
 ورودی: ایران خودرو در اسفند 1402 حدود 23 هزار میلیارد درآمد کسب کرد که 4.58 درصد افزایش داشت.
 خروجی: company-01 در اسفند 1402 حدود amount-01 درآمد کسب کرد که percent-01 افزایش داشت.
+⚠️ فقط متن ناشناس‌شده، بدون هیچ توضیح اضافی."""
     def anonymize_text(self, text: str) -> Dict[str, Any]:
         """ناشناس‌سازی متن"""
             return {"success": False, "error": "متن ورودی خالی است"}
         try:
+            print(f"⏳ پردازش متن...")
+            # ایجاد prompt
+            system_prompt = self._create_system_prompt()
+            user_prompt = text
+            # فرمت پیام برای Qwen
             messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
             ]
+            # تبدیل به متن
+            text_input = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            # Tokenize
+            inputs = self.tokenizer(text_input, return_tensors="pt").to(self.model.device)
+            # Generate
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=self.config.max_tokens,
+                    temperature=self.config.temperature,
+                    top_p=self.config.top_p,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                )
+            # Decode
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # استخراج جواب (بعد از assistant:)
+            if "assistant" in response:
+                content = response.split("assistant")[-1].strip()
+            else:
+                content = response.strip()
             # پاک‌سازی
             content = self._clean_explanations(content)
             content = content.strip()
         except Exception as e:
             return {"success": False, "error": f"خطا: {str(e)}"}
     def _clean_explanations(self, content: str) -> str:
+        """حذف توضیحات اضافی"""
         lines = content.split('\n')
         clean_lines = []
         for line in lines:
                 unique_indices = sorted(list(set([int(x) for x in indices])))
                 if unique_indices[0] != 1:
                     validation_issues.append(f"⚠️ {entity_type} از 01 شروع نشده")
         return {
             "is_valid": len(validation_issues) == 0,
+            "issues": validation_issues
         }
 # ========== رابط کاربری ==========
+anonymizer = QwenAnonymizer()
 def create_interface():
     custom_css = """
     .gradio-container {
         font-family: 'Tahoma', 'Arial', sans-serif !important;
         direction: rtl;
+        max-width: 1200px;
         margin: 0 auto;
     }
     .info-box {
         gr.Markdown("""
         # 🔒 سیستم ناشناس‌سازی متون مالی فارسی
+        ### 🚀 Qwen 2.5-32B (HuggingFace)
         """)
         gr.Markdown("""
         <div class="info-box">
+        📊 <strong>مدل:</strong> Qwen2.5-32B-Instruct<br>
         🌐 <strong>منبع:</strong> HuggingFace Hub<br>
+        💾 <strong>حجم:</strong> 32B Parameters<br>
+        ⚡ <strong>بهینه‌سازی:</strong> Transformers + PyTorch
         </div>
         """)
+        status_box = gr.Textbox(label="📋 وضعیت", interactive=False, value="⏳ آماده برای بارگذاری...")
         load_btn = gr.Button("🤖 بارگذاری مدل", variant="primary", size="lg")
                 input_text = gr.Textbox(
                     label="📝 متن ورودی",
                     placeholder="متن خود را اینجا وارد کنید...",
+                    lines=10,
+                    max_lines=20
                 )
                 with gr.Row():
             with gr.Column(scale=1):
                 output_text = gr.Textbox(
                     label="🎯 متن ناشناس‌سازی شده",
+                    lines=10,
+                    max_lines=20,
                     elem_classes=["result-box"]
                 )
             quality = result.get("quality_check", {})
             quality_md = f"""✅ **کنترل کیفیت:**
+{'✅ موفق' if quality.get('is_valid') else '⚠️ هشدار'}
 """
             if quality.get("issues"):
+                quality_md += "\n**نکات:**\n"
                 for issue in quality["issues"]:
                     quality_md += f"• {issue}\n"
         gr.Examples(
             examples=[
                 ["ایران خودرو در اسفندماه حدود 23 هزار میلیارد تومان درآمد کسب کرد که 4.58 درصد افزایش داشت."],
+                ["بانک ملی ایران و حسن روحانی در جلسه امروز بحث کردند."],
             ],
             inputs=input_text,
             label="📚 مثال‌ها"