Spaces:
Sleeping
Sleeping
| import json | |
| import gradio as gr | |
| from typing import Dict, Any | |
| import os | |
| from dataclasses import dataclass | |
| import re | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| class LocalModelConfig: | |
| """تنظیمات مدل محلی GGUF - Qwen2.5-32B""" | |
| repo_id: str = "Qwen/Qwen2.5-32B-Instruct-GGUF" | |
| filename: str = "qwen2.5-32b-instruct-q4_k_m.gguf" | |
| max_tokens: int = 8000 | |
| temperature: float = 0.3 | |
| top_p: float = 0.8 | |
| n_ctx: int = 4096 | |
| n_threads: int = 4 # کمتر برای Spaces | |
| n_gpu_layers: int = 50 | |
| class LocalCerebrasAnonymizer: | |
| """سیستم ناشناسسازی متون مالی فارسی با مدل محلی""" | |
| def __init__(self): | |
| self.config = LocalModelConfig() | |
| self.llm = None | |
| self.model_loaded = False | |
| def load_model(self) -> str: | |
| """بارگذاری مدل از HuggingFace""" | |
| try: | |
| print(f"🤖 درحال دانلود مدل از HuggingFace...") | |
| print(f"📦 Repo: {self.config.repo_id}") | |
| print(f"📄 Filename: {self.config.filename}") | |
| # دانلود مدل | |
| model_path = hf_hub_download( | |
| repo_id=self.config.repo_id, | |
| filename=self.config.filename, | |
| local_dir="./models", | |
| local_dir_use_symlinks=False | |
| ) | |
| print(f"✅ مدل دانلود شد: {model_path}") | |
| print(f"🤖 درحال بارگذاری مدل...") | |
| self.llm = Llama( | |
| model_path=model_path, | |
| n_ctx=self.config.n_ctx, | |
| n_threads=self.config.n_threads, | |
| n_gpu_layers=self.config.n_gpu_layers, | |
| verbose=False | |
| ) | |
| self.model_loaded = True | |
| print("✅ مدل با موفقیت بارگذاری شد\n") | |
| return "✅ مدل آماده است" | |
| except Exception as e: | |
| error_msg = f"❌ خطا: {str(e)}" | |
| print(error_msg) | |
| return error_msg | |
| def _create_system_prompt(self) -> str: | |
| """دستورالعمل سیستمی""" | |
| return """شما یک سیستم ناشناسسازی متون مالی فارسی هستید. | |
| ⚠️ CRITICAL: در پاسخ نهایی خود، فقط و فقط متن ناشناسسازی شده را برگردانید، بدون هیچ توضیح، تحلیل، یا تگ اضافی. | |
| ## قوانین اندیسگذاری: | |
| 1. **ترتیب پیوسته**: company-01, company-02, ... | person-01, person-02, ... | amount-01, amount-02, ... | percent-01, percent-02, ... | |
| 2. **ثبات**: اگر "همراه اول" → company-01 شد، در تمام متن همان باشد | |
| 3. **نام مستعار**: "فاما" = "فولاد مبارکه" → هر دو company-01 | |
| ## انواع موجودیت: | |
| - **company-XX**: شرکتها، بانکها، سازمانها | |
| - **person-XX**: نام و نام خانوادگی اشخاص | |
| - **amount-XX**: مبالغ - واحد را حفظ کن | |
| - **percent-XX**: درصدها | |
| ## مثال: | |
| ورودی: ایران خودرو در اسفند 1402 حدود 23 هزار میلیارد درآمد کسب کرد که 4.58 درصد افزایش داشت. | |
| خروجی: company-01 در اسفند 1402 حدود amount-01 درآمد کسب کرد که percent-01 افزایش داشت. | |
| ⚠️ یادآوری: فقط متن ناشناسشده.""" | |
| def anonymize_text(self, text: str) -> Dict[str, Any]: | |
| """ناشناسسازی متن""" | |
| if not self.model_loaded: | |
| return {"success": False, "error": "مدل بارگذاری نشده است"} | |
| if not text.strip(): | |
| return {"success": False, "error": "متن ورودی خالی است"} | |
| try: | |
| messages = [ | |
| {"role": "system", "content": self._create_system_prompt()}, | |
| {"role": "user", "content": text} | |
| ] | |
| prompt = self._format_prompt(messages) | |
| print(f"⏳ پردازش متن... (طول: {len(text)} کاراکتر)") | |
| response = self.llm( | |
| prompt, | |
| max_tokens=self.config.max_tokens, | |
| temperature=self.config.temperature, | |
| top_p=self.config.top_p, | |
| stop=["</s>", "[/INST]", "### User:"] | |
| ) | |
| content = response["choices"][0]["text"].strip() | |
| # پاکسازی | |
| content = self._remove_thinking_tags(content) | |
| content = self._clean_markdown(content) | |
| content = self._clean_explanations(content) | |
| content = content.strip() | |
| analysis = self._analyze_anonymized_text(content) | |
| return { | |
| "success": True, | |
| "anonymized_text": content, | |
| "entities": analysis["entities"], | |
| "statistics": analysis["statistics"], | |
| "detailed_analysis": analysis["detailed_analysis"], | |
| "quality_check": self._validate_anonymized_text(content) | |
| } | |
| except Exception as e: | |
| return {"success": False, "error": f"خطا: {str(e)}"} | |
| def _format_prompt(self, messages: list) -> str: | |
| """فرمت prompt برای Qwen2.5""" | |
| formatted = "" | |
| for message in messages: | |
| role = message["role"] | |
| content = message["content"] | |
| if role == "system": | |
| formatted += f"{content}\n\n" | |
| elif role == "user": | |
| formatted += f"[INST] {content} [/INST]\n" | |
| elif role == "assistant": | |
| formatted += f"{content}\n\n" | |
| return formatted | |
| def _remove_thinking_tags(self, content: str) -> str: | |
| content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL) | |
| content = re.sub(r'</?think>', '', content) | |
| return content.strip() | |
| def _clean_markdown(self, content: str) -> str: | |
| if "```" in content: | |
| lines = content.split('\n') | |
| clean_lines = [] | |
| skip = False | |
| for line in lines: | |
| if line.strip().startswith('```'): | |
| skip = not skip | |
| continue | |
| if not skip: | |
| clean_lines.append(line) | |
| content = '\n'.join(clean_lines) | |
| return content | |
| def _clean_explanations(self, content: str) -> str: | |
| lines = content.split('\n') | |
| clean_lines = [] | |
| for line in lines: | |
| if any(word in line.lower() for word in | |
| ['okay', 'let me', 'here is', 'خروجی', 'نتیجه', 'پاسخ:', 'assistant', '[inst]']): | |
| continue | |
| clean_lines.append(line) | |
| return '\n'.join(clean_lines).strip() | |
| def _analyze_anonymized_text(self, text: str) -> Dict[str, Any]: | |
| companies = re.findall(r'company-(\d+)', text) | |
| persons = re.findall(r'person-(\d+)', text) | |
| amounts = re.findall(r'amount-(\d+)', text) | |
| percents = re.findall(r'percent-(\d+)', text) | |
| statistics = { | |
| "company": len(set(companies)), | |
| "person": len(set(persons)), | |
| "amount": len(set(amounts)), | |
| "percent": len(set(percents)), | |
| "total": len(companies) + len(persons) + len(amounts) + len(percents) | |
| } | |
| entities = { | |
| "companies": sorted(list(set(companies)), key=lambda x: int(x)), | |
| "persons": sorted(list(set(persons)), key=lambda x: int(x)), | |
| "amounts": sorted(list(set(amounts)), key=lambda x: int(x)), | |
| "percents": sorted(list(set(percents)), key=lambda x: int(x)) | |
| } | |
| detailed_analysis = { | |
| "preserved_dates": len(re.findall(r'\d{4}/\d{1,2}/\d{1,2}|\d{1,2}\s+\w+\s+\d{4}', text)), | |
| "financial_indicators": len(re.findall(r'\b(EPS|P/E|ARPU|NPL|ROE|ROA)\b', text)), | |
| "units_preserved": len(re.findall(r'(میلیارد|میلیون|هزار|تومان|ریال|درهم|دلار)', text)) | |
| } | |
| return { | |
| "statistics": statistics, | |
| "entities": entities, | |
| "detailed_analysis": detailed_analysis | |
| } | |
| def _validate_anonymized_text(self, text: str) -> Dict[str, Any]: | |
| companies = re.findall(r'company-(\d+)', text) | |
| persons = re.findall(r'person-(\d+)', text) | |
| amounts = re.findall(r'amount-(\d+)', text) | |
| percents = re.findall(r'percent-(\d+)', text) | |
| validation_issues = [] | |
| for entity_type, indices in [("company", companies), ("person", persons), | |
| ("amount", amounts), ("percent", percents)]: | |
| if indices: | |
| unique_indices = sorted(list(set([int(x) for x in indices]))) | |
| if unique_indices[0] != 1: | |
| validation_issues.append(f"⚠️ {entity_type} از 01 شروع نشده") | |
| expected = list(range(1, len(unique_indices) + 1)) | |
| if unique_indices != expected: | |
| validation_issues.append(f"⚠️ {entity_type} پیوسته نیست") | |
| return { | |
| "is_valid": len(validation_issues) == 0, | |
| "issues": validation_issues, | |
| "entity_counts": { | |
| "company": len(set(companies)), | |
| "person": len(set(persons)), | |
| "amount": len(set(amounts)), | |
| "percent": len(set(percents)) | |
| } | |
| } | |
| # ========== رابط کاربری ========== | |
| anonymizer = LocalCerebrasAnonymizer() | |
| def create_interface(): | |
| custom_css = """ | |
| .gradio-container { | |
| font-family: 'Tahoma', 'Arial', sans-serif !important; | |
| direction: rtl; | |
| max-width: 1400px; | |
| margin: 0 auto; | |
| } | |
| .info-box { | |
| background-color: #e3f2fd; | |
| border: 2px solid #2196F3; | |
| border-radius: 12px; | |
| padding: 15px; | |
| color: #0d47a1; | |
| margin: 10px 0; | |
| } | |
| .local-box { | |
| background-color: #e8f5e9; | |
| border: 2px solid #4caf50; | |
| border-radius: 12px; | |
| padding: 15px; | |
| color: #1b5e20; | |
| margin: 10px 0; | |
| } | |
| .result-box { | |
| background-color: #f8f9fa; | |
| border: 2px solid #e9ecef; | |
| border-radius: 12px; | |
| padding: 20px; | |
| } | |
| """ | |
| with gr.Blocks(css=custom_css, title="ناشناسساز Qwen2.5", theme=gr.themes.Soft()) as interface: | |
| gr.Markdown(""" | |
| # 🔒 سیستم ناشناسسازی متون مالی فارسی | |
| ### 🚀 Qwen 2.5-32B (HuggingFace Spaces) | |
| """) | |
| gr.Markdown(""" | |
| <div class="info-box"> | |
| 📊 <strong>مدل:</strong> Qwen2.5-32B-Instruct-Q4_K_M<br> | |
| 🌐 <strong>منبع:</strong> HuggingFace Hub<br> | |
| 💾 <strong>حجم:</strong> ~20 GB (Q4 quantization)<br> | |
| ⚡ <strong>سرعت:</strong> بستگی به GPU Spaces دارد | |
| </div> | |
| """) | |
| status_box = gr.Textbox(label="📋 وضعیت", interactive=False, value="⏳ درحال بارگذاری مدل...") | |
| load_btn = gr.Button("🤖 بارگذاری مدل", variant="primary", size="lg") | |
| with gr.Row(visible=False) as input_section: | |
| with gr.Column(scale=1): | |
| input_text = gr.Textbox( | |
| label="📝 متن ورودی", | |
| placeholder="متن خود را اینجا وارد کنید...", | |
| lines=12, | |
| max_lines=25 | |
| ) | |
| with gr.Row(): | |
| anonymize_btn = gr.Button("🔒 ناشناسسازی", variant="primary", size="lg") | |
| clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox( | |
| label="🎯 متن ناشناسسازی شده", | |
| lines=12, | |
| max_lines=25, | |
| elem_classes=["result-box"] | |
| ) | |
| with gr.Row(visible=False) as output_section: | |
| with gr.Column(): | |
| statistics_output = gr.Markdown(label="📊 آمار") | |
| with gr.Column(): | |
| quality_output = gr.Markdown(label="✅ کیفیت") | |
| with gr.Row(visible=False) as output_section2: | |
| entities_output = gr.Markdown(label="🏷️ موجودیتها") | |
| detailed_output = gr.Markdown(label="🔍 تحلیل") | |
| def load_model_action(): | |
| """بارگذاری مدل""" | |
| msg = anonymizer.load_model() | |
| return ( | |
| gr.Textbox(value=msg), | |
| gr.Row(visible=True), | |
| gr.Row(visible=True), | |
| gr.Row(visible=True) | |
| ) | |
| def process_text(text): | |
| """پردازش متن""" | |
| if not text.strip(): | |
| return ("", "❌ متن خالی است", "", "", "", "") | |
| result = anonymizer.anonymize_text(text) | |
| if not result["success"]: | |
| return ("", f"❌ {result['error']}", "", "", "", "") | |
| stats = result.get("statistics", {}) | |
| stats_md = f"""📊 **آمار:** | |
| 🏢 شرکت: {stats.get('company', 0)} | |
| 👤 اشخاص: {stats.get('person', 0)} | |
| 💰 مبالغ: {stats.get('amount', 0)} | |
| 📊 درصدها: {stats.get('percent', 0)} | |
| 🔢 کل: {stats.get('total', 0)}""" | |
| quality = result.get("quality_check", {}) | |
| quality_md = f"""✅ **کنترل کیفیت:** | |
| {'✅ موفق' if quality.get('is_valid') else '❌ مشکل'} | |
| """ | |
| if quality.get("issues"): | |
| quality_md += "\n**مشکلات:**\n" | |
| for issue in quality["issues"]: | |
| quality_md += f"• {issue}\n" | |
| entities = result.get("entities", {}) | |
| entities_md = "🏷️ **موجودیتها:**\n" | |
| if entities.get("companies"): | |
| entities_md += f"\n🏢 company-{', company-'.join(entities['companies'])}" | |
| if entities.get("persons"): | |
| entities_md += f"\n👤 person-{', person-'.join(entities['persons'])}" | |
| if entities.get("amounts"): | |
| entities_md += f"\n💰 amount-{', amount-'.join(entities['amounts'])}" | |
| if entities.get("percents"): | |
| entities_md += f"\n📊 percent-{', percent-'.join(entities['percents'])}" | |
| detailed = result.get("detailed_analysis", {}) | |
| detailed_md = f"""🔍 **تحلیل:** | |
| 📅 تاریخ: {detailed.get('preserved_dates', 0)} | |
| 📈 شاخص: {detailed.get('financial_indicators', 0)} | |
| 📏 واحد: {detailed.get('units_preserved', 0)}""" | |
| return ( | |
| result["anonymized_text"], | |
| stats_md, | |
| quality_md, | |
| entities_md, | |
| detailed_md, | |
| "✅ موفق" | |
| ) | |
| def clear_all(): | |
| return "", "", "", "", "", "" | |
| load_btn.click( | |
| fn=load_model_action, | |
| outputs=[status_box, input_section, output_section, output_section2] | |
| ) | |
| anonymize_btn.click( | |
| fn=process_text, | |
| inputs=[input_text], | |
| outputs=[output_text, statistics_output, quality_output, entities_output, detailed_output, status_box] | |
| ) | |
| clear_btn.click( | |
| fn=clear_all, | |
| outputs=[input_text, output_text, statistics_output, quality_output, entities_output, detailed_output] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["ایران خودرو در اسفندماه حدود 23 هزار میلیارد تومان درآمد کسب کرد که 4.58 درصد افزایش داشت."], | |
| ["مجمع پتروشیمی برگزار شد. وانیا نیک تدبیر را بازرس انتخاب کردند."], | |
| ], | |
| inputs=input_text, | |
| label="📚 مثالها" | |
| ) | |
| return interface | |
| if __name__ == "__main__": | |
| interface = create_interface() | |
| interface.launch() |