Spaces:

leilaghomashchi
/

Benchmark-data-anonymization

Sleeping

App Files Files Community

leilaghomashchi commited on Sep 22, 2025

Commit

a8a1e42

verified ·

1 Parent(s): 2e7fc4a

Upload anonymization_benchmark (3).py

Browse files

Files changed (1) hide show

anonymization_benchmark (3).py +728 -0

anonymization_benchmark (3).py ADDED Viewed

	@@ -0,0 +1,728 @@

+import pandas as pd
+import re
+import numpy as np
+import json
+from typing import Dict, List, Any, Tuple
+import gradio as gr
+from pathlib import Path
+import plotly.graph_objects as go
+import plotly.express as px
+from dataclasses import dataclass
+from datetime import datetime
+@dataclass
+class BenchmarkMetrics:
+    """کلاس متریک‌های بنچمارک"""
+    model_name: str
+    total_texts: int
+    avg_original_length: float
+    avg_anonymized_length: float
+    company_entities: int
+    person_entities: int
+    amount_entities: int
+    percent_entities: int
+    group_entities: int
+    total_entities: int
+    correct_indexing_rate: float
+    consistency_score: float
+    structure_preservation_score: float
+    entity_coverage_rate: float
+    quality_score: float
+class AnonymizationBenchmark:
+    """کلاس اصلی بنچمارک ناشناس‌سازی"""
+    def __init__(self):
+        self.models_data = {}
+        self.benchmark_results = {}
+    def load_csv_files(self, chatgpt_file, grok_file, llama_file):
+        """بارگذاری فایل‌های CSV"""
+        try:
+            # بارگذاری فایل‌ها
+            chatgpt_df = pd.read_csv(chatgpt_file)
+            grok_df = pd.read_csv(grok_file)
+            llama_df = pd.read_csv(llama_file)
+            # بررسی ستون‌ها
+            required_columns = ['original_text', 'anonymized_text']
+            for df_name, df in [('ChatGPT', chatgpt_df), ('Grok', grok_df), ('Llama', llama_df)]:
+                if not all(col in df.columns for col in required_columns):
+                    raise ValueError(f"فایل {df_name} فاقد ستون‌های مورد نیاز است")
+            self.models_data = {
+                'ChatGPT': chatgpt_df,
+                'Grok': grok_df,
+                'Llama-3.1-8B': llama_df
+            }
+            return True, "فایل‌ها با موفقیت بارگذاری شدند"
+        except Exception as e:
+            return False, f"خطا در بارگذاری فایل‌ها: {str(e)}"
+    def extract_entities_from_text(self, text: str) -> Dict[str, List[str]]:
+        """استخراج موجودیت‌ها از متن"""
+        entities = {
+            'companies': re.findall(r'company-(\d+)', text),
+            'persons': re.findall(r'person-(\d+)', text),
+            'amounts': re.findall(r'amount-(\d+)', text),
+            'percents': re.findall(r'percent-(\d+)', text),
+            'groups': re.findall(r'group-(\d+)', text)
+        }
+        return entities
+    def count_original_entities(self, text: str) -> int:
+        """تخمین تعداد موجودیت‌های قابل ناشناس‌سازی در متن اصلی"""
+        # الگوهای شناسایی موجودیت‌ها در متن فارسی
+        patterns = [
+            r'[۰-۹]+(?:\.[۰-۹]+)?\s*(?:میلیارد|میلیون|هزار)?\s*(?:تومان|ریال|دلار|یورو)',  # اعداد پولی
+            r'[۰-۹]+(?:\.[۰-۹]+)?\s*درصد',  # درصدها
+            r'\b[آ-ی\s]{2,30}\b(?:\s*(?:شرکت|بانک|گروه|سازمان))',  # شرکت‌ها
+            r'\b[آ-ی\s]{2,20}\b(?:\s*(?:مدیرعامل|رئیس|مدیر))',  # اشخاص
+            r'[۰-۹]+(?:\.[۰-۹]+)?(?:\s*(?:میلیون|میلیارد|هزار))?',  # سایر اعداد
+        ]
+        total_entities = 0
+        for pattern in patterns:
+            matches = re.findall(pattern, text)
+            total_entities += len(matches)
+        return max(total_entities, 1)  # حداقل 1 برای جلوگیری از تقسیم بر صفر
+    def check_indexing_correctness(self, entities: Dict[str, List[str]]) -> float:
+        """بررسی درستی اندیس‌گذاری"""
+        total_checks = 0
+        passed_checks = 0
+        for entity_type, indices in entities.items():
+            if not indices:
+                continue
+            total_checks += 1
+            unique_indices = sorted([int(x) for x in set(indices)])
+            # بررسی شروع از 1
+            if unique_indices[0] == 1:
+                passed_checks += 0.5
+            # بررسی پیوستگی
+            expected = list(range(1, len(unique_indices) + 1))
+            if unique_indices == expected:
+                passed_checks += 0.5
+        return passed_checks / total_checks if total_checks > 0 else 0.0
+    def calculate_consistency_score(self, anonymized_texts: List[str]) -> float:
+        """محاسبه امتیاز ثبات در استفاده از شناسه‌ها"""
+        # این متریک پیچیده‌تر است و نیاز به تحلیل عمیق‌تری دارد
+        # در اینجا یک تقریب ساده ارائه می‌دهم
+        consistency_scores = []
+        for text in anonymized_texts:
+            entities = self.extract_entities_from_text(text)
+            total_entities = sum(len(v) for v in entities.values())
+            unique_entities = sum(len(set(v)) for v in entities.values())
+            if total_entities > 0:
+                consistency = unique_entities / total_entities
+                consistency_scores.append(consistency)
+        return np.mean(consistency_scores) if consistency_scores else 0.0
+    def calculate_structure_preservation(self, original_text: str, anonymized_text: str) -> float:
+        """محاسبه امتیاز حفظ ساختار"""
+        # بررسی حفظ کلمات کلیدی و ساختار جمله
+        # کلمات مهم که باید حفظ شوند
+        important_words = [
+            'میلیارد', 'میلیون', 'تومان', 'ریال', 'درصد', 'سود', 'زیان',
+            'مدیرعامل', 'شرکت', 'بانک', 'درآمد', 'سال', 'ماه'
+        ]
+        score = 0.0
+        total_checks = len(important_words)
+        for word in important_words:
+            if word in original_text and word in anonymized_text:
+                score += 1.0
+            elif word not in original_text:
+                total_checks -= 1
+        # بررسی حفظ تعداد کلمات (تقریبی)
+        original_words = len(original_text.split())
+        anonymized_words = len(anonymized_text.split())
+        if original_words > 0:
+            word_ratio = min(anonymized_words / original_words, 1.0)
+            score += word_ratio * 2  # وزن بیشتر برای حفظ تعداد کلمات
+            total_checks += 2
+        return score / total_checks if total_checks > 0 else 0.0
+    def calculate_entity_coverage(self, original_text: str, anonymized_text: str) -> float:
+        """محاسبه پوشش موجودیت‌ها"""
+        original_entity_count = self.count_original_entities(original_text)
+        entities = self.extract_entities_from_text(anonymized_text)
+        anonymized_entity_count = sum(len(set(v)) for v in entities.values())
+        return min(anonymized_entity_count / original_entity_count, 1.0)
+    def calculate_overall_quality(self, metrics: Dict[str, float]) -> float:
+        """محاسبه امتیاز کلی کیفیت"""
+        weights = {
+            'correct_indexing_rate': 0.3,
+            'consistency_score': 0.2,
+            'structure_preservation_score': 0.25,
+            'entity_coverage_rate': 0.25
+        }
+        quality_score = 0.0
+        for metric, weight in weights.items():
+            quality_score += metrics.get(metric, 0.0) * weight
+        return quality_score
+    def analyze_model(self, model_name: str, df: pd.DataFrame) -> BenchmarkMetrics:
+        """تحلیل یک مدل"""
+        print(f"تحلیل مدل {model_name}...")
+        total_texts = len(df)
+        # محاسبه طول متن‌ها
+        avg_original_length = df['original_text'].str.len().mean()
+        avg_anonymized_length = df['anonymized_text'].str.len().mean()
+        # استخراج موجودیت‌ها
+        all_entities = {'companies': [], 'persons': [], 'amounts': [], 'percents': [], 'groups': []}
+        indexing_scores = []
+        consistency_scores = []
+        structure_scores = []
+        coverage_scores = []
+        for _, row in df.iterrows():
+            original = str(row['original_text'])
+            anonymized = str(row['anonymized_text'])
+            # استخراج موجودیت‌ها
+            entities = self.extract_entities_from_text(anonymized)
+            for key in all_entities.keys():
+                all_entities[key].extend(entities[key])
+            # محاسبه متریک‌ها
+            indexing_scores.append(self.check_indexing_correctness(entities))
+            structure_scores.append(self.calculate_structure_preservation(original, anonymized))
+            coverage_scores.append(self.calculate_entity_coverage(original, anonymized))
+        # محاسبه ثبات کلی
+        consistency_score = self.calculate_consistency_score(df['anonymized_text'].tolist())
+        # آمار موجودیت‌ها
+        entity_counts = {
+            'company_entities': len(set(all_entities['companies'])),
+            'person_entities': len(set(all_entities['persons'])),
+            'amount_entities': len(set(all_entities['amounts'])),
+            'percent_entities': len(set(all_entities['percents'])),
+            'group_entities': len(set(all_entities['groups']))
+        }
+        # محاسبه امتیازهای میانگین
+        avg_metrics = {
+            'correct_indexing_rate': np.mean(indexing_scores),
+            'consistency_score': consistency_score,
+            'structure_preservation_score': np.mean(structure_scores),
+            'entity_coverage_rate': np.mean(coverage_scores)
+        }
+        # امتیاز کلی کیفیت
+        quality_score = self.calculate_overall_quality(avg_metrics)
+        return BenchmarkMetrics(
+            model_name=model_name,
+            total_texts=total_texts,
+            avg_original_length=round(avg_original_length, 2),
+            avg_anonymized_length=round(avg_anonymized_length, 2),
+            total_entities=sum(entity_counts.values()),
+            quality_score=round(quality_score, 3),
+            **entity_counts,
+            **{k: round(v, 3) for k, v in avg_metrics.items()}
+        )
+    def run_benchmark(self) -> Tuple[bool, str, str]:
+        """اجرای بنچمارک کامل"""
+        if not self.models_data:
+            return False, "ابتدا فایل‌ها را بارگذاری کنید", ""
+        try:
+            results = {}
+            # تحلیل هر مدل
+            for model_name, df in self.models_data.items():
+                results[model_name] = self.analyze_model(model_name, df)
+            self.benchmark_results = results
+            # تولید HTML
+            html_report = self.generate_html_report()
+            return True, "بنچمارک با موفقیت انجام شد", html_report
+        except Exception as e:
+            return False, f"خطا در اجرای بنچمارک: {str(e)}", ""
+    def generate_comparison_table(self) -> str:
+        """تولید جدول مقایسه"""
+        if not self.benchmark_results:
+            return "<p>هنوز بنچمارکی انجام نشده است</p>"
+        # آماده‌سازی داده‌ها برای جدول
+        table_data = []
+        for model_name, metrics in self.benchmark_results.items():
+            table_data.append({
+                'مدل': model_name,
+                'تعداد متن‌ها': metrics.total_texts,
+                'میانگین طول اصلی': f"{metrics.avg_original_length:.0f}",
+                'میانگین طول ناشناس': f"{metrics.avg_anonymized_length:.0f}",
+                'شرکت‌ها': metrics.company_entities,
+                'اشخاص': metrics.person_entities,
+                'مبالغ': metrics.amount_entities,
+                'درصدها': metrics.percent_entities,
+                'گروه‌ها': metrics.group_entities,
+                'کل موجودیت‌ها': metrics.total_entities,
+                'درستی اندیس (%)': f"{metrics.correct_indexing_rate*100:.1f}",
+                'ثبات (%)': f"{metrics.consistency_score*100:.1f}",
+                'حفظ ساختار (%)': f"{metrics.structure_preservation_score*100:.1f}",
+                'پوشش موجودیت (%)': f"{metrics.entity_coverage_rate*100:.1f}",
+                '🏆 امتیاز کلی': f"{metrics.quality_score:.3f}"
+            })
+        # تولید HTML جدول
+        html = """
+        <div style="overflow-x: auto; margin: 20px 0;">
+            <table style="width: 100%; border-collapse: collapse; font-family: 'Tahoma', sans-serif;">
+                <thead>
+                    <tr style="background-color: #4CAF50; color: white;">
+        """
+        # سرستون‌ها
+        headers = list(table_data[0].keys())
+        for header in headers:
+            html += f"<th style='border: 1px solid #ddd; padding: 12px; text-align: center;'>{header}</th>"
+        html += "</tr></thead><tbody>"
+        # ردیف‌ها
+        for i, row in enumerate(table_data):
+            bg_color = "#f2f2f2" if i % 2 == 0 else "white"
+            html += f"<tr style='background-color: {bg_color};'>"
+            for j, (key, value) in enumerate(row.items()):
+                # رنگ‌بندی ستون امتیاز کلی
+                if key == '🏆 امتیاز کلی':
+                    score = float(value)
+                    if score >= 0.8:
+                        color = "#4CAF50"  # سبز
+                    elif score >= 0.6:
+                        color = "#FF9800"  # نارنجی
+                    else:
+                        color = "#F44336"  # قرمز
+                    html += f"<td style='border: 1px solid #ddd; padding: 12px; text-align: center; font-weight: bold; color: {color};'>{value}</td>"
+                else:
+                    html += f"<td style='border: 1px solid #ddd; padding: 12px; text-align: center;'>{value}</td>"
+            html += "</tr>"
+        html += "</tbody></table></div>"
+        return html
+    def generate_charts(self) -> str:
+        """تولید نمودارها"""
+        if not self.benchmark_results:
+            return ""
+        models = list(self.benchmark_results.keys())
+        quality_scores = [self.benchmark_results[model].quality_score for model in models]
+        # نم��دار امتیاز کلی
+        chart_html = """
+        <div style="margin: 20px 0;">
+            <h3 style="text-align: center; color: #333;">مقایسه امتیاز کلی مدل‌ها</h3>
+            <div style="display: flex; justify-content: center; align-items: end; height: 300px; gap: 50px; background-color: #f9f9f9; padding: 20px; border-radius: 10px;">
+        """
+        colors = ['#4CAF50', '#2196F3', '#FF9800']
+        for i, (model, score) in enumerate(zip(models, quality_scores)):
+            height = score * 200  # ارتفاع بر اساس امتیاز
+            chart_html += f"""
+                <div style="text-align: center;">
+                    <div style="background-color: {colors[i]}; width: 80px; height: {height}px; border-radius: 5px; margin-bottom: 10px; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold;">
+                        {score:.3f}
+                    </div>
+                    <div style="font-weight: bold; color: #333;">{model}</div>
+                </div>
+            """
+        chart_html += "</div></div>"
+        return chart_html
+    def generate_html_report(self) -> str:
+        """تولید گزارش HTML کامل"""
+        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        html = f"""
+        <!DOCTYPE html>
+        <html lang="fa" dir="rtl">
+        <head>
+            <meta charset="UTF-8">
+            <meta name="viewport" content="width=device-width, initial-scale=1.0">
+            <title>گزارش بنچمارک ناشناس‌سازی</title>
+            <style>
+                * {{
+                    margin: 0;
+                    padding: 0;
+                    box-sizing: border-box;
+                }}
+                body {{
+                    font-family: 'Tahoma', 'Arial', sans-serif;
+                    line-height: 1.6;
+                    color: #333;
+                    background-color: #f5f5f5;
+                    padding: 20px;
+                }}
+                .container {{
+                    max-width: 1400px;
+                    margin: 0 auto;
+                    background-color: white;
+                    border-radius: 10px;
+                    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+                    overflow: hidden;
+                }}
+                .header {{
+                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+                    color: white;
+                    padding: 30px;
+                    text-align: center;
+                }}
+                .header h1 {{
+                    font-size: 2.5em;
+                    margin-bottom: 10px;
+                }}
+                .header p {{
+                    font-size: 1.2em;
+                    opacity: 0.9;
+                }}
+                .content {{
+                    padding: 30px;
+                }}
+                .summary {{
+                    background-color: #e8f5e8;
+                    border-right: 5px solid #4CAF50;
+                    padding: 20px;
+                    margin-bottom: 30px;
+                    border-radius: 5px;
+                }}
+                .section {{
+                    margin-bottom: 40px;
+                }}
+                .section h2 {{
+                    color: #333;
+                    border-bottom: 2px solid #4CAF50;
+                    padding-bottom: 10px;
+                    margin-bottom: 20px;
+                }}
+                .metrics-grid {{
+                    display: grid;
+                    grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+                    gap: 20px;
+                    margin-bottom: 30px;
+                }}
+                .metric-card {{
+                    background-color: #f8f9fa;
+                    border: 1px solid #dee2e6;
+                    border-radius: 8px;
+                    padding: 20px;
+                    text-align: center;
+                    transition: transform 0.2s;
+                }}
+                .metric-card:hover {{
+                    transform: translateY(-5px);
+                    box-shadow: 0 5px 15px rgba(0,0,0,0.1);
+                }}
+                .metric-number {{
+                    font-size: 2em;
+                    font-weight: bold;
+                    color: #4CAF50;
+                    margin-bottom: 5px;
+                }}
+                .metric-label {{
+                    color: #666;
+                    font-size: 0.9em;
+                }}
+                .footer {{
+                    background-color: #f8f9fa;
+                    padding: 20px;
+                    text-align: center;
+                    color: #666;
+                    border-top: 1px solid #dee2e6;
+                }}
+            </style>
+        </head>
+        <body>
+            <div class="container">
+                <div class="header">
+                    <h1>🏆 گزارش بنچمارک ناشنا��‌سازی</h1>
+                    <p>مقایسه عملکرد مدل‌های ChatGPT، Grok و Llama-3.1-8B</p>
+                </div>
+                <div class="content">
+                    <div class="summary">
+                        <h3>📋 خلاصه نتایج</h3>
+                        <p>این گزارش نتایج بنچمارک سه مدل مختلف برای ناشناس‌سازی متون فارسی را نشان می‌دهد.
+                        متریک‌های ارزیابی شامل درستی اندیس‌گذاری، ثبات استفاده از شناسه‌ها، حفظ ساختار متن و پوشش موجودیت‌ها می‌باشد.</p>
+                    </div>
+                    <div class="section">
+                        <h2>📊 جدول مقایسه کامل</h2>
+                        {self.generate_comparison_table()}
+                    </div>
+                    <div class="section">
+                        <h2>📈 نمودار مقایسه</h2>
+                        {self.generate_charts()}
+                    </div>
+                    <div class="section">
+                        <h2>🔍 تحلیل تفصیلی</h2>
+                        {self.generate_detailed_analysis()}
+                    </div>
+                </div>
+                <div class="footer">
+                    <p>گزارش تولید شده در تاریخ: {current_time}</p>
+                    <p>ابزار بنچمارک ناشناس‌سازی متون فارسی</p>
+                </div>
+            </div>
+        </body>
+        </html>
+        """
+        return html
+    def generate_detailed_analysis(self) -> str:
+        """تولید تحلیل تفصیلی"""
+        if not self.benchmark_results:
+            return "<p>داده‌ای برای تحلیل یافت نشد</p>"
+        # یافتن بهترین مدل
+        best_model = max(self.benchmark_results.keys(),
+                        key=lambda k: self.benchmark_results[k].quality_score)
+        best_score = self.benchmark_results[best_model].quality_score
+        analysis = f"""
+        <div class="metrics-grid">
+            <div class="metric-card">
+                <div class="metric-number">🥇</div>
+                <div class="metric-label">بهترین مدل: {best_model}</div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-number">{best_score:.3f}</div>
+                <div class="metric-label">بالاترین امتیاز کلی</div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-number">{len(self.models_data)}</div>
+                <div class="metric-label">تعداد مدل‌های مقایسه شده</div>
+            </div>
+        </div>
+        <div style="background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; padding: 20px; margin-top: 20px;">
+            <h4>💡 نتیجه‌گیری:</h4>
+            <ul style="margin-top: 10px; padding-right: 20px;">
+        """
+        # تحلیل نقاط قوت و ضعف هر مدل
+        for model_name, metrics in self.benchmark_results.items():
+            strong_points = []
+            weak_points = []
+            if metrics.correct_indexing_rate > 0.8:
+                strong_points.append("اندیس‌گذاری دقیق")
+            else:
+                weak_points.append("مشکل در اندیس‌گذاری")
+            if metrics.structure_preservation_score > 0.8:
+                strong_points.append("حفظ ساختار متن")
+            else:
+                weak_points.append("ضعف در حفظ ساختار")
+            if metrics.entity_coverage_rate > 0.8:
+                strong_points.append("پوشش مناسب موجودیت‌ها")
+            else:
+                weak_points.append("پوشش ناکافی موجودیت‌ها")
+            analysis += f"""
+                <li><strong>{model_name}:</strong>
+                    نقاط قوت: {', '.join(strong_points) if strong_points else 'ندارد'} |
+                    نقاط ضعف: {', '.join(weak_points) if weak_points else 'ندارد'}
+                </li>
+            """
+        analysis += """
+            </ul>
+        </div>
+        """
+        return analysis
+# رابط کاربری Gradio
+def create_benchmark_interface():
+    """ایجاد رابط کاربری بنچمارک"""
+    benchmark = AnonymizationBenchmark()
+    with gr.Blocks(
+        title="بنچمارک ناشناس‌سازی",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {
+            font-family: 'Tahoma', 'Arial', sans-serif !important;
+            direction: rtl;
+            max-width: 1400px;
+            margin: 0 auto;
+        }
+        .upload-box {
+            border: 2px dashed #4CAF50;
+            border-radius: 10px;
+            padding: 20px;
+            text-align: center;
+            background-color: #f8f9fa;
+            margin: 10px 0;
+        }
+        """
+    ) as interface:
+        gr.Markdown("""
+        # 🏆 ابزار بنچمارک ناشناس‌سازی متون فارسی
+        ### مقایسه عملکرد مدل‌های ChatGPT، Grok و Llama-3.1-8B در ناشناس‌سازی متون مالی/خبری
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📁 بارگذاری فایل‌های CSV")
+                chatgpt_file = gr.File(
+                    label="📄 فایل ChatGPT",
+                    file_types=[".csv"],
+                    elem_classes=["upload-box"]
+                )
+                grok_file = gr.File(
+                    label="📄 فایل Grok",
+                    file_types=[".csv"],
+                    elem_classes=["upload-box"]
+                )
+                llama_file = gr.File(
+                    label="📄 فایل Llama-3.1-8B",
+                    file_types=[".csv"],
+                    elem_classes=["upload-box"]
+                )
+                load_btn = gr.Button(
+                    "📂 بارگذاری فایل‌ها",
+                    variant="primary",
+                    size="lg"
+                )
+                benchmark_btn = gr.Button(
+                    "🚀 اجرای بنچمارک",
+                    variant="secondary",
+                    size="lg",
+                    interactive=False
+                )
+            with gr.Column(scale=2):
+                status_output = gr.Markdown("وضعیت: آماده بارگذاری فایل‌ها")
+                results_html = gr.HTML(
+                    label="📊 نتایج بنچمارک",
+                    visible=False
+                )
+        def load_files(chatgpt, grok, llama):
+            if not all([chatgpt, grok, llama]):
+                return "❌ لطفاً هر سه فایل را انتخاب کنید", gr.Button(interactive=False), gr.HTML(visible=False)
+            success, message = benchmark.load_csv_files(
+                chatgpt.name, grok.name, llama.name
+            )
+            if success:
+                return (
+                    f"✅ {message}",
+                    gr.Button(interactive=True),
+                    gr.HTML(visible=False)
+                )
+            else:
+                return (
+                    f"❌ {message}",
+                    gr.Button(interactive=False),
+                    gr.HTML(visible=False)
+                )
+        def run_benchmark():
+            success, message, html_report = benchmark.run_benchmark()
+            if success:
+                return (
+                    f"✅ {message}",
+                    gr.HTML(value=html_report, visible=True)
+                )
+            else:
+                return (
+                    f"❌ {message}",
+                    gr.HTML(visible=False)
+                )
+        # اتصال رویدادها
+        load_btn.click(
+            fn=load_files,
+            inputs=[chatgpt_file, grok_file, llama_file],
+            outputs=[status_output, benchmark_btn, results_html]
+        )
+        benchmark_btn.click(
+            fn=run_benchmark,
+            outputs=[status_output, results_html]
+        )
+        # راهنمای استفاده
+        with gr.Accordion("📖 راهنمای استفاده", open=False):
+            gr.Markdown("""
+            ### نحوه استفاده:
+            1. **بارگذاری فایل‌ها:** سه فایل CSV مربوط به نتایج ناشناس‌سازی مدل‌های مختلف را انتخاب کنید
+            2. **بررسی فرمت:** هر فایل باید دارای ستون‌های `original_text` و `anonymized_text` باشد
+            3. **اجرای بنچمارک:** روی دکمه "اجرای بنچمارک" کلیک کنید
+            4. **مشاهده نتایج:** گزارش HTML کامل با جداول و نمودارها نمایش داده می‌شود
+            ### متریک‌های ارزیابی:
+            - **درستی اندیس‌گذاری:** بررسی شروع از 01 و پیوستگی شماره‌ها
+            - **ثبات شناسه‌ها:** استفاده مداوم از یک شناسه برای یک موجودیت
+            - **حفظ ساختار:** حفظ واژگان مهم و ساختار جمله
+            - **پوشش موجودیت‌ها:** درصد موجودیت‌های شناسایی و ناشناس شده
+            - **امتیاز کلی:** ترکیب وزنی همه متریک‌ها
+            """)
+    return interface
+# اجرای برنامه
+if __name__ == "__main__":
+    interface = create_benchmark_interface()
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7861,
+        share=True,
+        show_error=True
+    )