Spaces:

leilaghomashchi
/

Benchmark-data-anonymization

Sleeping

File size: 30,288 Bytes

import pandas as pd
import re
import numpy as np
import json
from typing import Dict, List, Any, Tuple
import gradio as gr
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
from dataclasses import dataclass
from datetime import datetime

@dataclass
class BenchmarkMetrics:
    """کلاس متریک‌های بنچمارک"""
    model_name: str
    total_texts: int
    total_entities: int
    accuracy: float    # درستی کلی ناشناس‌سازی
    recall: float      # پوشش موجودیت‌ها
    precision: float   # دقت شناسایی

class AnonymizationBenchmark:
    """کلاس اصلی بنچمارک ناشناس‌سازی"""
    
    def __init__(self):
        self.models_data = {}
        self.benchmark_results = {}
        
    def load_csv_files(self, chatgpt_file, grok_file, llama_file):
        """بارگذاری فایل‌های CSV"""
        try:
            # بارگذاری فایل‌ها
            chatgpt_df = pd.read_csv(chatgpt_file)
            grok_df = pd.read_csv(grok_file)
            llama_df = pd.read_csv(llama_file)
            
            # بررسی ستون‌ها
            required_columns = ['original_text', 'anonymized_text']
            
            for df_name, df in [('ChatGPT', chatgpt_df), ('Grok', grok_df), ('Llama', llama_df)]:
                if not all(col in df.columns for col in required_columns):
                    raise ValueError(f"فایل {df_name} فاقد ستون‌های مورد نیاز است")
            
            self.models_data = {
                'ChatGPT': chatgpt_df,
                'Grok': grok_df,
                'Llama-3.1-8B': llama_df
            }
            
            return True, "فایل‌ها با موفقیت بارگذاری شدند"
            
        except Exception as e:
            return False, f"خطا در بارگذاری فایل‌ها: {str(e)}"
    
    def extract_entities_from_text(self, text: str) -> Dict[str, List[str]]:
        """استخراج موجودیت‌ها از متن"""
        entities = {
            'companies': re.findall(r'company-(\d+)', text),
            'persons': re.findall(r'person-(\d+)', text),
            'amounts': re.findall(r'amount-(\d+)', text),
            'percents': re.findall(r'percent-(\d+)', text),
            'groups': re.findall(r'group-(\d+)', text)
        }
        return entities
    
    def count_original_entities(self, text: str) -> int:
        """تخمین تعداد موجودیت‌های قابل ناشناس‌سازی در متن اصلی"""
        # الگوهای شناسایی موجودیت‌ها در متن فارسی
        patterns = [
            r'[۰-۹]+(?:\.[۰-۹]+)?\s*(?:میلیارد|میلیون|هزار)?\s*(?:تومان|ریال|دلار|یورو)',  # اعداد پولی
            r'[۰-۹]+(?:\.[۰-۹]+)?\s*درصد',  # درصدها
            r'\b[آ-ی\s]{2,30}\b(?:\s*(?:شرکت|بانک|گروه|سازمان))',  # شرکت‌ها
            r'\b[آ-ی\s]{2,20}\b(?:\s*(?:مدیرعامل|رئیس|مدیر))',  # اشخاص
            r'[۰-۹]+(?:\.[۰-۹]+)?(?:\s*(?:میلیون|میلیارد|هزار))?',  # سایر اعداد
        ]
        
        total_entities = 0
        for pattern in patterns:
            matches = re.findall(pattern, text)
            total_entities += len(matches)
            
        return max(total_entities, 1)  # حداقل 1 برای جلوگیری از تقسیم بر صفر
    
    def check_indexing_correctness(self, entities: Dict[str, List[str]]) -> float:
        """بررسی درستی اندیس‌گذاری"""
        total_checks = 0
        passed_checks = 0
        
        for entity_type, indices in entities.items():
            if not indices:
                continue
                
            total_checks += 1
            unique_indices = sorted([int(x) for x in set(indices)])
            
            # بررسی شروع از 1
            if unique_indices[0] == 1:
                passed_checks += 0.5
            
            # بررسی پیوستگی
            expected = list(range(1, len(unique_indices) + 1))
            if unique_indices == expected:
                passed_checks += 0.5
        
        return passed_checks / total_checks if total_checks > 0 else 0.0
    
    def calculate_structure_preservation(self, original_text: str, anonymized_text: str) -> float:
        """محاسبه امتیاز حفظ ساختار"""
        # کلمات مهم که باید حفظ شوند
        important_words = [
            'میلیارد', 'میلیون', 'تومان', 'ریال', 'درصد', 'سود', 'زیان',
            'مدیرعامل', 'شرکت', 'بانک', 'درآمد', 'سال', 'ماه'
        ]
        
        score = 0.0
        total_checks = len(important_words)
        
        for word in important_words:
            if word in original_text and word in anonymized_text:
                score += 1.0
            elif word not in original_text:
                total_checks -= 1
        
        # بررسی حفظ تعداد کلمات (تقریبی)
        original_words = len(original_text.split())
        anonymized_words = len(anonymized_text.split())
        
        if original_words > 0:
            word_ratio = min(anonymized_words / original_words, 1.0)
            score += word_ratio * 2
            total_checks += 2
        
        return score / total_checks if total_checks > 0 else 0.0
    
    def calculate_accuracy(self, original_text: str, anonymized_text: str) -> float:
        """محاسبه درستی کلی ناشناس‌سازی"""
        entities = self.extract_entities_from_text(anonymized_text)
        
        # بررسی درستی اندیس‌گذاری
        indexing_score = self.check_indexing_correctness(entities)
        
        # بررسی حفظ ساختار
        structure_score = self.calculate_structure_preservation(original_text, anonymized_text)
        
        # میانگین وزنی
        accuracy = (indexing_score * 0.6) + (structure_score * 0.4)
        return accuracy
    
    def calculate_recall(self, original_text: str, anonymized_text: str) -> float:
        """محاسبه پوشش موجودیت‌ها (Recall)"""
        original_entity_count = self.count_original_entities(original_text)
        entities = self.extract_entities_from_text(anonymized_text)
        anonymized_entity_count = sum(len(set(v)) for v in entities.values())
        
        return min(anonymized_entity_count / original_entity_count, 1.0)
    
    def calculate_precision(self, anonymized_text: str) -> float:
        """محاسبه دقت شناسایی (Precision)"""
        entities = self.extract_entities_from_text(anonymized_text)
        
        # بررسی کیفیت موجودیت‌های شناسایی شده
        total_entities = sum(len(v) for v in entities.values())
        if total_entities == 0:
            return 0.0
        
        # بررسی درستی فرمت شناسه‌ها
        correct_entities = 0
        for entity_type, indices in entities.items():
            for idx in indices:
                if idx.isdigit() and int(idx) > 0:
                    correct_entities += 1
        
        # بررسی عدم تکرار غیرضروری
        unique_entities = sum(len(set(v)) for v in entities.values())
        consistency_bonus = unique_entities / total_entities if total_entities > 0 else 0
        
        base_precision = correct_entities / total_entities if total_entities > 0 else 0
        return (base_precision * 0.7) + (consistency_bonus * 0.3)
    
    def analyze_model(self, model_name: str, df: pd.DataFrame) -> BenchmarkMetrics:
        """تحلیل یک مدل"""
        print(f"تحلیل مدل {model_name}...")
        
        total_texts = len(df)
        
        # محاسبه متریک‌ها برای هر متن
        accuracy_scores = []
        recall_scores = []
        precision_scores = []
        total_entities = 0
        
        for _, row in df.iterrows():
            original = str(row['original_text'])
            anonymized = str(row['anonymized_text'])
            
            # محاسبه متریک‌ها
            accuracy_scores.append(self.calculate_accuracy(original, anonymized))
            recall_scores.append(self.calculate_recall(original, anonymized))
            precision_scores.append(self.calculate_precision(anonymized))
            
            # شمارش موجودیت‌ها
            entities = self.extract_entities_from_text(anonymized)
            total_entities += sum(len(set(v)) for v in entities.values())
        
        return BenchmarkMetrics(
            model_name=model_name,
            total_texts=total_texts,
            total_entities=total_entities,
            accuracy=round(np.mean(accuracy_scores), 3),
            recall=round(np.mean(recall_scores), 3),
            precision=round(np.mean(precision_scores), 3)
        )
    
    def run_benchmark(self) -> Tuple[bool, str, str]:
        """اجرای بنچمارک کامل"""
        if not self.models_data:
            return False, "ابتدا فایل‌ها را بارگذاری کنید", ""
        
        try:
            results = {}
            
            # تحلیل هر مدل
            for model_name, df in self.models_data.items():
                results[model_name] = self.analyze_model(model_name, df)
            
            self.benchmark_results = results
            
            # تولید HTML
            html_report = self.generate_html_report()
            
            return True, "بنچمارک با موفقیت انجام شد", html_report
            
        except Exception as e:
            return False, f"خطا در اجرای بنچمارک: {str(e)}", ""
    
    def generate_comparison_table(self) -> str:
        """تولید جدول مقایسه"""
        if not self.benchmark_results:
            return "<p>هنوز بنچمارکی انجام نشده است</p>"
        
        # آماده‌سازی داده‌ها برای جدول
        table_data = []
        for model_name, metrics in self.benchmark_results.items():
            table_data.append({
                'مدل': model_name,
                'تعداد متن‌ها': metrics.total_texts,
                'کل موجودیت‌ها': metrics.total_entities,
                '🎯 دقت (Accuracy)': f"{metrics.accuracy:.3f}",
                '📊 بازیابی (Recall)': f"{metrics.recall:.3f}",
                '✅ دقت شناسایی (Precision)': f"{metrics.precision:.3f}"
            })
        
        # تولید HTML جدول
        html = """
        <div style="overflow-x: auto; margin: 20px 0;">
            <table style="width: 100%; border-collapse: collapse; font-family: 'Tahoma', sans-serif;">
                <thead>
                    <tr style="background-color: #4CAF50; color: white;">
        """
        
        # سرستون‌ها
        headers = list(table_data[0].keys())
        for header in headers:
            html += f"<th style='border: 1px solid #ddd; padding: 12px; text-align: center;'>{header}</th>"
        
        html += "</tr></thead><tbody>"
        
        # ردیف‌ها
        for i, row in enumerate(table_data):
            bg_color = "#f2f2f2" if i % 2 == 0 else "white"
            html += f"<tr style='background-color: {bg_color};'>"
            
            for j, (key, value) in enumerate(row.items()):
                # رنگ‌بندی ستون‌های متریک‌ها
                if key in ['🎯 دقت (Accuracy)', '📊 بازیابی (Recall)', '✅ دقت شناسایی (Precision)']:
                    score = float(value)
                    if score >= 0.8:
                        color = "#4CAF50"  # سبز
                    elif score >= 0.6:
                        color = "#FF9800"  # نارنجی
                    else:
                        color = "#F44336"  # قرمز
                    html += f"<td style='border: 1px solid #ddd; padding: 12px; text-align: center; font-weight: bold; color: {color};'>{value}</td>"
                else:
                    html += f"<td style='border: 1px solid #ddd; padding: 12px; text-align: center;'>{value}</td>"
            
            html += "</tr>"
        
        html += "</tbody></table></div>"
        
        return html
    
    def generate_charts(self) -> str:
        """تولید نمودارها"""
        if not self.benchmark_results:
            return ""
        
        models = list(self.benchmark_results.keys())
        accuracy_scores = [self.benchmark_results[model].accuracy for model in models]
        recall_scores = [self.benchmark_results[model].recall for model in models]
        precision_scores = [self.benchmark_results[model].precision for model in models]
        
        # نمودار مقایسه سه متریک
        chart_html = """
        <div style="margin: 20px 0;">
            <h3 style="text-align: center; color: #333;">مقایسه متریک‌های عملکرد مدل‌ها</h3>
            <div style="display: flex; justify-content: center; gap: 40px; background-color: #f9f9f9; padding: 30px; border-radius: 15px;">
        """
        
        colors = ['#4CAF50', '#2196F3', '#FF9800']
        
        for i, model in enumerate(models):
            accuracy = accuracy_scores[i]
            recall = recall_scores[i] 
            precision = precision_scores[i]
            
            chart_html += f"""
                <div style="text-align: center; min-width: 200px;">
                    <h4 style="margin-bottom: 15px; color: #333;">{model}</h4>
                    
                    <div style="margin-bottom: 10px;">
                        <div style="font-size: 12px; color: #666; margin-bottom: 5px;">Accuracy</div>
                        <div style="background-color: {colors[0]}; width: 60px; height: {accuracy*100}px; margin: 0 auto; border-radius: 3px; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 10px;">
                            {accuracy:.3f}
                        </div>
                    </div>
                    
                    <div style="margin-bottom: 10px;">
                        <div style="font-size: 12px; color: #666; margin-bottom: 5px;">Recall</div>
                        <div style="background-color: {colors[1]}; width: 60px; height: {recall*100}px; margin: 0 auto; border-radius: 3px; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 10px;">
                            {recall:.3f}
                        </div>
                    </div>
                    
                    <div style="margin-bottom: 10px;">
                        <div style="font-size: 12px; color: #666; margin-bottom: 5px;">Precision</div>
                        <div style="background-color: {colors[2]}; width: 60px; height: {precision*100}px; margin: 0 auto; border-radius: 3px; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 10px;">
                            {precision:.3f}
                        </div>
                    </div>
                </div>
            """
        
        chart_html += "</div></div>"
        
        return chart_html
    
    def generate_html_report(self) -> str:
        """تولید گزارش HTML کامل"""
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        html = f"""
        <!DOCTYPE html>
        <html lang="fa" dir="rtl">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>گزارش بنچمارک ناشناس‌سازی</title>
            <style>
                * {{
                    margin: 0;
                    padding: 0;
                    box-sizing: border-box;
                }}
                body {{
                    font-family: 'Tahoma', 'Arial', sans-serif;
                    line-height: 1.6;
                    color: #333;
                    background-color: #f5f5f5;
                    padding: 20px;
                }}
                .container {{
                    max-width: 1400px;
                    margin: 0 auto;
                    background-color: white;
                    border-radius: 10px;
                    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
                    overflow: hidden;
                }}
                .header {{
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    color: white;
                    padding: 30px;
                    text-align: center;
                }}
                .header h1 {{
                    font-size: 2.5em;
                    margin-bottom: 10px;
                }}
                .header p {{
                    font-size: 1.2em;
                    opacity: 0.9;
                }}
                .content {{
                    padding: 30px;
                }}
                .summary {{
                    background-color: #e8f5e8;
                    border-right: 5px solid #4CAF50;
                    padding: 20px;
                    margin-bottom: 30px;
                    border-radius: 5px;
                }}
                .section {{
                    margin-bottom: 40px;
                }}
                .section h2 {{
                    color: #333;
                    border-bottom: 2px solid #4CAF50;
                    padding-bottom: 10px;
                    margin-bottom: 20px;
                }}
                .metrics-grid {{
                    display: grid;
                    grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
                    gap: 20px;
                    margin-bottom: 30px;
                }}
                .metric-card {{
                    background-color: #f8f9fa;
                    border: 1px solid #dee2e6;
                    border-radius: 8px;
                    padding: 20px;
                    text-align: center;
                    transition: transform 0.2s;
                }}
                .metric-card:hover {{
                    transform: translateY(-5px);
                    box-shadow: 0 5px 15px rgba(0,0,0,0.1);
                }}
                .metric-number {{
                    font-size: 2em;
                    font-weight: bold;
                    color: #4CAF50;
                    margin-bottom: 5px;
                }}
                .metric-label {{
                    color: #666;
                    font-size: 0.9em;
                }}
                .footer {{
                    background-color: #f8f9fa;
                    padding: 20px;
                    text-align: center;
                    color: #666;
                    border-top: 1px solid #dee2e6;
                }}
            </style>
        </head>
        <body>
            <div class="container">
                <div class="header">
                    <h1>🏆 گزارش بنچمارک ناشناس‌سازی</h1>
                    <p>مقایسه عملکرد مدل‌های ChatGPT، Grok و Llama-3.1-8B</p>
                </div>
                
                <div class="content">
                    <div class="summary">
                        <h3>📋 خلاصه نتایج</h3>
                        <p>این گزارش نتایج بنچمارک سه مدل مختلف برای ناشناس‌سازی متون فارسی را نشان می‌دهد. 
                        متریک‌های ارزیابی شامل درستی اندیس‌گذاری، ثبات استفاده از شناسه‌ها، حفظ ساختار متن و پوشش موجودیت‌ها می‌باشد.</p>
                    </div>
                    
                    <div class="section">
                        <h2>📊 جدول مقایسه کامل</h2>
                        {self.generate_comparison_table()}
                    </div>
                    
                    <div class="section">
                        <h2>📈 نمودار مقایسه</h2>
                        {self.generate_charts()}
                    </div>
                    
                    <div class="section">
                        <h2>🔍 تحلیل تفصیلی</h2>
                        {self.generate_detailed_analysis()}
                    </div>
                </div>
                
                <div class="footer">
                    <p>گزارش تولید شده در تاریخ: {current_time}</p>
                    <p>ابزار بنچمارک ناشناس‌سازی متون فارسی</p>
                </div>
            </div>
        </body>
        </html>
        """
        
        return html
    
    def generate_detailed_analysis(self) -> str:
        """تولید تحلیل تفصیلی"""
        if not self.benchmark_results:
            return "<p>داده‌ای برای تحلیل یافت نشد</p>"
        
        # یافتن بهترین مدل در هر متریک
        best_accuracy = max(self.benchmark_results.keys(), 
                           key=lambda k: self.benchmark_results[k].accuracy)
        best_recall = max(self.benchmark_results.keys(), 
                         key=lambda k: self.benchmark_results[k].recall)
        best_precision = max(self.benchmark_results.keys(), 
                            key=lambda k: self.benchmark_results[k].precision)
        
        # محاسبه میانگین
        avg_accuracy = np.mean([m.accuracy for m in self.benchmark_results.values()])
        avg_recall = np.mean([m.recall for m in self.benchmark_results.values()])
        avg_precision = np.mean([m.precision for m in self.benchmark_results.values()])
        
        analysis = f"""
        <div class="metrics-grid">
            <div class="metric-card">
                <div class="metric-number">🎯</div>
                <div class="metric-label">بهترین Accuracy: {best_accuracy}</div>
                <div style="color: #4CAF50; font-weight: bold;">
                    {self.benchmark_results[best_accuracy].accuracy:.3f}
                </div>
            </div>
            <div class="metric-card">
                <div class="metric-number">📊</div>
                <div class="metric-label">بهترین Recall: {best_recall}</div>
                <div style="color: #2196F3; font-weight: bold;">
                    {self.benchmark_results[best_recall].recall:.3f}
                </div>
            </div>
            <div class="metric-card">
                <div class="metric-number">✅</div>
                <div class="metric-label">بهترین Precision: {best_precision}</div>
                <div style="color: #FF9800; font-weight: bold;">
                    {self.benchmark_results[best_precision].precision:.3f}
                </div>
            </div>
        </div>
        
        <div style="background-color: #e3f2fd; border: 1px solid #2196F3; border-radius: 8px; padding: 20px; margin-top: 20px;">
            <h4>📈 آمار کلی:</h4>
            <ul style="margin-top: 10px; padding-right: 20px;">
                <li><strong>میانگین Accuracy:</strong> {avg_accuracy:.3f}</li>
                <li><strong>میانگین Recall:</strong> {avg_recall:.3f}</li>
                <li><strong>میانگین Precision:</strong> {avg_precision:.3f}</li>
            </ul>
        </div>
        
        <div style="background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; padding: 20px; margin-top: 20px;">
            <h4>💡 تفسیر نتایج:</h4>
            <ul style="margin-top: 10px; padding-right: 20px;">
                <li><strong>Accuracy:</strong> دقت کلی ناشناس‌سازی (ترکیب اندیس‌گذاری صحیح و حفظ ساختار)</li>
                <li><strong>Recall:</strong> پوشش موجودیت‌ها (چه درصدی از موجودیت‌ها شناسایی شدند)</li>
                <li><strong>Precision:</strong> دقت شناسایی (چه درصدی از شناسه‌ها صحیح هستند)</li>
            </ul>
        </div>
        """
        
        return analysis

# رابط کاربری Gradio
def create_benchmark_interface():
    """ایجاد رابط کاربری بنچمارک"""
    benchmark = AnonymizationBenchmark()
    
    with gr.Blocks(
        title="بنچمارک ناشناس‌سازی",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            font-family: 'Tahoma', 'Arial', sans-serif !important;
            direction: rtl;
            max-width: 1400px;
            margin: 0 auto;
        }
        .upload-box {
            border: 2px dashed #4CAF50;
            border-radius: 10px;
            padding: 20px;
            text-align: center;
            background-color: #f8f9fa;
            margin: 10px 0;
        }
        """
    ) as interface:
        
        gr.Markdown("""
        # 🏆 ابزار بنچمارک ناشناس‌سازی متون فارسی
        ### مقایسه عملکرد مدل‌های ChatGPT، Grok و Llama-3.1-8B در ناشناس‌سازی متون مالی/خبری
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 📁 بارگذاری فایل‌های CSV")
                
                chatgpt_file = gr.File(
                    label="📄 فایل ChatGPT",
                    file_types=[".csv"],
                    elem_classes=["upload-box"]
                )
                
                grok_file = gr.File(
                    label="📄 فایل Grok", 
                    file_types=[".csv"],
                    elem_classes=["upload-box"]
                )
                
                llama_file = gr.File(
                    label="📄 فایل Llama-3.1-8B",
                    file_types=[".csv"],
                    elem_classes=["upload-box"]
                )
                
                load_btn = gr.Button(
                    "📂 بارگذاری فایل‌ها",
                    variant="primary",
                    size="lg"
                )
                
                benchmark_btn = gr.Button(
                    "🚀 اجرای بنچمارک",
                    variant="secondary", 
                    size="lg",
                    interactive=False
                )
            
            with gr.Column(scale=2):
                status_output = gr.Markdown("وضعیت: آماده بارگذاری فایل‌ها")
                
                results_html = gr.HTML(
                    label="📊 نتایج بنچمارک",
                    visible=False
                )
        
        def load_files(chatgpt, grok, llama):
            if not all([chatgpt, grok, llama]):
                return "❌ لطفاً هر سه فایل را انتخاب کنید", gr.Button(interactive=False), gr.HTML(visible=False)
            
            success, message = benchmark.load_csv_files(
                chatgpt.name, grok.name, llama.name
            )
            
            if success:
                return (
                    f"✅ {message}",
                    gr.Button(interactive=True),
                    gr.HTML(visible=False)
                )
            else:
                return (
                    f"❌ {message}",
                    gr.Button(interactive=False), 
                    gr.HTML(visible=False)
                )
        
        def run_benchmark():
            success, message, html_report = benchmark.run_benchmark()
            
            if success:
                return (
                    f"✅ {message}",
                    gr.HTML(value=html_report, visible=True)
                )
            else:
                return (
                    f"❌ {message}",
                    gr.HTML(visible=False)
                )
        
        # اتصال رویدادها
        load_btn.click(
            fn=load_files,
            inputs=[chatgpt_file, grok_file, llama_file],
            outputs=[status_output, benchmark_btn, results_html]
        )
        
        benchmark_btn.click(
            fn=run_benchmark,
            outputs=[status_output, results_html]
        )
        
        # راهنمای استفاده
        with gr.Accordion("📖 راهنمای استفاده", open=False):
            gr.Markdown("""
            ### نحوه استفاده:
            1. **بارگذاری فایل‌ها:** سه فایل CSV مربوط به نتایج ناشناس‌سازی مدل‌های مختلف را انتخاب کنید
            2. **بررسی فرمت:** هر فایل باید دارای ستون‌های `original_text` و `anonymized_text` باشد
            3. **اجرای بنچمارک:** روی دکمه "اجرای بنچمارک" کلیک کنید
            4. **مشاهده نتایج:** گزارش HTML کامل با جداول و نمودارها نمایش داده می‌شود
            
            ### متریک‌های ارزیابی:
            - **درستی اندیس‌گذاری:** بررسی شروع از 01 و پیوستگی شماره‌ها
            - **ثبات شناسه‌ها:** استفاده مداوم از یک شناسه برای یک موجودیت
            - **حفظ ساختار:** حفظ واژگان مهم و ساختار جمله
            - **پوشش موجودیت‌ها:** درصد موجودیت‌های شناسایی و ناشناس شده
            - **امتیاز کلی:** ترکیب وزنی همه متریک‌ها
            """)
    
    return interface

# اجرای برنامه
if __name__ == "__main__":
    interface = create_benchmark_interface()
    interface.launch()