import pandas as pd
import re
import numpy as np
import json
from typing import Dict, List, Any, Tuple
import gradio as gr
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
from dataclasses import dataclass
from datetime import datetime

@dataclass
class BenchmarkMetrics:
    """کلاس متریک‌های بنچمارک"""
    model_name: str
    total_texts: int
    total_entities: int
    accuracy: float    # درستی کلی ناشناس‌سازی
    recall: float      # پوشش موجودیت‌ها
    precision: float   # دقت شناسایی

class AnonymizationBenchmark:
    """کلاس اصلی بنچمارک ناشناس‌سازی"""
    
    def __init__(self):
        self.models_data = {}
        self.benchmark_results = {}
        
    def load_csv_files(self, chatgpt_file, grok_file, llama_file):
        """بارگذاری فایل‌های CSV"""
        try:
            # بارگذاری فایل‌ها
            chatgpt_df = pd.read_csv(chatgpt_file)
            grok_df = pd.read_csv(grok_file)
            llama_df = pd.read_csv(llama_file)
            
            # بررسی ستون‌ها
            required_columns = ['original_text', 'anonymized_text']
            
            for df_name, df in [('ChatGPT', chatgpt_df), ('Grok', grok_df), ('Llama', llama_df)]:
                if not all(col in df.columns for col in required_columns):
                    raise ValueError(f"فایل {df_name} فاقد ستون‌های مورد نیاز است")
            
            self.models_data = {
                'ChatGPT': chatgpt_df,
                'Grok': grok_df,
                'Llama-3.1-8B': llama_df
            }
            
            return True, "فایل‌ها با موفقیت بارگذاری شدند"
            
        except Exception as e:
            return False, f"خطا در بارگذاری فایل‌ها: {str(e)}"
    
    def extract_entities_from_text(self, text: str) -> Dict[str, List[str]]:
        """استخراج موجودیت‌ها از متن"""
        entities = {
            'companies': re.findall(r'company-(\d+)', text),
            'persons': re.findall(r'person-(\d+)', text),
            'amounts': re.findall(r'amount-(\d+)', text),
            'percents': re.findall(r'percent-(\d+)', text),
            'groups': re.findall(r'group-(\d+)', text)
        }
        return entities
    
    def count_original_entities(self, text: str) -> int:
        """تخمین تعداد موجودیت‌های قابل ناشناس‌سازی در متن اصلی"""
        # الگوهای شناسایی موجودیت‌ها در متن فارسی
        patterns = [
            r'[۰-۹]+(?:\.[۰-۹]+)?\s*(?:میلیارد|میلیون|هزار)?\s*(?:تومان|ریال|دلار|یورو)',  # اعداد پولی
            r'[۰-۹]+(?:\.[۰-۹]+)?\s*درصد',  # درصدها
            r'\b[آ-ی\s]{2,30}\b(?:\s*(?:شرکت|بانک|گروه|سازمان))',  # شرکت‌ها
            r'\b[آ-ی\s]{2,20}\b(?:\s*(?:مدیرعامل|رئیس|مدیر))',  # اشخاص
            r'[۰-۹]+(?:\.[۰-۹]+)?(?:\s*(?:میلیون|میلیارد|هزار))?',  # سایر اعداد
        ]
        
        total_entities = 0
        for pattern in patterns:
            matches = re.findall(pattern, text)
            total_entities += len(matches)
            
        return max(total_entities, 1)  # حداقل 1 برای جلوگیری از تقسیم بر صفر
    
    def check_indexing_correctness(self, entities: Dict[str, List[str]]) -> float:
        """بررسی درستی اندیس‌گذاری"""
        total_checks = 0
        passed_checks = 0
        
        for entity_type, indices in entities.items():
            if not indices:
                continue
                
            total_checks += 1
            unique_indices = sorted([int(x) for x in set(indices)])
            
            # بررسی شروع از 1
            if unique_indices[0] == 1:
                passed_checks += 0.5
            
            # بررسی پیوستگی
            expected = list(range(1, len(unique_indices) + 1))
            if unique_indices == expected:
                passed_checks += 0.5
        
        return passed_checks / total_checks if total_checks > 0 else 0.0
    
    def calculate_structure_preservation(self, original_text: str, anonymized_text: str) -> float:
        """محاسبه امتیاز حفظ ساختار"""
        # کلمات مهم که باید حفظ شوند
        important_words = [
            'میلیارد', 'میلیون', 'تومان', 'ریال', 'درصد', 'سود', 'زیان',
            'مدیرعامل', 'شرکت', 'بانک', 'درآمد', 'سال', 'ماه'
        ]
        
        score = 0.0
        total_checks = len(important_words)
        
        for word in important_words:
            if word in original_text and word in anonymized_text:
                score += 1.0
            elif word not in original_text:
                total_checks -= 1
        
        # بررسی حفظ تعداد کلمات (تقریبی)
        original_words = len(original_text.split())
        anonymized_words = len(anonymized_text.split())
        
        if original_words > 0:
            word_ratio = min(anonymized_words / original_words, 1.0)
            score += word_ratio * 2
            total_checks += 2
        
        return score / total_checks if total_checks > 0 else 0.0
    
    def calculate_accuracy(self, original_text: str, anonymized_text: str) -> float:
        """محاسبه درستی کلی ناشناس‌سازی"""
        entities = self.extract_entities_from_text(anonymized_text)
        
        # بررسی درستی اندیس‌گذاری
        indexing_score = self.check_indexing_correctness(entities)
        
        # بررسی حفظ ساختار
        structure_score = self.calculate_structure_preservation(original_text, anonymized_text)
        
        # میانگین وزنی
        accuracy = (indexing_score * 0.6) + (structure_score * 0.4)
        return accuracy
    
    def calculate_recall(self, original_text: str, anonymized_text: str) -> float:
        """محاسبه پوشش موجودیت‌ها (Recall)"""
        original_entity_count = self.count_original_entities(original_text)
        entities = self.extract_entities_from_text(anonymized_text)
        anonymized_entity_count = sum(len(set(v)) for v in entities.values())
        
        return min(anonymized_entity_count / original_entity_count, 1.0)
    
    def calculate_precision(self, anonymized_text: str) -> float:
        """محاسبه دقت شناسایی (Precision)"""
        entities = self.extract_entities_from_text(anonymized_text)
        
        # بررسی کیفیت موجودیت‌های شناسایی شده
        total_entities = sum(len(v) for v in entities.values())
        if total_entities == 0:
            return 0.0
        
        # بررسی درستی فرمت شناسه‌ها
        correct_entities = 0
        for entity_type, indices in entities.items():
            for idx in indices:
                if idx.isdigit() and int(idx) > 0:
                    correct_entities += 1
        
        # بررسی عدم تکرار غیرضروری
        unique_entities = sum(len(set(v)) for v in entities.values())
        consistency_bonus = unique_entities / total_entities if total_entities > 0 else 0
        
        base_precision = correct_entities / total_entities if total_entities > 0 else 0
        return (base_precision * 0.7) + (consistency_bonus * 0.3)
    
    def analyze_model(self, model_name: str, df: pd.DataFrame) -> BenchmarkMetrics:
        """تحلیل یک مدل"""
        print(f"تحلیل مدل {model_name}...")
        
        total_texts = len(df)
        
        # محاسبه متریک‌ها برای هر متن
        accuracy_scores = []
        recall_scores = []
        precision_scores = []
        total_entities = 0
        
        for _, row in df.iterrows():
            original = str(row['original_text'])
            anonymized = str(row['anonymized_text'])
            
            # محاسبه متریک‌ها
            accuracy_scores.append(self.calculate_accuracy(original, anonymized))
            recall_scores.append(self.calculate_recall(original, anonymized))
            precision_scores.append(self.calculate_precision(anonymized))
            
            # شمارش موجودیت‌ها
            entities = self.extract_entities_from_text(anonymized)
            total_entities += sum(len(set(v)) for v in entities.values())
        
        return BenchmarkMetrics(
            model_name=model_name,
            total_texts=total_texts,
            total_entities=total_entities,
            accuracy=round(np.mean(accuracy_scores), 3),
            recall=round(np.mean(recall_scores), 3),
            precision=round(np.mean(precision_scores), 3)
        )
    
    def run_benchmark(self) -> Tuple[bool, str, str]:
        """اجرای بنچمارک کامل"""
        if not self.models_data:
            return False, "ابتدا فایل‌ها را بارگذاری کنید", ""
        
        try:
            results = {}
            
            # تحلیل هر مدل
            for model_name, df in self.models_data.items():
                results[model_name] = self.analyze_model(model_name, df)
            
            self.benchmark_results = results
            
            # تولید HTML
            html_report = self.generate_html_report()
            
            return True, "بنچمارک با موفقیت انجام شد", html_report
            
        except Exception as e:
            return False, f"خطا در اجرای بنچمارک: {str(e)}", ""
    
    def generate_comparison_table(self) -> str:
        """تولید جدول مقایسه"""
        if not self.benchmark_results:
            return "<p>هنوز بنچمارکی انجام نشده است</p>"
        
        # آماده‌سازی داده‌ها برای جدول
        table_data = []
        for model_name, metrics in self.benchmark_results.items():
            table_data.append({
                'مدل': model_name,
                'تعداد متن‌ها': metrics.total_texts,
                'کل موجودیت‌ها': metrics.total_entities,
                '🎯 دقت (Accuracy)': f"{metrics.accuracy:.3f}",
                '📊 بازیابی (Recall)': f"{metrics.recall:.3f}",
                '✅ دقت شناسایی (Precision)': f"{metrics.precision:.3f}"
            })
        
        # تولید HTML جدول
        html = """
        <div style="overflow-x: auto; margin: 20px 0;">
            <table style="width: 100%; border-collapse: collapse; font-family: 'Tahoma', sans-serif;">
                <thead>
                    <tr style="background-color: #4CAF50; color: white;">
        """
        
        # سرستون‌ها
        headers = list(table_data[0].keys())
        for header in headers:
            html += f"<th style='border: 1px solid #ddd; padding: 12px; text-align: center;'>{header}</th>"
        
        html += "</tr></thead><tbody>"
        
        # ردیف‌ها
        for i, row in enumerate(table_data):
            bg_color = "#f2f2f2" if i % 2 == 0 else "white"
            html += f"<tr style='background-color: {bg_color};'>"
            
            for j, (key, value) in enumerate(row.items()):
                # رنگ‌بندی ستون‌های متریک‌ها
                if key in ['🎯 دقت (Accuracy)', '📊 بازیابی (Recall)', '✅ دقت شناسایی (Precision)']:
                    score = float(value)
                    if score >= 0.8:
                        color = "#4CAF50"  # سبز
                    elif score >= 0.6:
                        color = "#FF9800"  # نارنجی
                    else:
                        color = "#F44336"  # قرمز
                    html += f"<td style='border: 1px solid #ddd; padding: 12px; text-align: center; font-weight: bold; color: {color};'>{value}</td>"
                else:
                    html += f"<td style='border: 1px solid #ddd; padding: 12px; text-align: center;'>{value}</td>"
            
            html += "</tr>"
        
        html += "</tbody></table></div>"
        
        return html
    
    def generate_charts(self) -> str:
        """تولید نمودارها"""
        if not self.benchmark_results:
            return ""
        
        models = list(self.benchmark_results.keys())
        accuracy_scores = [self.benchmark_results[model].accuracy for model in models]
        recall_scores = [self.benchmark_results[model].recall for model in models]
        precision_scores = [self.benchmark_results[model].precision for model in models]
        
        # نمودار مقایسه سه متریک
        chart_html = """
        <div style="margin: 20px 0;">
            <h3 style="text-align: center; color: #333;">مقایسه متریک‌های عملکرد مدل‌ها</h3>
            <div style="display: flex; justify-content: center; gap: 40px; background-color: #f9f9f9; padding: 30px; border-radius: 15px;">
        """
        
        colors = ['#4CAF50', '#2196F3', '#FF9800']
        
        for i, model in enumerate(models):
            accuracy = accuracy_scores[i]
            recall = recall_scores[i] 
            precision = precision_scores[i]
            
            chart_html += f"""
                <div style="text-align: center; min-width: 200px;">
                    <h4 style="margin-bottom: 15px; color: #333;">{model}</h4>
                    
                    <div style="margin-bottom: 10px;">
                        <div style="font-size: 12px; color: #666; margin-bottom: 5px;">Accuracy</div>
                        <div style="background-color: {colors[0]}; width: 60px; height: {accuracy*100}px; margin: 0 auto; border-radius: 3px; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 10px;">
                            {accuracy:.3f}
                        </div>
                    </div>
                    
                    <div style="margin-bottom: 10px;">
                        <div style="font-size: 12px; color: #666; margin-bottom: 5px;">Recall</div>
                        <div style="background-color: {colors[1]}; width: 60px; height: {recall*100}px; margin: 0 auto; border-radius: 3px; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 10px;">
                            {recall:.3f}
                        </div>
                    </div>
                    
                    <div style="margin-bottom: 10px;">
                        <div style="font-size: 12px; color: #666; margin-bottom: 5px;">Precision</div>
                        <div style="background-color: {colors[2]}; width: 60px; height: {precision*100}px; margin: 0 auto; border-radius: 3px; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 10px;">
                            {precision:.3f}
                        </div>
                    </div>
                </div>
            """
        
        chart_html += "</div></div>"
        
        return chart_html
    
    def generate_html_report(self) -> str:
        """تولید گزارش HTML کامل"""
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        html = f"""
        <!DOCTYPE html>
        <html lang="fa" dir="rtl">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>گزارش بنچمارک ناشناس‌سازی</title>
            <style>
                * {{
                    margin: 0;
                    padding: 0;
                    box-sizing: border-box;
                }}
                body {{
                    font-family: 'Tahoma', 'Arial', sans-serif;
                    line-height: 1.6;
                    color: #333;
                    background-color: #f5f5f5;
                    padding: 20px;
                }}
                .container {{
                    max-width: 1400px;
                    margin: 0 auto;
                    background-color: white;
                    border-radius: 10px;
                    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
                    overflow: hidden;
                }}
                .header {{
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    color: white;
                    padding: 30px;
                    text-align: center;
                }}
                .header h1 {{
                    font-size: 2.5em;
                    margin-bottom: 10px;
                }}
                .header p {{
                    font-size: 1.2em;
                    opacity: 0.9;
                }}
                .content {{
                    padding: 30px;
                }}
                .summary {{
                    background-color: #e8f5e8;
                    border-right: 5px solid #4CAF50;
                    padding: 20px;
                    margin-bottom: 30px;
                    border-radius: 5px;
                }}
                .section {{
                    margin-bottom: 40px;
                }}
                .section h2 {{
                    color: #333;
                    border-bottom: 2px solid #4CAF50;
                    padding-bottom: 10px;
                    margin-bottom: 20px;
                }}
                .metrics-grid {{
                    display: grid;
                    grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
                    gap: 20px;
                    margin-bottom: 30px;
                }}
                .metric-card {{
                    background-color: #f8f9fa;
                    border: 1px solid #dee2e6;
                    border-radius: 8px;
                    padding: 20px;
                    text-align: center;
                    transition: transform 0.2s;
                }}
                .metric-card:hover {{
                    transform: translateY(-5px);
                    box-shadow: 0 5px 15px rgba(0,0,0,0.1);
                }}
                .metric-number {{
                    font-size: 2em;
                    font-weight: bold;
                    color: #4CAF50;
                    margin-bottom: 5px;
                }}
                .metric-label {{
                    color: #666;
                    font-size: 0.9em;
                }}
                .footer {{
                    background-color: #f8f9fa;
                    padding: 20px;
                    text-align: center;
                    color: #666;
                    border-top: 1px solid #dee2e6;
                }}
            </style>
        </head>
        <body>
            <div class="container">
                <div class="header">
                    <h1>🏆 گزارش بنچمارک ناشناس‌سازی</h1>
                    <p>مقایسه عملکرد مدل‌های ChatGPT، Grok و Llama-3.1-8B</p>
                </div>
                
                <div class="content">
                    <div class="summary">
                        <h3>📋 خلاصه نتایج</h3>
                        <p>این گزارش نتایج بنچمارک سه مدل مختلف برای ناشناس‌سازی متون فارسی را نشان می‌دهد. 
                        متریک‌های ارزیابی شامل درستی اندیس‌گذاری، ثبات استفاده از شناسه‌ها، حفظ ساختار متن و پوشش موجودیت‌ها می‌باشد.</p>
                    </div>
                    
                    <div class="section">
                        <h2>📊 جدول مقایسه کامل</h2>
                        {self.generate_comparison_table()}
                    </div>
                    
                    <div class="section">
                        <h2>📈 نمودار مقایسه</h2>
                        {self.generate_charts()}
                    </div>
                    
                    <div class="section">
                        <h2>🔍 تحلیل تفصیلی</h2>
                        {self.generate_detailed_analysis()}
                    </div>
                </div>
                
                <div class="footer">
                    <p>گزارش تولید شده در تاریخ: {current_time}</p>
                    <p>ابزار بنچمارک ناشناس‌سازی متون فارسی</p>
                </div>
            </div>
        </body>
        </html>
        """
        
        return html
    
    def generate_detailed_analysis(self) -> str:
        """تولید تحلیل تفصیلی"""
        if not self.benchmark_results:
            return "<p>داده‌ای برای تحلیل یافت نشد</p>"
        
        # یافتن بهترین مدل در هر متریک
        best_accuracy = max(self.benchmark_results.keys(), 
                           key=lambda k: self.benchmark_results[k].accuracy)
        best_recall = max(self.benchmark_results.keys(), 
                         key=lambda k: self.benchmark_results[k].recall)
        best_precision = max(self.benchmark_results.keys(), 
                            key=lambda k: self.benchmark_results[k].precision)
        
        # محاسبه میانگین
        avg_accuracy = np.mean([m.accuracy for m in self.benchmark_results.values()])
        avg_recall = np.mean([m.recall for m in self.benchmark_results.values()])
        avg_precision = np.mean([m.precision for m in self.benchmark_results.values()])
        
        analysis = f"""
        <div class="metrics-grid">
            <div class="metric-card">
                <div class="metric-number">🎯</div>
                <div class="metric-label">بهترین Accuracy: {best_accuracy}</div>
                <div style="color: #4CAF50; font-weight: bold;">
                    {self.benchmark_results[best_accuracy].accuracy:.3f}
                </div>
            </div>
            <div class="metric-card">
                <div class="metric-number">📊</div>
                <div class="metric-label">بهترین Recall: {best_recall}</div>
                <div style="color: #2196F3; font-weight: bold;">
                    {self.benchmark_results[best_recall].recall:.3f}
                </div>
            </div>
            <div class="metric-card">
                <div class="metric-number">✅</div>
                <div class="metric-label">بهترین Precision: {best_precision}</div>
                <div style="color: #FF9800; font-weight: bold;">
                    {self.benchmark_results[best_precision].precision:.3f}
                </div>
            </div>
        </div>
        
        <div style="background-color: #e3f2fd; border: 1px solid #2196F3; border-radius: 8px; padding: 20px; margin-top: 20px;">
            <h4>📈 آمار کلی:</h4>
            <ul style="margin-top: 10px; padding-right: 20px;">
                <li><strong>میانگین Accuracy:</strong> {avg_accuracy:.3f}</li>
                <li><strong>میانگین Recall:</strong> {avg_recall:.3f}</li>
                <li><strong>میانگین Precision:</strong> {avg_precision:.3f}</li>
            </ul>
        </div>
        
        <div style="background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; padding: 20px; margin-top: 20px;">
            <h4>💡 تفسیر نتایج:</h4>
            <ul style="margin-top: 10px; padding-right: 20px;">
                <li><strong>Accuracy:</strong> دقت کلی ناشناس‌سازی (ترکیب اندیس‌گذاری صحیح و حفظ ساختار)</li>
                <li><strong>Recall:</strong> پوشش موجودیت‌ها (چه درصدی از موجودیت‌ها شناسایی شدند)</li>
                <li><strong>Precision:</strong> دقت شناسایی (چه درصدی از شناسه‌ها صحیح هستند)</li>
            </ul>
        </div>
        """
        
        return analysis

# رابط کاربری Gradio
def create_benchmark_interface():
    """ایجاد رابط کاربری بنچمارک"""
    benchmark = AnonymizationBenchmark()
    
    with gr.Blocks(
        title="بنچمارک ناشناس‌سازی",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            font-family: 'Tahoma', 'Arial', sans-serif !important;
            direction: rtl;
            max-width: 1400px;
            margin: 0 auto;
        }
        .upload-box {
            border: 2px dashed #4CAF50;
            border-radius: 10px;
            padding: 20px;
            text-align: center;
            background-color: #f8f9fa;
            margin: 10px 0;
        }
        """
    ) as interface:
        
        gr.Markdown("""
        # 🏆 ابزار بنچمارک ناشناس‌سازی متون فارسی
        ### مقایسه عملکرد مدل‌های ChatGPT، Grok و Llama-3.1-8B در ناشناس‌سازی متون مالی/خبری
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 📁 بارگذاری فایل‌های CSV")
                
                chatgpt_file = gr.File(
                    label="📄 فایل ChatGPT",
                    file_types=[".csv"],
                    elem_classes=["upload-box"]
                )
                
                grok_file = gr.File(
                    label="📄 فایل Grok", 
                    file_types=[".csv"],
                    elem_classes=["upload-box"]
                )
                
                llama_file = gr.File(
                    label="📄 فایل Llama-3.1-8B",
                    file_types=[".csv"],
                    elem_classes=["upload-box"]
                )
                
                load_btn = gr.Button(
                    "📂 بارگذاری فایل‌ها",
                    variant="primary",
                    size="lg"
                )
                
                benchmark_btn = gr.Button(
                    "🚀 اجرای بنچمارک",
                    variant="secondary", 
                    size="lg",
                    interactive=False
                )
            
            with gr.Column(scale=2):
                status_output = gr.Markdown("وضعیت: آماده بارگذاری فایل‌ها")
                
                results_html = gr.HTML(
                    label="📊 نتایج بنچمارک",
                    visible=False
                )
        
        def load_files(chatgpt, grok, llama):
            if not all([chatgpt, grok, llama]):
                return "❌ لطفاً هر سه فایل را انتخاب کنید", gr.Button(interactive=False), gr.HTML(visible=False)
            
            success, message = benchmark.load_csv_files(
                chatgpt.name, grok.name, llama.name
            )
            
            if success:
                return (
                    f"✅ {message}",
                    gr.Button(interactive=True),
                    gr.HTML(visible=False)
                )
            else:
                return (
                    f"❌ {message}",
                    gr.Button(interactive=False), 
                    gr.HTML(visible=False)
                )
        
        def run_benchmark():
            success, message, html_report = benchmark.run_benchmark()
            
            if success:
                return (
                    f"✅ {message}",
                    gr.HTML(value=html_report, visible=True)
                )
            else:
                return (
                    f"❌ {message}",
                    gr.HTML(visible=False)
                )
        
        # اتصال رویدادها
        load_btn.click(
            fn=load_files,
            inputs=[chatgpt_file, grok_file, llama_file],
            outputs=[status_output, benchmark_btn, results_html]
        )
        
        benchmark_btn.click(
            fn=run_benchmark,
            outputs=[status_output, results_html]
        )
        
        # راهنمای استفاده
        with gr.Accordion("📖 راهنمای استفاده", open=False):
            gr.Markdown("""
            ### نحوه استفاده:
            1. **بارگذاری فایل‌ها:** سه فایل CSV مربوط به نتایج ناشناس‌سازی مدل‌های مختلف را انتخاب کنید
            2. **بررسی فرمت:** هر فایل باید دارای ستون‌های `original_text` و `anonymized_text` باشد
            3. **اجرای بنچمارک:** روی دکمه "اجرای بنچمارک" کلیک کنید
            4. **مشاهده نتایج:** گزارش HTML کامل با جداول و نمودارها نمایش داده می‌شود
            
            ### متریک‌های ارزیابی:
            - **درستی اندیس‌گذاری:** بررسی شروع از 01 و پیوستگی شماره‌ها
            - **ثبات شناسه‌ها:** استفاده مداوم از یک شناسه برای یک موجودیت
            - **حفظ ساختار:** حفظ واژگان مهم و ساختار جمله
            - **پوشش موجودیت‌ها:** درصد موجودیت‌های شناسایی و ناشناس شده
            - **امتیاز کلی:** ترکیب وزنی همه متریک‌ها
            """)
    
    return interface

# اجرای برنامه
if __name__ == "__main__":
    interface = create_benchmark_interface()
    interface.launch()