Spaces:

MrSimple01
/

RuSimulBench_arena

Sleeping

App Files Files Community

MrSimple01 commited on Mar 20, 2025

Commit

5465a38

verified ·

1 Parent(s): 908acb6

Create evaluate_stability.py

Browse files

Files changed (1) hide show

evaluate_stability.py +175 -0

evaluate_stability.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from typing import Dict
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+def evaluate_stability(df: pd.DataFrame, prompt_col: str, answer_col: str,
+                       model_name: str = 'paraphrase-MiniLM-L6-v2',
+                       progress=None) -> Dict:
+    if progress:
+        progress(0, desc="Loading sentence transformer model...")
+    model = SentenceTransformer(model_name)
+    prompts = df[prompt_col].tolist()
+    outputs = df[answer_col].tolist()
+    if progress:
+        progress(0.3, desc="Encoding prompts...")
+    prompt_embeddings = model.encode(prompts)
+    if progress:
+        progress(0.6, desc="Encoding outputs...")
+    output_embeddings = model.encode(outputs)
+    if progress:
+        progress(0.9, desc="Computing similarities...")
+    similarities = cosine_similarity(prompt_embeddings, output_embeddings)
+    stability_coefficients = np.diag(similarities)
+    if progress:
+        progress(1.0, desc="Done!")
+    return {
+        'stability_score': np.mean(stability_coefficients) * 100,
+        'stability_std': np.std(stability_coefficients) * 100,
+        'individual_similarities': stability_coefficients
+    }
+def evaluate_combined_score(creativity_df: pd.DataFrame, stability_results: Dict,
+                           model_name: str) -> Dict:
+    creative_score = creativity_df["Среднее"].mean()
+    stability_score = stability_results['stability_score']
+    combined_score = (creative_score + stability_score) / 2
+    timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
+    return {
+        'model': model_name,
+        'creativity_score': creative_score,
+        'stability_score': stability_score,
+        'combined_score': combined_score,
+        'evaluation_timestamp': timestamp,
+        'creative_details': {
+            'creativity': creativity_df["Креативность"].mean(),
+            'diversity': creativity_df["Разнообразие"].mean(),
+            'relevance': creativity_df["Релевантность"].mean(),
+        },
+        'stability_details': stability_results
+    }
+def create_radar_chart(all_results):
+    os.makedirs('results', exist_ok=True)
+    # Extract data for radar chart
+    categories = ['Креативность', 'Разнообразие', 'Релевантность', 'Стабильность']
+    models = list(all_results.keys())
+    # Create figure and polar axis
+    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
+    # Number of variables
+    N = len(categories)
+    # Angle of each axis
+    angles = [n / float(N) * 2 * np.pi for n in range(N)]
+    angles += angles[:1]  # Close the polygon
+    # Set the labels
+    ax.set_xticks(angles[:-1])
+    ax.set_xticklabels(categories)
+    # Draw the polygons for each model
+    for i, model in enumerate(models):
+        values = [
+            all_results[model]['creative_details']['creativity'],
+            all_results[model]['creative_details']['diversity'],
+            all_results[model]['creative_details']['relevance'],
+            all_results[model]['stability_score']
+        ]
+        # Add the first value again to close the polygon
+        values += values[:1]
+        # Plot values
+        ax.plot(angles, values, linewidth=2, linestyle='solid', label=model)
+        ax.fill(angles, values, alpha=0.1)
+    # Add legend
+    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
+    # Add title
+    plt.title('Model Performance Comparison', size=15, pad=20)
+    # Save the chart
+    radar_chart_path = 'results/radar_chart.png'
+    plt.savefig(radar_chart_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    return radar_chart_path
+def create_bar_chart(all_results):
+    # Extract data for bar chart
+    models = list(all_results.keys())
+    creative_scores = [all_results[model]['creativity_score'] for model in models]
+    stability_scores = [all_results[model]['stability_score'] for model in models]
+    combined_scores = [all_results[model]['combined_score'] for model in models]
+    # Create figure
+    fig, ax = plt.subplots(figsize=(12, 8))
+    # Set bar width
+    bar_width = 0.25
+    # Set bar positions
+    r1 = np.arange(len(models))
+    r2 = [x + bar_width for x in r1]
+    r3 = [x + bar_width for x in r2]
+    # Create bars
+    ax.bar(r1, creative_scores, width=bar_width, label='Креативность', color='skyblue')
+    ax.bar(r2, stability_scores, width=bar_width, label='Стабильность', color='orange')
+    ax.bar(r3, combined_scores, width=bar_width, label='Общий балл', color='green')
+    # Add labels and title
+    ax.set_xlabel('Модели')
+    ax.set_ylabel('Оценка')
+    ax.set_title('Сра��нение моделей по креативности и стабильности')
+    ax.set_xticks([r + bar_width for r in range(len(models))])
+    ax.set_xticklabels(models)
+    # Add legend
+    ax.legend()
+    # Save the chart
+    bar_chart_path = 'results/bar_chart.png'
+    plt.savefig(bar_chart_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    return bar_chart_path
+def get_leaderboard_data():
+    benchmark_file = 'results/benchmark_results.csv'
+    if not os.path.exists(benchmark_file):
+        return pd.DataFrame(columns=[
+            "Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл"
+        ])
+    try:
+        df = pd.read_csv(benchmark_file)
+        # Format the dataframe for display
+        formatted_df = pd.DataFrame({
+            "Model": df['model'],
+            "Креативность": df['creativity_score'].round(2),
+            "Стабильность": df['stability_score'].round(2),
+            "Общий балл": df['combined_score'].round(2)
+        })
+        return formatted_df.sort_values(by="Общий балл", ascending=False)
+    except Exception as e:
+        print(f"Error loading leaderboard data: {str(e)}")
+        return pd.DataFrame(columns=[
+            "Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл"
+        ])