Spaces:
Running
Running
| import numpy as np | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from typing import Dict | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import os | |
| def evaluate_stability(df: pd.DataFrame, prompt_col: str, answer_col: str, | |
| model_name: str = 'paraphrase-MiniLM-L6-v2', | |
| progress=None) -> Dict: | |
| if progress: | |
| progress(0, desc="Loading sentence transformer model...") | |
| model = SentenceTransformer(model_name) | |
| prompts = df[prompt_col].tolist() | |
| outputs = df[answer_col].tolist() | |
| if progress: | |
| progress(0.3, desc="Encoding prompts...") | |
| prompt_embeddings = model.encode(prompts) | |
| if progress: | |
| progress(0.6, desc="Encoding outputs...") | |
| output_embeddings = model.encode(outputs) | |
| if progress: | |
| progress(0.9, desc="Computing similarities...") | |
| similarities = cosine_similarity(prompt_embeddings, output_embeddings) | |
| stability_coefficients = np.diag(similarities) | |
| if progress: | |
| progress(1.0, desc="Done!") | |
| return { | |
| 'stability_score': np.mean(stability_coefficients) * 100, | |
| 'stability_std': np.std(stability_coefficients) * 100, | |
| 'individual_similarities': stability_coefficients | |
| } | |
| def evaluate_combined_score(creativity_df: pd.DataFrame, stability_results: Dict, | |
| model_name: str) -> Dict: | |
| creative_score = creativity_df["Среднее"].mean() | |
| stability_score = stability_results['stability_score'] | |
| combined_score = (creative_score + stability_score) / 2 | |
| timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S') | |
| return { | |
| 'model': model_name, | |
| 'creativity_score': creative_score, | |
| 'stability_score': stability_score, | |
| 'combined_score': combined_score, | |
| 'evaluation_timestamp': timestamp, | |
| 'creative_details': { | |
| 'creativity': creativity_df["Креативность"].mean(), | |
| 'diversity': creativity_df["Разнообразие"].mean(), | |
| 'relevance': creativity_df["Релевантность"].mean(), | |
| }, | |
| 'stability_details': stability_results | |
| } | |
| def create_radar_chart(all_results): | |
| os.makedirs('results', exist_ok=True) | |
| # Extract data for radar chart | |
| categories = ['Креативность', 'Разнообразие', 'Релевантность', 'Стабильность'] | |
| models = list(all_results.keys()) | |
| # Create figure and polar axis | |
| fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True)) | |
| # Number of variables | |
| N = len(categories) | |
| # Angle of each axis | |
| angles = [n / float(N) * 2 * np.pi for n in range(N)] | |
| angles += angles[:1] # Close the polygon | |
| # Set the labels | |
| ax.set_xticks(angles[:-1]) | |
| ax.set_xticklabels(categories) | |
| # Draw the polygons for each model | |
| for i, model in enumerate(models): | |
| values = [ | |
| all_results[model]['creative_details']['creativity'], | |
| all_results[model]['creative_details']['diversity'], | |
| all_results[model]['creative_details']['relevance'], | |
| all_results[model]['stability_score'] | |
| ] | |
| # Add the first value again to close the polygon | |
| values += values[:1] | |
| # Plot values | |
| ax.plot(angles, values, linewidth=2, linestyle='solid', label=model) | |
| ax.fill(angles, values, alpha=0.1) | |
| # Add legend | |
| plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1)) | |
| # Add title | |
| plt.title('Model Performance Comparison', size=15, pad=20) | |
| # Save the chart | |
| radar_chart_path = 'results/radar_chart.png' | |
| plt.savefig(radar_chart_path, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| return radar_chart_path | |
| def create_bar_chart(all_results): | |
| # Extract data for bar chart | |
| models = list(all_results.keys()) | |
| creative_scores = [all_results[model]['creativity_score'] for model in models] | |
| stability_scores = [all_results[model]['stability_score'] for model in models] | |
| combined_scores = [all_results[model]['combined_score'] for model in models] | |
| # Create figure | |
| fig, ax = plt.subplots(figsize=(12, 8)) | |
| # Set bar width | |
| bar_width = 0.25 | |
| # Set bar positions | |
| r1 = np.arange(len(models)) | |
| r2 = [x + bar_width for x in r1] | |
| r3 = [x + bar_width for x in r2] | |
| # Create bars | |
| ax.bar(r1, creative_scores, width=bar_width, label='Креативность', color='skyblue') | |
| ax.bar(r2, stability_scores, width=bar_width, label='Стабильность', color='orange') | |
| ax.bar(r3, combined_scores, width=bar_width, label='Общий балл', color='green') | |
| # Add labels and title | |
| ax.set_xlabel('Модели') | |
| ax.set_ylabel('Оценка') | |
| ax.set_title('Сравнение моделей по креативности и стабильности') | |
| ax.set_xticks([r + bar_width for r in range(len(models))]) | |
| ax.set_xticklabels(models) | |
| # Add legend | |
| ax.legend() | |
| # Save the chart | |
| bar_chart_path = 'results/bar_chart.png' | |
| plt.savefig(bar_chart_path, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| return bar_chart_path | |
| def get_leaderboard_data(): | |
| benchmark_file = 'results/benchmark_results.csv' | |
| if not os.path.exists(benchmark_file): | |
| return pd.DataFrame(columns=[ | |
| "Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл" | |
| ]) | |
| try: | |
| df = pd.read_csv(benchmark_file) | |
| # Format the dataframe for display | |
| formatted_df = pd.DataFrame({ | |
| "Model": df['model'], | |
| "Креативность": df['creativity_score'].round(2), | |
| "Стабильность": df['stability_score'].round(2), | |
| "Общий балл": df['combined_score'].round(2) | |
| }) | |
| return formatted_df.sort_values(by="Общий балл", ascending=False) | |
| except Exception as e: | |
| print(f"Error loading leaderboard data: {str(e)}") | |
| return pd.DataFrame(columns=[ | |
| "Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл" | |
| ]) |