"""Statistics calculation and visualization functionality""" import matplotlib.pyplot as plt import numpy as np from typing import List, Dict, Optional, Tuple class StatisticsCalculator: """Handles statistical calculations and visualization generation""" def get_score_statistics(self, scores): """Calculate mean and standard deviation of scores.""" if not scores: return None, None return np.mean(scores), np.std(scores) def create_violin_plot(self, prompt_results, user_score, user_tokens): """Create horizontal violin plots stacked vertically by token count.""" token_counts = [1, 2, 3, 4, 5] # Filter to only include token counts that have data token_data = [] for token_count in token_counts: token_scores = [r["cosine_distance"] for r in prompt_results if r["num_user_tokens"] == token_count] if token_scores: token_data.append((token_count, token_scores)) if not token_data: fig, ax = plt.subplots(figsize=(10, 4)) ax.text(0.5, 0.5, 'No data available for visualization', ha='center', va='center', transform=ax.transAxes, fontsize=14) ax.set_title('Score Distribution by Token Count', fontsize=14, fontweight='bold') return fig # Create subplots - one for each token count fig, axes = plt.subplots(len(token_data), 1, figsize=(10, 1 * len(token_data)), sharex=True) # Handle single subplot case if len(token_data) == 1: axes = [axes] for i, (token_count, scores) in enumerate(token_data): ax = axes[i] # Create horizontal violin plot parts = ax.violinplot([scores], positions=[0], vert=False, showmeans=True, showextrema=True) # Color based on whether this is user's token count - reuse existing gradient colors color = '#667eea' if token_count == user_tokens else '#764ba2' for pc in parts['bodies']: pc.set_facecolor(color) pc.set_alpha(0.7) pc.set_edgecolor('black') pc.set_linewidth(1) # Highlight user's score if this is their token count if token_count == user_tokens: ax.scatter(user_score, 0, color='red', s=150, zorder=5, marker='*', label=f'Your Score: {user_score:.3f}') ax.legend(loc='upper right') # Styling for each subplot ax.set_ylabel(f'{token_count} token{"s" if token_count != 1 else ""}\n(n={len(scores)})', fontsize=11, fontweight='bold') ax.set_yticks([]) ax.grid(True, alpha=0.3, axis='x') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) # Set consistent y-limits for visual consistency ax.set_ylim(-0.4, 0.4) # Set common x-label only on bottom subplot axes[-1].set_xlabel('Creativity Score (Cosine Distance)', fontsize=12) # Overall title fig.suptitle('Score Distribution by Token Count', fontsize=14, fontweight='bold', y=0.98) plt.tight_layout() plt.subplots_adjust(top=0.92) return fig def calculate_session_ranking_stats(self, session_results, data_manager, scorer): """Calculate comprehensive ranking statistics for the session.""" all_results = data_manager.get_results() ranking_stats = { "best_rank": None, "best_percentile": None, "average_percentile": 0.0, "total_ranked_attempts": 0, "ranking_trend": "stable", # up, down, stable "recent_percentiles": [] } if not session_results: return ranking_stats percentiles = [] ranks = [] # Calculate rankings for each session attempt for result in session_results: # Get all results for this specific prompt prompt_results = data_manager.filter_results_by_partial_response( all_results, result["prompt"], result["llm_partial_response"] ) if len(prompt_results) >= 2: # Need at least 2 results to rank rank, percentile = scorer.calculate_rank_and_percentile( result["cosine_distance"], prompt_results, result["num_user_tokens"] ) if rank and percentile is not None: percentiles.append(percentile) ranks.append(rank) ranking_stats["recent_percentiles"].append({ "percentile": percentile, "rank": rank, "total": len([r for r in prompt_results if r["num_user_tokens"] == result["num_user_tokens"]]), "timestamp": result["timestamp"] }) if percentiles: ranking_stats["total_ranked_attempts"] = len(percentiles) ranking_stats["average_percentile"] = sum(percentiles) / len(percentiles) ranking_stats["best_percentile"] = max(percentiles) ranking_stats["best_rank"] = min(ranks) if ranks else None # Determine trend (compare first half vs second half) if len(percentiles) >= 4: mid_point = len(percentiles) // 2 first_half_avg = sum(percentiles[:mid_point]) / mid_point second_half_avg = sum(percentiles[mid_point:]) / (len(percentiles) - mid_point) if second_half_avg > first_half_avg + 10: ranking_stats["ranking_trend"] = "up" elif second_half_avg < first_half_avg - 10: ranking_stats["ranking_trend"] = "down" return ranking_stats