Spaces:

alon-albalak
/

collaborative-decoding

Sleeping

collaborative-decoding / src /scoring /statistics.py

Alon Albalak

major update: data saved to hf, user sessions maintain separation, fixed scoring bug

36d5e94 3 months ago

6.25 kB

	"""Statistics calculation and visualization functionality"""

	import matplotlib.pyplot as plt
	import numpy as np
	from typing import List, Dict, Optional, Tuple


	class StatisticsCalculator:
	"""Handles statistical calculations and visualization generation"""

	def get_score_statistics(self, scores):
	"""Calculate mean and standard deviation of scores."""
	if not scores:
	return None, None
	return np.mean(scores), np.std(scores)

	def create_violin_plot(self, prompt_results, user_score, user_tokens):
	"""Create horizontal violin plots stacked vertically by token count."""
	token_counts = [1, 2, 3, 4, 5]

	# Filter to only include token counts that have data
	token_data = []
	for token_count in token_counts:
	token_scores = [r["cosine_distance"] for r in prompt_results
	if r["num_user_tokens"] == token_count]
	if token_scores:
	token_data.append((token_count, token_scores))

	if not token_data:
	fig, ax = plt.subplots(figsize=(10, 4))
	ax.text(0.5, 0.5, 'No data available for visualization',
	ha='center', va='center', transform=ax.transAxes, fontsize=14)
	ax.set_title('Score Distribution by Token Count', fontsize=14, fontweight='bold')
	return fig

	# Create subplots - one for each token count
	fig, axes = plt.subplots(len(token_data), 1, figsize=(10, 1 * len(token_data)),
	sharex=True)

	# Handle single subplot case
	if len(token_data) == 1:
	axes = [axes]

	for i, (token_count, scores) in enumerate(token_data):
	ax = axes[i]

	# Create horizontal violin plot
	parts = ax.violinplot([scores], positions=[0], vert=False,
	showmeans=True, showextrema=True)

	# Color based on whether this is user's token count - reuse existing gradient colors
	color = '#667eea' if token_count == user_tokens else '#764ba2'
	for pc in parts['bodies']:
	pc.set_facecolor(color)
	pc.set_alpha(0.7)
	pc.set_edgecolor('black')
	pc.set_linewidth(1)

	# Highlight user's score if this is their token count
	if token_count == user_tokens:
	ax.scatter(user_score, 0, color='red', s=150, zorder=5,
	marker='*', label=f'Your Score: {user_score:.3f}')
	ax.legend(loc='upper right')

	# Styling for each subplot
	ax.set_ylabel(f'{token_count} token{"s" if token_count != 1 else ""}\n(n={len(scores)})',
	fontsize=11, fontweight='bold')
	ax.set_yticks([])
	ax.grid(True, alpha=0.3, axis='x')
	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)
	ax.spines['left'].set_visible(False)

	# Set consistent y-limits for visual consistency
	ax.set_ylim(-0.4, 0.4)

	# Set common x-label only on bottom subplot
	axes[-1].set_xlabel('Creativity Score (Cosine Distance)', fontsize=12)

	# Overall title
	fig.suptitle('Score Distribution by Token Count', fontsize=14, fontweight='bold', y=0.98)

	plt.tight_layout()
	plt.subplots_adjust(top=0.92)
	return fig

	def calculate_session_ranking_stats(self, session_results, data_manager, scorer):
	"""Calculate comprehensive ranking statistics for the session."""
	all_results = data_manager.get_results()
	ranking_stats = {
	"best_rank": None,
	"best_percentile": None,
	"average_percentile": 0.0,
	"total_ranked_attempts": 0,
	"ranking_trend": "stable", # up, down, stable
	"recent_percentiles": []
	}

	if not session_results:
	return ranking_stats

	percentiles = []
	ranks = []

	# Calculate rankings for each session attempt
	for result in session_results:
	# Get all results for this specific prompt
	prompt_results = data_manager.filter_results_by_partial_response(
	all_results, result["prompt"], result["llm_partial_response"]
	)

	if len(prompt_results) >= 2: # Need at least 2 results to rank
	rank, percentile = scorer.calculate_rank_and_percentile(
	result["cosine_distance"], prompt_results, result["num_user_tokens"]
	)
	if rank and percentile is not None:
	percentiles.append(percentile)
	ranks.append(rank)
	ranking_stats["recent_percentiles"].append({
	"percentile": percentile,
	"rank": rank,
	"total": len([r for r in prompt_results if r["num_user_tokens"] == result["num_user_tokens"]]),
	"timestamp": result["timestamp"]
	})

	if percentiles:
	ranking_stats["total_ranked_attempts"] = len(percentiles)
	ranking_stats["average_percentile"] = sum(percentiles) / len(percentiles)
	ranking_stats["best_percentile"] = max(percentiles)
	ranking_stats["best_rank"] = min(ranks) if ranks else None

	# Determine trend (compare first half vs second half)
	if len(percentiles) >= 4:
	mid_point = len(percentiles) // 2
	first_half_avg = sum(percentiles[:mid_point]) / mid_point
	second_half_avg = sum(percentiles[mid_point:]) / (len(percentiles) - mid_point)

	if second_half_avg > first_half_avg + 10:
	ranking_stats["ranking_trend"] = "up"
	elif second_half_avg < first_half_avg - 10:
	ranking_stats["ranking_trend"] = "down"

	return ranking_stats