Alex
zalupa1
982b341
raw
history blame
10.9 kB
"""
Display utilities for the CodeReview Leaderboard
"""
from typing import List, Dict, Any, Optional, Tuple
import json
from datetime import datetime, timezone
from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
from src.display.formatting import format_table_cell, format_timestamp
def filter_leaderboard_data(
data: List[Dict],
programming_language: str = "All",
comment_language: str = "All",
taxonomy_category: str = "All",
sort_by: str = "llm_pass_1",
sort_order: str = "desc"
) -> List[Dict]:
"""Filter and sort leaderboard data based on criteria"""
if not data:
return []
# Apply filters
filtered_data = data.copy()
if programming_language != "All":
filtered_data = [
entry for entry in filtered_data
if entry.get("programming_language", "").lower() == programming_language.lower()
]
if comment_language != "All":
filtered_data = [
entry for entry in filtered_data
if entry.get("comment_language", "").lower() == comment_language.lower()
]
if taxonomy_category != "All":
filtered_data = [
entry for entry in filtered_data
if entry.get("taxonomy_category", "").lower() == taxonomy_category.lower()
]
# Sort data
reverse = sort_order.lower() == "desc"
try:
if sort_by in ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]:
filtered_data.sort(key=lambda x: x.get(sort_by, 0), reverse=reverse)
elif sort_by in QUALITY_METRICS:
filtered_data.sort(key=lambda x: x.get("metrics", {}).get(sort_by, 0), reverse=reverse)
else:
filtered_data.sort(key=lambda x: str(x.get(sort_by, "")), reverse=reverse)
except Exception as e:
print(f"Error sorting data: {e}")
# Default sort by pass@1
filtered_data.sort(key=lambda x: x.get("llm_pass_1", 0), reverse=True)
return filtered_data
def get_main_leaderboard_data(
data: List[Dict],
programming_language: str = "All",
comment_language: str = "All",
taxonomy_category: str = "All",
sort_by: str = "llm_pass_1"
) -> List[List[str]]:
"""Get formatted main leaderboard table data"""
filtered_data = filter_leaderboard_data(
data, programming_language, comment_language, taxonomy_category, sort_by
)
table_rows = []
for entry in filtered_data:
row = [
format_table_cell(entry.get("model_name", ""), "model"),
format_table_cell(entry.get("programming_language", ""), "programming language"),
format_table_cell(entry.get("comment_language", ""), "comment language"),
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
format_table_cell(entry.get("bleu", 0), "bleu"),
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
format_table_cell(entry.get("llm_pass_5", 0), "pass@5"),
format_table_cell(entry.get("llm_pass_10", 0), "pass@10"),
]
table_rows.append(row)
return table_rows
def get_quality_metrics_data(
data: List[Dict],
programming_language: str = "All",
comment_language: str = "All",
taxonomy_category: str = "All",
sort_by: str = "llm_pass_1"
) -> List[List[str]]:
"""Get formatted quality metrics table data"""
filtered_data = filter_leaderboard_data(
data, programming_language, comment_language, taxonomy_category, sort_by
)
table_rows = []
for entry in filtered_data:
metrics = entry.get("metrics", {})
row = [format_table_cell(entry.get("model_name", ""), "model")]
for metric in QUALITY_METRICS:
formatted_value = format_table_cell(metrics.get(metric, 0), metric.replace("_", " "))
row.append(formatted_value)
table_rows.append(row)
return table_rows
def get_submission_history_data(
data: List[Dict],
programming_language: str = "All",
comment_language: str = "All",
taxonomy_category: str = "All",
limit: int = 50
) -> List[List[str]]:
"""Get formatted submission history data"""
filtered_data = filter_leaderboard_data(
data, programming_language, comment_language, taxonomy_category, "submission_date", "desc"
)
# Limit results
filtered_data = filtered_data[:limit]
table_rows = []
for entry in filtered_data:
row = [
format_table_cell(entry.get("model_name", ""), "model"),
format_table_cell(entry.get("programming_language", ""), "programming language"),
format_table_cell(entry.get("comment_language", ""), "comment language"),
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
format_timestamp(entry.get("submission_date", "")),
entry.get("submission_ip", "").split(".")[0] + ".xxx.xxx.xxx" if entry.get("submission_ip") else "Unknown"
]
table_rows.append(row)
return table_rows
def get_statistics_summary(data: List[Dict]) -> Dict[str, Any]:
"""Get summary statistics for the leaderboard"""
if not data:
return {
"total_models": 0,
"total_submissions": 0,
"avg_pass_1": 0,
"best_model": "None",
"languages_covered": 0,
"categories_covered": 0
}
# Calculate statistics
total_models = len(set(entry.get("model_name", "") for entry in data))
total_submissions = len(data)
pass_1_scores = [entry.get("llm_pass_1", 0) for entry in data if entry.get("llm_pass_1") is not None]
avg_pass_1 = sum(pass_1_scores) / len(pass_1_scores) if pass_1_scores else 0
best_entry = max(data, key=lambda x: x.get("llm_pass_1", 0)) if data else None
best_model = best_entry.get("model_name", "None") if best_entry else "None"
languages_covered = len(set(entry.get("programming_language", "") for entry in data if entry.get("programming_language")))
categories_covered = len(set(entry.get("taxonomy_category", "") for entry in data if entry.get("taxonomy_category")))
return {
"total_models": total_models,
"total_submissions": total_submissions,
"avg_pass_1": avg_pass_1,
"best_model": best_model,
"languages_covered": languages_covered,
"categories_covered": categories_covered
}
def validate_submission_data(data: Dict[str, Any]) -> Tuple[bool, str]:
"""Validate submission data"""
required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
# Check required fields
for field in required_fields:
if not data.get(field):
return False, f"Missing required field: {field}"
# Validate scores
score_fields = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
for field in score_fields:
value = data.get(field)
if value is None:
return False, f"Missing score: {field}"
if not isinstance(value, (int, float)):
return False, f"Invalid score format: {field}"
if not 0 <= value <= 1:
return False, f"Score out of range (0-1): {field}"
# Validate metrics
metrics = data.get("metrics", {})
for metric in QUALITY_METRICS:
value = metrics.get(metric)
if value is None:
return False, f"Missing metric: {metric}"
if not isinstance(value, (int, float)):
return False, f"Invalid metric format: {metric}"
if not 0 <= value <= 10:
return False, f"Metric out of range (0-10): {metric}"
# Validate language and category choices
if data.get("programming_language") not in PROGRAMMING_LANGUAGES:
return False, "Invalid programming language"
if data.get("comment_language") not in COMMENT_LANGUAGES:
return False, "Invalid comment language"
if data.get("taxonomy_category") not in TAXONOMY_CATEGORIES:
return False, "Invalid taxonomy category"
return True, "Valid submission"
def get_leaderboard_insights(data: List[Dict]) -> Dict[str, Any]:
"""Get insights and trends from leaderboard data"""
if not data:
return {}
# Language performance analysis
lang_performance = {}
for lang in PROGRAMMING_LANGUAGES[1:]: # Skip "All"
lang_data = [entry for entry in data if entry.get("programming_language") == lang]
if lang_data:
avg_score = sum(entry.get("llm_pass_1", 0) for entry in lang_data) / len(lang_data)
lang_performance[lang] = {
"avg_score": avg_score,
"model_count": len(lang_data),
"best_model": max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
}
# Category performance analysis
category_performance = {}
for category in TAXONOMY_CATEGORIES[1:]: # Skip "All"
cat_data = [entry for entry in data if entry.get("taxonomy_category") == category]
if cat_data:
avg_score = sum(entry.get("llm_pass_1", 0) for entry in cat_data) / len(cat_data)
category_performance[category] = {
"avg_score": avg_score,
"model_count": len(cat_data),
"best_model": max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
}
return {
"language_performance": lang_performance,
"category_performance": category_performance,
"top_performers": sorted(data, key=lambda x: x.get("llm_pass_1", 0), reverse=True)[:5]
}
def export_leaderboard_data(data: List[Dict], format_type: str = "json") -> str:
"""Export leaderboard data in specified format"""
if format_type.lower() == "json":
return json.dumps(data, indent=2, ensure_ascii=False)
elif format_type.lower() == "csv":
# Simple CSV export
if not data:
return ""
# Get headers
headers = ["model_name", "programming_language", "comment_language", "taxonomy_category",
"bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
headers.extend(QUALITY_METRICS)
lines = [",".join(headers)]
for entry in data:
row = []
for header in headers:
if header in QUALITY_METRICS:
value = entry.get("metrics", {}).get(header, "")
else:
value = entry.get(header, "")
row.append(str(value))
lines.append(",".join(row))
return "\n".join(lines)
else:
return "Unsupported format"