Spaces:
Sleeping
Sleeping
File size: 10,946 Bytes
982b341 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 | """
Display utilities for the CodeReview Leaderboard
"""
from typing import List, Dict, Any, Optional, Tuple
import json
from datetime import datetime, timezone
from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
from src.display.formatting import format_table_cell, format_timestamp
def filter_leaderboard_data(
data: List[Dict],
programming_language: str = "All",
comment_language: str = "All",
taxonomy_category: str = "All",
sort_by: str = "llm_pass_1",
sort_order: str = "desc"
) -> List[Dict]:
"""Filter and sort leaderboard data based on criteria"""
if not data:
return []
# Apply filters
filtered_data = data.copy()
if programming_language != "All":
filtered_data = [
entry for entry in filtered_data
if entry.get("programming_language", "").lower() == programming_language.lower()
]
if comment_language != "All":
filtered_data = [
entry for entry in filtered_data
if entry.get("comment_language", "").lower() == comment_language.lower()
]
if taxonomy_category != "All":
filtered_data = [
entry for entry in filtered_data
if entry.get("taxonomy_category", "").lower() == taxonomy_category.lower()
]
# Sort data
reverse = sort_order.lower() == "desc"
try:
if sort_by in ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]:
filtered_data.sort(key=lambda x: x.get(sort_by, 0), reverse=reverse)
elif sort_by in QUALITY_METRICS:
filtered_data.sort(key=lambda x: x.get("metrics", {}).get(sort_by, 0), reverse=reverse)
else:
filtered_data.sort(key=lambda x: str(x.get(sort_by, "")), reverse=reverse)
except Exception as e:
print(f"Error sorting data: {e}")
# Default sort by pass@1
filtered_data.sort(key=lambda x: x.get("llm_pass_1", 0), reverse=True)
return filtered_data
def get_main_leaderboard_data(
data: List[Dict],
programming_language: str = "All",
comment_language: str = "All",
taxonomy_category: str = "All",
sort_by: str = "llm_pass_1"
) -> List[List[str]]:
"""Get formatted main leaderboard table data"""
filtered_data = filter_leaderboard_data(
data, programming_language, comment_language, taxonomy_category, sort_by
)
table_rows = []
for entry in filtered_data:
row = [
format_table_cell(entry.get("model_name", ""), "model"),
format_table_cell(entry.get("programming_language", ""), "programming language"),
format_table_cell(entry.get("comment_language", ""), "comment language"),
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
format_table_cell(entry.get("bleu", 0), "bleu"),
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
format_table_cell(entry.get("llm_pass_5", 0), "pass@5"),
format_table_cell(entry.get("llm_pass_10", 0), "pass@10"),
]
table_rows.append(row)
return table_rows
def get_quality_metrics_data(
data: List[Dict],
programming_language: str = "All",
comment_language: str = "All",
taxonomy_category: str = "All",
sort_by: str = "llm_pass_1"
) -> List[List[str]]:
"""Get formatted quality metrics table data"""
filtered_data = filter_leaderboard_data(
data, programming_language, comment_language, taxonomy_category, sort_by
)
table_rows = []
for entry in filtered_data:
metrics = entry.get("metrics", {})
row = [format_table_cell(entry.get("model_name", ""), "model")]
for metric in QUALITY_METRICS:
formatted_value = format_table_cell(metrics.get(metric, 0), metric.replace("_", " "))
row.append(formatted_value)
table_rows.append(row)
return table_rows
def get_submission_history_data(
data: List[Dict],
programming_language: str = "All",
comment_language: str = "All",
taxonomy_category: str = "All",
limit: int = 50
) -> List[List[str]]:
"""Get formatted submission history data"""
filtered_data = filter_leaderboard_data(
data, programming_language, comment_language, taxonomy_category, "submission_date", "desc"
)
# Limit results
filtered_data = filtered_data[:limit]
table_rows = []
for entry in filtered_data:
row = [
format_table_cell(entry.get("model_name", ""), "model"),
format_table_cell(entry.get("programming_language", ""), "programming language"),
format_table_cell(entry.get("comment_language", ""), "comment language"),
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
format_timestamp(entry.get("submission_date", "")),
entry.get("submission_ip", "").split(".")[0] + ".xxx.xxx.xxx" if entry.get("submission_ip") else "Unknown"
]
table_rows.append(row)
return table_rows
def get_statistics_summary(data: List[Dict]) -> Dict[str, Any]:
"""Get summary statistics for the leaderboard"""
if not data:
return {
"total_models": 0,
"total_submissions": 0,
"avg_pass_1": 0,
"best_model": "None",
"languages_covered": 0,
"categories_covered": 0
}
# Calculate statistics
total_models = len(set(entry.get("model_name", "") for entry in data))
total_submissions = len(data)
pass_1_scores = [entry.get("llm_pass_1", 0) for entry in data if entry.get("llm_pass_1") is not None]
avg_pass_1 = sum(pass_1_scores) / len(pass_1_scores) if pass_1_scores else 0
best_entry = max(data, key=lambda x: x.get("llm_pass_1", 0)) if data else None
best_model = best_entry.get("model_name", "None") if best_entry else "None"
languages_covered = len(set(entry.get("programming_language", "") for entry in data if entry.get("programming_language")))
categories_covered = len(set(entry.get("taxonomy_category", "") for entry in data if entry.get("taxonomy_category")))
return {
"total_models": total_models,
"total_submissions": total_submissions,
"avg_pass_1": avg_pass_1,
"best_model": best_model,
"languages_covered": languages_covered,
"categories_covered": categories_covered
}
def validate_submission_data(data: Dict[str, Any]) -> Tuple[bool, str]:
"""Validate submission data"""
required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
# Check required fields
for field in required_fields:
if not data.get(field):
return False, f"Missing required field: {field}"
# Validate scores
score_fields = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
for field in score_fields:
value = data.get(field)
if value is None:
return False, f"Missing score: {field}"
if not isinstance(value, (int, float)):
return False, f"Invalid score format: {field}"
if not 0 <= value <= 1:
return False, f"Score out of range (0-1): {field}"
# Validate metrics
metrics = data.get("metrics", {})
for metric in QUALITY_METRICS:
value = metrics.get(metric)
if value is None:
return False, f"Missing metric: {metric}"
if not isinstance(value, (int, float)):
return False, f"Invalid metric format: {metric}"
if not 0 <= value <= 10:
return False, f"Metric out of range (0-10): {metric}"
# Validate language and category choices
if data.get("programming_language") not in PROGRAMMING_LANGUAGES:
return False, "Invalid programming language"
if data.get("comment_language") not in COMMENT_LANGUAGES:
return False, "Invalid comment language"
if data.get("taxonomy_category") not in TAXONOMY_CATEGORIES:
return False, "Invalid taxonomy category"
return True, "Valid submission"
def get_leaderboard_insights(data: List[Dict]) -> Dict[str, Any]:
"""Get insights and trends from leaderboard data"""
if not data:
return {}
# Language performance analysis
lang_performance = {}
for lang in PROGRAMMING_LANGUAGES[1:]: # Skip "All"
lang_data = [entry for entry in data if entry.get("programming_language") == lang]
if lang_data:
avg_score = sum(entry.get("llm_pass_1", 0) for entry in lang_data) / len(lang_data)
lang_performance[lang] = {
"avg_score": avg_score,
"model_count": len(lang_data),
"best_model": max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
}
# Category performance analysis
category_performance = {}
for category in TAXONOMY_CATEGORIES[1:]: # Skip "All"
cat_data = [entry for entry in data if entry.get("taxonomy_category") == category]
if cat_data:
avg_score = sum(entry.get("llm_pass_1", 0) for entry in cat_data) / len(cat_data)
category_performance[category] = {
"avg_score": avg_score,
"model_count": len(cat_data),
"best_model": max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
}
return {
"language_performance": lang_performance,
"category_performance": category_performance,
"top_performers": sorted(data, key=lambda x: x.get("llm_pass_1", 0), reverse=True)[:5]
}
def export_leaderboard_data(data: List[Dict], format_type: str = "json") -> str:
"""Export leaderboard data in specified format"""
if format_type.lower() == "json":
return json.dumps(data, indent=2, ensure_ascii=False)
elif format_type.lower() == "csv":
# Simple CSV export
if not data:
return ""
# Get headers
headers = ["model_name", "programming_language", "comment_language", "taxonomy_category",
"bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
headers.extend(QUALITY_METRICS)
lines = [",".join(headers)]
for entry in data:
row = []
for header in headers:
if header in QUALITY_METRICS:
value = entry.get("metrics", {}).get(header, "")
else:
value = entry.get(header, "")
row.append(str(value))
lines.append(",".join(row))
return "\n".join(lines)
else:
return "Unsupported format" |