CodeReviewBench / app.py
Alex
zalupa1
982b341
raw
history blame
12.8 kB
"""
CodeReview Leaderboard - Inspired by circle-guard-bench
A comprehensive leaderboard for code review generation models
"""
import gradio as gr
from typing import List, Dict, Any
from datetime import datetime, timezone
# Import our modules
from src.envs import (
PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES,
MAIN_HEADERS, QUALITY_HEADERS
)
from src.about import TITLE, INTRODUCTION_TEXT
from src.display.css_html_js import DARK_THEME_CSS, CUSTOM_JS, HEADER_HTML, FOOTER_HTML
from src.display.utils import (
get_main_leaderboard_data, get_quality_metrics_data,
get_submission_history_data, get_statistics_summary
)
from src.leaderboard.processor import LeaderboardProcessor
from src.submission.submit import SubmissionHandler
# Initialize processors
processor = LeaderboardProcessor()
submission_handler = SubmissionHandler()
# Global state
current_filters = {
"programming_language": "All",
"comment_language": "All",
"taxonomy_category": "All"
}
def update_leaderboard_tables(
programming_language: str = "All",
comment_language: str = "All",
taxonomy_category: str = "All"
):
"""Update leaderboard tables with filters"""
global current_filters
current_filters = {
"programming_language": programming_language,
"comment_language": comment_language,
"taxonomy_category": taxonomy_category
}
# Load current data
data = processor.load_leaderboard_data()
# Get filtered tables
main_table = get_main_leaderboard_data(
data, programming_language, comment_language, taxonomy_category
)
quality_table = get_quality_metrics_data(
data, programming_language, comment_language, taxonomy_category
)
# Get statistics
stats = get_statistics_summary(data)
# Format statistics display
stats_text = f"""
## πŸ“Š Current Statistics
- **Total Models**: {stats['total_models']}
- **Total Submissions**: {stats['total_submissions']}
- **Average Pass@1**: {stats['avg_pass_1']:.3f}
- **Best Model**: {stats['best_model']}
- **Languages Covered**: {stats['languages_covered']}
- **Categories Covered**: {stats['categories_covered']}
"""
return main_table, quality_table, stats_text
def refresh_data():
"""Refresh all data from storage"""
return update_leaderboard_tables(
current_filters["programming_language"],
current_filters["comment_language"],
current_filters["taxonomy_category"]
)
def handle_submission(
request: gr.Request,
*args
):
"""Handle model submission"""
# Get current data
current_data = processor.load_leaderboard_data()
# Call submission handler
result = submission_handler.submit_model(request, current_data, *args)
# If submission was successful, refresh tables
if result[0] != current_data: # Data was updated
main_table, quality_table, stats_text = update_leaderboard_tables(
current_filters["programming_language"],
current_filters["comment_language"],
current_filters["taxonomy_category"]
)
return result[0], main_table, quality_table, result[3], stats_text
else:
return result[0], result[1], result[2], result[3], None
# Create the Gradio interface
with gr.Blocks(
theme=gr.themes.Base(),
css=DARK_THEME_CSS,
js=CUSTOM_JS,
title=TITLE,
head="<meta name='viewport' content='width=device-width, initial-scale=1'>"
) as demo:
# Header
gr.HTML(HEADER_HTML)
# State to store leaderboard data
leaderboard_state = gr.State(value=processor.load_leaderboard_data())
# Main content tabs
with gr.Tabs():
# Leaderboard Tab
with gr.Tab("πŸ† Leaderboard"):
# Filters
with gr.Row():
prog_lang_filter = gr.Dropdown(
choices=PROGRAMMING_LANGUAGES,
value="All",
label="πŸ” Programming Language",
info="Filter by programming language"
)
comment_lang_filter = gr.Dropdown(
choices=COMMENT_LANGUAGES,
value="All",
label="🌍 Comment Language",
info="Filter by comment language"
)
taxonomy_filter = gr.Dropdown(
choices=TAXONOMY_CATEGORIES,
value="All",
label="🏷️ Taxonomy Category",
info="Filter by review category"
)
refresh_btn = gr.Button("πŸ”„ Refresh", variant="secondary")
# Statistics
stats_display = gr.Markdown("")
# Main leaderboard table
with gr.Row():
main_leaderboard = gr.Dataframe(
headers=MAIN_HEADERS,
label="πŸ… Main Leaderboard",
interactive=False,
wrap=True,
max_height=600
)
# Quality metrics table
with gr.Row():
quality_metrics = gr.Dataframe(
headers=QUALITY_HEADERS,
label="πŸ“Š Quality Metrics",
interactive=False,
wrap=True,
max_height=600
)
# Submission Tab
with gr.Tab("πŸ“ Submit Model"):
# Create submission form
form_components = submission_handler.get_submission_form_components()
# Connect submission handler
form_components["submit_btn"].click(
fn=handle_submission,
inputs=[
leaderboard_state,
form_components["model_name"],
form_components["programming_language"],
form_components["comment_language"],
form_components["taxonomy_category"],
form_components["bleu"],
form_components["pass1"],
form_components["pass5"],
form_components["pass10"],
form_components["readability"],
form_components["relevance"],
form_components["explanation_clarity"],
form_components["problem_identification"],
form_components["actionability"],
form_components["completeness"],
form_components["specificity"],
form_components["contextual_adequacy"],
form_components["consistency"],
form_components["brevity"],
],
outputs=[
leaderboard_state,
main_leaderboard,
quality_metrics,
form_components["status_msg"],
stats_display
]
)
# Analytics Tab
with gr.Tab("πŸ“ˆ Analytics"):
with gr.Row():
analytics_prog_lang = gr.Dropdown(
choices=PROGRAMMING_LANGUAGES,
value="All",
label="Programming Language"
)
analytics_comment_lang = gr.Dropdown(
choices=COMMENT_LANGUAGES,
value="All",
label="Comment Language"
)
analytics_taxonomy = gr.Dropdown(
choices=TAXONOMY_CATEGORIES,
value="All",
label="Taxonomy Category"
)
# Submission history
submission_history = gr.Dataframe(
headers=["Model", "Programming Language", "Comment Language", "Taxonomy", "Pass@1", "Date", "IP"],
label="πŸ“‹ Recent Submissions",
interactive=False,
max_height=400
)
# Language performance analysis
with gr.Row():
with gr.Column():
gr.Markdown("### πŸ—£οΈ Language Performance Analysis")
language_analysis = gr.Dataframe(
headers=["Language", "Avg Pass@1", "Model Count", "Best Model"],
label="Programming Language Performance",
interactive=False
)
with gr.Column():
gr.Markdown("### 🏷️ Category Performance Analysis")
category_analysis = gr.Dataframe(
headers=["Category", "Avg Pass@1", "Model Count", "Best Model"],
label="Taxonomy Category Performance",
interactive=False
)
# About Tab
with gr.Tab("ℹ️ About"):
gr.Markdown(INTRODUCTION_TEXT)
# Export functionality
with gr.Row():
export_format = gr.Dropdown(
choices=["JSON", "CSV"],
value="JSON",
label="Export Format"
)
export_btn = gr.Button("πŸ“₯ Export Data")
export_output = gr.Textbox(
label="Export Output",
lines=10,
max_lines=20,
show_copy_button=True
)
# Footer
gr.HTML(FOOTER_HTML)
# Initialize with data
initial_main, initial_quality, initial_stats = update_leaderboard_tables()
# Update tables when filters change
filter_inputs = [prog_lang_filter, comment_lang_filter, taxonomy_filter]
filter_outputs = [main_leaderboard, quality_metrics, stats_display]
for filter_input in filter_inputs:
filter_input.change(
fn=update_leaderboard_tables,
inputs=filter_inputs,
outputs=filter_outputs
)
# Refresh button
refresh_btn.click(
fn=refresh_data,
outputs=filter_outputs
)
# Analytics updates
analytics_inputs = [analytics_prog_lang, analytics_comment_lang, analytics_taxonomy]
def update_analytics(prog_lang, comment_lang, taxonomy):
"""Update analytics tables"""
data = processor.load_leaderboard_data()
# Get submission history
history = get_submission_history_data(data, prog_lang, comment_lang, taxonomy)
# Get language performance
lang_perf = []
for lang in PROGRAMMING_LANGUAGES[1:]:
lang_data = [d for d in data if d.get("programming_language") == lang]
if lang_data:
avg_score = sum(d.get("llm_pass_1", 0) for d in lang_data) / len(lang_data)
best_model = max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
lang_perf.append([lang, f"{avg_score:.3f}", len(lang_data), best_model])
# Get category performance
cat_perf = []
for cat in TAXONOMY_CATEGORIES[1:]:
cat_data = [d for d in data if d.get("taxonomy_category") == cat]
if cat_data:
avg_score = sum(d.get("llm_pass_1", 0) for d in cat_data) / len(cat_data)
best_model = max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
cat_perf.append([cat, f"{avg_score:.3f}", len(cat_data), best_model])
return history, lang_perf, cat_perf
for analytics_input in analytics_inputs:
analytics_input.change(
fn=update_analytics,
inputs=analytics_inputs,
outputs=[submission_history, language_analysis, category_analysis]
)
# Export functionality
def export_data(format_type):
"""Export leaderboard data"""
return processor.export_data(format_type.lower())
export_btn.click(
fn=export_data,
inputs=[export_format],
outputs=[export_output]
)
# Set initial values
demo.load(
fn=lambda: (initial_main, initial_quality, initial_stats),
outputs=[main_leaderboard, quality_metrics, stats_display]
)
# Launch configuration
if __name__ == "__main__":
demo.queue(max_size=20).launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
debug=True
)
# For deployment (HuggingFace Spaces, etc.)
app = demo