Spaces:
Sleeping
Sleeping
| """ | |
| CodeReview Leaderboard - Inspired by circle-guard-bench | |
| A comprehensive leaderboard for code review generation models | |
| """ | |
| import gradio as gr | |
| from typing import List, Dict, Any | |
| from datetime import datetime, timezone | |
| # Import our modules | |
| from src.envs import ( | |
| PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, | |
| MAIN_HEADERS, QUALITY_HEADERS | |
| ) | |
| from src.about import TITLE, INTRODUCTION_TEXT | |
| from src.display.css_html_js import DARK_THEME_CSS, CUSTOM_JS, HEADER_HTML, FOOTER_HTML | |
| from src.display.utils import ( | |
| get_main_leaderboard_data, get_quality_metrics_data, | |
| get_submission_history_data, get_statistics_summary | |
| ) | |
| from src.leaderboard.processor import LeaderboardProcessor | |
| from src.submission.submit import SubmissionHandler | |
| # Initialize processors | |
| processor = LeaderboardProcessor() | |
| submission_handler = SubmissionHandler() | |
| # Global state | |
| current_filters = { | |
| "programming_language": "All", | |
| "comment_language": "All", | |
| "taxonomy_category": "All" | |
| } | |
| def update_leaderboard_tables( | |
| programming_language: str = "All", | |
| comment_language: str = "All", | |
| taxonomy_category: str = "All" | |
| ): | |
| """Update leaderboard tables with filters""" | |
| global current_filters | |
| current_filters = { | |
| "programming_language": programming_language, | |
| "comment_language": comment_language, | |
| "taxonomy_category": taxonomy_category | |
| } | |
| # Load current data | |
| data = processor.load_leaderboard_data() | |
| # Get filtered tables | |
| main_table = get_main_leaderboard_data( | |
| data, programming_language, comment_language, taxonomy_category | |
| ) | |
| quality_table = get_quality_metrics_data( | |
| data, programming_language, comment_language, taxonomy_category | |
| ) | |
| # Get statistics | |
| stats = get_statistics_summary(data) | |
| # Format statistics display | |
| stats_text = f""" | |
| ## π Current Statistics | |
| - **Total Models**: {stats['total_models']} | |
| - **Total Submissions**: {stats['total_submissions']} | |
| - **Average Pass@1**: {stats['avg_pass_1']:.3f} | |
| - **Best Model**: {stats['best_model']} | |
| - **Languages Covered**: {stats['languages_covered']} | |
| - **Categories Covered**: {stats['categories_covered']} | |
| """ | |
| return main_table, quality_table, stats_text | |
| def refresh_data(): | |
| """Refresh all data from storage""" | |
| return update_leaderboard_tables( | |
| current_filters["programming_language"], | |
| current_filters["comment_language"], | |
| current_filters["taxonomy_category"] | |
| ) | |
| def handle_submission( | |
| request: gr.Request, | |
| *args | |
| ): | |
| """Handle model submission""" | |
| # Get current data | |
| current_data = processor.load_leaderboard_data() | |
| # Call submission handler | |
| result = submission_handler.submit_model(request, current_data, *args) | |
| # If submission was successful, refresh tables | |
| if result[0] != current_data: # Data was updated | |
| main_table, quality_table, stats_text = update_leaderboard_tables( | |
| current_filters["programming_language"], | |
| current_filters["comment_language"], | |
| current_filters["taxonomy_category"] | |
| ) | |
| return result[0], main_table, quality_table, result[3], stats_text | |
| else: | |
| return result[0], result[1], result[2], result[3], None | |
| # Create the Gradio interface | |
| with gr.Blocks( | |
| theme=gr.themes.Base(), | |
| css=DARK_THEME_CSS, | |
| js=CUSTOM_JS, | |
| title=TITLE, | |
| head="<meta name='viewport' content='width=device-width, initial-scale=1'>" | |
| ) as demo: | |
| # Header | |
| gr.HTML(HEADER_HTML) | |
| # State to store leaderboard data | |
| leaderboard_state = gr.State(value=processor.load_leaderboard_data()) | |
| # Main content tabs | |
| with gr.Tabs(): | |
| # Leaderboard Tab | |
| with gr.Tab("π Leaderboard"): | |
| # Filters | |
| with gr.Row(): | |
| prog_lang_filter = gr.Dropdown( | |
| choices=PROGRAMMING_LANGUAGES, | |
| value="All", | |
| label="π Programming Language", | |
| info="Filter by programming language" | |
| ) | |
| comment_lang_filter = gr.Dropdown( | |
| choices=COMMENT_LANGUAGES, | |
| value="All", | |
| label="π Comment Language", | |
| info="Filter by comment language" | |
| ) | |
| taxonomy_filter = gr.Dropdown( | |
| choices=TAXONOMY_CATEGORIES, | |
| value="All", | |
| label="π·οΈ Taxonomy Category", | |
| info="Filter by review category" | |
| ) | |
| refresh_btn = gr.Button("π Refresh", variant="secondary") | |
| # Statistics | |
| stats_display = gr.Markdown("") | |
| # Main leaderboard table | |
| with gr.Row(): | |
| main_leaderboard = gr.Dataframe( | |
| headers=MAIN_HEADERS, | |
| label="π Main Leaderboard", | |
| interactive=False, | |
| wrap=True, | |
| max_height=600 | |
| ) | |
| # Quality metrics table | |
| with gr.Row(): | |
| quality_metrics = gr.Dataframe( | |
| headers=QUALITY_HEADERS, | |
| label="π Quality Metrics", | |
| interactive=False, | |
| wrap=True, | |
| max_height=600 | |
| ) | |
| # Submission Tab | |
| with gr.Tab("π Submit Model"): | |
| # Create submission form | |
| form_components = submission_handler.get_submission_form_components() | |
| # Connect submission handler | |
| form_components["submit_btn"].click( | |
| fn=handle_submission, | |
| inputs=[ | |
| leaderboard_state, | |
| form_components["model_name"], | |
| form_components["programming_language"], | |
| form_components["comment_language"], | |
| form_components["taxonomy_category"], | |
| form_components["bleu"], | |
| form_components["pass1"], | |
| form_components["pass5"], | |
| form_components["pass10"], | |
| form_components["readability"], | |
| form_components["relevance"], | |
| form_components["explanation_clarity"], | |
| form_components["problem_identification"], | |
| form_components["actionability"], | |
| form_components["completeness"], | |
| form_components["specificity"], | |
| form_components["contextual_adequacy"], | |
| form_components["consistency"], | |
| form_components["brevity"], | |
| ], | |
| outputs=[ | |
| leaderboard_state, | |
| main_leaderboard, | |
| quality_metrics, | |
| form_components["status_msg"], | |
| stats_display | |
| ] | |
| ) | |
| # Analytics Tab | |
| with gr.Tab("π Analytics"): | |
| with gr.Row(): | |
| analytics_prog_lang = gr.Dropdown( | |
| choices=PROGRAMMING_LANGUAGES, | |
| value="All", | |
| label="Programming Language" | |
| ) | |
| analytics_comment_lang = gr.Dropdown( | |
| choices=COMMENT_LANGUAGES, | |
| value="All", | |
| label="Comment Language" | |
| ) | |
| analytics_taxonomy = gr.Dropdown( | |
| choices=TAXONOMY_CATEGORIES, | |
| value="All", | |
| label="Taxonomy Category" | |
| ) | |
| # Submission history | |
| submission_history = gr.Dataframe( | |
| headers=["Model", "Programming Language", "Comment Language", "Taxonomy", "Pass@1", "Date", "IP"], | |
| label="π Recent Submissions", | |
| interactive=False, | |
| max_height=400 | |
| ) | |
| # Language performance analysis | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π£οΈ Language Performance Analysis") | |
| language_analysis = gr.Dataframe( | |
| headers=["Language", "Avg Pass@1", "Model Count", "Best Model"], | |
| label="Programming Language Performance", | |
| interactive=False | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### π·οΈ Category Performance Analysis") | |
| category_analysis = gr.Dataframe( | |
| headers=["Category", "Avg Pass@1", "Model Count", "Best Model"], | |
| label="Taxonomy Category Performance", | |
| interactive=False | |
| ) | |
| # About Tab | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown(INTRODUCTION_TEXT) | |
| # Export functionality | |
| with gr.Row(): | |
| export_format = gr.Dropdown( | |
| choices=["JSON", "CSV"], | |
| value="JSON", | |
| label="Export Format" | |
| ) | |
| export_btn = gr.Button("π₯ Export Data") | |
| export_output = gr.Textbox( | |
| label="Export Output", | |
| lines=10, | |
| max_lines=20, | |
| show_copy_button=True | |
| ) | |
| # Footer | |
| gr.HTML(FOOTER_HTML) | |
| # Initialize with data | |
| initial_main, initial_quality, initial_stats = update_leaderboard_tables() | |
| # Update tables when filters change | |
| filter_inputs = [prog_lang_filter, comment_lang_filter, taxonomy_filter] | |
| filter_outputs = [main_leaderboard, quality_metrics, stats_display] | |
| for filter_input in filter_inputs: | |
| filter_input.change( | |
| fn=update_leaderboard_tables, | |
| inputs=filter_inputs, | |
| outputs=filter_outputs | |
| ) | |
| # Refresh button | |
| refresh_btn.click( | |
| fn=refresh_data, | |
| outputs=filter_outputs | |
| ) | |
| # Analytics updates | |
| analytics_inputs = [analytics_prog_lang, analytics_comment_lang, analytics_taxonomy] | |
| def update_analytics(prog_lang, comment_lang, taxonomy): | |
| """Update analytics tables""" | |
| data = processor.load_leaderboard_data() | |
| # Get submission history | |
| history = get_submission_history_data(data, prog_lang, comment_lang, taxonomy) | |
| # Get language performance | |
| lang_perf = [] | |
| for lang in PROGRAMMING_LANGUAGES[1:]: | |
| lang_data = [d for d in data if d.get("programming_language") == lang] | |
| if lang_data: | |
| avg_score = sum(d.get("llm_pass_1", 0) for d in lang_data) / len(lang_data) | |
| best_model = max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "") | |
| lang_perf.append([lang, f"{avg_score:.3f}", len(lang_data), best_model]) | |
| # Get category performance | |
| cat_perf = [] | |
| for cat in TAXONOMY_CATEGORIES[1:]: | |
| cat_data = [d for d in data if d.get("taxonomy_category") == cat] | |
| if cat_data: | |
| avg_score = sum(d.get("llm_pass_1", 0) for d in cat_data) / len(cat_data) | |
| best_model = max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "") | |
| cat_perf.append([cat, f"{avg_score:.3f}", len(cat_data), best_model]) | |
| return history, lang_perf, cat_perf | |
| for analytics_input in analytics_inputs: | |
| analytics_input.change( | |
| fn=update_analytics, | |
| inputs=analytics_inputs, | |
| outputs=[submission_history, language_analysis, category_analysis] | |
| ) | |
| # Export functionality | |
| def export_data(format_type): | |
| """Export leaderboard data""" | |
| return processor.export_data(format_type.lower()) | |
| export_btn.click( | |
| fn=export_data, | |
| inputs=[export_format], | |
| outputs=[export_output] | |
| ) | |
| # Set initial values | |
| demo.load( | |
| fn=lambda: (initial_main, initial_quality, initial_stats), | |
| outputs=[main_leaderboard, quality_metrics, stats_display] | |
| ) | |
| # Launch configuration | |
| if __name__ == "__main__": | |
| demo.queue(max_size=20).launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True, | |
| debug=True | |
| ) | |
| # For deployment (HuggingFace Spaces, etc.) | |
| app = demo |