Spaces:
Sleeping
Sleeping
| import json | |
| from pathlib import Path | |
| from typing import List, Dict | |
| import gradio as gr | |
| from pydantic import BaseModel, Field, field_validator | |
| # --------------- Configuration --------------- | |
| LEADERBOARD_PATH = Path("leaderboard_data.json") | |
| DEFAULT_MODEL_NAME = "example/model" | |
| # --------------- Data models --------------- | |
| class Metrics(BaseModel): | |
| readability: int | |
| relevance: int | |
| explanation_clarity: int = Field(alias="explanation_clarity") | |
| problem_identification: int | |
| actionability: int | |
| completeness: int | |
| specificity: int | |
| contextual_adequacy: int | |
| consistency: int | |
| brevity: int | |
| def metric_range(cls, v: int): | |
| if not 0 <= v <= 10: | |
| raise ValueError("Multi-metrics should be between 0 and 10") | |
| return v | |
| class LeaderboardEntry(BaseModel): | |
| model_name: str | |
| bleu: float | |
| llm_pass_1: float | |
| llm_pass_5: float | |
| llm_pass_10: float | |
| metrics: Metrics | |
| def score_range(cls, v: float): | |
| if not 0.0 <= v <= 1.0: | |
| raise ValueError("Scores should be between 0 and 1") | |
| return v | |
| # --------------- Persistence helpers --------------- | |
| def _load_leaderboard() -> List[Dict]: | |
| if not LEADERBOARD_PATH.exists(): | |
| return [] | |
| with LEADERBOARD_PATH.open("r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| return data.get("leaderboard", []) | |
| def _save_leaderboard(data: List[Dict]): | |
| to_store = {"leaderboard": data} | |
| with LEADERBOARD_PATH.open("w", encoding="utf-8") as f: | |
| json.dump(to_store, f, indent=2) | |
| # --------------- Utility --------------- | |
| def _flatten_entry(entry: Dict) -> Dict: | |
| """Flatten nested metrics so that every metric is a column.""" | |
| flat = { | |
| "Model": entry["model_name"], | |
| "BLEU": entry["bleu"], | |
| "Pass@1": entry["llm_pass_1"], | |
| "Pass@5": entry["llm_pass_5"], | |
| "Pass@10": entry["llm_pass_10"], | |
| "Readability": entry["metrics"]["readability"], | |
| "Relevance": entry["metrics"]["relevance"], | |
| "Explanation Clarity": entry["metrics"]["explanation_clarity"], | |
| "Problem Identification": entry["metrics"]["problem_identification"], | |
| "Actionability": entry["metrics"]["actionability"], | |
| "Completeness": entry["metrics"]["completeness"], | |
| "Specificity": entry["metrics"]["specificity"], | |
| "Contextual Adequacy": entry["metrics"]["contextual_adequacy"], | |
| "Consistency": entry["metrics"]["consistency"], | |
| "Brevity": entry["metrics"]["brevity"], | |
| } | |
| return flat | |
| def _table_data() -> List[List]: | |
| data = _load_leaderboard() | |
| if not data: | |
| # Return empty list if no data | |
| return [] | |
| # Sort descending by pass@1 as requested | |
| data.sort(key=lambda x: x["llm_pass_1"], reverse=True) | |
| # Convert to list of lists for Gradio table | |
| table_rows = [] | |
| for entry in data: | |
| row = [ | |
| entry["model_name"], | |
| entry["bleu"], | |
| entry["llm_pass_1"], | |
| entry["llm_pass_5"], | |
| entry["llm_pass_10"], | |
| entry["metrics"]["readability"], | |
| entry["metrics"]["relevance"], | |
| entry["metrics"]["explanation_clarity"], | |
| entry["metrics"]["problem_identification"], | |
| entry["metrics"]["actionability"], | |
| entry["metrics"]["completeness"], | |
| entry["metrics"]["specificity"], | |
| entry["metrics"]["contextual_adequacy"], | |
| entry["metrics"]["consistency"], | |
| entry["metrics"]["brevity"], | |
| ] | |
| table_rows.append(row) | |
| return table_rows | |
| # --------------- Gradio callbacks --------------- | |
| def submit_model( | |
| model_name: str, | |
| bleu: float, | |
| llm_pass_1: float, | |
| llm_pass_5: float, | |
| llm_pass_10: float, | |
| readability: int, | |
| relevance: int, | |
| explanation_clarity: int, | |
| problem_identification: int, | |
| actionability: int, | |
| completeness: int, | |
| specificity: int, | |
| contextual_adequacy: int, | |
| consistency: int, | |
| brevity: int, | |
| ): | |
| """Validate and append a new model entry to the leaderboard.""" | |
| try: | |
| entry = LeaderboardEntry( | |
| model_name=model_name.strip(), | |
| bleu=bleu, | |
| llm_pass_1=llm_pass_1, | |
| llm_pass_5=llm_pass_5, | |
| llm_pass_10=llm_pass_10, | |
| metrics={ | |
| "readability": readability, | |
| "relevance": relevance, | |
| "explanation_clarity": explanation_clarity, | |
| "problem_identification": problem_identification, | |
| "actionability": actionability, | |
| "completeness": completeness, | |
| "specificity": specificity, | |
| "contextual_adequacy": contextual_adequacy, | |
| "consistency": consistency, | |
| "brevity": brevity, | |
| }, | |
| ) | |
| except Exception as e: | |
| return _table_data(), f"❌ Submission failed: {e}" | |
| data = _load_leaderboard() | |
| # Replace existing model entry if any | |
| data = [d for d in data if d["model_name"] != entry.model_name] | |
| data.append(entry.dict()) | |
| _save_leaderboard(data) | |
| return _table_data(), "✅ Submission recorded!" | |
| # --------------- Interface --------------- | |
| with gr.Blocks(title="CodeReview Leaderboard") as demo: | |
| gr.Markdown("""# 🏆 CodeReview Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """) | |
| # Create initial example data if file doesn't exist | |
| if not LEADERBOARD_PATH.exists(): | |
| example_data = { | |
| "leaderboard": [ | |
| { | |
| "model_name": "example/model", | |
| "bleu": 0.5, | |
| "llm_pass_1": 0.5, | |
| "llm_pass_5": 0.5, | |
| "llm_pass_10": 0.5, | |
| "metrics": { | |
| "readability": 5, | |
| "relevance": 5, | |
| "explanation_clarity": 5, | |
| "problem_identification": 5, | |
| "actionability": 5, | |
| "completeness": 5, | |
| "specificity": 5, | |
| "contextual_adequacy": 5, | |
| "consistency": 5, | |
| "brevity": 5 | |
| } | |
| } | |
| ] | |
| } | |
| with LEADERBOARD_PATH.open("w", encoding="utf-8") as f: | |
| json.dump(example_data, f, indent=2) | |
| # Initialize table data | |
| initial_data = _table_data() | |
| leaderboard_df = gr.Dataframe( | |
| headers=["Model", "BLEU", "Pass@1", "Pass@5", "Pass@10", "Readability", "Relevance", "Explanation Clarity", "Problem Identification", "Actionability", "Completeness", "Specificity", "Contextual Adequacy", "Consistency", "Brevity"], | |
| value=initial_data, | |
| label="Current Leaderboard", | |
| interactive=False, | |
| ) | |
| gr.Markdown("## 🔄 Submit new model results") | |
| with gr.Accordion("Submission form", open=False): | |
| with gr.Row(): | |
| model_name_inp = gr.Text(label="Model name (org/model)", value="") | |
| bleu_inp = gr.Number(label="BLEU", value=0.0, minimum=0.0, maximum=1.0) | |
| pass1_inp = gr.Number(label="Pass@1", value=0.0, minimum=0.0, maximum=1.0) | |
| pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0) | |
| pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0) | |
| gr.Markdown("### Multi-metric subjective scores (0 – 10)") | |
| with gr.Row(): | |
| readability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Readability") | |
| relevance_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Relevance") | |
| explanation_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Explanation Clarity") | |
| problem_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Problem Identification") | |
| actionability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Actionability") | |
| completeness_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Completeness") | |
| specificity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Specificity") | |
| contextual_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Contextual Adequacy") | |
| consistency_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Consistency") | |
| brevity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Brevity") | |
| submit_btn = gr.Button("Submit") | |
| status_markdown = gr.Markdown("") | |
| submit_btn.click( | |
| fn=submit_model, | |
| inputs=[ | |
| model_name_inp, | |
| bleu_inp, | |
| pass1_inp, | |
| pass5_inp, | |
| pass10_inp, | |
| readability_inp, | |
| relevance_inp, | |
| explanation_inp, | |
| problem_inp, | |
| actionability_inp, | |
| completeness_inp, | |
| specificity_inp, | |
| contextual_inp, | |
| consistency_inp, | |
| brevity_inp, | |
| ], | |
| outputs=[leaderboard_df, status_markdown], | |
| api_name="submit_model", | |
| ) | |
| # ----------------- Launch ----------------- | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |
| # For HF Spaces runtime (gradio SDK) expose `demo` | |
| app = demo |