Spaces:
Running
Running
| # app.py | |
| import os | |
| import gradio as gr | |
| import pandas as pd | |
| from ecoeval.config import EcoEvalConfig | |
| from ecoeval.datasets import load_dataset_by_name, list_available_datasets | |
| from ecoeval.core import run_benchmark | |
| from ecoeval.energy import run_with_energy | |
| from ecoeval.logging_utils import append_run_to_csv, load_leaderboard | |
| RUNS_CSV = "runs.csv" | |
| def run_ecoeval(model_id: str, dataset_name: str, max_tasks: int): | |
| dataset = load_dataset_by_name(dataset_name) | |
| if max_tasks is not None and max_tasks > 0: | |
| max_tasks = min(max_tasks, len(dataset)) | |
| else: | |
| max_tasks = len(dataset) | |
| cfg = EcoEvalConfig( | |
| model_id=model_id, | |
| max_new_tokens=128, | |
| temperature=0.2, | |
| top_p=0.95, | |
| ) | |
| def bench_fn(): | |
| return run_benchmark(dataset, cfg, limit=max_tasks) | |
| metrics = run_with_energy(bench_fn, project_name="EcoEval-LLM") | |
| # Build single-run summary table | |
| run_row = { | |
| "Model": model_id, | |
| "Dataset": dataset_name, | |
| "Tasks": metrics["tasks"], | |
| "Passed": metrics["passed"], | |
| "Accuracy": round(metrics["accuracy"], 3), | |
| "Runtime (s)": round(metrics["runtime_seconds"], 2), | |
| "Energy (kWh)": ( | |
| round(metrics["energy_kwh"], 5) if metrics.get("energy_kwh") is not None else None | |
| ), | |
| "CO2eq (kg)": ( | |
| round(metrics["emissions_kg"], 5) if metrics.get("emissions_kg") is not None else None | |
| ), | |
| "Energy / Task (kWh)": ( | |
| round(metrics["energy_kwh"] / metrics["tasks"], 6) | |
| if metrics.get("energy_kwh") is not None and metrics["tasks"] > 0 | |
| else None | |
| ), | |
| "CO2eq / Passed (kg)": ( | |
| round(metrics["emissions_kg"] / metrics["passed"], 6) | |
| if metrics.get("emissions_kg") is not None and metrics["passed"] > 0 | |
| else None | |
| ), | |
| } | |
| summary_df = pd.DataFrame([run_row]) | |
| # Persist run to leaderboard CSV | |
| append_run_to_csv(RUNS_CSV, run_row) | |
| summary_text = ( | |
| f"### Run summary\n" | |
| f"- **Model**: `{model_id}`\n" | |
| f"- **Dataset**: `{dataset_name}`\n" | |
| f"- **Tasks**: {metrics['tasks']}\n" | |
| f"- **Passed**: {metrics['passed']} \n" | |
| f"- **Accuracy**: {metrics['accuracy']:.3f}\n" | |
| f"- **Runtime**: {metrics['runtime_seconds']:.2f} s\n" | |
| f"- **Energy**: {metrics.get('energy_kwh', 'N/A')} kWh\n" | |
| f"- **CO₂eq**: {metrics.get('emissions_kg', 'N/A')} kg\n" | |
| ) | |
| per_task_df = pd.DataFrame(metrics["per_task"]) | |
| return summary_df, summary_text, per_task_df | |
| def refresh_leaderboard(): | |
| df = load_leaderboard(RUNS_CSV) | |
| if df is None or df.empty: | |
| return pd.DataFrame() | |
| # Sort by accuracy descending, then energy ascending | |
| sort_cols = [] | |
| if "Accuracy" in df.columns: | |
| sort_cols.append("Accuracy") | |
| if "Energy (kWh)" in df.columns: | |
| sort_cols.append("Energy (kWh)") | |
| if sort_cols: | |
| df = df.sort_values(by=["Accuracy", "Energy (kWh)"], ascending=[False, True]) | |
| return df.reset_index(drop=True) | |
| def build_app(): | |
| dataset_options = list_available_datasets() | |
| with gr.Blocks(title="EcoEval-LLM: Energy & Carbon Benchmarking for LLM Code Generation") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🌱 EcoEval-LLM | |
| Evaluate code generation models on **correctness**, **runtime**, **energy usage**, and **carbon emissions**. | |
| This Space runs a small code-generation benchmark, executes unit tests, and tracks energy & CO₂ with [CodeCarbon](https://github.com/mlco2/codecarbon). | |
| """ | |
| ) | |
| with gr.Tab("Run Benchmark"): | |
| with gr.Row(): | |
| model_in = gr.Textbox( | |
| label="Model ID (Hugging Face Hub)", | |
| value="Salesforce/codegen-350M-mono", | |
| info="Any causal LM checkpoint that can generate Python code.", | |
| ) | |
| dataset_in = gr.Dropdown( | |
| choices=dataset_options, | |
| value=dataset_options[0], | |
| label="Dataset", | |
| ) | |
| max_tasks_in = gr.Slider( | |
| minimum=1, | |
| maximum=50, | |
| step=1, | |
| value=5, | |
| label="Max tasks to evaluate", | |
| info="For heavy models, start small.", | |
| ) | |
| run_btn = gr.Button("🚀 Run EcoEval Benchmark", variant="primary") | |
| gr.Markdown("### Run-level metrics") | |
| summary_table = gr.Dataframe( | |
| headers=[ | |
| "Model", | |
| "Dataset", | |
| "Tasks", | |
| "Passed", | |
| "Accuracy", | |
| "Runtime (s)", | |
| "Energy (kWh)", | |
| "CO2eq (kg)", | |
| "Energy / Task (kWh)", | |
| "CO2eq / Passed (kg)", | |
| ], | |
| interactive=False, | |
| wrap=True, | |
| ) | |
| summary_md = gr.Markdown() | |
| gr.Markdown("### Per-task results") | |
| per_task_table = gr.Dataframe( | |
| headers=[ | |
| "task_id", | |
| "prompt_preview", | |
| "passed", | |
| "runtime_s", | |
| ], | |
| interactive=False, | |
| wrap=True, | |
| ) | |
| run_btn.click( | |
| fn=run_ecoeval, | |
| inputs=[model_in, dataset_in, max_tasks_in], | |
| outputs=[summary_table, summary_md, per_task_table], | |
| ) | |
| with gr.Tab("Leaderboard"): | |
| gr.Markdown( | |
| "Global history of runs in this Space (sorted by accuracy, then energy)." | |
| ) | |
| refresh_btn = gr.Button("🔄 Refresh leaderboard") | |
| leaderboard_table = gr.Dataframe(interactive=False, wrap=True) | |
| refresh_btn.click( | |
| fn=refresh_leaderboard, | |
| inputs=None, | |
| outputs=leaderboard_table, | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = build_app() | |
| demo.launch() | |