EcoEval-LLM / app.py
singhalamaan116's picture
Update app.py
3a3f687 verified
# app.py
import os
import gradio as gr
import pandas as pd
from ecoeval.config import EcoEvalConfig
from ecoeval.datasets import load_dataset_by_name, list_available_datasets
from ecoeval.core import run_benchmark
from ecoeval.energy import run_with_energy
from ecoeval.logging_utils import append_run_to_csv, load_leaderboard
RUNS_CSV = "runs.csv"
def run_ecoeval(model_id: str, dataset_name: str, max_tasks: int):
dataset = load_dataset_by_name(dataset_name)
if max_tasks is not None and max_tasks > 0:
max_tasks = min(max_tasks, len(dataset))
else:
max_tasks = len(dataset)
cfg = EcoEvalConfig(
model_id=model_id,
max_new_tokens=128,
temperature=0.2,
top_p=0.95,
)
def bench_fn():
return run_benchmark(dataset, cfg, limit=max_tasks)
metrics = run_with_energy(bench_fn, project_name="EcoEval-LLM")
# Build single-run summary table
run_row = {
"Model": model_id,
"Dataset": dataset_name,
"Tasks": metrics["tasks"],
"Passed": metrics["passed"],
"Accuracy": round(metrics["accuracy"], 3),
"Runtime (s)": round(metrics["runtime_seconds"], 2),
"Energy (kWh)": (
round(metrics["energy_kwh"], 5) if metrics.get("energy_kwh") is not None else None
),
"CO2eq (kg)": (
round(metrics["emissions_kg"], 5) if metrics.get("emissions_kg") is not None else None
),
"Energy / Task (kWh)": (
round(metrics["energy_kwh"] / metrics["tasks"], 6)
if metrics.get("energy_kwh") is not None and metrics["tasks"] > 0
else None
),
"CO2eq / Passed (kg)": (
round(metrics["emissions_kg"] / metrics["passed"], 6)
if metrics.get("emissions_kg") is not None and metrics["passed"] > 0
else None
),
}
summary_df = pd.DataFrame([run_row])
# Persist run to leaderboard CSV
append_run_to_csv(RUNS_CSV, run_row)
summary_text = (
f"### Run summary\n"
f"- **Model**: `{model_id}`\n"
f"- **Dataset**: `{dataset_name}`\n"
f"- **Tasks**: {metrics['tasks']}\n"
f"- **Passed**: {metrics['passed']} \n"
f"- **Accuracy**: {metrics['accuracy']:.3f}\n"
f"- **Runtime**: {metrics['runtime_seconds']:.2f} s\n"
f"- **Energy**: {metrics.get('energy_kwh', 'N/A')} kWh\n"
f"- **CO₂eq**: {metrics.get('emissions_kg', 'N/A')} kg\n"
)
per_task_df = pd.DataFrame(metrics["per_task"])
return summary_df, summary_text, per_task_df
def refresh_leaderboard():
df = load_leaderboard(RUNS_CSV)
if df is None or df.empty:
return pd.DataFrame()
# Sort by accuracy descending, then energy ascending
sort_cols = []
if "Accuracy" in df.columns:
sort_cols.append("Accuracy")
if "Energy (kWh)" in df.columns:
sort_cols.append("Energy (kWh)")
if sort_cols:
df = df.sort_values(by=["Accuracy", "Energy (kWh)"], ascending=[False, True])
return df.reset_index(drop=True)
def build_app():
dataset_options = list_available_datasets()
with gr.Blocks(title="EcoEval-LLM: Energy & Carbon Benchmarking for LLM Code Generation") as demo:
gr.Markdown(
"""
# 🌱 EcoEval-LLM
Evaluate code generation models on **correctness**, **runtime**, **energy usage**, and **carbon emissions**.
This Space runs a small code-generation benchmark, executes unit tests, and tracks energy & CO₂ with [CodeCarbon](https://github.com/mlco2/codecarbon).
"""
)
with gr.Tab("Run Benchmark"):
with gr.Row():
model_in = gr.Textbox(
label="Model ID (Hugging Face Hub)",
value="Salesforce/codegen-350M-mono",
info="Any causal LM checkpoint that can generate Python code.",
)
dataset_in = gr.Dropdown(
choices=dataset_options,
value=dataset_options[0],
label="Dataset",
)
max_tasks_in = gr.Slider(
minimum=1,
maximum=50,
step=1,
value=5,
label="Max tasks to evaluate",
info="For heavy models, start small.",
)
run_btn = gr.Button("🚀 Run EcoEval Benchmark", variant="primary")
gr.Markdown("### Run-level metrics")
summary_table = gr.Dataframe(
headers=[
"Model",
"Dataset",
"Tasks",
"Passed",
"Accuracy",
"Runtime (s)",
"Energy (kWh)",
"CO2eq (kg)",
"Energy / Task (kWh)",
"CO2eq / Passed (kg)",
],
interactive=False,
wrap=True,
)
summary_md = gr.Markdown()
gr.Markdown("### Per-task results")
per_task_table = gr.Dataframe(
headers=[
"task_id",
"prompt_preview",
"passed",
"runtime_s",
],
interactive=False,
wrap=True,
)
run_btn.click(
fn=run_ecoeval,
inputs=[model_in, dataset_in, max_tasks_in],
outputs=[summary_table, summary_md, per_task_table],
)
with gr.Tab("Leaderboard"):
gr.Markdown(
"Global history of runs in this Space (sorted by accuracy, then energy)."
)
refresh_btn = gr.Button("🔄 Refresh leaderboard")
leaderboard_table = gr.Dataframe(interactive=False, wrap=True)
refresh_btn.click(
fn=refresh_leaderboard,
inputs=None,
outputs=leaderboard_table,
)
return demo
if __name__ == "__main__":
demo = build_app()
demo.launch()