Spaces:
Sleeping
Sleeping
File size: 6,134 Bytes
ddb8dc2 3a3f687 ddb8dc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
# app.py
import os
import gradio as gr
import pandas as pd
from ecoeval.config import EcoEvalConfig
from ecoeval.datasets import load_dataset_by_name, list_available_datasets
from ecoeval.core import run_benchmark
from ecoeval.energy import run_with_energy
from ecoeval.logging_utils import append_run_to_csv, load_leaderboard
RUNS_CSV = "runs.csv"
def run_ecoeval(model_id: str, dataset_name: str, max_tasks: int):
dataset = load_dataset_by_name(dataset_name)
if max_tasks is not None and max_tasks > 0:
max_tasks = min(max_tasks, len(dataset))
else:
max_tasks = len(dataset)
cfg = EcoEvalConfig(
model_id=model_id,
max_new_tokens=128,
temperature=0.2,
top_p=0.95,
)
def bench_fn():
return run_benchmark(dataset, cfg, limit=max_tasks)
metrics = run_with_energy(bench_fn, project_name="EcoEval-LLM")
# Build single-run summary table
run_row = {
"Model": model_id,
"Dataset": dataset_name,
"Tasks": metrics["tasks"],
"Passed": metrics["passed"],
"Accuracy": round(metrics["accuracy"], 3),
"Runtime (s)": round(metrics["runtime_seconds"], 2),
"Energy (kWh)": (
round(metrics["energy_kwh"], 5) if metrics.get("energy_kwh") is not None else None
),
"CO2eq (kg)": (
round(metrics["emissions_kg"], 5) if metrics.get("emissions_kg") is not None else None
),
"Energy / Task (kWh)": (
round(metrics["energy_kwh"] / metrics["tasks"], 6)
if metrics.get("energy_kwh") is not None and metrics["tasks"] > 0
else None
),
"CO2eq / Passed (kg)": (
round(metrics["emissions_kg"] / metrics["passed"], 6)
if metrics.get("emissions_kg") is not None and metrics["passed"] > 0
else None
),
}
summary_df = pd.DataFrame([run_row])
# Persist run to leaderboard CSV
append_run_to_csv(RUNS_CSV, run_row)
summary_text = (
f"### Run summary\n"
f"- **Model**: `{model_id}`\n"
f"- **Dataset**: `{dataset_name}`\n"
f"- **Tasks**: {metrics['tasks']}\n"
f"- **Passed**: {metrics['passed']} \n"
f"- **Accuracy**: {metrics['accuracy']:.3f}\n"
f"- **Runtime**: {metrics['runtime_seconds']:.2f} s\n"
f"- **Energy**: {metrics.get('energy_kwh', 'N/A')} kWh\n"
f"- **CO₂eq**: {metrics.get('emissions_kg', 'N/A')} kg\n"
)
per_task_df = pd.DataFrame(metrics["per_task"])
return summary_df, summary_text, per_task_df
def refresh_leaderboard():
df = load_leaderboard(RUNS_CSV)
if df is None or df.empty:
return pd.DataFrame()
# Sort by accuracy descending, then energy ascending
sort_cols = []
if "Accuracy" in df.columns:
sort_cols.append("Accuracy")
if "Energy (kWh)" in df.columns:
sort_cols.append("Energy (kWh)")
if sort_cols:
df = df.sort_values(by=["Accuracy", "Energy (kWh)"], ascending=[False, True])
return df.reset_index(drop=True)
def build_app():
dataset_options = list_available_datasets()
with gr.Blocks(title="EcoEval-LLM: Energy & Carbon Benchmarking for LLM Code Generation") as demo:
gr.Markdown(
"""
# 🌱 EcoEval-LLM
Evaluate code generation models on **correctness**, **runtime**, **energy usage**, and **carbon emissions**.
This Space runs a small code-generation benchmark, executes unit tests, and tracks energy & CO₂ with [CodeCarbon](https://github.com/mlco2/codecarbon).
"""
)
with gr.Tab("Run Benchmark"):
with gr.Row():
model_in = gr.Textbox(
label="Model ID (Hugging Face Hub)",
value="Salesforce/codegen-350M-mono",
info="Any causal LM checkpoint that can generate Python code.",
)
dataset_in = gr.Dropdown(
choices=dataset_options,
value=dataset_options[0],
label="Dataset",
)
max_tasks_in = gr.Slider(
minimum=1,
maximum=50,
step=1,
value=5,
label="Max tasks to evaluate",
info="For heavy models, start small.",
)
run_btn = gr.Button("🚀 Run EcoEval Benchmark", variant="primary")
gr.Markdown("### Run-level metrics")
summary_table = gr.Dataframe(
headers=[
"Model",
"Dataset",
"Tasks",
"Passed",
"Accuracy",
"Runtime (s)",
"Energy (kWh)",
"CO2eq (kg)",
"Energy / Task (kWh)",
"CO2eq / Passed (kg)",
],
interactive=False,
wrap=True,
)
summary_md = gr.Markdown()
gr.Markdown("### Per-task results")
per_task_table = gr.Dataframe(
headers=[
"task_id",
"prompt_preview",
"passed",
"runtime_s",
],
interactive=False,
wrap=True,
)
run_btn.click(
fn=run_ecoeval,
inputs=[model_in, dataset_in, max_tasks_in],
outputs=[summary_table, summary_md, per_task_table],
)
with gr.Tab("Leaderboard"):
gr.Markdown(
"Global history of runs in this Space (sorted by accuracy, then energy)."
)
refresh_btn = gr.Button("🔄 Refresh leaderboard")
leaderboard_table = gr.Dataframe(interactive=False, wrap=True)
refresh_btn.click(
fn=refresh_leaderboard,
inputs=None,
outputs=leaderboard_table,
)
return demo
if __name__ == "__main__":
demo = build_app()
demo.launch()
|