singhalamaan116 commited on
Commit
ddb8dc2
·
verified ·
1 Parent(s): 5ff27d1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -0
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import gradio as gr
4
+ import pandas as pd
5
+
6
+ from ecoeval.config import EcoEvalConfig
7
+ from ecoeval.datasets import load_dataset_by_name, list_available_datasets
8
+ from ecoeval.core import run_benchmark
9
+ from ecoeval.energy import run_with_energy
10
+ from ecoeval.logging_utils import append_run_to_csv, load_leaderboard
11
+
12
+
13
+ RUNS_CSV = "runs.csv"
14
+
15
+
16
+ def run_ecoeval(model_id: str, dataset_name: str, max_tasks: int):
17
+ dataset = load_dataset_by_name(dataset_name)
18
+ if max_tasks is not None and max_tasks > 0:
19
+ max_tasks = min(max_tasks, len(dataset))
20
+ else:
21
+ max_tasks = len(dataset)
22
+
23
+ cfg = EcoEvalConfig(
24
+ model_id=model_id,
25
+ max_new_tokens=128,
26
+ temperature=0.2,
27
+ top_p=0.95,
28
+ )
29
+
30
+ def bench_fn():
31
+ return run_benchmark(dataset, cfg, limit=max_tasks)
32
+
33
+ metrics = run_with_energy(bench_fn, project_name="EcoEval-LLM")
34
+
35
+ # Build single-run summary table
36
+ run_row = {
37
+ "Model": model_id,
38
+ "Dataset": dataset_name,
39
+ "Tasks": metrics["tasks"],
40
+ "Passed": metrics["passed"],
41
+ "Accuracy": round(metrics["accuracy"], 3),
42
+ "Runtime (s)": round(metrics["runtime_seconds"], 2),
43
+ "Energy (kWh)": (
44
+ round(metrics["energy_kwh"], 5) if metrics.get("energy_kwh") is not None else None
45
+ ),
46
+ "CO2eq (kg)": (
47
+ round(metrics["emissions_kg"], 5) if metrics.get("emissions_kg") is not None else None
48
+ ),
49
+ "Energy / Task (kWh)": (
50
+ round(metrics["energy_kwh"] / metrics["tasks"], 6)
51
+ if metrics.get("energy_kwh") is not None and metrics["tasks"] > 0
52
+ else None
53
+ ),
54
+ "CO2eq / Passed (kg)": (
55
+ round(metrics["emissions_kg"] / metrics["passed"], 6)
56
+ if metrics.get("emissions_kg") is not None and metrics["passed"] > 0
57
+ else None
58
+ ),
59
+ }
60
+
61
+ summary_df = pd.DataFrame([run_row])
62
+
63
+ # Persist run to leaderboard CSV
64
+ append_run_to_csv(RUNS_CSV, run_row)
65
+
66
+ summary_text = (
67
+ f"### Run summary\n"
68
+ f"- **Model**: `{model_id}`\n"
69
+ f"- **Dataset**: `{dataset_name}`\n"
70
+ f"- **Tasks**: {metrics['tasks']}\n"
71
+ f"- **Passed**: {metrics['passed']} \n"
72
+ f"- **Accuracy**: {metrics['accuracy']:.3f}\n"
73
+ f"- **Runtime**: {metrics['runtime_seconds']:.2f} s\n"
74
+ f"- **Energy**: {metrics.get('energy_kwh', 'N/A')} kWh\n"
75
+ f"- **CO₂eq**: {metrics.get('emissions_kg', 'N/A')} kg\n"
76
+ )
77
+
78
+ per_task_df = pd.DataFrame(metrics["per_task"])
79
+
80
+ return summary_df, summary_text, per_task_df
81
+
82
+
83
+ def refresh_leaderboard():
84
+ df = load_leaderboard(RUNS_CSV)
85
+ if df is None or df.empty:
86
+ return pd.DataFrame()
87
+ # Sort by accuracy descending, then energy ascending
88
+ sort_cols = []
89
+ if "Accuracy" in df.columns:
90
+ sort_cols.append("Accuracy")
91
+ if "Energy (kWh)" in df.columns:
92
+ sort_cols.append("Energy (kWh)")
93
+ if sort_cols:
94
+ df = df.sort_values(by=["Accuracy", "Energy (kWh)"], ascending=[False, True])
95
+ return df.reset_index(drop=True)
96
+
97
+
98
+ def build_app():
99
+ dataset_options = list_available_datasets()
100
+
101
+ with gr.Blocks(title="EcoEval-LLM: Energy & Carbon Benchmarking for LLM Code Generation") as demo:
102
+ gr.Markdown(
103
+ """
104
+ # 🌱 EcoEval-LLM
105
+ Evaluate code generation models on **correctness**, **runtime**, **energy usage**, and **carbon emissions**.
106
+
107
+ This Space runs a small code-generation benchmark, executes unit tests, and tracks energy & CO₂ with [CodeCarbon](https://github.com/mlco2/codecarbon).
108
+ """
109
+ )
110
+
111
+ with gr.Tab("Run Benchmark"):
112
+ with gr.Row():
113
+ model_in = gr.Textbox(
114
+ label="Model ID (Hugging Face Hub)",
115
+ value="Salesforce/codegen-350M-multi",
116
+ info="Any causal LM checkpoint that can generate Python code.",
117
+ )
118
+ dataset_in = gr.Dropdown(
119
+ choices=dataset_options,
120
+ value=dataset_options[0],
121
+ label="Dataset",
122
+ )
123
+
124
+ max_tasks_in = gr.Slider(
125
+ minimum=1,
126
+ maximum=50,
127
+ step=1,
128
+ value=5,
129
+ label="Max tasks to evaluate",
130
+ info="For heavy models, start small.",
131
+ )
132
+
133
+ run_btn = gr.Button("🚀 Run EcoEval Benchmark", variant="primary")
134
+
135
+ gr.Markdown("### Run-level metrics")
136
+ summary_table = gr.Dataframe(
137
+ headers=[
138
+ "Model",
139
+ "Dataset",
140
+ "Tasks",
141
+ "Passed",
142
+ "Accuracy",
143
+ "Runtime (s)",
144
+ "Energy (kWh)",
145
+ "CO2eq (kg)",
146
+ "Energy / Task (kWh)",
147
+ "CO2eq / Passed (kg)",
148
+ ],
149
+ interactive=False,
150
+ wrap=True,
151
+ )
152
+
153
+ summary_md = gr.Markdown()
154
+
155
+ gr.Markdown("### Per-task results")
156
+ per_task_table = gr.Dataframe(
157
+ headers=[
158
+ "task_id",
159
+ "prompt_preview",
160
+ "passed",
161
+ "runtime_s",
162
+ ],
163
+ interactive=False,
164
+ wrap=True,
165
+ )
166
+
167
+ run_btn.click(
168
+ fn=run_ecoeval,
169
+ inputs=[model_in, dataset_in, max_tasks_in],
170
+ outputs=[summary_table, summary_md, per_task_table],
171
+ )
172
+
173
+ with gr.Tab("Leaderboard"):
174
+ gr.Markdown(
175
+ "Global history of runs in this Space (sorted by accuracy, then energy)."
176
+ )
177
+ refresh_btn = gr.Button("🔄 Refresh leaderboard")
178
+ leaderboard_table = gr.Dataframe(interactive=False, wrap=True)
179
+
180
+ refresh_btn.click(
181
+ fn=refresh_leaderboard,
182
+ inputs=None,
183
+ outputs=leaderboard_table,
184
+ )
185
+
186
+ return demo
187
+
188
+
189
+ if __name__ == "__main__":
190
+ demo = build_app()
191
+ demo.launch()