Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update Evaluation tab
Browse files- yourbench_space/app.py +66 -20
- yourbench_space/evaluation.py +3 -1
yourbench_space/app.py
CHANGED
|
@@ -9,7 +9,7 @@ from loguru import logger
|
|
| 9 |
|
| 10 |
import gradio as gr
|
| 11 |
from datasets import load_dataset
|
| 12 |
-
from huggingface_hub import whoami
|
| 13 |
from yourbench_space import PATH
|
| 14 |
from yourbench_space.utils import (
|
| 15 |
STAGES,
|
|
@@ -136,23 +136,26 @@ def enable_button(files):
|
|
| 136 |
return gr.update(interactive=bool(files))
|
| 137 |
|
| 138 |
|
| 139 |
-
def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
|
| 140 |
-
# Test dataset existence
|
| 141 |
eval_ds_name = f"{org_name}/{eval_name}"
|
| 142 |
-
|
|
|
|
|
|
|
| 143 |
try:
|
| 144 |
-
load_dataset(eval_ds_name, streaming=True, token=oauth_token.token)
|
| 145 |
except Exception as e:
|
| 146 |
-
|
| 147 |
-
return
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
|
| 155 |
api = HfApi()
|
|
|
|
| 156 |
|
| 157 |
try:
|
| 158 |
api.create_repo(
|
|
@@ -161,10 +164,30 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
|
|
| 161 |
space_sdk="gradio",
|
| 162 |
token=oauth_token.token,
|
| 163 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
api.upload_folder(
|
| 165 |
repo_id=repo_id,
|
| 166 |
repo_type="space",
|
| 167 |
-
folder_path=
|
| 168 |
token=oauth_token.token,
|
| 169 |
)
|
| 170 |
api.add_space_secret(
|
|
@@ -176,8 +199,12 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
|
|
| 176 |
api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
|
| 177 |
api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
|
| 178 |
except Exception as e:
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
|
| 183 |
def init_session(profile: gr.OAuthProfile | None):
|
|
@@ -338,11 +365,30 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
| 338 |
outputs=[log_output, stages_table],
|
| 339 |
)
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
with gr.Tab("Evaluate", id=2):
|
| 342 |
-
with gr.
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
|
| 347 |
app.load(init_session, outputs=session_state)
|
| 348 |
|
|
|
|
| 9 |
|
| 10 |
import gradio as gr
|
| 11 |
from datasets import load_dataset
|
| 12 |
+
from huggingface_hub import whoami, HfApi
|
| 13 |
from yourbench_space import PATH
|
| 14 |
from yourbench_space.utils import (
|
| 15 |
STAGES,
|
|
|
|
| 136 |
return gr.update(interactive=bool(files))
|
| 137 |
|
| 138 |
|
| 139 |
+
def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name, config_name="lighteval"):
|
|
|
|
| 140 |
eval_ds_name = f"{org_name}/{eval_name}"
|
| 141 |
+
repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
|
| 142 |
+
folder_path = str(Path(PATH) / "yourbench_space" / "leaderboard_space")
|
| 143 |
+
|
| 144 |
try:
|
| 145 |
+
load_dataset(eval_ds_name, name=config_name, streaming=True, token=oauth_token.token)
|
| 146 |
except Exception as e:
|
| 147 |
+
logger.error(f"Failed to load dataset '{eval_ds_name}': {e}")
|
| 148 |
+
return "β Failed: Dataset loading error"
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
create_eval_file(eval_ds_name)
|
| 152 |
+
status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(f"Evaluation error: {e}")
|
| 155 |
+
return f"β Failed: Evaluation error\n{e}"
|
| 156 |
|
|
|
|
| 157 |
api = HfApi()
|
| 158 |
+
space_was_regenerated = False
|
| 159 |
|
| 160 |
try:
|
| 161 |
api.create_repo(
|
|
|
|
| 164 |
space_sdk="gradio",
|
| 165 |
token=oauth_token.token,
|
| 166 |
)
|
| 167 |
+
except Exception as e:
|
| 168 |
+
if "409" in str(e) and "already created this space repo" in str(e):
|
| 169 |
+
logger.info(f"Space '{repo_id}' already exists. Deleting and regenerating it.")
|
| 170 |
+
try:
|
| 171 |
+
api.delete_repo(repo_id=repo_id, repo_type="space", token=oauth_token.token)
|
| 172 |
+
api.create_repo(
|
| 173 |
+
repo_id=repo_id,
|
| 174 |
+
repo_type="space",
|
| 175 |
+
space_sdk="gradio",
|
| 176 |
+
token=oauth_token.token,
|
| 177 |
+
)
|
| 178 |
+
space_was_regenerated = True
|
| 179 |
+
except Exception as delete_err:
|
| 180 |
+
logger.error(f"Failed to delete and recreate space '{repo_id}': {delete_err}")
|
| 181 |
+
return f"β
Evaluation succeeded\nβ Failed: Could not recreate space\n{delete_err}"
|
| 182 |
+
else:
|
| 183 |
+
logger.error(f"Space creation error: {e}")
|
| 184 |
+
return f"β
Evaluation succeeded\nβ Failed: Space creation error\n{e}"
|
| 185 |
+
|
| 186 |
+
try:
|
| 187 |
api.upload_folder(
|
| 188 |
repo_id=repo_id,
|
| 189 |
repo_type="space",
|
| 190 |
+
folder_path=folder_path,
|
| 191 |
token=oauth_token.token,
|
| 192 |
)
|
| 193 |
api.add_space_secret(
|
|
|
|
| 199 |
api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
|
| 200 |
api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
|
| 201 |
except Exception as e:
|
| 202 |
+
logger.error(f"Failed during space setup: {e}")
|
| 203 |
+
return f"β
Evaluation succeeded\nβ Failed: Space setup error\n{e}"
|
| 204 |
+
|
| 205 |
+
if space_was_regenerated:
|
| 206 |
+
return f"β
Evaluation succeeded\nπ Space '{repo_id}' was regenerated successfully"
|
| 207 |
+
return f"β
Evaluation and Space creation completed successfully for: {repo_id}"
|
| 208 |
|
| 209 |
|
| 210 |
def init_session(profile: gr.OAuthProfile | None):
|
|
|
|
| 365 |
outputs=[log_output, stages_table],
|
| 366 |
)
|
| 367 |
|
| 368 |
+
# with gr.Tab("Evaluate", id=2):
|
| 369 |
+
# with gr.Row():
|
| 370 |
+
# btn_launch_evals = gr.Button("Launch evaluations")
|
| 371 |
+
# status = gr.Textbox(label="Status")
|
| 372 |
+
# btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name, gr.State("lighteval")], status)
|
| 373 |
+
|
| 374 |
with gr.Tab("Evaluate", id=2):
|
| 375 |
+
with gr.Column():
|
| 376 |
+
gr.Markdown("### π§ͺ Run YourBench Evaluation")
|
| 377 |
+
gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
|
| 378 |
+
|
| 379 |
+
with gr.Row():
|
| 380 |
+
btn_launch_evals = gr.Button("π Launch Evaluation", variant="primary")
|
| 381 |
+
clear_status_btn = gr.Button("Clear", variant="secondary")
|
| 382 |
+
|
| 383 |
+
with gr.Accordion("Evaluation Log", open=True):
|
| 384 |
+
eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
|
| 385 |
+
|
| 386 |
+
btn_launch_evals.click(
|
| 387 |
+
run_evaluation_pipeline,
|
| 388 |
+
[hf_org_dropdown, hf_dataset_name, gr.State("lighteval")],
|
| 389 |
+
eval_status,
|
| 390 |
+
)
|
| 391 |
+
clear_status_btn.click(lambda: "", outputs=eval_status)
|
| 392 |
|
| 393 |
app.load(init_session, outputs=session_state)
|
| 394 |
|
yourbench_space/evaluation.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import subprocess
|
| 3 |
import asyncio
|
|
|
|
| 4 |
|
| 5 |
from yourbench_space.leaderboard_space.env import INIT_MODELS
|
| 6 |
|
|
@@ -11,7 +12,8 @@ OUTPUT_DIR = "/data" if ON_SPACES else "."
|
|
| 11 |
|
| 12 |
def create_eval_file(eval_ds_name: str):
|
| 13 |
task_name = eval_ds_name.replace("/", "_")
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
async def run_process(args: list) -> dict:
|
| 17 |
process = await asyncio.create_subprocess_exec(
|
|
|
|
| 1 |
import os
|
| 2 |
import subprocess
|
| 3 |
import asyncio
|
| 4 |
+
from pathlib import Path
|
| 5 |
|
| 6 |
from yourbench_space.leaderboard_space.env import INIT_MODELS
|
| 7 |
|
|
|
|
| 12 |
|
| 13 |
def create_eval_file(eval_ds_name: str):
|
| 14 |
task_name = eval_ds_name.replace("/", "_")
|
| 15 |
+
template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
|
| 16 |
+
subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
|
| 17 |
|
| 18 |
async def run_process(args: list) -> dict:
|
| 19 |
process = await asyncio.create_subprocess_exec(
|