Spaces:

yourbench
/

advanced

Running on CPU Upgrade

App Files Files Community

alozowski HF Staff commited on Mar 24

Commit

9562cba

1 Parent(s): 8ff0670

Update Evaluation tab

Browse files

Files changed (2) hide show

yourbench_space/app.py +66 -20
yourbench_space/evaluation.py +3 -1

yourbench_space/app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from loguru import logger
 import gradio as gr
 from datasets import load_dataset
-from huggingface_hub import whoami
 from yourbench_space import PATH
 from yourbench_space.utils import (
     STAGES,
@@ -136,23 +136,26 @@ def enable_button(files):
     return gr.update(interactive=bool(files))
-def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
-    # Test dataset existence
     eval_ds_name = f"{org_name}/{eval_name}"
-    # Test dataset existence
     try:
-        load_dataset(eval_ds_name, streaming=True, token=oauth_token.token)
     except Exception as e:
-        print(f"Error while loading the dataset: {e}")
-        return
-    # Run evaluations
-    create_eval_file(eval_ds_name)
-    status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
-    # Create space
-    from huggingface_hub import HfApi
-    repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
     api = HfApi()
     try:
         api.create_repo(
@@ -161,10 +164,30 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
             space_sdk="gradio",
             token=oauth_token.token,
         )
         api.upload_folder(
             repo_id=repo_id,
             repo_type="space",
-            folder_path="src/",
             token=oauth_token.token,
         )
         api.add_space_secret(
@@ -176,8 +199,12 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
         api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
         api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
     except Exception as e:
-        status = "Evaluation" + status + "\nLeaderboard creation:" + e
-    return status
 def init_session(profile: gr.OAuthProfile | None):
@@ -338,11 +365,30 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
                     outputs=[log_output, stages_table],
                 )
         with gr.Tab("Evaluate", id=2):
-            with gr.Row():
-                btn_launch_evals = gr.Button("Launch evaluations")
-                status = gr.Textbox(label="Status")
-            btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name], status)
     app.load(init_session, outputs=session_state)

 import gradio as gr
 from datasets import load_dataset
+from huggingface_hub import whoami, HfApi
 from yourbench_space import PATH
 from yourbench_space.utils import (
     STAGES,
     return gr.update(interactive=bool(files))
+def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name, config_name="lighteval"):
     eval_ds_name = f"{org_name}/{eval_name}"
+    repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
+    folder_path = str(Path(PATH) / "yourbench_space" / "leaderboard_space")
     try:
+        load_dataset(eval_ds_name, name=config_name, streaming=True, token=oauth_token.token)
     except Exception as e:
+        logger.error(f"Failed to load dataset '{eval_ds_name}': {e}")
+        return "❌ Failed: Dataset loading error"
+    try:
+        create_eval_file(eval_ds_name)
+        status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
+    except Exception as e:
+        logger.error(f"Evaluation error: {e}")
+        return f"❌ Failed: Evaluation error\n{e}"
     api = HfApi()
+    space_was_regenerated = False
     try:
         api.create_repo(
             space_sdk="gradio",
             token=oauth_token.token,
         )
+    except Exception as e:
+        if "409" in str(e) and "already created this space repo" in str(e):
+            logger.info(f"Space '{repo_id}' already exists. Deleting and regenerating it.")
+            try:
+                api.delete_repo(repo_id=repo_id, repo_type="space", token=oauth_token.token)
+                api.create_repo(
+                    repo_id=repo_id,
+                    repo_type="space",
+                    space_sdk="gradio",
+                    token=oauth_token.token,
+                )
+                space_was_regenerated = True
+            except Exception as delete_err:
+                logger.error(f"Failed to delete and recreate space '{repo_id}': {delete_err}")
+                return f"✅ Evaluation succeeded\n❌ Failed: Could not recreate space\n{delete_err}"
+        else:
+            logger.error(f"Space creation error: {e}")
+            return f"✅ Evaluation succeeded\n❌ Failed: Space creation error\n{e}"
+    try:
         api.upload_folder(
             repo_id=repo_id,
             repo_type="space",
+            folder_path=folder_path,
             token=oauth_token.token,
         )
         api.add_space_secret(
         api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
         api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
     except Exception as e:
+        logger.error(f"Failed during space setup: {e}")
+        return f"✅ Evaluation succeeded\n❌ Failed: Space setup error\n{e}"
+    if space_was_regenerated:
+        return f"✅ Evaluation succeeded\n🔁 Space '{repo_id}' was regenerated successfully"
+    return f"✅ Evaluation and Space creation completed successfully for: {repo_id}"
 def init_session(profile: gr.OAuthProfile | None):
                     outputs=[log_output, stages_table],
                 )
+        # with gr.Tab("Evaluate", id=2):
+        #     with gr.Row():
+        #         btn_launch_evals = gr.Button("Launch evaluations")
+        #         status = gr.Textbox(label="Status")
+        #     btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name, gr.State("lighteval")], status)
         with gr.Tab("Evaluate", id=2):
+            with gr.Column():
+                gr.Markdown("### 🧪 Run YourBench Evaluation")
+                gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
+                with gr.Row():
+                    btn_launch_evals = gr.Button("🚀 Launch Evaluation", variant="primary")
+                    clear_status_btn = gr.Button("Clear", variant="secondary")
+                with gr.Accordion("Evaluation Log", open=True):
+                    eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
+                btn_launch_evals.click(
+                    run_evaluation_pipeline,
+                    [hf_org_dropdown, hf_dataset_name, gr.State("lighteval")],
+                    eval_status,
+                )
+                clear_status_btn.click(lambda: "", outputs=eval_status)
     app.load(init_session, outputs=session_state)

yourbench_space/evaluation.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import subprocess
 import asyncio
 from yourbench_space.leaderboard_space.env import INIT_MODELS
@@ -11,7 +12,8 @@ OUTPUT_DIR = "/data" if ON_SPACES else "."
 def create_eval_file(eval_ds_name: str):
     task_name = eval_ds_name.replace("/", "_")
-    subprocess.run(["lighteval", "tasks", "create", "examples/custom_tasks_templates/custom_yourbench_task.py", task_name, eval_ds_name])
 async def run_process(args: list) -> dict:
     process = await asyncio.create_subprocess_exec(

 import os
 import subprocess
 import asyncio
+from pathlib import Path
 from yourbench_space.leaderboard_space.env import INIT_MODELS
 def create_eval_file(eval_ds_name: str):
     task_name = eval_ds_name.replace("/", "_")
+    template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
+    subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
 async def run_process(args: list) -> dict:
     process = await asyncio.create_subprocess_exec(