Spaces:

OpenEvals
/

InferenceProviderTesting

Running

File size: 5,763 Bytes

import gradio as gr
import time
from apscheduler.schedulers.background import BackgroundScheduler
import threading
import globals
from utils.io import initialize_models_providers_file, save_results, load_results, load_models_providers, get_results_table, load_models_providers_str
from utils.jobs import run_single_job, launch_jobs, update_job_statuses, relaunch_failed_jobs
from typing import List, Optional


def status_monitor() -> None:
    """Background thread to monitor job statuses."""
    while True:
        update_job_statuses()
        time.sleep(240)  # Check every 30 seconds


def daily_checkpoint() -> None:
    """Daily checkpoint - save current state."""
    print("Daily checkpoint - saving current state")
    save_results()


# Create Gradio interface
def create_app() -> gr.Blocks:
    with gr.Blocks(title="Inference Provider Testing Dashboard") as demo:
        with gr.Tab("Main"):
            gr.Markdown("# Inference Provider Testing Dashboard")
            gr.Markdown("Launch and monitor evaluation jobs for multiple models and providers.")

            # All action buttons in one row
            with gr.Row():
                init_btn = gr.Button("Fetch and Initialize Models/Providers", variant="secondary")
                launch_btn = gr.Button("Launch All Jobs", variant="primary")
                relaunch_failed_btn = gr.Button("Relaunch Failed", variant="stop")
                refresh_btn = gr.Button("Refresh Results", variant="secondary")

            output = gr.Textbox(label="Status", interactive=False)

            # Accordion for viewing models/providers list
            with gr.Accordion("Models/Providers Configuration", open=False):
                models_providers_display = gr.Code(
                    label="Current Models and Providers",
                    value=load_models_providers_str(),
                    interactive=False,
                )

            with gr.Row():
                with gr.Column():
                    gr.Markdown("## Job Results")
                    results_table = gr.Dataframe(
                        value=get_results_table(),
                        interactive=True,
                        show_search="search",
                        show_copy_button=True,
                        show_fullscreen_button=True,
                        wrap=True,
                        static_columns=list(range(7)),
                        datatype=["str", "str", "str", "str", "str", "str", "html", "str"],
                        elem_id="results_table"
                    )


            # Event handlers
            init_btn.click(
                fn=initialize_models_providers_file,
                outputs=[output, models_providers_display]
            )

            launch_btn.click(
                fn=launch_jobs,
                outputs=output
            )

            relaunch_failed_btn.click(
                fn=relaunch_failed_jobs,
                outputs=output
            )

            refresh_btn.click(
                fn=get_results_table,
                outputs=results_table
            )

            # Handle dataframe cell selection for relaunch
            def handle_table_select(evt: gr.SelectData):
                """Handle when a cell in the results table is clicked."""
                print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}")

                # If we selected a "rerun" cell, we relaunch a job
                if evt.index[1] == 7:
                    # Get the full row data from the dataframe
                    df = get_results_table()
                    row_data = df.data.iloc[evt.index[0]]

                    model = row_data['Model']
                    provider = row_data['Provider']
                    print(f"[Relaunch] Relaunching job - Model: {model}, Provider: {provider}")

                    run_single_job(model, provider, globals.TASKS)

                # Then update the table
                return get_results_table()

            results_table.select(
                fn=handle_table_select,
                inputs=[],
                outputs=results_table
            )
        with gr.Tab("About"):
            gr.Markdown("""
In this demo, we run 10 samples for ifeval (instruction following), gsm_plus (grade school math problems, less contaminated than gsm8k) and gpqa, diamond subset (knowledge), 
for all models and providers combinations.

To run any of these locally, you can use the following
```python
from huggingface_hub import run_job, inspect_job, whoami
job = run_job(
    image="hf.co/spaces/OpenEvals/EvalsOnTheHub",
    command=[
        "lighteval", "endpoint", "inference-providers", 
        "model_name=MODEL,provider=PROVIDER", 
        "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0", 
        "--max-samples", "10", 
        "--push-to-hub", "--save-details", 
        "--results-org", "YOURORG"
    ],
    namespace="huggingface",
    secrets={"HF_TOKEN": YOURTOKEN},
    token=YOURTOKEN
)
```
""")

    return demo


if __name__ == "__main__":
    # Load previous results
    load_results()
    print("Starting Inference Provider Testing Dashboard")

    # Start status monitor thread
    monitor_thread = threading.Thread(target=status_monitor, daemon=True)
    monitor_thread.start()
    print("Job status monitor started")

    # Start APScheduler for daily checkpoint
    scheduler = BackgroundScheduler()
    scheduler.add_job(daily_checkpoint, 'cron', hour=0, minute=0)  # Run at midnight
    scheduler.start()
    print("Daily checkpoint scheduler started (saves at 00:00)")

    # Create and launch the Gradio interface
    demo = create_app()
    demo.launch(server_name="0.0.0.0", server_port=7860)