robo-eval / app.py
Neo-X
Working on example
5c5b737
import gradio as gr
import gymnasium as gym
from stable_baselines3 import PPO
from huggingface_hub import HfApi, snapshot_download, login
import pandas as pd
import os
import shutil
import time
# --- CONFIGURATION ---
HF_TOKEN = os.environ.get("HF_TOKEN")
REQUESTS_DATASET = "gberseth/rl-leaderboard-requests" # REPLACE THIS
RESULTS_DATASET = "gberseth/rl-leaderboard-results" # REPLACE THIS
ENV_NAME = "CartPole-v1" # The Gym environment to evaluate
EVAL_EPISODES = 10 # How many times to run the agent
# Authenticate
login(token=HF_TOKEN)
api = HfApi()
def evaluate_policy(model_id):
"""
Downloads a PPO model from HF Hub, runs it in Gym, returns mean reward.
"""
print(f"Starting evaluation for: {model_id}")
try:
# 1. Download the model repository
# We look for a file named "ppo_cartpole.zip" or just standard "model.zip"
# Adjust 'allow_patterns' to match what you require users to submit.
repo_path = snapshot_download(repo_id=model_id, allow_patterns=["*.zip"])
# Find the .zip file in the downloaded folder
model_file = None
for root, dirs, files in os.walk(repo_path):
for file in files:
if file.endswith(".zip"):
model_file = os.path.join(root, file)
break
if not model_file:
return None, "Error: No .zip model file found in repo."
# 2. Load the PPO Agent
# custom_objects map may be needed if python versions differ, but usually fine for PPO
model = PPO.load(model_file)
# 3. Run Evaluation Loop
env = gym.make(ENV_NAME)
total_rewards = []
for i in range(EVAL_EPISODES):
obs, _ = env.reset()
done = False
truncated = False
episode_reward = 0
while not (done or truncated):
# PPO prediction
action, _ = model.predict(obs, deterministic=True)
obs, reward, done, truncated, _ = env.step(action)
episode_reward += reward
total_rewards.append(episode_reward)
mean_reward = sum(total_rewards) / len(total_rewards)
env.close()
return mean_reward, "Success"
except Exception as e:
print(f"Evaluation failed: {e}")
return None, str(e)
def run_evaluation_loop():
"""
Main loop: Pulls requests, checks for 'Pending', evaluates, updates datasets.
"""
print("Checking for new submissions...")
# 1. Load the Requests Dataset
# We use pandas to read the CSV directly from the Hub
try:
requests_df = pd.read_csv(f"hf://datasets/{REQUESTS_DATASET}/requests.csv")
except Exception:
# If dataset doesn't exist yet, create an empty one locally (for testing)
print("Requests dataset not found or empty.")
return "No requests found."
# 2. Filter for Pending Submissions
# Assuming columns: [model_id, status, submitted_by]
pending_rows = requests_df[requests_df["status"] == "Pending"]
if len(pending_rows) == 0:
return "No pending submissions."
# 3. Process the first pending submission
row_index = pending_rows.index[0]
model_id = pending_rows.loc[row_index, "model_id"]
print(f"Evaluating {model_id}...")
# Run the Eval
score, status_msg = evaluate_policy(model_id)
# 4. Update the Dataframes
# Update Requests (Mark as Done or Failed)
requests_df.loc[row_index, "status"] = "Done" if score is not None else "Failed"
# Prepare Results Row
if score is not None:
new_result = {
"model_id": model_id,
"mean_reward": score,
"status": "Success"
}
# Load Results Dataset
try:
results_df = pd.read_csv(f"hf://datasets/{RESULTS_DATASET}/results.csv")
except:
results_df = pd.DataFrame(columns=["model_id", "mean_reward", "status"])
# Append new result
results_df = pd.concat([results_df, pd.DataFrame([new_result])], ignore_index=True)
# Save Results to Hub
results_df.to_csv("results.csv", index=False)
api.upload_file(
path_or_fileobj="results.csv",
path_in_repo="results.csv",
repo_id=RESULTS_DATASET,
repo_type="dataset"
)
# Save Requests Updates to Hub
requests_df.to_csv("requests.csv", index=False)
api.upload_file(
path_or_fileobj="requests.csv",
path_in_repo="requests.csv",
repo_id=REQUESTS_DATASET,
repo_type="dataset"
)
return f"Processed {model_id}: Score {score}"
# --- GRADIO UI (To keep the Space running) ---
with gr.Blocks() as demo:
gr.Markdown("# RL Evaluation Backend")
gr.Markdown("This space runs in the background to evaluate new submissions.")
# A button to manually trigger eval (useful for debugging)
eval_btn = gr.Button("Run Evaluator Now")
output = gr.Textbox(label="Logs")
eval_btn.click(fn=run_evaluation_loop, outputs=output)
# Auto-run every 60 seconds (requires Gradio 'live' updates or external scheduler)
# In a real deployment, you might use a simplified cron loop or `gradio.Timer`
demo.queue().launch()