Spaces:

gberseth
/

robo-eval

No application file

robo-eval / app.py

Neo-X

Working on example

5c5b737 11 days ago

5.41 kB

	import gradio as gr
	import gymnasium as gym
	from stable_baselines3 import PPO
	from huggingface_hub import HfApi, snapshot_download, login
	import pandas as pd
	import os
	import shutil
	import time

	# --- CONFIGURATION ---
	HF_TOKEN = os.environ.get("HF_TOKEN")
	REQUESTS_DATASET = "gberseth/rl-leaderboard-requests" # REPLACE THIS
	RESULTS_DATASET = "gberseth/rl-leaderboard-results" # REPLACE THIS
	ENV_NAME = "CartPole-v1" # The Gym environment to evaluate
	EVAL_EPISODES = 10 # How many times to run the agent

	# Authenticate
	login(token=HF_TOKEN)
	api = HfApi()

	def evaluate_policy(model_id):
	"""
	Downloads a PPO model from HF Hub, runs it in Gym, returns mean reward.
	"""
	print(f"Starting evaluation for: {model_id}")
	try:
	# 1. Download the model repository
	# We look for a file named "ppo_cartpole.zip" or just standard "model.zip"
	# Adjust 'allow_patterns' to match what you require users to submit.
	repo_path = snapshot_download(repo_id=model_id, allow_patterns=["*.zip"])

	# Find the .zip file in the downloaded folder
	model_file = None
	for root, dirs, files in os.walk(repo_path):
	for file in files:
	if file.endswith(".zip"):
	model_file = os.path.join(root, file)
	break

	if not model_file:
	return None, "Error: No .zip model file found in repo."

	# 2. Load the PPO Agent
	# custom_objects map may be needed if python versions differ, but usually fine for PPO
	model = PPO.load(model_file)

	# 3. Run Evaluation Loop
	env = gym.make(ENV_NAME)
	total_rewards = []

	for i in range(EVAL_EPISODES):
	obs, _ = env.reset()
	done = False
	truncated = False
	episode_reward = 0

	while not (done or truncated):
	# PPO prediction
	action, _ = model.predict(obs, deterministic=True)
	obs, reward, done, truncated, _ = env.step(action)
	episode_reward += reward

	total_rewards.append(episode_reward)

	mean_reward = sum(total_rewards) / len(total_rewards)
	env.close()

	return mean_reward, "Success"

	except Exception as e:
	print(f"Evaluation failed: {e}")
	return None, str(e)

	def run_evaluation_loop():
	"""
	Main loop: Pulls requests, checks for 'Pending', evaluates, updates datasets.
	"""
	print("Checking for new submissions...")

	# 1. Load the Requests Dataset
	# We use pandas to read the CSV directly from the Hub
	try:
	requests_df = pd.read_csv(f"hf://datasets/{REQUESTS_DATASET}/requests.csv")
	except Exception:
	# If dataset doesn't exist yet, create an empty one locally (for testing)
	print("Requests dataset not found or empty.")
	return "No requests found."

	# 2. Filter for Pending Submissions
	# Assuming columns: [model_id, status, submitted_by]
	pending_rows = requests_df[requests_df["status"] == "Pending"]

	if len(pending_rows) == 0:
	return "No pending submissions."

	# 3. Process the first pending submission
	row_index = pending_rows.index[0]
	model_id = pending_rows.loc[row_index, "model_id"]

	print(f"Evaluating {model_id}...")

	# Run the Eval
	score, status_msg = evaluate_policy(model_id)

	# 4. Update the Dataframes

	# Update Requests (Mark as Done or Failed)
	requests_df.loc[row_index, "status"] = "Done" if score is not None else "Failed"

	# Prepare Results Row
	if score is not None:
	new_result = {
	"model_id": model_id,
	"mean_reward": score,
	"status": "Success"
	}

	# Load Results Dataset
	try:
	results_df = pd.read_csv(f"hf://datasets/{RESULTS_DATASET}/results.csv")
	except:
	results_df = pd.DataFrame(columns=["model_id", "mean_reward", "status"])

	# Append new result
	results_df = pd.concat([results_df, pd.DataFrame([new_result])], ignore_index=True)

	# Save Results to Hub
	results_df.to_csv("results.csv", index=False)
	api.upload_file(
	path_or_fileobj="results.csv",
	path_in_repo="results.csv",
	repo_id=RESULTS_DATASET,
	repo_type="dataset"
	)

	# Save Requests Updates to Hub
	requests_df.to_csv("requests.csv", index=False)
	api.upload_file(
	path_or_fileobj="requests.csv",
	path_in_repo="requests.csv",
	repo_id=REQUESTS_DATASET,
	repo_type="dataset"
	)

	return f"Processed {model_id}: Score {score}"

	# --- GRADIO UI (To keep the Space running) ---
	with gr.Blocks() as demo:
	gr.Markdown("# RL Evaluation Backend")
	gr.Markdown("This space runs in the background to evaluate new submissions.")

	# A button to manually trigger eval (useful for debugging)
	eval_btn = gr.Button("Run Evaluator Now")
	output = gr.Textbox(label="Logs")

	eval_btn.click(fn=run_evaluation_loop, outputs=output)

	# Auto-run every 60 seconds (requires Gradio 'live' updates or external scheduler)
	# In a real deployment, you might use a simplified cron loop or `gradio.Timer`

	demo.queue().launch()