Spaces:

Pranavkk
/

AntiAtropos

Running

App Files Files Community

AntiAtropos / training /eval.py

Pranavkk

Upload folder using huggingface_hub

0e94d58 verified about 1 month ago

raw

history blame contribute delete

6.22 kB

	"""
	eval.py — Evaluate base vs fine-tuned model on the OpenEnv.

	Runs episodes with:
	1. The fine-tuned model (current LoRA adapter)
	2. The heuristic baseline

	Compares average rewards across tasks. Pushes results to Hub metrics dataset.
	"""

	from __future__ import annotations

	import json
	import time
	from typing import Any, Dict, List

	import torch

	try:
	from .model_utils import push_to_hub
	from .openenv_loop import (
	OpenEnvClient,
	rollout_episode,
	rollout_heuristic_episode,
	)
	except ImportError:
	from model_utils import push_to_hub
	from openenv_loop import (
	OpenEnvClient,
	rollout_episode,
	rollout_heuristic_episode,
	)


	def evaluate(
	client: OpenEnvClient,
	model,
	tokenizer,
	cfg: Dict[str, Any],
	output_dir: str = "/tmp/antiatropos_eval",
	) -> Dict[str, Any]:
	"""Run evaluation: fine-tuned model vs heuristic baseline.

	Returns a dict with per-task results and overall comparison.
	"""
	tasks = cfg.get("tasks", ["task-1", "task-2", "task-3"])
	eval_episodes = cfg.get("eval_episodes", 3)
	eval_max_steps = cfg.get("eval_max_steps", 60)

	# Enable inference mode
	try:
	from unsloth import FastLanguageModel
	FastLanguageModel.for_inference(model)
	except ImportError:
	model.eval()

	results: Dict[str, Any] = {}
	all_ft_rewards: List[float] = []
	all_heur_rewards: List[float] = []

	print(f"\n{'='*70}")
	print(f"EVALUATION — {eval_episodes} episodes per task, {eval_max_steps} steps")
	print(f"{'='*70}")

	for task_id in tasks:
	ft_rewards: List[float] = []
	heur_rewards: List[float] = []
	ft_invalid = 0

	for ep in range(eval_episodes):
	seed = 1000 + ep # Deterministic eval seeds

	# Fine-tuned model episode
	ft_ep = rollout_episode(
	client, model, tokenizer, task_id,
	eval_max_steps, cfg, seed=seed,
	)
	ft_rewards.append(ft_ep.avg_reward)
	ft_invalid += ft_ep.num_invalid

	# Heuristic baseline episode
	heur_ep = rollout_heuristic_episode(
	client, task_id, eval_max_steps, seed=seed,
	)
	heur_rewards.append(heur_ep.avg_reward)

	ft_avg = sum(ft_rewards) / len(ft_rewards)
	heur_avg = sum(heur_rewards) / len(heur_rewards)
	all_ft_rewards.extend(ft_rewards)
	all_heur_rewards.extend(heur_rewards)

	winner = "FT WINS" if ft_avg >= heur_avg else "HEURISTIC WINS"
	results[task_id] = {
	"ft_avg_reward": ft_avg,
	"heuristic_avg_reward": heur_avg,
	"ft_wins": ft_avg >= heur_avg,
	"ft_invalid_actions": ft_invalid,
	}

	print(f"\n {task_id}:")
	print(f" FT model avg reward: {ft_avg:.4f}")
	print(f" Heuristic avg reward: {heur_avg:.4f}")
	print(f" Result: {winner}")
	print(f" Invalid actions (FT): {ft_invalid}")

	# Overall summary
	tasks_won = sum(1 for r in results.values() if r["ft_wins"])
	ft_overall = sum(all_ft_rewards) / len(all_ft_rewards) if all_ft_rewards else 0
	heur_overall = sum(all_heur_rewards) / len(all_heur_rewards) if all_heur_rewards else 0

	summary = {
	"per_task": results,
	"overall_ft_avg": ft_overall,
	"overall_heuristic_avg": heur_overall,
	"tasks_won_by_ft": tasks_won,
	"total_tasks": len(tasks),
	"ft_overall_wins": ft_overall >= heur_overall,
	}

	print(f"\n{'='*70}")
	print(f"EVALUATION SUMMARY")
	print(f"{'='*70}")
	print(f" FT model overall avg: {ft_overall:.4f}")
	print(f" Heuristic overall avg: {heur_overall:.4f}")
	print(f" FT wins on: {tasks_won}/{len(tasks)} tasks")
	print(f" Overall: {'FT WINS' if ft_overall >= heur_overall else 'HEURISTIC WINS'}")

	# Save eval results
	import os
	os.makedirs(output_dir, exist_ok=True)
	with open(f"{output_dir}/eval_results.json", "w") as f:
	json.dump(summary, f, indent=2)

	return summary


	def push_eval_results(
	results: Dict[str, Any],
	hub_dataset: str,
	run_id: str,
	iteration: int,
	) -> None:
	"""Push eval results as a row to the HF metrics dataset."""
	if not hub_dataset:
	return

	row = {
	"run_id": run_id,
	"step": iteration,
	"type": "eval",
	**{f"eval_{k}": v for k, v in results.items() if not isinstance(v, dict)},
	}
	# Flatten per-task results
	for task_id, task_results in results.get("per_task", {}).items():
	for metric, value in task_results.items():
	row[f"eval_{task_id}_{metric}"] = value

	_append_to_dataset(row, hub_dataset)


	def _append_to_dataset(row: Dict[str, Any], hub_dataset: str) -> None:
	"""Append a row to a JSONL file on Hub (creates if not exists)."""
	try:
	from huggingface_hub import HfApi
	api = HfApi()

	# Download existing data or start fresh
	import tempfile, os
	tmp_dir = tempfile.mkdtemp()
	jsonl_path = os.path.join(tmp_dir, "metrics.jsonl")

	try:
	api.hf_hub_download(
	repo_id=hub_dataset,
	filename="metrics.jsonl",
	repo_type="dataset",
	local_dir=tmp_dir,
	)
	except Exception:
	pass # File doesn't exist yet — that's fine

	# Append row
	with open(jsonl_path, "a") as f:
	f.write(json.dumps(row) + "\n")

	# Upload back
	api.upload_file(
	path_or_fileobj=jsonl_path,
	path_in_repo="metrics.jsonl",
	repo_id=hub_dataset,
	repo_type="dataset",
	commit_message=f"AntiAtropos metrics — {row.get('run_id', 'unknown')} step {row.get('step', '?')}",
	)
	print(f"[eval] Metrics pushed to {hub_dataset}")

	except Exception as e:
	print(f"[eval] Failed to push metrics: {e}")