Spaces:
Running
Running
| """ | |
| eval.py — Evaluate base vs fine-tuned model on the OpenEnv. | |
| Runs episodes with: | |
| 1. The fine-tuned model (current LoRA adapter) | |
| 2. The heuristic baseline | |
| Compares average rewards across tasks. Pushes results to Hub metrics dataset. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import time | |
| from typing import Any, Dict, List | |
| import torch | |
| try: | |
| from .model_utils import push_to_hub | |
| from .openenv_loop import ( | |
| OpenEnvClient, | |
| rollout_episode, | |
| rollout_heuristic_episode, | |
| ) | |
| except ImportError: | |
| from model_utils import push_to_hub | |
| from openenv_loop import ( | |
| OpenEnvClient, | |
| rollout_episode, | |
| rollout_heuristic_episode, | |
| ) | |
| def evaluate( | |
| client: OpenEnvClient, | |
| model, | |
| tokenizer, | |
| cfg: Dict[str, Any], | |
| output_dir: str = "/tmp/antiatropos_eval", | |
| ) -> Dict[str, Any]: | |
| """Run evaluation: fine-tuned model vs heuristic baseline. | |
| Returns a dict with per-task results and overall comparison. | |
| """ | |
| tasks = cfg.get("tasks", ["task-1", "task-2", "task-3"]) | |
| eval_episodes = cfg.get("eval_episodes", 3) | |
| eval_max_steps = cfg.get("eval_max_steps", 60) | |
| # Enable inference mode | |
| try: | |
| from unsloth import FastLanguageModel | |
| FastLanguageModel.for_inference(model) | |
| except ImportError: | |
| model.eval() | |
| results: Dict[str, Any] = {} | |
| all_ft_rewards: List[float] = [] | |
| all_heur_rewards: List[float] = [] | |
| print(f"\n{'='*70}") | |
| print(f"EVALUATION — {eval_episodes} episodes per task, {eval_max_steps} steps") | |
| print(f"{'='*70}") | |
| for task_id in tasks: | |
| ft_rewards: List[float] = [] | |
| heur_rewards: List[float] = [] | |
| ft_invalid = 0 | |
| for ep in range(eval_episodes): | |
| seed = 1000 + ep # Deterministic eval seeds | |
| # Fine-tuned model episode | |
| ft_ep = rollout_episode( | |
| client, model, tokenizer, task_id, | |
| eval_max_steps, cfg, seed=seed, | |
| ) | |
| ft_rewards.append(ft_ep.avg_reward) | |
| ft_invalid += ft_ep.num_invalid | |
| # Heuristic baseline episode | |
| heur_ep = rollout_heuristic_episode( | |
| client, task_id, eval_max_steps, seed=seed, | |
| ) | |
| heur_rewards.append(heur_ep.avg_reward) | |
| ft_avg = sum(ft_rewards) / len(ft_rewards) | |
| heur_avg = sum(heur_rewards) / len(heur_rewards) | |
| all_ft_rewards.extend(ft_rewards) | |
| all_heur_rewards.extend(heur_rewards) | |
| winner = "FT WINS" if ft_avg >= heur_avg else "HEURISTIC WINS" | |
| results[task_id] = { | |
| "ft_avg_reward": ft_avg, | |
| "heuristic_avg_reward": heur_avg, | |
| "ft_wins": ft_avg >= heur_avg, | |
| "ft_invalid_actions": ft_invalid, | |
| } | |
| print(f"\n {task_id}:") | |
| print(f" FT model avg reward: {ft_avg:.4f}") | |
| print(f" Heuristic avg reward: {heur_avg:.4f}") | |
| print(f" Result: {winner}") | |
| print(f" Invalid actions (FT): {ft_invalid}") | |
| # Overall summary | |
| tasks_won = sum(1 for r in results.values() if r["ft_wins"]) | |
| ft_overall = sum(all_ft_rewards) / len(all_ft_rewards) if all_ft_rewards else 0 | |
| heur_overall = sum(all_heur_rewards) / len(all_heur_rewards) if all_heur_rewards else 0 | |
| summary = { | |
| "per_task": results, | |
| "overall_ft_avg": ft_overall, | |
| "overall_heuristic_avg": heur_overall, | |
| "tasks_won_by_ft": tasks_won, | |
| "total_tasks": len(tasks), | |
| "ft_overall_wins": ft_overall >= heur_overall, | |
| } | |
| print(f"\n{'='*70}") | |
| print(f"EVALUATION SUMMARY") | |
| print(f"{'='*70}") | |
| print(f" FT model overall avg: {ft_overall:.4f}") | |
| print(f" Heuristic overall avg: {heur_overall:.4f}") | |
| print(f" FT wins on: {tasks_won}/{len(tasks)} tasks") | |
| print(f" Overall: {'FT WINS' if ft_overall >= heur_overall else 'HEURISTIC WINS'}") | |
| # Save eval results | |
| import os | |
| os.makedirs(output_dir, exist_ok=True) | |
| with open(f"{output_dir}/eval_results.json", "w") as f: | |
| json.dump(summary, f, indent=2) | |
| return summary | |
| def push_eval_results( | |
| results: Dict[str, Any], | |
| hub_dataset: str, | |
| run_id: str, | |
| iteration: int, | |
| ) -> None: | |
| """Push eval results as a row to the HF metrics dataset.""" | |
| if not hub_dataset: | |
| return | |
| row = { | |
| "run_id": run_id, | |
| "step": iteration, | |
| "type": "eval", | |
| **{f"eval_{k}": v for k, v in results.items() if not isinstance(v, dict)}, | |
| } | |
| # Flatten per-task results | |
| for task_id, task_results in results.get("per_task", {}).items(): | |
| for metric, value in task_results.items(): | |
| row[f"eval_{task_id}_{metric}"] = value | |
| _append_to_dataset(row, hub_dataset) | |
| def _append_to_dataset(row: Dict[str, Any], hub_dataset: str) -> None: | |
| """Append a row to a JSONL file on Hub (creates if not exists).""" | |
| try: | |
| from huggingface_hub import HfApi | |
| api = HfApi() | |
| # Download existing data or start fresh | |
| import tempfile, os | |
| tmp_dir = tempfile.mkdtemp() | |
| jsonl_path = os.path.join(tmp_dir, "metrics.jsonl") | |
| try: | |
| api.hf_hub_download( | |
| repo_id=hub_dataset, | |
| filename="metrics.jsonl", | |
| repo_type="dataset", | |
| local_dir=tmp_dir, | |
| ) | |
| except Exception: | |
| pass # File doesn't exist yet — that's fine | |
| # Append row | |
| with open(jsonl_path, "a") as f: | |
| f.write(json.dumps(row) + "\n") | |
| # Upload back | |
| api.upload_file( | |
| path_or_fileobj=jsonl_path, | |
| path_in_repo="metrics.jsonl", | |
| repo_id=hub_dataset, | |
| repo_type="dataset", | |
| commit_message=f"AntiAtropos metrics — {row.get('run_id', 'unknown')} step {row.get('step', '?')}", | |
| ) | |
| print(f"[eval] Metrics pushed to {hub_dataset}") | |
| except Exception as e: | |
| print(f"[eval] Failed to push metrics: {e}") | |