Spaces:
Sleeping
Sleeping
| """ | |
| Evaluation script — runs a baseline Detective (GPT-4o-mini or local model) | |
| across all 5 tasks and produces evals/baseline_results.md. | |
| Usage: | |
| python -m training.eval --baseline --n 5 | |
| python -m training.eval --model path/to/finetuned --n 5 | |
| """ | |
| import argparse | |
| import json | |
| import statistics | |
| import time | |
| from pathlib import Path | |
| from datetime import datetime | |
| import httpx | |
| from dotenv import load_dotenv | |
| load_dotenv(Path(__file__).parent.parent / ".env") | |
| from training.mindread_grpo_env import MindReadGRPOEnv | |
| TASK_IDS = ["factual_easy", "factual_hard", "belief_inference", "goal_inference", "second_order"] | |
| ENV_URL = "http://localhost:8000" | |
| def run_llm_detective(obs: dict, env: MindReadGRPOEnv, openai_model: str = "gpt-4o-mini") -> tuple[float, int]: | |
| import os | |
| import re | |
| from groq import Groq | |
| client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| groq_model = "llama-3.1-8b-instant" | |
| system = f"""\ | |
| You are a Detective. Infer the Oracle's hidden secret by asking strategic questions. | |
| Task: {obs['task_description']} | |
| Context: {obs['context']} | |
| Oracle: {obs['oracle_persona']} | |
| Max questions: {obs['max_steps']} | |
| After asking questions (one per message), submit your hypothesis using: | |
| SUBMIT: Category: <factual|belief|goal|second_order> | |
| <your hypothesis text>\ | |
| """ | |
| messages = [{"role": "system", "content": system}] | |
| episode_id = obs["episode_id"] | |
| questions_asked = 0 | |
| max_q = obs["max_steps"] | |
| for _ in range(max_q): | |
| resp = client.chat.completions.create( | |
| model=groq_model, | |
| messages=messages, | |
| temperature=0.7, | |
| max_tokens=200, | |
| ) | |
| answer = resp.choices[0].message.content.strip() | |
| messages.append({"role": "assistant", "content": answer}) | |
| if answer.upper().startswith("SUBMIT:"): | |
| break | |
| try: | |
| result = env.step(episode_id, answer) | |
| oracle_resp = result["info"].get("oracle_response", "") | |
| messages.append({"role": "user", "content": oracle_resp}) | |
| questions_asked += 1 | |
| if result["done"]: | |
| break | |
| except Exception as e: | |
| print(f" [step error] {e}") | |
| break | |
| hyp_text = "" | |
| category = None | |
| for msg in reversed(messages): | |
| if msg["role"] == "assistant": | |
| text = msg["content"] | |
| if text.upper().startswith("SUBMIT:"): | |
| body = text[7:].strip() | |
| cat_match = re.match(r"Category:\s*(\w+)", body, re.IGNORECASE) | |
| if cat_match: | |
| category = cat_match.group(1).lower() | |
| hyp_text = body[cat_match.end():].strip() | |
| else: | |
| hyp_text = body | |
| break | |
| else: | |
| hyp_text = text | |
| break | |
| if not hyp_text: | |
| hyp_text = "Unable to determine the secret." | |
| try: | |
| result = env.submit(episode_id, hyp_text, category) | |
| return result["reward"], questions_asked | |
| except Exception as e: | |
| print(f" [submit error] {e}") | |
| return 0.0, questions_asked | |
| def evaluate_task(task_id: str, env: MindReadGRPOEnv, n_episodes: int, use_baseline: bool) -> dict: | |
| rewards = [] | |
| questions_counts = [] | |
| for i in range(n_episodes): | |
| print(f" Episode {i+1}/{n_episodes} ...", end=" ", flush=True) | |
| try: | |
| obs = env.reset(task_id=task_id) | |
| if use_baseline: | |
| reward, n_q = run_llm_detective(obs, env) | |
| else: | |
| reward, n_q = 0.3, 5 # placeholder | |
| rewards.append(reward) | |
| questions_counts.append(n_q) | |
| print(f"reward={reward:.3f} q={n_q}") | |
| time.sleep(0.5) | |
| except Exception as e: | |
| print(f"ERROR: {e}") | |
| rewards.append(0.0) | |
| questions_counts.append(0) | |
| return { | |
| "task_id": task_id, | |
| "n_episodes": n_episodes, | |
| "avg_reward": round(statistics.mean(rewards), 4), | |
| "std_reward": round(statistics.stdev(rewards) if len(rewards) > 1 else 0.0, 4), | |
| "min_reward": round(min(rewards), 4), | |
| "max_reward": round(max(rewards), 4), | |
| "avg_questions": round(statistics.mean(questions_counts), 2), | |
| "rewards": rewards, | |
| } | |
| def write_markdown(results: list[dict], output_path: Path, label: str): | |
| lines = [ | |
| f"# MindRead Evaluation Results — {label}", | |
| f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}", | |
| "", | |
| "| Task | Avg Reward | Std | Min | Max | Avg Questions |", | |
| "|------|-----------|-----|-----|-----|---------------|", | |
| ] | |
| for r in results: | |
| lines.append( | |
| f"| {r['task_id']} | {r['avg_reward']:.4f} | " | |
| f"{r['std_reward']:.4f} | {r['min_reward']:.4f} | " | |
| f"{r['max_reward']:.4f} | {r['avg_questions']:.1f} |" | |
| ) | |
| lines += ["", "## Raw Rewards", ""] | |
| for r in results: | |
| lines.append(f"**{r['task_id']}**: {r['rewards']}") | |
| output_path.write_text("\n".join(lines), encoding="utf-8") | |
| print(f"\n[eval] Written to {output_path}") | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--baseline", action="store_true", help="Run GPT-4o-mini as baseline detective") | |
| parser.add_argument("--model", default=None, help="Path to finetuned model for evaluation") | |
| parser.add_argument("--n", type=int, default=5, help="Episodes per task") | |
| parser.add_argument("--tasks", nargs="+", default=TASK_IDS) | |
| parser.add_argument("--env-url", default=ENV_URL) | |
| parser.add_argument("--output", default=None) | |
| args = parser.parse_args() | |
| env = MindReadGRPOEnv(base_url=args.env_url) | |
| results = [] | |
| for task_id in args.tasks: | |
| print(f"\n[eval] Task: {task_id} ({args.n} episodes)") | |
| r = evaluate_task(task_id, env, n_episodes=args.n, use_baseline=args.baseline) | |
| results.append(r) | |
| print(f" => avg_reward={r['avg_reward']:.4f} avg_q={r['avg_questions']:.1f}") | |
| evals_dir = Path(__file__).parent.parent / "evals" | |
| evals_dir.mkdir(exist_ok=True) | |
| if args.output: | |
| out_path = Path(args.output) | |
| elif args.baseline: | |
| out_path = evals_dir / "baseline_results.md" | |
| else: | |
| out_path = evals_dir / "trained_results.md" | |
| write_markdown(results, out_path, label="Baseline" if args.baseline else "Trained") | |
| print("\n[eval] Summary:") | |
| for r in results: | |
| print(f" {r['task_id']}: {r['avg_reward']:.4f} (±{r['std_reward']:.4f}), q={r['avg_questions']:.1f}") | |
| if __name__ == "__main__": | |
| main() | |