#!/usr/bin/env python3 """ baseline_inference.py -- Baseline agent using the OpenAI API. Reads OPENAI_API_KEY from environment variables. Runs all 3 tasks (easy, medium, hard) and prints reproducible scores. Usage: # Start the server first: uvicorn server.app:app --port 8000 # Then run the baseline: export OPENAI_API_KEY=sk-... python baseline_inference.py """ from __future__ import annotations import json import os import re import sys from typing import Any, Optional from openai import OpenAI from server.hypothesis_lab_environment import HypothesisLabEnvironment from models import ActionType, ExperimentType, HypLabAction, NoiseLevelTag from tasks import ALL_TASKS from tasks.task_easy import grade_easy from tasks.task_medium import grade_medium from tasks.task_hard import grade_hard SYSTEM_PROMPT_RL = """You are a scientific AI assistant. You must discover hidden causal rules between variables through experimentation. You can take these actions (respond with valid JSON): EXPERIMENT -- probe the system: {"action_type": "experiment", "experiment_type": "", "control_variable": "", "target_variable": "", ...} Experiment types: "intervention" -- set control_variable to control_value, observe target "correlation" -- sweep control_variable over control_range [min, max, n_points], observe target "counterfactual" -- ask what happens if control_variable changes by control_value (delta) "passive" -- observe target_variable in its resting state SUBMIT -- end the episode with your hypothesis: {"action_type": "submit", "hypothesis_text": "", "hypothesis_equations": [""], "confidence": <0.0-1.0>} Discover the rules. Submit when ready.""" SYSTEM_PROMPT_BASELINE = SYSTEM_PROMPT_RL + """ Strategy tips (for baseline evaluation only -- remove for RL training): - Run interventions first to discover which variables are causally connected - Vary the control variable widely (e.g. 1, 5, 10) to detect nonlinearity - Don't repeat the same experiment -- redundant experiments are penalised - Submit early with confidence if you have strong evidence (efficiency bonus) - Include numerical values (slopes, thresholds) in your hypothesis for precision bonus """ GRADERS = { "easy": grade_easy, "medium": grade_medium, "hard": grade_hard, } MAX_TURNS = 8 def parse_action(text: str, obs_vars: list[str], turn: int) -> Optional[HypLabAction]: """Parse a HypLabAction from LLM-generated text.""" if turn >= MAX_TURNS - 1: return HypLabAction( action_type=ActionType.SUBMIT, hypothesis_text=text[:1000], confidence=0.5, ) json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) raw = json_match.group(1) if json_match else text.strip() brace_match = re.search(r"\{[^{}]*\}", raw, re.DOTALL) if brace_match: raw = brace_match.group(0) try: data = json.loads(raw) return HypLabAction(**data) except Exception: pass text_l = text.lower() if any(w in text_l for w in ["submit", "hypothesis:", "my hypothesis", "i conclude"]): hyp_match = re.search( r"(?:hypothesis|conclude|rule)[:\s]+(.{10,500})", text, re.IGNORECASE ) hyp_text = hyp_match.group(1) if hyp_match else text[:500] return HypLabAction( action_type=ActionType.SUBMIT, hypothesis_text=hyp_text.strip(), confidence=0.6, ) return None def run_episode( client: OpenAI, model: str, task: dict[str, Any], use_hints: bool = True, ) -> dict[str, Any]: """Run a single episode and return the grading result dict.""" env = HypothesisLabEnvironment() reset_kwargs = dict(task["reset_kwargs"]) seed = reset_kwargs.pop("seed", None) obs = env.reset(seed=seed, **reset_kwargs) prompt = SYSTEM_PROMPT_BASELINE if use_hints else SYSTEM_PROMPT_RL messages = [ {"role": "system", "content": prompt}, {"role": "user", "content": obs.system_message}, ] last_obs = obs for turn in range(MAX_TURNS): if last_obs.done: break response = client.chat.completions.create( model=model, messages=messages, temperature=0.3, max_tokens=512, ) assistant_text = response.choices[0].message.content or "" messages.append({"role": "assistant", "content": assistant_text}) action = parse_action(assistant_text, last_obs.available_variables, turn) if action is None: messages.append({ "role": "user", "content": "Invalid action format. Please respond with a valid JSON action.", }) continue last_obs = env.step(action) messages.append({"role": "user", "content": last_obs.system_message}) if not last_obs.done: submit = HypLabAction( action_type=ActionType.SUBMIT, hypothesis_text="Unable to determine -- insufficient experiments.", confidence=0.1, ) last_obs = env.step(submit) return { "accuracy_score": last_obs.accuracy_score or 0.0, "precision_bonus": last_obs.precision_bonus or 0.0, "calibration_score": last_obs.calibration_score or 0.0, "efficiency_bonus": last_obs.efficiency_bonus or 0.0, "contradiction_penalty": last_obs.contradiction_penalty or 0.0, "total_episode_reward": last_obs.total_episode_reward or 0.0, "ground_truth": last_obs.ground_truth_revealed or "", } def run_all_tasks() -> dict[str, Any]: """Run baseline agent on all tasks and return scores. Callable from both the CLI and the /baseline endpoint. Requires OPENAI_API_KEY in environment. """ api_key = os.environ.get("OPENAI_API_KEY") if not api_key: raise RuntimeError("OPENAI_API_KEY environment variable not set.") model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") client = OpenAI(api_key=api_key) results: dict[str, Any] = {} for task in ALL_TASKS: task_id = task["id"] episode_result = run_episode(client, model, task) grader = GRADERS[task_id] score = grader(episode_result) results[task_id] = { "score": score, "episode_result": episode_result, } avg = sum(r["score"] for r in results.values()) / max(len(results), 1) results["average_score"] = round(avg, 4) return results def main(): api_key = os.environ.get("OPENAI_API_KEY") if not api_key: print("ERROR: Set OPENAI_API_KEY environment variable.") sys.exit(1) model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") client = OpenAI(api_key=api_key) print("=" * 60) print(" Scientific Hypothesis Lab -- Baseline Inference") print(f" Model: {model}") print("=" * 60) print() results = {} for task in ALL_TASKS: task_id = task["id"] print(f"--- Task: {task['name']} ---") print(f" {task['description']}") episode_result = run_episode(client, model, task) grader = GRADERS[task_id] score = grader(episode_result) results[task_id] = { "score": score, "episode_result": episode_result, } print(f" Total episode reward: {episode_result['total_episode_reward']:+.4f}") print(f" Graded score: {score:.4f}") print() print("=" * 60) print(" SUMMARY") print("=" * 60) for task_id, r in results.items(): print(f" {task_id:8s}: {r['score']:.4f}") avg = sum(r["score"] for r in results.values()) / len(results) print(f" {'average':8s}: {avg:.4f}") print() if __name__ == "__main__": main()