Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| baseline_inference.py -- Baseline agent using the OpenAI API. | |
| Reads OPENAI_API_KEY from environment variables. | |
| Runs all 3 tasks (easy, medium, hard) and prints reproducible scores. | |
| Usage: | |
| # Start the server first: | |
| uvicorn server.app:app --port 8000 | |
| # Then run the baseline: | |
| export OPENAI_API_KEY=sk-... | |
| python baseline_inference.py | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import re | |
| import sys | |
| from typing import Any, Optional | |
| from openai import OpenAI | |
| from server.hypothesis_lab_environment import HypothesisLabEnvironment | |
| from models import ActionType, ExperimentType, HypLabAction, NoiseLevelTag | |
| from tasks import ALL_TASKS | |
| from tasks.task_easy import grade_easy | |
| from tasks.task_medium import grade_medium | |
| from tasks.task_hard import grade_hard | |
| SYSTEM_PROMPT_RL = """You are a scientific AI assistant. You must discover hidden causal rules between variables through experimentation. | |
| You can take these actions (respond with valid JSON): | |
| EXPERIMENT -- probe the system: | |
| {"action_type": "experiment", "experiment_type": "<type>", "control_variable": "<var>", "target_variable": "<var>", ...} | |
| Experiment types: | |
| "intervention" -- set control_variable to control_value, observe target | |
| "correlation" -- sweep control_variable over control_range [min, max, n_points], observe target | |
| "counterfactual" -- ask what happens if control_variable changes by control_value (delta) | |
| "passive" -- observe target_variable in its resting state | |
| SUBMIT -- end the episode with your hypothesis: | |
| {"action_type": "submit", "hypothesis_text": "<your hypothesis>", "hypothesis_equations": ["<equation>"], "confidence": <0.0-1.0>} | |
| Discover the rules. Submit when ready.""" | |
| SYSTEM_PROMPT_BASELINE = SYSTEM_PROMPT_RL + """ | |
| Strategy tips (for baseline evaluation only -- remove for RL training): | |
| - Run interventions first to discover which variables are causally connected | |
| - Vary the control variable widely (e.g. 1, 5, 10) to detect nonlinearity | |
| - Don't repeat the same experiment -- redundant experiments are penalised | |
| - Submit early with confidence if you have strong evidence (efficiency bonus) | |
| - Include numerical values (slopes, thresholds) in your hypothesis for precision bonus | |
| """ | |
| GRADERS = { | |
| "easy": grade_easy, | |
| "medium": grade_medium, | |
| "hard": grade_hard, | |
| } | |
| MAX_TURNS = 8 | |
| def parse_action(text: str, obs_vars: list[str], turn: int) -> Optional[HypLabAction]: | |
| """Parse a HypLabAction from LLM-generated text.""" | |
| if turn >= MAX_TURNS - 1: | |
| return HypLabAction( | |
| action_type=ActionType.SUBMIT, | |
| hypothesis_text=text[:1000], | |
| confidence=0.5, | |
| ) | |
| json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) | |
| raw = json_match.group(1) if json_match else text.strip() | |
| brace_match = re.search(r"\{[^{}]*\}", raw, re.DOTALL) | |
| if brace_match: | |
| raw = brace_match.group(0) | |
| try: | |
| data = json.loads(raw) | |
| return HypLabAction(**data) | |
| except Exception: | |
| pass | |
| text_l = text.lower() | |
| if any(w in text_l for w in ["submit", "hypothesis:", "my hypothesis", "i conclude"]): | |
| hyp_match = re.search( | |
| r"(?:hypothesis|conclude|rule)[:\s]+(.{10,500})", text, re.IGNORECASE | |
| ) | |
| hyp_text = hyp_match.group(1) if hyp_match else text[:500] | |
| return HypLabAction( | |
| action_type=ActionType.SUBMIT, | |
| hypothesis_text=hyp_text.strip(), | |
| confidence=0.6, | |
| ) | |
| return None | |
| def run_episode( | |
| client: OpenAI, | |
| model: str, | |
| task: dict[str, Any], | |
| use_hints: bool = True, | |
| ) -> dict[str, Any]: | |
| """Run a single episode and return the grading result dict.""" | |
| env = HypothesisLabEnvironment() | |
| reset_kwargs = dict(task["reset_kwargs"]) | |
| seed = reset_kwargs.pop("seed", None) | |
| obs = env.reset(seed=seed, **reset_kwargs) | |
| prompt = SYSTEM_PROMPT_BASELINE if use_hints else SYSTEM_PROMPT_RL | |
| messages = [ | |
| {"role": "system", "content": prompt}, | |
| {"role": "user", "content": obs.system_message}, | |
| ] | |
| last_obs = obs | |
| for turn in range(MAX_TURNS): | |
| if last_obs.done: | |
| break | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| temperature=0.3, | |
| max_tokens=512, | |
| ) | |
| assistant_text = response.choices[0].message.content or "" | |
| messages.append({"role": "assistant", "content": assistant_text}) | |
| action = parse_action(assistant_text, last_obs.available_variables, turn) | |
| if action is None: | |
| messages.append({ | |
| "role": "user", | |
| "content": "Invalid action format. Please respond with a valid JSON action.", | |
| }) | |
| continue | |
| last_obs = env.step(action) | |
| messages.append({"role": "user", "content": last_obs.system_message}) | |
| if not last_obs.done: | |
| submit = HypLabAction( | |
| action_type=ActionType.SUBMIT, | |
| hypothesis_text="Unable to determine -- insufficient experiments.", | |
| confidence=0.1, | |
| ) | |
| last_obs = env.step(submit) | |
| return { | |
| "accuracy_score": last_obs.accuracy_score or 0.0, | |
| "precision_bonus": last_obs.precision_bonus or 0.0, | |
| "calibration_score": last_obs.calibration_score or 0.0, | |
| "efficiency_bonus": last_obs.efficiency_bonus or 0.0, | |
| "contradiction_penalty": last_obs.contradiction_penalty or 0.0, | |
| "total_episode_reward": last_obs.total_episode_reward or 0.0, | |
| "ground_truth": last_obs.ground_truth_revealed or "", | |
| } | |
| def run_all_tasks() -> dict[str, Any]: | |
| """Run baseline agent on all tasks and return scores. | |
| Callable from both the CLI and the /baseline endpoint. | |
| Requires OPENAI_API_KEY in environment. | |
| """ | |
| api_key = os.environ.get("OPENAI_API_KEY") | |
| if not api_key: | |
| raise RuntimeError("OPENAI_API_KEY environment variable not set.") | |
| model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") | |
| client = OpenAI(api_key=api_key) | |
| results: dict[str, Any] = {} | |
| for task in ALL_TASKS: | |
| task_id = task["id"] | |
| episode_result = run_episode(client, model, task) | |
| grader = GRADERS[task_id] | |
| score = grader(episode_result) | |
| results[task_id] = { | |
| "score": score, | |
| "episode_result": episode_result, | |
| } | |
| avg = sum(r["score"] for r in results.values()) / max(len(results), 1) | |
| results["average_score"] = round(avg, 4) | |
| return results | |
| def main(): | |
| api_key = os.environ.get("OPENAI_API_KEY") | |
| if not api_key: | |
| print("ERROR: Set OPENAI_API_KEY environment variable.") | |
| sys.exit(1) | |
| model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") | |
| client = OpenAI(api_key=api_key) | |
| print("=" * 60) | |
| print(" Scientific Hypothesis Lab -- Baseline Inference") | |
| print(f" Model: {model}") | |
| print("=" * 60) | |
| print() | |
| results = {} | |
| for task in ALL_TASKS: | |
| task_id = task["id"] | |
| print(f"--- Task: {task['name']} ---") | |
| print(f" {task['description']}") | |
| episode_result = run_episode(client, model, task) | |
| grader = GRADERS[task_id] | |
| score = grader(episode_result) | |
| results[task_id] = { | |
| "score": score, | |
| "episode_result": episode_result, | |
| } | |
| print(f" Total episode reward: {episode_result['total_episode_reward']:+.4f}") | |
| print(f" Graded score: {score:.4f}") | |
| print() | |
| print("=" * 60) | |
| print(" SUMMARY") | |
| print("=" * 60) | |
| for task_id, r in results.items(): | |
| print(f" {task_id:8s}: {r['score']:.4f}") | |
| avg = sum(r["score"] for r in results.values()) / len(results) | |
| print(f" {'average':8s}: {avg:.4f}") | |
| print() | |
| if __name__ == "__main__": | |
| main() | |