Labexperiment / baseline_inference.py
Sbhimraj's picture
Add application file
aab0192
Raw
History Blame Contribute Delete
7.91 kB
#!/usr/bin/env python3
"""
baseline_inference.py -- Baseline agent using the OpenAI API.
Reads OPENAI_API_KEY from environment variables.
Runs all 3 tasks (easy, medium, hard) and prints reproducible scores.
Usage:
# Start the server first:
uvicorn server.app:app --port 8000
# Then run the baseline:
export OPENAI_API_KEY=sk-...
python baseline_inference.py
"""
from __future__ import annotations
import json
import os
import re
import sys
from typing import Any, Optional
from openai import OpenAI
from server.hypothesis_lab_environment import HypothesisLabEnvironment
from models import ActionType, ExperimentType, HypLabAction, NoiseLevelTag
from tasks import ALL_TASKS
from tasks.task_easy import grade_easy
from tasks.task_medium import grade_medium
from tasks.task_hard import grade_hard
SYSTEM_PROMPT_RL = """You are a scientific AI assistant. You must discover hidden causal rules between variables through experimentation.
You can take these actions (respond with valid JSON):
EXPERIMENT -- probe the system:
{"action_type": "experiment", "experiment_type": "<type>", "control_variable": "<var>", "target_variable": "<var>", ...}
Experiment types:
"intervention" -- set control_variable to control_value, observe target
"correlation" -- sweep control_variable over control_range [min, max, n_points], observe target
"counterfactual" -- ask what happens if control_variable changes by control_value (delta)
"passive" -- observe target_variable in its resting state
SUBMIT -- end the episode with your hypothesis:
{"action_type": "submit", "hypothesis_text": "<your hypothesis>", "hypothesis_equations": ["<equation>"], "confidence": <0.0-1.0>}
Discover the rules. Submit when ready."""
SYSTEM_PROMPT_BASELINE = SYSTEM_PROMPT_RL + """
Strategy tips (for baseline evaluation only -- remove for RL training):
- Run interventions first to discover which variables are causally connected
- Vary the control variable widely (e.g. 1, 5, 10) to detect nonlinearity
- Don't repeat the same experiment -- redundant experiments are penalised
- Submit early with confidence if you have strong evidence (efficiency bonus)
- Include numerical values (slopes, thresholds) in your hypothesis for precision bonus
"""
GRADERS = {
"easy": grade_easy,
"medium": grade_medium,
"hard": grade_hard,
}
MAX_TURNS = 8
def parse_action(text: str, obs_vars: list[str], turn: int) -> Optional[HypLabAction]:
"""Parse a HypLabAction from LLM-generated text."""
if turn >= MAX_TURNS - 1:
return HypLabAction(
action_type=ActionType.SUBMIT,
hypothesis_text=text[:1000],
confidence=0.5,
)
json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
raw = json_match.group(1) if json_match else text.strip()
brace_match = re.search(r"\{[^{}]*\}", raw, re.DOTALL)
if brace_match:
raw = brace_match.group(0)
try:
data = json.loads(raw)
return HypLabAction(**data)
except Exception:
pass
text_l = text.lower()
if any(w in text_l for w in ["submit", "hypothesis:", "my hypothesis", "i conclude"]):
hyp_match = re.search(
r"(?:hypothesis|conclude|rule)[:\s]+(.{10,500})", text, re.IGNORECASE
)
hyp_text = hyp_match.group(1) if hyp_match else text[:500]
return HypLabAction(
action_type=ActionType.SUBMIT,
hypothesis_text=hyp_text.strip(),
confidence=0.6,
)
return None
def run_episode(
client: OpenAI,
model: str,
task: dict[str, Any],
use_hints: bool = True,
) -> dict[str, Any]:
"""Run a single episode and return the grading result dict."""
env = HypothesisLabEnvironment()
reset_kwargs = dict(task["reset_kwargs"])
seed = reset_kwargs.pop("seed", None)
obs = env.reset(seed=seed, **reset_kwargs)
prompt = SYSTEM_PROMPT_BASELINE if use_hints else SYSTEM_PROMPT_RL
messages = [
{"role": "system", "content": prompt},
{"role": "user", "content": obs.system_message},
]
last_obs = obs
for turn in range(MAX_TURNS):
if last_obs.done:
break
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0.3,
max_tokens=512,
)
assistant_text = response.choices[0].message.content or ""
messages.append({"role": "assistant", "content": assistant_text})
action = parse_action(assistant_text, last_obs.available_variables, turn)
if action is None:
messages.append({
"role": "user",
"content": "Invalid action format. Please respond with a valid JSON action.",
})
continue
last_obs = env.step(action)
messages.append({"role": "user", "content": last_obs.system_message})
if not last_obs.done:
submit = HypLabAction(
action_type=ActionType.SUBMIT,
hypothesis_text="Unable to determine -- insufficient experiments.",
confidence=0.1,
)
last_obs = env.step(submit)
return {
"accuracy_score": last_obs.accuracy_score or 0.0,
"precision_bonus": last_obs.precision_bonus or 0.0,
"calibration_score": last_obs.calibration_score or 0.0,
"efficiency_bonus": last_obs.efficiency_bonus or 0.0,
"contradiction_penalty": last_obs.contradiction_penalty or 0.0,
"total_episode_reward": last_obs.total_episode_reward or 0.0,
"ground_truth": last_obs.ground_truth_revealed or "",
}
def run_all_tasks() -> dict[str, Any]:
"""Run baseline agent on all tasks and return scores.
Callable from both the CLI and the /baseline endpoint.
Requires OPENAI_API_KEY in environment.
"""
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY environment variable not set.")
model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
client = OpenAI(api_key=api_key)
results: dict[str, Any] = {}
for task in ALL_TASKS:
task_id = task["id"]
episode_result = run_episode(client, model, task)
grader = GRADERS[task_id]
score = grader(episode_result)
results[task_id] = {
"score": score,
"episode_result": episode_result,
}
avg = sum(r["score"] for r in results.values()) / max(len(results), 1)
results["average_score"] = round(avg, 4)
return results
def main():
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
print("ERROR: Set OPENAI_API_KEY environment variable.")
sys.exit(1)
model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
client = OpenAI(api_key=api_key)
print("=" * 60)
print(" Scientific Hypothesis Lab -- Baseline Inference")
print(f" Model: {model}")
print("=" * 60)
print()
results = {}
for task in ALL_TASKS:
task_id = task["id"]
print(f"--- Task: {task['name']} ---")
print(f" {task['description']}")
episode_result = run_episode(client, model, task)
grader = GRADERS[task_id]
score = grader(episode_result)
results[task_id] = {
"score": score,
"episode_result": episode_result,
}
print(f" Total episode reward: {episode_result['total_episode_reward']:+.4f}")
print(f" Graded score: {score:.4f}")
print()
print("=" * 60)
print(" SUMMARY")
print("=" * 60)
for task_id, r in results.items():
print(f" {task_id:8s}: {r['score']:.4f}")
avg = sum(r["score"] for r in results.values()) / len(results)
print(f" {'average':8s}: {avg:.4f}")
print()
if __name__ == "__main__":
main()