Spaces:

Sumukh66
/

Labexperiment

Sleeping

App Files Files Community

Labexperiment / baseline_inference.py

Sbhimraj

Add application file

aab0192 3 months ago

Raw

History Blame Contribute Delete

7.91 kB

	#!/usr/bin/env python3
	"""
	baseline_inference.py -- Baseline agent using the OpenAI API.

	Reads OPENAI_API_KEY from environment variables.
	Runs all 3 tasks (easy, medium, hard) and prints reproducible scores.

	Usage:
	# Start the server first:
	uvicorn server.app:app --port 8000

	# Then run the baseline:
	export OPENAI_API_KEY=sk-...
	python baseline_inference.py
	"""

	from __future__ import annotations

	import json
	import os
	import re
	import sys
	from typing import Any, Optional

	from openai import OpenAI

	from server.hypothesis_lab_environment import HypothesisLabEnvironment
	from models import ActionType, ExperimentType, HypLabAction, NoiseLevelTag
	from tasks import ALL_TASKS
	from tasks.task_easy import grade_easy
	from tasks.task_medium import grade_medium
	from tasks.task_hard import grade_hard


	SYSTEM_PROMPT_RL = """You are a scientific AI assistant. You must discover hidden causal rules between variables through experimentation.

	You can take these actions (respond with valid JSON):

	EXPERIMENT -- probe the system:
	{"action_type": "experiment", "experiment_type": "<type>", "control_variable": "<var>", "target_variable": "<var>", ...}

	Experiment types:
	"intervention" -- set control_variable to control_value, observe target
	"correlation" -- sweep control_variable over control_range [min, max, n_points], observe target
	"counterfactual" -- ask what happens if control_variable changes by control_value (delta)
	"passive" -- observe target_variable in its resting state

	SUBMIT -- end the episode with your hypothesis:
	{"action_type": "submit", "hypothesis_text": "<your hypothesis>", "hypothesis_equations": ["<equation>"], "confidence": <0.0-1.0>}

	Discover the rules. Submit when ready."""

	SYSTEM_PROMPT_BASELINE = SYSTEM_PROMPT_RL + """

	Strategy tips (for baseline evaluation only -- remove for RL training):
	- Run interventions first to discover which variables are causally connected
	- Vary the control variable widely (e.g. 1, 5, 10) to detect nonlinearity
	- Don't repeat the same experiment -- redundant experiments are penalised
	- Submit early with confidence if you have strong evidence (efficiency bonus)
	- Include numerical values (slopes, thresholds) in your hypothesis for precision bonus
	"""


	GRADERS = {
	"easy": grade_easy,
	"medium": grade_medium,
	"hard": grade_hard,
	}

	MAX_TURNS = 8


	def parse_action(text: str, obs_vars: list[str], turn: int) -> Optional[HypLabAction]:
	"""Parse a HypLabAction from LLM-generated text."""
	if turn >= MAX_TURNS - 1:
	return HypLabAction(
	action_type=ActionType.SUBMIT,
	hypothesis_text=text[:1000],
	confidence=0.5,
	)

	json_match = re.search(r"```(?:json)?\s(\{.?\})\s*```", text, re.DOTALL)
	raw = json_match.group(1) if json_match else text.strip()

	brace_match = re.search(r"\{[^{}]*\}", raw, re.DOTALL)
	if brace_match:
	raw = brace_match.group(0)

	try:
	data = json.loads(raw)
	return HypLabAction(**data)
	except Exception:
	pass

	text_l = text.lower()
	if any(w in text_l for w in ["submit", "hypothesis:", "my hypothesis", "i conclude"]):
	hyp_match = re.search(
	r"(?:hypothesis\|conclude\|rule)[:\s]+(.{10,500})", text, re.IGNORECASE
	)
	hyp_text = hyp_match.group(1) if hyp_match else text[:500]
	return HypLabAction(
	action_type=ActionType.SUBMIT,
	hypothesis_text=hyp_text.strip(),
	confidence=0.6,
	)

	return None


	def run_episode(
	client: OpenAI,
	model: str,
	task: dict[str, Any],
	use_hints: bool = True,
	) -> dict[str, Any]:
	"""Run a single episode and return the grading result dict."""
	env = HypothesisLabEnvironment()
	reset_kwargs = dict(task["reset_kwargs"])
	seed = reset_kwargs.pop("seed", None)

	obs = env.reset(seed=seed, **reset_kwargs)

	prompt = SYSTEM_PROMPT_BASELINE if use_hints else SYSTEM_PROMPT_RL
	messages = [
	{"role": "system", "content": prompt},
	{"role": "user", "content": obs.system_message},
	]

	last_obs = obs
	for turn in range(MAX_TURNS):
	if last_obs.done:
	break

	response = client.chat.completions.create(
	model=model,
	messages=messages,
	temperature=0.3,
	max_tokens=512,
	)

	assistant_text = response.choices[0].message.content or ""
	messages.append({"role": "assistant", "content": assistant_text})

	action = parse_action(assistant_text, last_obs.available_variables, turn)

	if action is None:
	messages.append({
	"role": "user",
	"content": "Invalid action format. Please respond with a valid JSON action.",
	})
	continue

	last_obs = env.step(action)
	messages.append({"role": "user", "content": last_obs.system_message})

	if not last_obs.done:
	submit = HypLabAction(
	action_type=ActionType.SUBMIT,
	hypothesis_text="Unable to determine -- insufficient experiments.",
	confidence=0.1,
	)
	last_obs = env.step(submit)

	return {
	"accuracy_score": last_obs.accuracy_score or 0.0,
	"precision_bonus": last_obs.precision_bonus or 0.0,
	"calibration_score": last_obs.calibration_score or 0.0,
	"efficiency_bonus": last_obs.efficiency_bonus or 0.0,
	"contradiction_penalty": last_obs.contradiction_penalty or 0.0,
	"total_episode_reward": last_obs.total_episode_reward or 0.0,
	"ground_truth": last_obs.ground_truth_revealed or "",
	}


	def run_all_tasks() -> dict[str, Any]:
	"""Run baseline agent on all tasks and return scores.

	Callable from both the CLI and the /baseline endpoint.
	Requires OPENAI_API_KEY in environment.
	"""
	api_key = os.environ.get("OPENAI_API_KEY")
	if not api_key:
	raise RuntimeError("OPENAI_API_KEY environment variable not set.")

	model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
	client = OpenAI(api_key=api_key)

	results: dict[str, Any] = {}
	for task in ALL_TASKS:
	task_id = task["id"]
	episode_result = run_episode(client, model, task)
	grader = GRADERS[task_id]
	score = grader(episode_result)
	results[task_id] = {
	"score": score,
	"episode_result": episode_result,
	}

	avg = sum(r["score"] for r in results.values()) / max(len(results), 1)
	results["average_score"] = round(avg, 4)
	return results


	def main():
	api_key = os.environ.get("OPENAI_API_KEY")
	if not api_key:
	print("ERROR: Set OPENAI_API_KEY environment variable.")
	sys.exit(1)

	model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
	client = OpenAI(api_key=api_key)

	print("=" * 60)
	print(" Scientific Hypothesis Lab -- Baseline Inference")
	print(f" Model: {model}")
	print("=" * 60)
	print()

	results = {}
	for task in ALL_TASKS:
	task_id = task["id"]
	print(f"--- Task: {task['name']} ---")
	print(f" {task['description']}")

	episode_result = run_episode(client, model, task)

	grader = GRADERS[task_id]
	score = grader(episode_result)

	results[task_id] = {
	"score": score,
	"episode_result": episode_result,
	}

	print(f" Total episode reward: {episode_result['total_episode_reward']:+.4f}")
	print(f" Graded score: {score:.4f}")
	print()

	print("=" * 60)
	print(" SUMMARY")
	print("=" * 60)
	for task_id, r in results.items():
	print(f" {task_id:8s}: {r['score']:.4f}")

	avg = sum(r["score"] for r in results.values()) / len(results)
	print(f" {'average':8s}: {avg:.4f}")
	print()


	if __name__ == "__main__":
	main()