Spaces:

Sumukh66
/

Labexperiment

Sleeping

File size: 7,910 Bytes

aab0192

#!/usr/bin/env python3
"""
baseline_inference.py -- Baseline agent using the OpenAI API.

Reads OPENAI_API_KEY from environment variables.
Runs all 3 tasks (easy, medium, hard) and prints reproducible scores.

Usage:
    # Start the server first:
    uvicorn server.app:app --port 8000

    # Then run the baseline:
    export OPENAI_API_KEY=sk-...
    python baseline_inference.py
"""

from __future__ import annotations

import json
import os
import re
import sys
from typing import Any, Optional

from openai import OpenAI

from server.hypothesis_lab_environment import HypothesisLabEnvironment
from models import ActionType, ExperimentType, HypLabAction, NoiseLevelTag
from tasks import ALL_TASKS
from tasks.task_easy import grade_easy
from tasks.task_medium import grade_medium
from tasks.task_hard import grade_hard


SYSTEM_PROMPT_RL = """You are a scientific AI assistant. You must discover hidden causal rules between variables through experimentation.

You can take these actions (respond with valid JSON):

EXPERIMENT -- probe the system:
  {"action_type": "experiment", "experiment_type": "<type>", "control_variable": "<var>", "target_variable": "<var>", ...}

  Experiment types:
    "intervention"   -- set control_variable to control_value, observe target
    "correlation"    -- sweep control_variable over control_range [min, max, n_points], observe target
    "counterfactual" -- ask what happens if control_variable changes by control_value (delta)
    "passive"        -- observe target_variable in its resting state

SUBMIT -- end the episode with your hypothesis:
  {"action_type": "submit", "hypothesis_text": "<your hypothesis>", "hypothesis_equations": ["<equation>"], "confidence": <0.0-1.0>}

Discover the rules. Submit when ready."""

SYSTEM_PROMPT_BASELINE = SYSTEM_PROMPT_RL + """

Strategy tips (for baseline evaluation only -- remove for RL training):
- Run interventions first to discover which variables are causally connected
- Vary the control variable widely (e.g. 1, 5, 10) to detect nonlinearity
- Don't repeat the same experiment -- redundant experiments are penalised
- Submit early with confidence if you have strong evidence (efficiency bonus)
- Include numerical values (slopes, thresholds) in your hypothesis for precision bonus
"""


GRADERS = {
    "easy": grade_easy,
    "medium": grade_medium,
    "hard": grade_hard,
}

MAX_TURNS = 8


def parse_action(text: str, obs_vars: list[str], turn: int) -> Optional[HypLabAction]:
    """Parse a HypLabAction from LLM-generated text."""
    if turn >= MAX_TURNS - 1:
        return HypLabAction(
            action_type=ActionType.SUBMIT,
            hypothesis_text=text[:1000],
            confidence=0.5,
        )

    json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
    raw = json_match.group(1) if json_match else text.strip()

    brace_match = re.search(r"\{[^{}]*\}", raw, re.DOTALL)
    if brace_match:
        raw = brace_match.group(0)

    try:
        data = json.loads(raw)
        return HypLabAction(**data)
    except Exception:
        pass

    text_l = text.lower()
    if any(w in text_l for w in ["submit", "hypothesis:", "my hypothesis", "i conclude"]):
        hyp_match = re.search(
            r"(?:hypothesis|conclude|rule)[:\s]+(.{10,500})", text, re.IGNORECASE
        )
        hyp_text = hyp_match.group(1) if hyp_match else text[:500]
        return HypLabAction(
            action_type=ActionType.SUBMIT,
            hypothesis_text=hyp_text.strip(),
            confidence=0.6,
        )

    return None


def run_episode(
    client: OpenAI,
    model: str,
    task: dict[str, Any],
    use_hints: bool = True,
) -> dict[str, Any]:
    """Run a single episode and return the grading result dict."""
    env = HypothesisLabEnvironment()
    reset_kwargs = dict(task["reset_kwargs"])
    seed = reset_kwargs.pop("seed", None)

    obs = env.reset(seed=seed, **reset_kwargs)

    prompt = SYSTEM_PROMPT_BASELINE if use_hints else SYSTEM_PROMPT_RL
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": obs.system_message},
    ]

    last_obs = obs
    for turn in range(MAX_TURNS):
        if last_obs.done:
            break

        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.3,
            max_tokens=512,
        )

        assistant_text = response.choices[0].message.content or ""
        messages.append({"role": "assistant", "content": assistant_text})

        action = parse_action(assistant_text, last_obs.available_variables, turn)

        if action is None:
            messages.append({
                "role": "user",
                "content": "Invalid action format. Please respond with a valid JSON action.",
            })
            continue

        last_obs = env.step(action)
        messages.append({"role": "user", "content": last_obs.system_message})

    if not last_obs.done:
        submit = HypLabAction(
            action_type=ActionType.SUBMIT,
            hypothesis_text="Unable to determine -- insufficient experiments.",
            confidence=0.1,
        )
        last_obs = env.step(submit)

    return {
        "accuracy_score": last_obs.accuracy_score or 0.0,
        "precision_bonus": last_obs.precision_bonus or 0.0,
        "calibration_score": last_obs.calibration_score or 0.0,
        "efficiency_bonus": last_obs.efficiency_bonus or 0.0,
        "contradiction_penalty": last_obs.contradiction_penalty or 0.0,
        "total_episode_reward": last_obs.total_episode_reward or 0.0,
        "ground_truth": last_obs.ground_truth_revealed or "",
    }


def run_all_tasks() -> dict[str, Any]:
    """Run baseline agent on all tasks and return scores.

    Callable from both the CLI and the /baseline endpoint.
    Requires OPENAI_API_KEY in environment.
    """
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY environment variable not set.")

    model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
    client = OpenAI(api_key=api_key)

    results: dict[str, Any] = {}
    for task in ALL_TASKS:
        task_id = task["id"]
        episode_result = run_episode(client, model, task)
        grader = GRADERS[task_id]
        score = grader(episode_result)
        results[task_id] = {
            "score": score,
            "episode_result": episode_result,
        }

    avg = sum(r["score"] for r in results.values()) / max(len(results), 1)
    results["average_score"] = round(avg, 4)
    return results


def main():
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("ERROR: Set OPENAI_API_KEY environment variable.")
        sys.exit(1)

    model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
    client = OpenAI(api_key=api_key)

    print("=" * 60)
    print("  Scientific Hypothesis Lab -- Baseline Inference")
    print(f"  Model: {model}")
    print("=" * 60)
    print()

    results = {}
    for task in ALL_TASKS:
        task_id = task["id"]
        print(f"--- Task: {task['name']} ---")
        print(f"    {task['description']}")

        episode_result = run_episode(client, model, task)

        grader = GRADERS[task_id]
        score = grader(episode_result)

        results[task_id] = {
            "score": score,
            "episode_result": episode_result,
        }

        print(f"    Total episode reward: {episode_result['total_episode_reward']:+.4f}")
        print(f"    Graded score:         {score:.4f}")
        print()

    print("=" * 60)
    print("  SUMMARY")
    print("=" * 60)
    for task_id, r in results.items():
        print(f"  {task_id:8s}: {r['score']:.4f}")

    avg = sum(r["score"] for r in results.values()) / len(results)
    print(f"  {'average':8s}: {avg:.4f}")
    print()


if __name__ == "__main__":
    main()