File size: 7,910 Bytes
aab0192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#!/usr/bin/env python3
"""
baseline_inference.py -- Baseline agent using the OpenAI API.

Reads OPENAI_API_KEY from environment variables.
Runs all 3 tasks (easy, medium, hard) and prints reproducible scores.

Usage:
    # Start the server first:
    uvicorn server.app:app --port 8000

    # Then run the baseline:
    export OPENAI_API_KEY=sk-...
    python baseline_inference.py
"""

from __future__ import annotations

import json
import os
import re
import sys
from typing import Any, Optional

from openai import OpenAI

from server.hypothesis_lab_environment import HypothesisLabEnvironment
from models import ActionType, ExperimentType, HypLabAction, NoiseLevelTag
from tasks import ALL_TASKS
from tasks.task_easy import grade_easy
from tasks.task_medium import grade_medium
from tasks.task_hard import grade_hard


SYSTEM_PROMPT_RL = """You are a scientific AI assistant. You must discover hidden causal rules between variables through experimentation.

You can take these actions (respond with valid JSON):

EXPERIMENT -- probe the system:
  {"action_type": "experiment", "experiment_type": "<type>", "control_variable": "<var>", "target_variable": "<var>", ...}

  Experiment types:
    "intervention"   -- set control_variable to control_value, observe target
    "correlation"    -- sweep control_variable over control_range [min, max, n_points], observe target
    "counterfactual" -- ask what happens if control_variable changes by control_value (delta)
    "passive"        -- observe target_variable in its resting state

SUBMIT -- end the episode with your hypothesis:
  {"action_type": "submit", "hypothesis_text": "<your hypothesis>", "hypothesis_equations": ["<equation>"], "confidence": <0.0-1.0>}

Discover the rules. Submit when ready."""

SYSTEM_PROMPT_BASELINE = SYSTEM_PROMPT_RL + """

Strategy tips (for baseline evaluation only -- remove for RL training):
- Run interventions first to discover which variables are causally connected
- Vary the control variable widely (e.g. 1, 5, 10) to detect nonlinearity
- Don't repeat the same experiment -- redundant experiments are penalised
- Submit early with confidence if you have strong evidence (efficiency bonus)
- Include numerical values (slopes, thresholds) in your hypothesis for precision bonus
"""


GRADERS = {
    "easy": grade_easy,
    "medium": grade_medium,
    "hard": grade_hard,
}

MAX_TURNS = 8


def parse_action(text: str, obs_vars: list[str], turn: int) -> Optional[HypLabAction]:
    """Parse a HypLabAction from LLM-generated text."""
    if turn >= MAX_TURNS - 1:
        return HypLabAction(
            action_type=ActionType.SUBMIT,
            hypothesis_text=text[:1000],
            confidence=0.5,
        )

    json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
    raw = json_match.group(1) if json_match else text.strip()

    brace_match = re.search(r"\{[^{}]*\}", raw, re.DOTALL)
    if brace_match:
        raw = brace_match.group(0)

    try:
        data = json.loads(raw)
        return HypLabAction(**data)
    except Exception:
        pass

    text_l = text.lower()
    if any(w in text_l for w in ["submit", "hypothesis:", "my hypothesis", "i conclude"]):
        hyp_match = re.search(
            r"(?:hypothesis|conclude|rule)[:\s]+(.{10,500})", text, re.IGNORECASE
        )
        hyp_text = hyp_match.group(1) if hyp_match else text[:500]
        return HypLabAction(
            action_type=ActionType.SUBMIT,
            hypothesis_text=hyp_text.strip(),
            confidence=0.6,
        )

    return None


def run_episode(
    client: OpenAI,
    model: str,
    task: dict[str, Any],
    use_hints: bool = True,
) -> dict[str, Any]:
    """Run a single episode and return the grading result dict."""
    env = HypothesisLabEnvironment()
    reset_kwargs = dict(task["reset_kwargs"])
    seed = reset_kwargs.pop("seed", None)

    obs = env.reset(seed=seed, **reset_kwargs)

    prompt = SYSTEM_PROMPT_BASELINE if use_hints else SYSTEM_PROMPT_RL
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": obs.system_message},
    ]

    last_obs = obs
    for turn in range(MAX_TURNS):
        if last_obs.done:
            break

        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.3,
            max_tokens=512,
        )

        assistant_text = response.choices[0].message.content or ""
        messages.append({"role": "assistant", "content": assistant_text})

        action = parse_action(assistant_text, last_obs.available_variables, turn)

        if action is None:
            messages.append({
                "role": "user",
                "content": "Invalid action format. Please respond with a valid JSON action.",
            })
            continue

        last_obs = env.step(action)
        messages.append({"role": "user", "content": last_obs.system_message})

    if not last_obs.done:
        submit = HypLabAction(
            action_type=ActionType.SUBMIT,
            hypothesis_text="Unable to determine -- insufficient experiments.",
            confidence=0.1,
        )
        last_obs = env.step(submit)

    return {
        "accuracy_score": last_obs.accuracy_score or 0.0,
        "precision_bonus": last_obs.precision_bonus or 0.0,
        "calibration_score": last_obs.calibration_score or 0.0,
        "efficiency_bonus": last_obs.efficiency_bonus or 0.0,
        "contradiction_penalty": last_obs.contradiction_penalty or 0.0,
        "total_episode_reward": last_obs.total_episode_reward or 0.0,
        "ground_truth": last_obs.ground_truth_revealed or "",
    }


def run_all_tasks() -> dict[str, Any]:
    """Run baseline agent on all tasks and return scores.

    Callable from both the CLI and the /baseline endpoint.
    Requires OPENAI_API_KEY in environment.
    """
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY environment variable not set.")

    model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
    client = OpenAI(api_key=api_key)

    results: dict[str, Any] = {}
    for task in ALL_TASKS:
        task_id = task["id"]
        episode_result = run_episode(client, model, task)
        grader = GRADERS[task_id]
        score = grader(episode_result)
        results[task_id] = {
            "score": score,
            "episode_result": episode_result,
        }

    avg = sum(r["score"] for r in results.values()) / max(len(results), 1)
    results["average_score"] = round(avg, 4)
    return results


def main():
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("ERROR: Set OPENAI_API_KEY environment variable.")
        sys.exit(1)

    model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
    client = OpenAI(api_key=api_key)

    print("=" * 60)
    print("  Scientific Hypothesis Lab -- Baseline Inference")
    print(f"  Model: {model}")
    print("=" * 60)
    print()

    results = {}
    for task in ALL_TASKS:
        task_id = task["id"]
        print(f"--- Task: {task['name']} ---")
        print(f"    {task['description']}")

        episode_result = run_episode(client, model, task)

        grader = GRADERS[task_id]
        score = grader(episode_result)

        results[task_id] = {
            "score": score,
            "episode_result": episode_result,
        }

        print(f"    Total episode reward: {episode_result['total_episode_reward']:+.4f}")
        print(f"    Graded score:         {score:.4f}")
        print()

    print("=" * 60)
    print("  SUMMARY")
    print("=" * 60)
    for task_id, r in results.items():
        print(f"  {task_id:8s}: {r['score']:.4f}")

    avg = sum(r["score"] for r in results.values()) / len(results)
    print(f"  {'average':8s}: {avg:.4f}")
    print()


if __name__ == "__main__":
    main()