hypothesis-engine / examples /training_loop.py
AbhinavDubey30
Hypothesis Engine: Scientific Discovery RL Environment for LLM Training
bbfb06a
#!/usr/bin/env python3
"""
Example: Using Hypothesis Engine for LLM RL Training
This script demonstrates how the Hypothesis Engine can be used in a standard
reinforcement learning training loop for LLMs.
Three integration patterns are shown:
1. Basic RL loop (any framework)
2. Gymnasium-compatible wrapper
3. Multi-episode training with auto-curriculum
Usage:
python examples/training_loop.py
"""
import sys
import os
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def example_basic_loop():
"""
Example 1: Basic RL Training Loop
This shows the simplest integration pattern using the core
HypothesisEngine API directly.
"""
print("=" * 60)
print(" Example 1: Basic RL Training Loop")
print("=" * 60)
from hypothesis_engine import HypothesisEngine
env = HypothesisEngine(difficulty=1, experiment_budget=20, seed=42)
# Standard RL loop
for episode in range(3):
obs = env.reset()
done = False
total_reward = 0.0
step = 0
while not done and step < 40:
# YOUR AGENT SELECTS AN ACTION HERE
# In real training, this comes from your LLM policy network
action = select_action(obs, step)
obs, reward, done, info = env.step(action)
total_reward += reward
step += 1
# YOUR AGENT LEARNS HERE
# e.g., store transition in replay buffer, compute policy gradient, etc.
summary = env.get_episode_summary()
print(f" Episode {episode + 1}: steps={step}, reward={total_reward:.1f}, "
f"world='{summary['world_name']}'")
print()
def example_gymnasium_wrapper():
"""
Example 2: Gymnasium-Compatible Wrapper
Uses the standard gymnasium interface for compatibility with
Stable-Baselines3, RLlib, TRL, and other RL frameworks.
"""
print("=" * 60)
print(" Example 2: Gymnasium-Compatible Wrapper")
print("=" * 60)
from hypothesis_engine.gym_wrapper import make_env
env = make_env(difficulty=2, experiment_budget=20, seed=42)
for episode in range(3):
obs_text, info = env.reset()
terminated = False
truncated = False
total_reward = 0.0
step = 0
print(f"\n Episode {episode + 1}: {info.get('world_name', '?')}")
while not terminated and not truncated and step < 40:
# In a real setup, your LLM generates this action text
action_text = generate_action_text(obs_text, step)
obs_text, reward, terminated, truncated, info = env.step(action_text)
total_reward += reward
step += 1
print(f" Steps: {step}, Total Reward: {total_reward:.1f}")
env.close()
print()
def example_curriculum_training():
"""
Example 3: Multi-Episode Training with Auto-Curriculum
Demonstrates how the auto-curriculum automatically advances
difficulty as the agent improves, providing a self-improving
training signal.
"""
print("=" * 60)
print(" Example 3: Auto-Curriculum Training")
print("=" * 60)
from hypothesis_engine import HypothesisEngine
env = HypothesisEngine(
difficulty=1,
experiment_budget=25,
auto_curriculum=True,
advance_threshold=60.0,
)
episode_rewards = []
for episode in range(10):
obs = env.reset()
done = False
total_reward = 0.0
step = 0
while not done and step < 40:
action = select_action(obs, step)
obs, reward, done, info = env.step(action)
total_reward += reward
step += 1
episode_rewards.append(total_reward)
difficulty = env.world.difficulty if env.world else "?"
# Get curriculum progress
if env.curriculum:
progress = env.curriculum.get_progress_summary()
current_level = progress.get("current_difficulty", "?")
else:
current_level = difficulty
print(f" Episode {episode + 1:>2d}: Level {current_level}, "
f"Reward: {total_reward:>6.1f}, "
f"Running Avg: {sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):>6.1f}")
print(f"\n Final average reward (last 5): "
f"{sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):.1f}")
print()
# ── Helper: Simple action selector (replace with your LLM) ───────────
def select_action(obs, step):
"""
Dummy action selector for demonstration.
In real training, replace this with your LLM policy:
- Feed obs_text to LLM
- Parse LLM output as JSON action
- Return the action dict
"""
world = obs.get("world", {})
variables = world.get("variables", ["x"])
ranges = world.get("variable_ranges", {"x": [-10, 10]})
test_cases = obs.get("test_cases", [])
remaining = obs.get("experiments_remaining", 0)
# Simple strategy: probe, then predict
if step < 8 and remaining > 2:
# Run experiments at strategic points
inputs = {}
for var in variables:
lo, hi = ranges.get(var, [-10, 10])
# Probe different points
probes = [0, 1, -1, 2, -2, 3, -3, 5]
idx = step % len(probes)
val = max(lo, min(hi, float(probes[idx])))
inputs[var] = val
return {"action": "experiment", "inputs": inputs}
elif step == 8:
# Simple hypothesis
return {"action": "hypothesize", "expression": "2*x + 1"}
else:
# Predict (ends episode)
predictions = [0.0] * len(test_cases) if test_cases else [0.0] * 20
return {"action": "predict", "predictions": predictions}
def generate_action_text(obs_text, step):
"""
Generate action text for the gymnasium wrapper.
In real training, this is where your LLM generates text responses.
"""
import json
if step < 5:
probes = [0, 1, -1, 2, -2]
return json.dumps({
"action": "experiment",
"inputs": {"x": probes[step % len(probes)]}
})
elif step == 5:
return json.dumps({
"action": "hypothesize",
"expression": "2*x + 1"
})
else:
return json.dumps({
"action": "predict",
"predictions": [0.0] * 20
})
# ── Integration Guide ─────────────────────────────────────────────────
INTEGRATION_GUIDE = """
== INTEGRATION WITH POPULAR RL FRAMEWORKS ==
1. Stable-Baselines3 (via gymnasium wrapper):
from hypothesis_engine.gym_wrapper import make_env
from stable_baselines3 import PPO
env = make_env(difficulty=3, experiment_budget=30)
model = PPO("MlpPolicy", env) # Use custom text policy
model.learn(total_timesteps=10000)
2. TRL (Transformer RL for LLM fine-tuning):
from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv
from trl import PPOTrainer
env = HypothesisEngineGymEnv(difficulty=1, auto_curriculum=True)
# Use env.reset() and env.step() in your TRL training loop
# The text-based obs/action spaces work naturally with LLMs
3. Custom GRPO/RLHF Training:
from hypothesis_engine import HypothesisEngine
env = HypothesisEngine(difficulty=1, auto_curriculum=True)
for episode in range(num_episodes):
obs = env.reset()
trajectory = []
done = False
while not done:
# LLM generates action from observation text
action = llm.generate(format_prompt(obs))
obs, reward, done, info = env.step(parse_action(action))
trajectory.append((obs, action, reward))
# Use trajectory for GRPO/PPO update
update_policy(trajectory)
4. Ray RLlib:
from ray.rllib.algorithms.ppo import PPOConfig
from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv
config = PPOConfig().environment(
env=HypothesisEngineGymEnv,
env_config={"difficulty": 3, "experiment_budget": 30},
)
"""
if __name__ == "__main__":
print()
print("Hypothesis Engine β€” RL Training Integration Examples")
print("=" * 60)
print()
example_basic_loop()
example_gymnasium_wrapper()
example_curriculum_training()
print(INTEGRATION_GUIDE)