Spaces:
Sleeping
Sleeping
File size: 8,495 Bytes
bbfb06a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 | #!/usr/bin/env python3
"""
Example: Using Hypothesis Engine for LLM RL Training
This script demonstrates how the Hypothesis Engine can be used in a standard
reinforcement learning training loop for LLMs.
Three integration patterns are shown:
1. Basic RL loop (any framework)
2. Gymnasium-compatible wrapper
3. Multi-episode training with auto-curriculum
Usage:
python examples/training_loop.py
"""
import sys
import os
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def example_basic_loop():
"""
Example 1: Basic RL Training Loop
This shows the simplest integration pattern using the core
HypothesisEngine API directly.
"""
print("=" * 60)
print(" Example 1: Basic RL Training Loop")
print("=" * 60)
from hypothesis_engine import HypothesisEngine
env = HypothesisEngine(difficulty=1, experiment_budget=20, seed=42)
# Standard RL loop
for episode in range(3):
obs = env.reset()
done = False
total_reward = 0.0
step = 0
while not done and step < 40:
# YOUR AGENT SELECTS AN ACTION HERE
# In real training, this comes from your LLM policy network
action = select_action(obs, step)
obs, reward, done, info = env.step(action)
total_reward += reward
step += 1
# YOUR AGENT LEARNS HERE
# e.g., store transition in replay buffer, compute policy gradient, etc.
summary = env.get_episode_summary()
print(f" Episode {episode + 1}: steps={step}, reward={total_reward:.1f}, "
f"world='{summary['world_name']}'")
print()
def example_gymnasium_wrapper():
"""
Example 2: Gymnasium-Compatible Wrapper
Uses the standard gymnasium interface for compatibility with
Stable-Baselines3, RLlib, TRL, and other RL frameworks.
"""
print("=" * 60)
print(" Example 2: Gymnasium-Compatible Wrapper")
print("=" * 60)
from hypothesis_engine.gym_wrapper import make_env
env = make_env(difficulty=2, experiment_budget=20, seed=42)
for episode in range(3):
obs_text, info = env.reset()
terminated = False
truncated = False
total_reward = 0.0
step = 0
print(f"\n Episode {episode + 1}: {info.get('world_name', '?')}")
while not terminated and not truncated and step < 40:
# In a real setup, your LLM generates this action text
action_text = generate_action_text(obs_text, step)
obs_text, reward, terminated, truncated, info = env.step(action_text)
total_reward += reward
step += 1
print(f" Steps: {step}, Total Reward: {total_reward:.1f}")
env.close()
print()
def example_curriculum_training():
"""
Example 3: Multi-Episode Training with Auto-Curriculum
Demonstrates how the auto-curriculum automatically advances
difficulty as the agent improves, providing a self-improving
training signal.
"""
print("=" * 60)
print(" Example 3: Auto-Curriculum Training")
print("=" * 60)
from hypothesis_engine import HypothesisEngine
env = HypothesisEngine(
difficulty=1,
experiment_budget=25,
auto_curriculum=True,
advance_threshold=60.0,
)
episode_rewards = []
for episode in range(10):
obs = env.reset()
done = False
total_reward = 0.0
step = 0
while not done and step < 40:
action = select_action(obs, step)
obs, reward, done, info = env.step(action)
total_reward += reward
step += 1
episode_rewards.append(total_reward)
difficulty = env.world.difficulty if env.world else "?"
# Get curriculum progress
if env.curriculum:
progress = env.curriculum.get_progress_summary()
current_level = progress.get("current_difficulty", "?")
else:
current_level = difficulty
print(f" Episode {episode + 1:>2d}: Level {current_level}, "
f"Reward: {total_reward:>6.1f}, "
f"Running Avg: {sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):>6.1f}")
print(f"\n Final average reward (last 5): "
f"{sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):.1f}")
print()
# ββ Helper: Simple action selector (replace with your LLM) βββββββββββ
def select_action(obs, step):
"""
Dummy action selector for demonstration.
In real training, replace this with your LLM policy:
- Feed obs_text to LLM
- Parse LLM output as JSON action
- Return the action dict
"""
world = obs.get("world", {})
variables = world.get("variables", ["x"])
ranges = world.get("variable_ranges", {"x": [-10, 10]})
test_cases = obs.get("test_cases", [])
remaining = obs.get("experiments_remaining", 0)
# Simple strategy: probe, then predict
if step < 8 and remaining > 2:
# Run experiments at strategic points
inputs = {}
for var in variables:
lo, hi = ranges.get(var, [-10, 10])
# Probe different points
probes = [0, 1, -1, 2, -2, 3, -3, 5]
idx = step % len(probes)
val = max(lo, min(hi, float(probes[idx])))
inputs[var] = val
return {"action": "experiment", "inputs": inputs}
elif step == 8:
# Simple hypothesis
return {"action": "hypothesize", "expression": "2*x + 1"}
else:
# Predict (ends episode)
predictions = [0.0] * len(test_cases) if test_cases else [0.0] * 20
return {"action": "predict", "predictions": predictions}
def generate_action_text(obs_text, step):
"""
Generate action text for the gymnasium wrapper.
In real training, this is where your LLM generates text responses.
"""
import json
if step < 5:
probes = [0, 1, -1, 2, -2]
return json.dumps({
"action": "experiment",
"inputs": {"x": probes[step % len(probes)]}
})
elif step == 5:
return json.dumps({
"action": "hypothesize",
"expression": "2*x + 1"
})
else:
return json.dumps({
"action": "predict",
"predictions": [0.0] * 20
})
# ββ Integration Guide βββββββββββββββββββββββββββββββββββββββββββββββββ
INTEGRATION_GUIDE = """
== INTEGRATION WITH POPULAR RL FRAMEWORKS ==
1. Stable-Baselines3 (via gymnasium wrapper):
from hypothesis_engine.gym_wrapper import make_env
from stable_baselines3 import PPO
env = make_env(difficulty=3, experiment_budget=30)
model = PPO("MlpPolicy", env) # Use custom text policy
model.learn(total_timesteps=10000)
2. TRL (Transformer RL for LLM fine-tuning):
from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv
from trl import PPOTrainer
env = HypothesisEngineGymEnv(difficulty=1, auto_curriculum=True)
# Use env.reset() and env.step() in your TRL training loop
# The text-based obs/action spaces work naturally with LLMs
3. Custom GRPO/RLHF Training:
from hypothesis_engine import HypothesisEngine
env = HypothesisEngine(difficulty=1, auto_curriculum=True)
for episode in range(num_episodes):
obs = env.reset()
trajectory = []
done = False
while not done:
# LLM generates action from observation text
action = llm.generate(format_prompt(obs))
obs, reward, done, info = env.step(parse_action(action))
trajectory.append((obs, action, reward))
# Use trajectory for GRPO/PPO update
update_policy(trajectory)
4. Ray RLlib:
from ray.rllib.algorithms.ppo import PPOConfig
from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv
config = PPOConfig().environment(
env=HypothesisEngineGymEnv,
env_config={"difficulty": 3, "experiment_budget": 30},
)
"""
if __name__ == "__main__":
print()
print("Hypothesis Engine β RL Training Integration Examples")
print("=" * 60)
print()
example_basic_loop()
example_gymnasium_wrapper()
example_curriculum_training()
print(INTEGRATION_GUIDE)
|