Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Example: Using Hypothesis Engine for LLM RL Training | |
| This script demonstrates how the Hypothesis Engine can be used in a standard | |
| reinforcement learning training loop for LLMs. | |
| Three integration patterns are shown: | |
| 1. Basic RL loop (any framework) | |
| 2. Gymnasium-compatible wrapper | |
| 3. Multi-episode training with auto-curriculum | |
| Usage: | |
| python examples/training_loop.py | |
| """ | |
| import sys | |
| import os | |
| # Add project root to path | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| def example_basic_loop(): | |
| """ | |
| Example 1: Basic RL Training Loop | |
| This shows the simplest integration pattern using the core | |
| HypothesisEngine API directly. | |
| """ | |
| print("=" * 60) | |
| print(" Example 1: Basic RL Training Loop") | |
| print("=" * 60) | |
| from hypothesis_engine import HypothesisEngine | |
| env = HypothesisEngine(difficulty=1, experiment_budget=20, seed=42) | |
| # Standard RL loop | |
| for episode in range(3): | |
| obs = env.reset() | |
| done = False | |
| total_reward = 0.0 | |
| step = 0 | |
| while not done and step < 40: | |
| # YOUR AGENT SELECTS AN ACTION HERE | |
| # In real training, this comes from your LLM policy network | |
| action = select_action(obs, step) | |
| obs, reward, done, info = env.step(action) | |
| total_reward += reward | |
| step += 1 | |
| # YOUR AGENT LEARNS HERE | |
| # e.g., store transition in replay buffer, compute policy gradient, etc. | |
| summary = env.get_episode_summary() | |
| print(f" Episode {episode + 1}: steps={step}, reward={total_reward:.1f}, " | |
| f"world='{summary['world_name']}'") | |
| print() | |
| def example_gymnasium_wrapper(): | |
| """ | |
| Example 2: Gymnasium-Compatible Wrapper | |
| Uses the standard gymnasium interface for compatibility with | |
| Stable-Baselines3, RLlib, TRL, and other RL frameworks. | |
| """ | |
| print("=" * 60) | |
| print(" Example 2: Gymnasium-Compatible Wrapper") | |
| print("=" * 60) | |
| from hypothesis_engine.gym_wrapper import make_env | |
| env = make_env(difficulty=2, experiment_budget=20, seed=42) | |
| for episode in range(3): | |
| obs_text, info = env.reset() | |
| terminated = False | |
| truncated = False | |
| total_reward = 0.0 | |
| step = 0 | |
| print(f"\n Episode {episode + 1}: {info.get('world_name', '?')}") | |
| while not terminated and not truncated and step < 40: | |
| # In a real setup, your LLM generates this action text | |
| action_text = generate_action_text(obs_text, step) | |
| obs_text, reward, terminated, truncated, info = env.step(action_text) | |
| total_reward += reward | |
| step += 1 | |
| print(f" Steps: {step}, Total Reward: {total_reward:.1f}") | |
| env.close() | |
| print() | |
| def example_curriculum_training(): | |
| """ | |
| Example 3: Multi-Episode Training with Auto-Curriculum | |
| Demonstrates how the auto-curriculum automatically advances | |
| difficulty as the agent improves, providing a self-improving | |
| training signal. | |
| """ | |
| print("=" * 60) | |
| print(" Example 3: Auto-Curriculum Training") | |
| print("=" * 60) | |
| from hypothesis_engine import HypothesisEngine | |
| env = HypothesisEngine( | |
| difficulty=1, | |
| experiment_budget=25, | |
| auto_curriculum=True, | |
| advance_threshold=60.0, | |
| ) | |
| episode_rewards = [] | |
| for episode in range(10): | |
| obs = env.reset() | |
| done = False | |
| total_reward = 0.0 | |
| step = 0 | |
| while not done and step < 40: | |
| action = select_action(obs, step) | |
| obs, reward, done, info = env.step(action) | |
| total_reward += reward | |
| step += 1 | |
| episode_rewards.append(total_reward) | |
| difficulty = env.world.difficulty if env.world else "?" | |
| # Get curriculum progress | |
| if env.curriculum: | |
| progress = env.curriculum.get_progress_summary() | |
| current_level = progress.get("current_difficulty", "?") | |
| else: | |
| current_level = difficulty | |
| print(f" Episode {episode + 1:>2d}: Level {current_level}, " | |
| f"Reward: {total_reward:>6.1f}, " | |
| f"Running Avg: {sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):>6.1f}") | |
| print(f"\n Final average reward (last 5): " | |
| f"{sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):.1f}") | |
| print() | |
| # ββ Helper: Simple action selector (replace with your LLM) βββββββββββ | |
| def select_action(obs, step): | |
| """ | |
| Dummy action selector for demonstration. | |
| In real training, replace this with your LLM policy: | |
| - Feed obs_text to LLM | |
| - Parse LLM output as JSON action | |
| - Return the action dict | |
| """ | |
| world = obs.get("world", {}) | |
| variables = world.get("variables", ["x"]) | |
| ranges = world.get("variable_ranges", {"x": [-10, 10]}) | |
| test_cases = obs.get("test_cases", []) | |
| remaining = obs.get("experiments_remaining", 0) | |
| # Simple strategy: probe, then predict | |
| if step < 8 and remaining > 2: | |
| # Run experiments at strategic points | |
| inputs = {} | |
| for var in variables: | |
| lo, hi = ranges.get(var, [-10, 10]) | |
| # Probe different points | |
| probes = [0, 1, -1, 2, -2, 3, -3, 5] | |
| idx = step % len(probes) | |
| val = max(lo, min(hi, float(probes[idx]))) | |
| inputs[var] = val | |
| return {"action": "experiment", "inputs": inputs} | |
| elif step == 8: | |
| # Simple hypothesis | |
| return {"action": "hypothesize", "expression": "2*x + 1"} | |
| else: | |
| # Predict (ends episode) | |
| predictions = [0.0] * len(test_cases) if test_cases else [0.0] * 20 | |
| return {"action": "predict", "predictions": predictions} | |
| def generate_action_text(obs_text, step): | |
| """ | |
| Generate action text for the gymnasium wrapper. | |
| In real training, this is where your LLM generates text responses. | |
| """ | |
| import json | |
| if step < 5: | |
| probes = [0, 1, -1, 2, -2] | |
| return json.dumps({ | |
| "action": "experiment", | |
| "inputs": {"x": probes[step % len(probes)]} | |
| }) | |
| elif step == 5: | |
| return json.dumps({ | |
| "action": "hypothesize", | |
| "expression": "2*x + 1" | |
| }) | |
| else: | |
| return json.dumps({ | |
| "action": "predict", | |
| "predictions": [0.0] * 20 | |
| }) | |
| # ββ Integration Guide βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| INTEGRATION_GUIDE = """ | |
| == INTEGRATION WITH POPULAR RL FRAMEWORKS == | |
| 1. Stable-Baselines3 (via gymnasium wrapper): | |
| from hypothesis_engine.gym_wrapper import make_env | |
| from stable_baselines3 import PPO | |
| env = make_env(difficulty=3, experiment_budget=30) | |
| model = PPO("MlpPolicy", env) # Use custom text policy | |
| model.learn(total_timesteps=10000) | |
| 2. TRL (Transformer RL for LLM fine-tuning): | |
| from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv | |
| from trl import PPOTrainer | |
| env = HypothesisEngineGymEnv(difficulty=1, auto_curriculum=True) | |
| # Use env.reset() and env.step() in your TRL training loop | |
| # The text-based obs/action spaces work naturally with LLMs | |
| 3. Custom GRPO/RLHF Training: | |
| from hypothesis_engine import HypothesisEngine | |
| env = HypothesisEngine(difficulty=1, auto_curriculum=True) | |
| for episode in range(num_episodes): | |
| obs = env.reset() | |
| trajectory = [] | |
| done = False | |
| while not done: | |
| # LLM generates action from observation text | |
| action = llm.generate(format_prompt(obs)) | |
| obs, reward, done, info = env.step(parse_action(action)) | |
| trajectory.append((obs, action, reward)) | |
| # Use trajectory for GRPO/PPO update | |
| update_policy(trajectory) | |
| 4. Ray RLlib: | |
| from ray.rllib.algorithms.ppo import PPOConfig | |
| from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv | |
| config = PPOConfig().environment( | |
| env=HypothesisEngineGymEnv, | |
| env_config={"difficulty": 3, "experiment_budget": 30}, | |
| ) | |
| """ | |
| if __name__ == "__main__": | |
| print() | |
| print("Hypothesis Engine β RL Training Integration Examples") | |
| print("=" * 60) | |
| print() | |
| example_basic_loop() | |
| example_gymnasium_wrapper() | |
| example_curriculum_training() | |
| print(INTEGRATION_GUIDE) | |