Spaces:

AbhinavSDubey30
/

hypothesis-engine

Sleeping

hypothesis-engine / examples /training_loop.py

AbhinavDubey30

Hypothesis Engine: Scientific Discovery RL Environment for LLM Training

bbfb06a about 1 month ago

8.5 kB

	#!/usr/bin/env python3
	"""
	Example: Using Hypothesis Engine for LLM RL Training

	This script demonstrates how the Hypothesis Engine can be used in a standard
	reinforcement learning training loop for LLMs.

	Three integration patterns are shown:
	1. Basic RL loop (any framework)
	2. Gymnasium-compatible wrapper
	3. Multi-episode training with auto-curriculum

	Usage:
	python examples/training_loop.py
	"""

	import sys
	import os

	# Add project root to path
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


	def example_basic_loop():
	"""
	Example 1: Basic RL Training Loop

	This shows the simplest integration pattern using the core
	HypothesisEngine API directly.
	"""
	print("=" * 60)
	print(" Example 1: Basic RL Training Loop")
	print("=" * 60)

	from hypothesis_engine import HypothesisEngine

	env = HypothesisEngine(difficulty=1, experiment_budget=20, seed=42)

	# Standard RL loop
	for episode in range(3):
	obs = env.reset()
	done = False
	total_reward = 0.0
	step = 0

	while not done and step < 40:
	# YOUR AGENT SELECTS AN ACTION HERE
	# In real training, this comes from your LLM policy network
	action = select_action(obs, step)

	obs, reward, done, info = env.step(action)
	total_reward += reward
	step += 1

	# YOUR AGENT LEARNS HERE
	# e.g., store transition in replay buffer, compute policy gradient, etc.

	summary = env.get_episode_summary()
	print(f" Episode {episode + 1}: steps={step}, reward={total_reward:.1f}, "
	f"world='{summary['world_name']}'")

	print()


	def example_gymnasium_wrapper():
	"""
	Example 2: Gymnasium-Compatible Wrapper

	Uses the standard gymnasium interface for compatibility with
	Stable-Baselines3, RLlib, TRL, and other RL frameworks.
	"""
	print("=" * 60)
	print(" Example 2: Gymnasium-Compatible Wrapper")
	print("=" * 60)

	from hypothesis_engine.gym_wrapper import make_env

	env = make_env(difficulty=2, experiment_budget=20, seed=42)

	for episode in range(3):
	obs_text, info = env.reset()
	terminated = False
	truncated = False
	total_reward = 0.0
	step = 0

	print(f"\n Episode {episode + 1}: {info.get('world_name', '?')}")

	while not terminated and not truncated and step < 40:
	# In a real setup, your LLM generates this action text
	action_text = generate_action_text(obs_text, step)

	obs_text, reward, terminated, truncated, info = env.step(action_text)
	total_reward += reward
	step += 1

	print(f" Steps: {step}, Total Reward: {total_reward:.1f}")

	env.close()
	print()


	def example_curriculum_training():
	"""
	Example 3: Multi-Episode Training with Auto-Curriculum

	Demonstrates how the auto-curriculum automatically advances
	difficulty as the agent improves, providing a self-improving
	training signal.
	"""
	print("=" * 60)
	print(" Example 3: Auto-Curriculum Training")
	print("=" * 60)

	from hypothesis_engine import HypothesisEngine

	env = HypothesisEngine(
	difficulty=1,
	experiment_budget=25,
	auto_curriculum=True,
	advance_threshold=60.0,
	)

	episode_rewards = []

	for episode in range(10):
	obs = env.reset()
	done = False
	total_reward = 0.0
	step = 0

	while not done and step < 40:
	action = select_action(obs, step)
	obs, reward, done, info = env.step(action)
	total_reward += reward
	step += 1

	episode_rewards.append(total_reward)
	difficulty = env.world.difficulty if env.world else "?"

	# Get curriculum progress
	if env.curriculum:
	progress = env.curriculum.get_progress_summary()
	current_level = progress.get("current_difficulty", "?")
	else:
	current_level = difficulty

	print(f" Episode {episode + 1:>2d}: Level {current_level}, "
	f"Reward: {total_reward:>6.1f}, "
	f"Running Avg: {sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):>6.1f}")

	print(f"\n Final average reward (last 5): "
	f"{sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):.1f}")
	print()


	# ── Helper: Simple action selector (replace with your LLM) ───────────

	def select_action(obs, step):
	"""
	Dummy action selector for demonstration.

	In real training, replace this with your LLM policy:
	- Feed obs_text to LLM
	- Parse LLM output as JSON action
	- Return the action dict
	"""
	world = obs.get("world", {})
	variables = world.get("variables", ["x"])
	ranges = world.get("variable_ranges", {"x": [-10, 10]})
	test_cases = obs.get("test_cases", [])
	remaining = obs.get("experiments_remaining", 0)

	# Simple strategy: probe, then predict
	if step < 8 and remaining > 2:
	# Run experiments at strategic points
	inputs = {}
	for var in variables:
	lo, hi = ranges.get(var, [-10, 10])
	# Probe different points
	probes = [0, 1, -1, 2, -2, 3, -3, 5]
	idx = step % len(probes)
	val = max(lo, min(hi, float(probes[idx])))
	inputs[var] = val
	return {"action": "experiment", "inputs": inputs}

	elif step == 8:
	# Simple hypothesis
	return {"action": "hypothesize", "expression": "2*x + 1"}

	else:
	# Predict (ends episode)
	predictions = [0.0] * len(test_cases) if test_cases else [0.0] * 20
	return {"action": "predict", "predictions": predictions}


	def generate_action_text(obs_text, step):
	"""
	Generate action text for the gymnasium wrapper.

	In real training, this is where your LLM generates text responses.
	"""
	import json

	if step < 5:
	probes = [0, 1, -1, 2, -2]
	return json.dumps({
	"action": "experiment",
	"inputs": {"x": probes[step % len(probes)]}
	})
	elif step == 5:
	return json.dumps({
	"action": "hypothesize",
	"expression": "2*x + 1"
	})
	else:
	return json.dumps({
	"action": "predict",
	"predictions": [0.0] * 20
	})


	# ── Integration Guide ─────────────────────────────────────────────────

	INTEGRATION_GUIDE = """
	== INTEGRATION WITH POPULAR RL FRAMEWORKS ==

	1. Stable-Baselines3 (via gymnasium wrapper):

	from hypothesis_engine.gym_wrapper import make_env
	from stable_baselines3 import PPO

	env = make_env(difficulty=3, experiment_budget=30)
	model = PPO("MlpPolicy", env) # Use custom text policy
	model.learn(total_timesteps=10000)

	2. TRL (Transformer RL for LLM fine-tuning):

	from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv
	from trl import PPOTrainer

	env = HypothesisEngineGymEnv(difficulty=1, auto_curriculum=True)
	# Use env.reset() and env.step() in your TRL training loop
	# The text-based obs/action spaces work naturally with LLMs

	3. Custom GRPO/RLHF Training:

	from hypothesis_engine import HypothesisEngine

	env = HypothesisEngine(difficulty=1, auto_curriculum=True)

	for episode in range(num_episodes):
	obs = env.reset()
	trajectory = []
	done = False

	while not done:
	# LLM generates action from observation text
	action = llm.generate(format_prompt(obs))
	obs, reward, done, info = env.step(parse_action(action))
	trajectory.append((obs, action, reward))

	# Use trajectory for GRPO/PPO update
	update_policy(trajectory)

	4. Ray RLlib:

	from ray.rllib.algorithms.ppo import PPOConfig
	from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv

	config = PPOConfig().environment(
	env=HypothesisEngineGymEnv,
	env_config={"difficulty": 3, "experiment_budget": 30},
	)
	"""


	if __name__ == "__main__":
	print()
	print("Hypothesis Engine — RL Training Integration Examples")
	print("=" * 60)
	print()

	example_basic_loop()
	example_gymnasium_wrapper()
	example_curriculum_training()

	print(INTEGRATION_GUIDE)