Spaces:
Sleeping
Sleeping
| """ | |
| Gymnasium-Compatible Wrapper for the Hypothesis Engine. | |
| Provides a standard gymnasium.Env interface for seamless integration | |
| with RL training frameworks (Stable-Baselines3, RLlib, TRL, etc.). | |
| Usage: | |
| import gymnasium as gym | |
| # Register the environment | |
| from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv | |
| env = HypothesisEngineGymEnv(difficulty=3, experiment_budget=30) | |
| obs, info = env.reset() | |
| obs, reward, terminated, truncated, info = env.step(action_text) | |
| """ | |
| import json | |
| import numpy as np | |
| from typing import Dict, Any, Optional, Tuple | |
| try: | |
| import gymnasium as gym | |
| from gymnasium import spaces | |
| HAS_GYMNASIUM = True | |
| except ImportError: | |
| HAS_GYMNASIUM = False | |
| from .env import HypothesisEngine | |
| from .worlds import WorldGenerator | |
| class HypothesisEngineGymEnv: | |
| """ | |
| Gymnasium-compatible wrapper for the Hypothesis Engine. | |
| This wrapper exposes the Hypothesis Engine as a standard RL environment | |
| with text-based observation and action spaces, designed for training | |
| LLMs via reinforcement learning (e.g., RLHF, PPO, GRPO). | |
| Observation Space: | |
| A text string containing the current state description, experiment | |
| history, and available actions. | |
| Action Space: | |
| A text string containing a JSON action (experiment, hypothesize, | |
| predict, get_hint, get_status). | |
| Reward: | |
| Float reward signal combining prediction accuracy, hypothesis quality, | |
| experiment efficiency, information gain, and progressive improvement. | |
| """ | |
| metadata = {"render_modes": ["human", "ansi"]} | |
| def __init__( | |
| self, | |
| difficulty: int = 1, | |
| experiment_budget: int = 30, | |
| seed: Optional[int] = None, | |
| auto_curriculum: bool = False, | |
| advance_threshold: float = 65.0, | |
| max_steps: int = 50, | |
| render_mode: Optional[str] = None, | |
| ): | |
| """ | |
| Initialize the Gymnasium-compatible Hypothesis Engine. | |
| Args: | |
| difficulty: Starting difficulty level (1-10). | |
| experiment_budget: Max experiments per episode. | |
| seed: Random seed for reproducibility. | |
| auto_curriculum: Enable automatic difficulty progression. | |
| advance_threshold: Score needed to advance to next level. | |
| max_steps: Maximum steps before truncation. | |
| render_mode: "human" for rich display, "ansi" for plain text, None for silent. | |
| """ | |
| self.engine = HypothesisEngine( | |
| difficulty=difficulty, | |
| experiment_budget=experiment_budget, | |
| seed=seed, | |
| auto_curriculum=auto_curriculum, | |
| advance_threshold=advance_threshold, | |
| ) | |
| self.max_steps = max_steps | |
| self.render_mode = render_mode | |
| self._step_count = 0 | |
| self._last_obs_text = "" | |
| # If gymnasium is available, set up proper spaces | |
| if HAS_GYMNASIUM: | |
| # Text-based spaces for LLM training | |
| self.observation_space = spaces.Text( | |
| min_length=0, | |
| max_length=10000, | |
| ) | |
| self.action_space = spaces.Text( | |
| min_length=1, | |
| max_length=2000, | |
| ) | |
| # Display for render | |
| self._display = None | |
| if render_mode == "human": | |
| from .display import Display | |
| self._display = Display(slow_mode=False) | |
| def reset( | |
| self, | |
| seed: Optional[int] = None, | |
| options: Optional[Dict] = None, | |
| ) -> Tuple[str, Dict[str, Any]]: | |
| """ | |
| Reset the environment and start a new episode. | |
| Args: | |
| seed: Optional random seed. | |
| options: Optional configuration overrides. | |
| Returns: | |
| Tuple of (observation_text, info_dict) | |
| """ | |
| self._step_count = 0 | |
| # Handle options | |
| if options and "difficulty" in options: | |
| self.engine.initial_difficulty = options["difficulty"] | |
| obs = self.engine.reset(seed=seed) | |
| obs_text = self._format_observation(obs) | |
| self._last_obs_text = obs_text | |
| info = { | |
| "raw_observation": obs, | |
| "difficulty": self.engine.world.difficulty if self.engine.world else 1, | |
| "world_name": self.engine.world.name if self.engine.world else "Unknown", | |
| "action_space_description": self.engine.get_action_space_description(), | |
| } | |
| if self.render_mode == "human" and self._display: | |
| world_briefing = obs.get("world", {}) | |
| self._display.show_episode_start( | |
| self.engine.episode_count, | |
| self.engine.world.difficulty, | |
| world_briefing, | |
| ) | |
| return obs_text, info | |
| def step(self, action_text: str) -> Tuple[str, float, bool, bool, Dict[str, Any]]: | |
| """ | |
| Take a step in the environment. | |
| Args: | |
| action_text: A JSON string describing the action, or a natural language | |
| string that will be parsed into an action. | |
| Returns: | |
| Tuple of (observation_text, reward, terminated, truncated, info) | |
| """ | |
| self._step_count += 1 | |
| # Parse action | |
| action = self._parse_action(action_text) | |
| # Execute in engine | |
| obs, reward, done, info = self.engine.step(action) | |
| # Format observation as text | |
| obs_text = self._format_observation(obs) | |
| self._last_obs_text = obs_text | |
| # Check truncation | |
| truncated = self._step_count >= self.max_steps and not done | |
| terminated = done | |
| # Add step metadata to info | |
| info["step"] = self._step_count | |
| info["raw_observation"] = obs | |
| # Render if needed | |
| if self.render_mode == "human" and self._display: | |
| action_type = action.get("action", "") | |
| if action_type == "experiment" and "experiment_result" in info: | |
| result = info["experiment_result"] | |
| self._display.show_experiment( | |
| self._step_count, | |
| result.get("inputs", {}), | |
| result.get("output"), | |
| ) | |
| elif action_type == "hypothesize" and "hypothesis_score" in info: | |
| score = info["hypothesis_score"] | |
| hint = "high" if score >= 0.8 else "medium" if score >= 0.5 else "low" | |
| self._display.show_hypothesis( | |
| len(self.engine.hypothesis_history), | |
| action.get("expression", "?"), | |
| hint, | |
| ) | |
| return obs_text, float(reward), terminated, truncated, info | |
| def render(self) -> Optional[str]: | |
| """Render the current state.""" | |
| if self.render_mode == "ansi": | |
| return self._last_obs_text | |
| elif self.render_mode == "human": | |
| print(self._last_obs_text) | |
| return None | |
| def close(self): | |
| """Clean up resources.""" | |
| pass | |
| # ββ Observation Formatting ββββββββββββββββββββββββββββββββββββββββββββ | |
| def _format_observation(self, obs: Dict[str, Any]) -> str: | |
| """ | |
| Format the raw observation dict into a structured text string | |
| suitable for LLM consumption. | |
| """ | |
| parts = [] | |
| # System message | |
| if obs.get("message"): | |
| parts.append(f"[System] {obs['message']}") | |
| # World description | |
| if obs.get("world"): | |
| world = obs["world"] | |
| parts.append(f"\n== World: {world.get('world_name', 'Unknown')} ==") | |
| parts.append(f"Difficulty: {world.get('difficulty', '?')}/10") | |
| parts.append(f"Category: {world.get('category', '?')}") | |
| parts.append(f"Description: {world.get('description', '')}") | |
| parts.append(f"Variables: {world.get('variables', [])}") | |
| ranges = world.get("variable_ranges", {}) | |
| range_strs = [f" {v}: [{r[0]}, {r[1]}]" for v, r in ranges.items()] | |
| parts.append("Ranges:\n" + "\n".join(range_strs)) | |
| if world.get("is_stateful"): | |
| parts.append("WARNING: This system has MEMORY -- order matters!") | |
| if world.get("hints"): | |
| parts.append(f"Hint: {world['hints'][0]}") | |
| # Budget status | |
| remaining = obs.get("experiments_remaining", "?") | |
| used = obs.get("experiments_used", "?") | |
| parts.append(f"\nBudget: {remaining} experiments remaining ({used} used)") | |
| # Experiment history | |
| if obs.get("experiment_history"): | |
| parts.append("\nExperiment History (recent):") | |
| for i, exp in enumerate(obs["experiment_history"]): | |
| inputs = exp.get("inputs", {}) | |
| output = exp.get("output", "ERROR") | |
| input_str = ", ".join(f"{k}={v}" for k, v in inputs.items()) | |
| parts.append(f" #{i+1}: [{input_str}] -> y = {output}") | |
| # Last experiment result (highlighted) | |
| if obs.get("last_experiment_result"): | |
| result = obs["last_experiment_result"] | |
| inputs = result.get("inputs", {}) | |
| output = result.get("output", "ERROR") | |
| input_str = ", ".join(f"{k}={v}" for k, v in inputs.items()) | |
| parts.append(f"\nLast Result: [{input_str}] -> y = {output}") | |
| # Hypothesis feedback | |
| if obs.get("hypothesis_feedback"): | |
| fb = obs["hypothesis_feedback"] | |
| parts.append(f"\nHypothesis Feedback:") | |
| parts.append(f" Expression: y = {fb.get('expression', '?')}") | |
| parts.append(f" Quality: {fb.get('quality', '?')}") | |
| parts.append(f" Score Level: {fb.get('score_hint', '?')}") | |
| # Latest hypothesis | |
| if obs.get("latest_hypothesis"): | |
| parts.append(f"\nCurrent Hypothesis: y = {obs['latest_hypothesis']}") | |
| # Test cases (for prediction) | |
| if obs.get("test_cases"): | |
| cases = obs["test_cases"] | |
| parts.append(f"\nTest Cases ({len(cases)} predictions needed):") | |
| for i, case in enumerate(cases[:8]): | |
| case_str = ", ".join(f"{k}={v}" for k, v in case.items()) | |
| parts.append(f" Case {i+1}: [{case_str}] -> y = ?") | |
| if len(cases) > 8: | |
| parts.append(f" ... and {len(cases) - 8} more cases") | |
| # Final results | |
| if obs.get("final_results"): | |
| results = obs["final_results"] | |
| parts.append("\n== Episode Complete ==") | |
| reward = results.get("reward_breakdown", {}) | |
| parts.append(f"Total Score: {reward.get('total_reward', 0)}/100") | |
| parts.append(f"Passed: {results.get('passed', False)}") | |
| parts.append(f"Ground Truth: y = {results.get('ground_truth', '?')}") | |
| # Available actions reminder | |
| parts.append("\nAvailable Actions (respond with JSON):") | |
| parts.append(' {"action": "experiment", "inputs": {"x": VALUE}}') | |
| parts.append(' {"action": "hypothesize", "expression": "MATH_EXPR"}') | |
| parts.append(' {"action": "predict", "predictions": [v1, v2, ...]}') | |
| parts.append(' {"action": "get_hint"}') | |
| parts.append(' {"action": "get_status"}') | |
| return "\n".join(parts) | |
| # ββ Action Parsing ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _parse_action(self, action_text: str) -> Dict[str, Any]: | |
| """ | |
| Parse an action string into an action dict. | |
| Handles: | |
| - Pure JSON: {"action": "experiment", "inputs": {"x": 3}} | |
| - JSON with reasoning prefix: "Let me try x=3\n{...}" | |
| - Natural language (fallback to get_status) | |
| """ | |
| action_text = action_text.strip() | |
| # Try direct JSON parse | |
| try: | |
| action = json.loads(action_text) | |
| if isinstance(action, dict) and "action" in action: | |
| return action | |
| except json.JSONDecodeError: | |
| pass | |
| # Try to extract JSON from text | |
| import re | |
| # Nested JSON (e.g., {"action": "experiment", "inputs": {"x": 3}}) | |
| patterns = [ | |
| r'\{[^{}]*\{[^{}]*\}[^{}]*\}', # nested | |
| r'\{[^{}]*\}', # simple | |
| ] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, action_text) | |
| for match in matches: | |
| try: | |
| action = json.loads(match) | |
| if isinstance(action, dict) and "action" in action: | |
| return action | |
| except json.JSONDecodeError: | |
| continue | |
| # Fallback | |
| return {"action": "get_status"} | |
| # ββ Utility βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_episode_summary(self) -> Dict[str, Any]: | |
| """Get summary of the current/last episode.""" | |
| return self.engine.get_episode_summary() | |
| def difficulty(self) -> int: | |
| """Current difficulty level.""" | |
| return self.engine.world.difficulty if self.engine.world else self.engine.initial_difficulty | |
| def world_name(self) -> str: | |
| """Current world name.""" | |
| return self.engine.world.name if self.engine.world else "Not started" | |
| def make_env( | |
| difficulty: int = 1, | |
| experiment_budget: int = 30, | |
| seed: Optional[int] = None, | |
| auto_curriculum: bool = False, | |
| render_mode: Optional[str] = None, | |
| ) -> HypothesisEngineGymEnv: | |
| """ | |
| Factory function to create a Hypothesis Engine environment. | |
| This is the recommended way to create the environment: | |
| from hypothesis_engine.gym_wrapper import make_env | |
| env = make_env(difficulty=3, experiment_budget=30) | |
| obs, info = env.reset() | |
| obs, reward, terminated, truncated, info = env.step('{"action": "experiment", "inputs": {"x": 3}}') | |
| Args: | |
| difficulty: Starting difficulty (1-10). | |
| experiment_budget: Max experiments per episode. | |
| seed: Random seed. | |
| auto_curriculum: Auto-advance difficulty. | |
| render_mode: "human", "ansi", or None. | |
| Returns: | |
| HypothesisEngineGymEnv instance. | |
| """ | |
| return HypothesisEngineGymEnv( | |
| difficulty=difficulty, | |
| experiment_budget=experiment_budget, | |
| seed=seed, | |
| auto_curriculum=auto_curriculum, | |
| render_mode=render_mode, | |
| ) | |