#!/usr/bin/env python3 """ RL Training CLI Runner Dedicated CLI runner for RL training workflows with: - Extended timeouts for long-running training - RL-focused system prompts - Full toolset including RL training tools - Special handling for 30-minute check intervals Usage: python rl_cli.py "Train a model on GSM8k for math reasoning" python rl_cli.py --interactive python rl_cli.py --list-environments Environment Variables: TINKER_API_KEY: API key for Tinker service (required) WANDB_API_KEY: API key for WandB metrics (required) OPENROUTER_API_KEY: API key for OpenRouter (required for agent) """ import asyncio import os import sys from pathlib import Path import fire import yaml # Load .env from ~/.hermes/.env first, then project root as dev fallback. # User-managed env files should override stale shell exports on restart. _hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) _project_env = Path(__file__).parent / '.env' from hermes_cli.env_loader import load_hermes_dotenv _loaded_env_paths = load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env) for _env_path in _loaded_env_paths: print(f"āœ… Loaded environment variables from {_env_path}") # Set terminal working directory to tinker-atropos submodule # This ensures terminal commands run in the right context for RL work tinker_atropos_dir = Path(__file__).parent / 'tinker-atropos' if tinker_atropos_dir.exists(): os.environ['TERMINAL_CWD'] = str(tinker_atropos_dir) os.environ['HERMES_QUIET'] = '1' # Disable temp subdirectory creation print(f"šŸ“‚ Terminal working directory: {tinker_atropos_dir}") else: # Fall back to hermes-agent directory if submodule not found os.environ['TERMINAL_CWD'] = str(Path(__file__).parent) os.environ['HERMES_QUIET'] = '1' print(f"āš ļø tinker-atropos submodule not found, using: {Path(__file__).parent}") # Import agent and tools from run_agent import AIAgent from model_tools import get_tool_definitions, check_toolset_requirements from tools.rl_training_tool import check_rl_api_keys, get_missing_keys # ============================================================================ # Config Loading # ============================================================================ from hermes_constants import OPENROUTER_BASE_URL DEFAULT_MODEL = "anthropic/claude-opus-4.5" DEFAULT_BASE_URL = OPENROUTER_BASE_URL def load_hermes_config() -> dict: """ Load configuration from ~/.hermes/config.yaml. Returns: dict: Configuration with model, base_url, etc. """ config_path = _hermes_home / 'config.yaml' config = { "model": DEFAULT_MODEL, "base_url": DEFAULT_BASE_URL, } if config_path.exists(): try: with open(config_path, "r") as f: file_config = yaml.safe_load(f) or {} # Get model from config if "model" in file_config: if isinstance(file_config["model"], str): config["model"] = file_config["model"] elif isinstance(file_config["model"], dict): config["model"] = file_config["model"].get("default", DEFAULT_MODEL) # Get base_url if specified if "base_url" in file_config: config["base_url"] = file_config["base_url"] except Exception as e: print(f"āš ļø Warning: Failed to load config.yaml: {e}") return config # ============================================================================ # RL-Specific Configuration # ============================================================================ # Extended timeouts for long-running RL operations RL_MAX_ITERATIONS = 200 # Allow many more iterations for long workflows # RL-focused system prompt RL_SYSTEM_PROMPT = """You are an automated post-training engineer specializing in reinforcement learning for language models. ## Your Capabilities You have access to RL training tools for running reinforcement learning on models through Tinker-Atropos: 1. **DISCOVER**: Use `rl_list_environments` to see available RL environments 2. **INSPECT**: Read environment files to understand how they work (verifiers, data loading, rewards) 3. **INSPECT DATA**: Use terminal to explore HuggingFace datasets and understand their format 4. **CREATE**: Copy existing environments as templates, modify for your needs 5. **CONFIGURE**: Use `rl_select_environment` and `rl_edit_config` to set up training 6. **TEST**: Always use `rl_test_inference` before full training to validate your setup 7. **TRAIN**: Use `rl_start_training` to begin, `rl_check_status` to monitor 8. **EVALUATE**: Use `rl_get_results` and analyze WandB metrics to assess performance ## Environment Files Environment files are located in: `tinker-atropos/tinker_atropos/environments/` Study existing environments to learn patterns. Look for: - `load_dataset()` calls - how data is loaded - `score_answer()` / `score()` - verification logic - `get_next_item()` - prompt formatting - `system_prompt` - instruction format - `config_init()` - default configuration ## Creating New Environments To create a new environment: 1. Read an existing environment file (e.g., gsm8k_tinker.py) 2. Use terminal to explore the target dataset format 3. Copy the environment file as a template 4. Modify the dataset loading, prompt formatting, and verifier logic 5. Test with `rl_test_inference` before training ## Important Guidelines - **Always test before training**: Training runs take hours - verify everything works first - **Monitor metrics**: Check WandB for reward/mean and percent_correct - **Status check intervals**: Wait at least 30 minutes between status checks - **Early stopping**: Stop training early if metrics look bad or stagnant - **Iterate quickly**: Start with small total_steps to validate, then scale up ## Available Toolsets You have access to: - **RL tools**: Environment discovery, config management, training, testing - **Terminal**: Run commands, inspect files, explore datasets - **Web**: Search for information, documentation, papers - **File tools**: Read and modify code files When asked to train a model, follow this workflow: 1. List available environments 2. Select and configure the appropriate environment 3. Test with sample prompts 4. Start training with conservative settings 5. Monitor progress and adjust as needed """ # Toolsets to enable for RL workflows RL_TOOLSETS = ["terminal", "web", "rl"] # ============================================================================ # Helper Functions # ============================================================================ def check_requirements(): """Check that all required environment variables and services are available.""" errors = [] # Check API keys if not os.getenv("OPENROUTER_API_KEY"): errors.append("OPENROUTER_API_KEY not set - required for agent") missing_rl_keys = get_missing_keys() if missing_rl_keys: errors.append(f"Missing RL API keys: {', '.join(missing_rl_keys)}") if errors: print("āŒ Missing requirements:") for error in errors: print(f" - {error}") print("\nPlease set these environment variables in your .env file or shell.") return False return True def check_tinker_atropos(): """Check if tinker-atropos submodule is properly set up.""" tinker_path = Path(__file__).parent / "tinker-atropos" if not tinker_path.exists(): return False, "tinker-atropos submodule not found. Run: git submodule update --init" envs_path = tinker_path / "tinker_atropos" / "environments" if not envs_path.exists(): return False, f"environments directory not found at {envs_path}" env_files = list(envs_path.glob("*.py")) env_files = [f for f in env_files if not f.name.startswith("_")] return True, {"path": str(tinker_path), "environments_count": len(env_files)} def list_environments_sync(): """List available environments (synchronous wrapper).""" from tools.rl_training_tool import rl_list_environments import json async def _list(): result = await rl_list_environments() return json.loads(result) return asyncio.run(_list()) # ============================================================================ # Main CLI # ============================================================================ def main( task: str = None, model: str = None, api_key: str = None, base_url: str = None, max_iterations: int = RL_MAX_ITERATIONS, interactive: bool = False, list_environments: bool = False, check_server: bool = False, verbose: bool = False, save_trajectories: bool = True, ): """ RL Training CLI - Dedicated runner for RL training workflows. Args: task: The training task/goal (e.g., "Train a model on GSM8k for math") model: Model to use for the agent (reads from ~/.hermes/config.yaml if not provided) api_key: OpenRouter API key (uses OPENROUTER_API_KEY env var if not provided) base_url: API base URL (reads from config or defaults to OpenRouter) max_iterations: Maximum agent iterations (default: 200 for long workflows) interactive: Run in interactive mode (multiple conversations) list_environments: Just list available RL environments and exit check_server: Check if RL API server is running and exit verbose: Enable verbose logging save_trajectories: Save conversation trajectories (default: True for RL) Examples: # Train on a specific environment python rl_cli.py "Train a model on GSM8k math problems" # Interactive mode python rl_cli.py --interactive # List available environments python rl_cli.py --list-environments # Check server status python rl_cli.py --check-server """ # Load config from ~/.hermes/config.yaml config = load_hermes_config() # Use config values if not explicitly provided if model is None: model = config["model"] if base_url is None: base_url = config["base_url"] print("šŸŽÆ RL Training Agent") print("=" * 60) # Handle setup check if check_server: print("\nšŸ” Checking tinker-atropos setup...") ok, result = check_tinker_atropos() if ok: print("āœ… tinker-atropos submodule found") print(f" Path: {result.get('path')}") print(f" Environments found: {result.get('environments_count', 0)}") # Also check API keys missing = get_missing_keys() if missing: print(f"\nāš ļø Missing API keys: {', '.join(missing)}") print(" Add them to ~/.hermes/.env") else: print("āœ… API keys configured") else: print(f"āŒ tinker-atropos not set up: {result}") print("\nTo set up:") print(" git submodule update --init") print(" pip install -e ./tinker-atropos") return # Handle environment listing if list_environments: print("\nšŸ“‹ Available RL Environments:") print("-" * 40) try: data = list_environments_sync() if "error" in data: print(f"āŒ Error: {data['error']}") return envs = data.get("environments", []) if not envs: print("No environments found.") print("\nMake sure tinker-atropos is set up:") print(" git submodule update --init") return for env in envs: print(f"\n šŸ“¦ {env['name']}") print(f" Class: {env['class_name']}") print(f" Path: {env['file_path']}") if env.get('description'): desc = env['description'][:100] + "..." if len(env.get('description', '')) > 100 else env.get('description', '') print(f" Description: {desc}") print(f"\nšŸ“Š Total: {len(envs)} environments") print("\nUse `rl_select_environment(name)` to select an environment for training.") except Exception as e: print(f"āŒ Error listing environments: {e}") print("\nMake sure tinker-atropos is set up:") print(" git submodule update --init") print(" pip install -e ./tinker-atropos") return # Check requirements if not check_requirements(): sys.exit(1) # Set default task if none provided if not task and not interactive: print("\nāš ļø No task provided. Use --interactive for interactive mode or provide a task.") print("\nExamples:") print(' python rl_cli.py "Train a model on GSM8k math problems"') print(' python rl_cli.py "Create an RL environment for code generation"') print(' python rl_cli.py --interactive') return # Get API key api_key = api_key or os.getenv("OPENROUTER_API_KEY") if not api_key: print("āŒ No API key provided. Set OPENROUTER_API_KEY or pass --api-key") sys.exit(1) print(f"\nšŸ¤– Model: {model}") print(f"šŸ”§ Max iterations: {max_iterations}") print(f"šŸ“ Toolsets: {', '.join(RL_TOOLSETS)}") print("=" * 60) # Create agent with RL configuration agent = AIAgent( base_url=base_url, api_key=api_key, model=model, max_iterations=max_iterations, enabled_toolsets=RL_TOOLSETS, save_trajectories=save_trajectories, verbose_logging=verbose, quiet_mode=False, ephemeral_system_prompt=RL_SYSTEM_PROMPT, ) if interactive: # Interactive mode - multiple conversations print("\nšŸ”„ Interactive RL Training Mode") print("Type 'quit' or 'exit' to end the session.") print("Type 'status' to check active training runs.") print("-" * 40) while True: try: user_input = input("\nšŸŽÆ RL Task> ").strip() if not user_input: continue if user_input.lower() in ('quit', 'exit', 'q'): print("\nšŸ‘‹ Goodbye!") break if user_input.lower() == 'status': # Quick status check from tools.rl_training_tool import rl_list_runs import json result = asyncio.run(rl_list_runs()) runs = json.loads(result) if isinstance(runs, list) and runs: print("\nšŸ“Š Active Runs:") for run in runs: print(f" - {run['run_id']}: {run['environment']} ({run['status']})") else: print("\nNo active runs.") continue # Run the agent print("\n" + "=" * 60) response = agent.run_conversation(user_input) print("\n" + "=" * 60) except KeyboardInterrupt: print("\n\nšŸ‘‹ Interrupted. Goodbye!") break except Exception as e: print(f"\nāŒ Error: {e}") if verbose: import traceback traceback.print_exc() else: # Single task mode print(f"\nšŸ“ Task: {task}") print("-" * 40) try: response = agent.run_conversation(task) print("\n" + "=" * 60) print("āœ… Task completed") except KeyboardInterrupt: print("\n\nāš ļø Interrupted by user") except Exception as e: print(f"\nāŒ Error: {e}") if verbose: import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": fire.Fire(main)