| |
| """ |
| SWE Runner with Hermes Trajectory Format |
| |
| A runner that uses Hermes-Agent's built-in execution environments |
| (local, docker, modal) and outputs trajectories in the Hermes-Agent format |
| compatible with batch_runner.py and trajectory_compressor.py. |
| |
| Features: |
| - Uses Hermes-Agent's Docker, Modal, or Local environments for command execution |
| - Outputs trajectories in Hermes format (from/value pairs with <tool_call>/<tool_response> XML) |
| - Compatible with the trajectory compression pipeline |
| - Supports batch processing from JSONL prompt files |
| |
| Usage: |
| # Run a single task with local environment |
| python mini_swe_runner.py --task "Create a hello world Python script" --env local |
| |
| # Run with Docker |
| python mini_swe_runner.py --task "List files in /tmp" --env docker --image python:3.11-slim |
| |
| # Run with Modal (cloud) |
| python mini_swe_runner.py --task "Install numpy and test it" --env modal --image python:3.11-slim |
| |
| # Batch mode from JSONL file |
| python mini_swe_runner.py --prompts_file prompts.jsonl --output_file trajectories.jsonl --env docker |
| """ |
|
|
| import json |
| import logging |
| import os |
| import sys |
| import time |
| import uuid |
| from datetime import datetime |
| from pathlib import Path |
| from typing import List, Dict, Any, Optional, Literal |
|
|
| import fire |
| from dotenv import load_dotenv |
|
|
| |
| load_dotenv() |
|
|
|
|
|
|
|
|
| |
| |
| |
|
|
| TERMINAL_TOOL_DEFINITION = { |
| "type": "function", |
| "function": { |
| "name": "terminal", |
| "description": """Execute bash commands in a sandboxed environment. |
| |
| **Environment:** |
| - Isolated execution environment (local, Docker, or Modal cloud) |
| - Filesystem persists between tool calls within the same task |
| - Internet access available |
| |
| **Command Execution:** |
| - Provide the command to execute via the 'command' parameter |
| - Optional 'timeout' parameter in seconds (default: 60) |
| |
| **Examples:** |
| - Run command: `{"command": "ls -la"}` |
| - With timeout: `{"command": "long_task.sh", "timeout": 300}` |
| |
| **Best Practices:** |
| - Use non-interactive commands (avoid vim, nano, interactive python) |
| - Pipe to cat if output might be large |
| - Install tools with apt-get or pip as needed |
| |
| **Completion:** |
| - When task is complete, output: echo "MINI_SWE_AGENT_FINAL_OUTPUT" followed by your result |
| """, |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "command": { |
| "type": "string", |
| "description": "The bash command to execute" |
| }, |
| "timeout": { |
| "type": "integer", |
| "description": "Command timeout in seconds (default: 60)" |
| } |
| }, |
| "required": ["command"] |
| } |
| } |
| } |
|
|
|
|
| |
| |
| |
|
|
| def create_environment( |
| env_type: str = "local", |
| image: str = "python:3.11-slim", |
| cwd: str = "/tmp", |
| timeout: int = 60, |
| **kwargs |
| ): |
| """ |
| Create an execution environment using Hermes-Agent's built-in backends. |
| |
| Args: |
| env_type: One of "local", "docker", "modal" |
| image: Docker/Modal image name (ignored for local) |
| cwd: Working directory |
| timeout: Default command timeout |
| **kwargs: Additional environment-specific options |
| |
| Returns: |
| Environment instance with execute() and cleanup() methods |
| """ |
| if env_type == "local": |
| from tools.environments.local import LocalEnvironment |
| return LocalEnvironment(cwd=cwd, timeout=timeout) |
| |
| elif env_type == "docker": |
| from tools.environments.docker import DockerEnvironment |
| return DockerEnvironment(image=image, cwd=cwd, timeout=timeout, **kwargs) |
| |
| elif env_type == "modal": |
| from tools.environments.modal import ModalEnvironment |
| return ModalEnvironment(image=image, cwd=cwd, timeout=timeout, **kwargs) |
| |
| else: |
| raise ValueError(f"Unknown environment type: {env_type}. Use 'local', 'docker', or 'modal'") |
|
|
|
|
| |
| |
| |
|
|
| class MiniSWERunner: |
| """ |
| Agent runner that uses Hermes-Agent's built-in execution environments |
| and outputs trajectories in Hermes-Agent format. |
| """ |
| |
| def __init__( |
| self, |
| model: str = "anthropic/claude-sonnet-4.6", |
| base_url: str = None, |
| api_key: str = None, |
| env_type: str = "local", |
| image: str = "python:3.11-slim", |
| cwd: str = "/tmp", |
| max_iterations: int = 15, |
| command_timeout: int = 60, |
| verbose: bool = False, |
| ): |
| """ |
| Initialize the Mini-SWE Runner. |
| |
| Args: |
| model: Model name for OpenAI-compatible API |
| base_url: API base URL (optional, uses env vars if not provided) |
| api_key: API key (optional, uses env vars if not provided) |
| env_type: Environment type - "local", "docker", or "modal" |
| image: Docker/Modal image (ignored for local) |
| cwd: Working directory for commands |
| max_iterations: Maximum tool-calling iterations |
| command_timeout: Default timeout for commands |
| verbose: Enable verbose logging |
| """ |
| self.model = model |
| self.max_iterations = max_iterations |
| self.command_timeout = command_timeout |
| self.verbose = verbose |
| self.env_type = env_type |
| self.image = image |
| self.cwd = cwd |
| |
| |
| logging.basicConfig( |
| level=logging.DEBUG if verbose else logging.INFO, |
| format='%(asctime)s - %(levelname)s - %(message)s', |
| datefmt='%H:%M:%S' |
| ) |
| self.logger = logging.getLogger(__name__) |
| |
| |
| |
| |
| if api_key or base_url: |
| from openai import OpenAI |
| client_kwargs = { |
| "base_url": base_url or "https://openrouter.ai/api/v1", |
| "api_key": api_key or os.getenv( |
| "OPENROUTER_API_KEY", |
| os.getenv("ANTHROPIC_API_KEY", |
| os.getenv("OPENAI_API_KEY", ""))), |
| } |
| self.client = OpenAI(**client_kwargs) |
| else: |
| from agent.auxiliary_client import resolve_provider_client |
| self.client, _ = resolve_provider_client("openrouter", model=model) |
| if self.client is None: |
| |
| self.client, _ = resolve_provider_client("auto", model=model) |
| if self.client is None: |
| from openai import OpenAI |
| self.client = OpenAI( |
| base_url="https://openrouter.ai/api/v1", |
| api_key=os.getenv("OPENROUTER_API_KEY", "")) |
| |
| |
| self.env = None |
| |
| |
| self.tools = [TERMINAL_TOOL_DEFINITION] |
| |
| print(f"π€ Mini-SWE Runner initialized") |
| print(f" Model: {self.model}") |
| print(f" Environment: {self.env_type}") |
| if self.env_type != "local": |
| print(f" Image: {self.image}") |
| print(f" Max iterations: {self.max_iterations}") |
| |
| def _create_env(self): |
| """Create the execution environment.""" |
| print(f"π§ Creating {self.env_type} environment...") |
| self.env = create_environment( |
| env_type=self.env_type, |
| image=self.image, |
| cwd=self.cwd, |
| timeout=self.command_timeout |
| ) |
| print(f"β
Environment ready") |
| |
| def _cleanup_env(self): |
| """Cleanup the execution environment.""" |
| if self.env is not None: |
| if hasattr(self.env, 'cleanup'): |
| self.env.cleanup() |
| elif hasattr(self.env, 'stop'): |
| self.env.stop() |
| self.env = None |
| |
| def _execute_command(self, command: str, timeout: int = None) -> Dict[str, Any]: |
| """ |
| Execute a command in the environment. |
| |
| Args: |
| command: Bash command to execute |
| timeout: Optional timeout override |
| |
| Returns: |
| Dict with 'output' and 'returncode' |
| """ |
| if self.env is None: |
| self._create_env() |
| |
| try: |
| result = self.env.execute(command, timeout=timeout or self.command_timeout) |
| return { |
| "output": result.get("output", ""), |
| "exit_code": result.get("returncode", 0), |
| "error": None |
| } |
| except Exception as e: |
| return { |
| "output": "", |
| "exit_code": -1, |
| "error": str(e) |
| } |
| |
| def _format_tools_for_system_message(self) -> str: |
| """Format tool definitions for the system message.""" |
| formatted_tools = [] |
| for tool in self.tools: |
| func = tool["function"] |
| formatted_tools.append({ |
| "name": func["name"], |
| "description": func.get("description", ""), |
| "parameters": func.get("parameters", {}), |
| "required": None |
| }) |
| return json.dumps(formatted_tools, ensure_ascii=False) |
| |
| def _convert_to_hermes_format( |
| self, |
| messages: List[Dict[str, Any]], |
| user_query: str, |
| completed: bool |
| ) -> List[Dict[str, Any]]: |
| """ |
| Convert internal message format to Hermes trajectory format. |
| |
| This produces the exact format used by batch_runner.py. |
| """ |
| trajectory = [] |
| |
| |
| system_msg = ( |
| "You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. " |
| "You may call one or more functions to assist with the user query. If available tools are not relevant in assisting " |
| "with user query, just respond in natural conversational language. Don't make assumptions about what values to plug " |
| "into functions. After calling & executing the functions, you will be provided with function results within " |
| "<tool_response> </tool_response> XML tags. Here are the available tools:\n" |
| f"<tools>\n{self._format_tools_for_system_message()}\n</tools>\n" |
| "For each function call return a JSON object, with the following pydantic model json schema for each:\n" |
| "{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, " |
| "'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n" |
| "Each function call should be enclosed within <tool_call> </tool_call> XML tags.\n" |
| "Example:\n<tool_call>\n{'name': <function-name>,'arguments': <args-dict>}\n</tool_call>" |
| ) |
| |
| trajectory.append({"from": "system", "value": system_msg}) |
| trajectory.append({"from": "human", "value": user_query}) |
| |
| |
| i = 1 |
| while i < len(messages): |
| msg = messages[i] |
| |
| if msg["role"] == "assistant": |
| if "tool_calls" in msg and msg["tool_calls"]: |
| |
| content = "" |
| |
| |
| if msg.get("reasoning"): |
| content = f"<think>{msg['reasoning']}</think>" |
| |
| if msg.get("content"): |
| content += msg["content"] + "\n" |
| |
| |
| for tool_call in msg["tool_calls"]: |
| if not tool_call or not isinstance(tool_call, dict): continue |
| try: |
| arguments = json.loads(tool_call["function"]["arguments"]) \ |
| if isinstance(tool_call["function"]["arguments"], str) \ |
| else tool_call["function"]["arguments"] |
| except json.JSONDecodeError: |
| arguments = {} |
| |
| tool_call_json = { |
| "name": tool_call["function"]["name"], |
| "arguments": arguments |
| } |
| content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n" |
| |
| trajectory.append({"from": "gpt", "value": content.rstrip()}) |
| |
| |
| tool_responses = [] |
| j = i + 1 |
| while j < len(messages) and messages[j]["role"] == "tool": |
| tool_msg = messages[j] |
| tool_content = tool_msg["content"] |
| |
| |
| try: |
| if tool_content.strip().startswith(("{", "[")): |
| tool_content = json.loads(tool_content) |
| except (json.JSONDecodeError, AttributeError): |
| pass |
| |
| tool_response = f"<tool_response>\n" |
| tool_response += json.dumps({ |
| "tool_call_id": tool_msg.get("tool_call_id", ""), |
| "name": msg["tool_calls"][len(tool_responses)]["function"]["name"] \ |
| if len(tool_responses) < len(msg["tool_calls"]) else "unknown", |
| "content": tool_content |
| }, ensure_ascii=False) |
| tool_response += "\n</tool_response>" |
| tool_responses.append(tool_response) |
| j += 1 |
| |
| if tool_responses: |
| trajectory.append({"from": "tool", "value": "\n".join(tool_responses)}) |
| i = j - 1 |
| |
| else: |
| |
| content = "" |
| if msg.get("reasoning"): |
| content = f"<think>{msg['reasoning']}</think>" |
| content += msg.get("content") or "" |
| trajectory.append({"from": "gpt", "value": content}) |
| |
| elif msg["role"] == "user": |
| trajectory.append({"from": "human", "value": msg["content"]}) |
| |
| i += 1 |
| |
| return trajectory |
| |
| def run_task(self, task: str) -> Dict[str, Any]: |
| """ |
| Run a single task and return the result with trajectory. |
| |
| Args: |
| task: The task/prompt to execute |
| |
| Returns: |
| Dict with trajectory, completion status, and metadata |
| """ |
| print(f"\n{'='*60}") |
| print(f"π Task: {task[:80]}{'...' if len(task) > 80 else ''}") |
| print(f"{'='*60}") |
| |
| |
| self._create_env() |
| |
| |
| messages = [{"role": "user", "content": task}] |
| |
| |
| system_prompt = """You are an AI agent that can execute bash commands to complete tasks. |
| |
| When you need to run commands, use the 'terminal' tool with your bash command. |
| |
| **Important:** |
| - When you have completed the task successfully, run: echo "MINI_SWE_AGENT_FINAL_OUTPUT" followed by a summary |
| - Be concise and efficient in your approach |
| - Install any needed tools with apt-get or pip |
| - Avoid interactive commands (no vim, nano, less, etc.) |
| |
| Complete the user's task step by step.""" |
| |
| api_call_count = 0 |
| completed = False |
| final_response = None |
| |
| try: |
| while api_call_count < self.max_iterations: |
| api_call_count += 1 |
| print(f"\nπ API call #{api_call_count}/{self.max_iterations}") |
| |
| |
| api_messages = [{"role": "system", "content": system_prompt}] + messages |
| |
| |
| try: |
| response = self.client.chat.completions.create( |
| model=self.model, |
| messages=api_messages, |
| tools=self.tools, |
| timeout=300.0 |
| ) |
| except Exception as e: |
| self.logger.error(f"API call failed: {e}") |
| break |
| |
| assistant_message = response.choices[0].message |
| |
| |
| if assistant_message.content: |
| print(f"π€ Assistant: {assistant_message.content[:100]}...") |
| |
| |
| if assistant_message.tool_calls: |
| print(f"π§ Tool calls: {len(assistant_message.tool_calls)}") |
| |
| |
| messages.append({ |
| "role": "assistant", |
| "content": assistant_message.content, |
| "tool_calls": [ |
| { |
| "id": tc.id, |
| "type": tc.type, |
| "function": { |
| "name": tc.function.name, |
| "arguments": tc.function.arguments |
| } |
| } |
| for tc in assistant_message.tool_calls |
| ] |
| }) |
| |
| |
| for tc in assistant_message.tool_calls: |
| try: |
| args = json.loads(tc.function.arguments) |
| except json.JSONDecodeError: |
| args = {} |
| |
| command = args.get("command", "echo 'No command provided'") |
| timeout = args.get("timeout", self.command_timeout) |
| |
| print(f" π terminal: {command[:60]}...") |
| |
| |
| result = self._execute_command(command, timeout) |
| |
| |
| result_json = json.dumps({ |
| "content": { |
| "output": result["output"], |
| "exit_code": result["exit_code"], |
| "error": result["error"] |
| } |
| }, ensure_ascii=False) |
| |
| |
| if "MINI_SWE_AGENT_FINAL_OUTPUT" in result["output"]: |
| print(f" β
Task completion signal detected!") |
| completed = True |
| |
| |
| messages.append({ |
| "role": "tool", |
| "content": result_json, |
| "tool_call_id": tc.id |
| }) |
| |
| print(f" β
exit_code={result['exit_code']}, output={len(result['output'])} chars") |
| |
| |
| if completed: |
| final_response = assistant_message.content |
| break |
| |
| else: |
| |
| final_response = assistant_message.content or "" |
| messages.append({ |
| "role": "assistant", |
| "content": final_response |
| }) |
| completed = True |
| print(f"π Agent finished (no more tool calls)") |
| break |
| |
| if api_call_count >= self.max_iterations: |
| print(f"β οΈ Reached max iterations ({self.max_iterations})") |
| |
| finally: |
| |
| self._cleanup_env() |
| |
| |
| trajectory = self._convert_to_hermes_format(messages, task, completed) |
| |
| return { |
| "conversations": trajectory, |
| "completed": completed, |
| "api_calls": api_call_count, |
| "metadata": { |
| "model": self.model, |
| "env_type": self.env_type, |
| "timestamp": datetime.now().isoformat() |
| } |
| } |
| |
| def run_batch( |
| self, |
| prompts: List[str], |
| output_file: str |
| ) -> List[Dict[str, Any]]: |
| """ |
| Run multiple tasks and save trajectories to a JSONL file. |
| |
| Args: |
| prompts: List of task prompts |
| output_file: Output JSONL file path |
| |
| Returns: |
| List of results |
| """ |
| results = [] |
| |
| print(f"\nπ¦ Running batch of {len(prompts)} tasks") |
| print(f"π Output: {output_file}") |
| |
| with open(output_file, 'w', encoding='utf-8') as f: |
| for i, prompt in enumerate(prompts, 1): |
| print(f"\n{'='*60}") |
| print(f"π Task {i}/{len(prompts)}") |
| print(f"{'='*60}") |
| |
| try: |
| result = self.run_task(prompt) |
| results.append(result) |
| |
| |
| f.write(json.dumps(result, ensure_ascii=False) + "\n") |
| f.flush() |
| |
| print(f"β
Task {i} completed (api_calls={result['api_calls']})") |
| |
| except Exception as e: |
| self.logger.error(f"Error on task {i}: {e}") |
| error_result = { |
| "conversations": [], |
| "completed": False, |
| "api_calls": 0, |
| "error": str(e), |
| "metadata": {"timestamp": datetime.now().isoformat()} |
| } |
| results.append(error_result) |
| f.write(json.dumps(error_result, ensure_ascii=False) + "\n") |
| f.flush() |
| |
| print(f"\nβ
Batch complete! {len(results)} trajectories saved to {output_file}") |
| return results |
|
|
|
|
| |
| |
| |
|
|
| def main( |
| task: str = None, |
| prompts_file: str = None, |
| output_file: str = "swe-runner-test1.jsonl", |
| model: str = "claude-sonnet-4-20250514", |
| base_url: str = None, |
| api_key: str = None, |
| env: str = "local", |
| image: str = "python:3.11-slim", |
| cwd: str = "/tmp", |
| max_iterations: int = 15, |
| timeout: int = 60, |
| verbose: bool = False, |
| ): |
| """ |
| Run SWE tasks with Hermes trajectory format output. |
| |
| Args: |
| task: Single task to run (use this OR prompts_file) |
| prompts_file: JSONL file with prompts (each line: {"prompt": "..."}) |
| output_file: Output JSONL file for trajectories |
| model: Model name (default: claude-sonnet-4-20250514) |
| base_url: API base URL (optional) |
| api_key: API key (optional, uses env vars) |
| env: Environment type - "local", "docker", or "modal" |
| image: Docker/Modal image (default: python:3.11-slim) |
| cwd: Working directory (default: /tmp) |
| max_iterations: Maximum tool-calling iterations (default: 15) |
| timeout: Command timeout in seconds (default: 60) |
| verbose: Enable verbose logging |
| |
| Examples: |
| # Single task with local environment |
| python mini_swe_runner.py --task "Create hello.py that prints Hello World" |
| |
| # Single task with Docker |
| python mini_swe_runner.py --task "List files" --env docker |
| |
| # Batch from file |
| python mini_swe_runner.py --prompts_file tasks.jsonl --output_file results.jsonl |
| """ |
| print("π Mini-SWE Runner with Hermes Trajectory Format") |
| print("=" * 60) |
| |
| |
| runner = MiniSWERunner( |
| model=model, |
| base_url=base_url, |
| api_key=api_key, |
| env_type=env, |
| image=image, |
| cwd=cwd, |
| max_iterations=max_iterations, |
| command_timeout=timeout, |
| verbose=verbose, |
| ) |
| |
| if task: |
| |
| result = runner.run_task(task) |
| |
| |
| with open(output_file, 'w', encoding='utf-8') as f: |
| f.write(json.dumps(result, ensure_ascii=False) + "\n") |
| |
| print(f"\nπ Trajectory saved to: {output_file}") |
| print(f"β
Completed: {result['completed']}") |
| print(f"π API calls: {result['api_calls']}") |
| print(f"π¬ Turns: {len(result['conversations'])}") |
| |
| elif prompts_file: |
| |
| prompts = [] |
| with open(prompts_file, 'r', encoding='utf-8') as f: |
| for line in f: |
| line = line.strip() |
| if line: |
| try: |
| entry = json.loads(line) |
| prompts.append(entry.get("prompt", entry.get("task", ""))) |
| except json.JSONDecodeError: |
| prompts.append(line) |
| |
| if not prompts: |
| print(f"β No prompts found in {prompts_file}") |
| return |
| |
| runner.run_batch(prompts, output_file) |
| |
| else: |
| print("β Please provide either --task or --prompts_file") |
| print(" Example: python mini_swe_runner.py --task 'Create a hello world script'") |
|
|
|
|
| if __name__ == "__main__": |
| fire.Fire(main) |
|
|