"""Command extraction from LLM output. LLMs often wrap commands in markdown code blocks or add explanatory text. ``extract_command`` strips that away and returns the raw shell command. """ from __future__ import annotations import re def extract_command(text: str) -> str: """Extract a shell command from free-form LLM output. Handles common patterns: 1. Bare command (no wrapping) 2. Markdown code block (```bash ... ``` or ``` ... ```) 3. Single backtick wrapping (`command`) 4. "Command:" prefix 5. Multi-line output -- takes the first non-empty, non-comment line Args: text: Raw LLM output text. Returns: Cleaned shell command string. Returns original text stripped if no pattern matches. """ if not text: return "" stripped = text.strip() # Pattern 1: Markdown fenced code block (```bash ... ``` or ``` ... ```) fenced = re.search( r"```(?:bash|sh|shell|zsh)?\s*\n(.*?)```", stripped, re.DOTALL, ) if fenced: # Take the first non-empty line from the code block lines = [ ln.strip() for ln in fenced.group(1).strip().splitlines() if ln.strip() and not ln.strip().startswith("#") ] if lines: return lines[0] # Pattern 2: Single backtick wrapping backtick = re.search(r"`([^`]+)`", stripped) if backtick: candidate = backtick.group(1).strip() # Only use if it looks like a command (not prose with backticks) if candidate and not candidate[0].isupper(): return candidate # Pattern 3: "Command:" or "Run:" prefix prefix_match = re.search( r"(?:command|run|execute|cmd)\s*:\s*(.+)", stripped, re.IGNORECASE, ) if prefix_match: return prefix_match.group(1).strip().strip("`") # Pattern 4: Multi-line -- take first non-empty, non-comment line lines = [ ln.strip() for ln in stripped.splitlines() if ln.strip() and not ln.strip().startswith("#") ] if lines: # If the first line looks like prose (starts with uppercase and has # many words), try subsequent lines first = lines[0] if len(lines) > 1 and first[0].isupper() and len(first.split()) > 5: # Probably explanation text; try to find the actual command for ln in lines[1:]: if not ln[0].isupper() or ln.startswith(("nmap", "curl", "ssh")): return ln.strip("`").strip() return first return stripped def strip_command_from_response(text: str, command: str) -> str: """Remove the extracted command from an LLM response, preserving reasoning. This is best-effort. It handles the response patterns encouraged by the synthetic-data prompts: - fenced code blocks - ``Command: ...`` lines - a trailing bare command line """ if not text: return "" stripped = text.strip() if not command: return stripped command_pattern = re.escape(command.strip()) # Remove fenced blocks that only contain the command. stripped = re.sub( rf"```(?:bash|sh|shell|zsh)?\s*\n\s*{command_pattern}\s*```", "", stripped, flags=re.IGNORECASE | re.DOTALL, ).strip() # Remove explicit "Command:" lines. stripped = re.sub( rf"(?im)^\s*(?:command|run|execute|cmd)\s*:\s*{command_pattern}\s*$", "", stripped, ).strip() # Remove a trailing bare command line. lines = stripped.splitlines() if lines and lines[-1].strip().strip("`") == command.strip(): lines = lines[:-1] return "\n".join(lines).strip()