open-range / src /open_range /agents /parsing.py
Aaron Brown
Add episode CLI, synthetic data pipeline, NPC generalization, service manifest
f016eb7
"""Command extraction from LLM output.
LLMs often wrap commands in markdown code blocks or add explanatory text.
``extract_command`` strips that away and returns the raw shell command.
"""
from __future__ import annotations
import re
def extract_command(text: str) -> str:
"""Extract a shell command from free-form LLM output.
Handles common patterns:
1. Bare command (no wrapping)
2. Markdown code block (```bash ... ``` or ``` ... ```)
3. Single backtick wrapping (`command`)
4. "Command:" prefix
5. Multi-line output -- takes the first non-empty, non-comment line
Args:
text: Raw LLM output text.
Returns:
Cleaned shell command string. Returns original text stripped
if no pattern matches.
"""
if not text:
return ""
stripped = text.strip()
# Pattern 1: Markdown fenced code block (```bash ... ``` or ``` ... ```)
fenced = re.search(
r"```(?:bash|sh|shell|zsh)?\s*\n(.*?)```",
stripped,
re.DOTALL,
)
if fenced:
# Take the first non-empty line from the code block
lines = [
ln.strip()
for ln in fenced.group(1).strip().splitlines()
if ln.strip() and not ln.strip().startswith("#")
]
if lines:
return lines[0]
# Pattern 2: Single backtick wrapping
backtick = re.search(r"`([^`]+)`", stripped)
if backtick:
candidate = backtick.group(1).strip()
# Only use if it looks like a command (not prose with backticks)
if candidate and not candidate[0].isupper():
return candidate
# Pattern 3: "Command:" or "Run:" prefix
prefix_match = re.search(
r"(?:command|run|execute|cmd)\s*:\s*(.+)",
stripped,
re.IGNORECASE,
)
if prefix_match:
return prefix_match.group(1).strip().strip("`")
# Pattern 4: Multi-line -- take first non-empty, non-comment line
lines = [
ln.strip()
for ln in stripped.splitlines()
if ln.strip() and not ln.strip().startswith("#")
]
if lines:
# If the first line looks like prose (starts with uppercase and has
# many words), try subsequent lines
first = lines[0]
if len(lines) > 1 and first[0].isupper() and len(first.split()) > 5:
# Probably explanation text; try to find the actual command
for ln in lines[1:]:
if not ln[0].isupper() or ln.startswith(("nmap", "curl", "ssh")):
return ln.strip("`").strip()
return first
return stripped
def strip_command_from_response(text: str, command: str) -> str:
"""Remove the extracted command from an LLM response, preserving reasoning.
This is best-effort. It handles the response patterns encouraged by the
synthetic-data prompts:
- fenced code blocks
- ``Command: ...`` lines
- a trailing bare command line
"""
if not text:
return ""
stripped = text.strip()
if not command:
return stripped
command_pattern = re.escape(command.strip())
# Remove fenced blocks that only contain the command.
stripped = re.sub(
rf"```(?:bash|sh|shell|zsh)?\s*\n\s*{command_pattern}\s*```",
"",
stripped,
flags=re.IGNORECASE | re.DOTALL,
).strip()
# Remove explicit "Command:" lines.
stripped = re.sub(
rf"(?im)^\s*(?:command|run|execute|cmd)\s*:\s*{command_pattern}\s*$",
"",
stripped,
).strip()
# Remove a trailing bare command line.
lines = stripped.splitlines()
if lines and lines[-1].strip().strip("`") == command.strip():
lines = lines[:-1]
return "\n".join(lines).strip()