Spaces:

abrown31
/

open-range

Runtime error

open-range / src /open_range /agents /parsing.py

Aaron Brown

Add episode CLI, synthetic data pipeline, NPC generalization, service manifest

f016eb7 3 months ago

3.74 kB

	"""Command extraction from LLM output.

	LLMs often wrap commands in markdown code blocks or add explanatory text.
	``extract_command`` strips that away and returns the raw shell command.
	"""

	from __future__ import annotations

	import re


	def extract_command(text: str) -> str:
	"""Extract a shell command from free-form LLM output.

	Handles common patterns:
	1. Bare command (no wrapping)
	2. Markdown code block (```bash ... ``` or ``` ... ```)
	3. Single backtick wrapping (`command`)
	4. "Command:" prefix
	5. Multi-line output -- takes the first non-empty, non-comment line

	Args:
	text: Raw LLM output text.

	Returns:
	Cleaned shell command string. Returns original text stripped
	if no pattern matches.
	"""
	if not text:
	return ""

	stripped = text.strip()

	# Pattern 1: Markdown fenced code block (```bash ... ``` or ``` ... ```)
	fenced = re.search(
	r"```(?:bash\|sh\|shell\|zsh)?\s\n(.?)```",
	stripped,
	re.DOTALL,
	)
	if fenced:
	# Take the first non-empty line from the code block
	lines = [
	ln.strip()
	for ln in fenced.group(1).strip().splitlines()
	if ln.strip() and not ln.strip().startswith("#")
	]
	if lines:
	return lines[0]

	# Pattern 2: Single backtick wrapping
	backtick = re.search(r"`([^`]+)`", stripped)
	if backtick:
	candidate = backtick.group(1).strip()
	# Only use if it looks like a command (not prose with backticks)
	if candidate and not candidate[0].isupper():
	return candidate

	# Pattern 3: "Command:" or "Run:" prefix
	prefix_match = re.search(
	r"(?:command\|run\|execute\|cmd)\s:\s(.+)",
	stripped,
	re.IGNORECASE,
	)
	if prefix_match:
	return prefix_match.group(1).strip().strip("`")

	# Pattern 4: Multi-line -- take first non-empty, non-comment line
	lines = [
	ln.strip()
	for ln in stripped.splitlines()
	if ln.strip() and not ln.strip().startswith("#")
	]
	if lines:
	# If the first line looks like prose (starts with uppercase and has
	# many words), try subsequent lines
	first = lines[0]
	if len(lines) > 1 and first[0].isupper() and len(first.split()) > 5:
	# Probably explanation text; try to find the actual command
	for ln in lines[1:]:
	if not ln[0].isupper() or ln.startswith(("nmap", "curl", "ssh")):
	return ln.strip("`").strip()

	return first

	return stripped


	def strip_command_from_response(text: str, command: str) -> str:
	"""Remove the extracted command from an LLM response, preserving reasoning.

	This is best-effort. It handles the response patterns encouraged by the
	synthetic-data prompts:
	- fenced code blocks
	- ``Command: ...`` lines
	- a trailing bare command line
	"""
	if not text:
	return ""

	stripped = text.strip()
	if not command:
	return stripped

	command_pattern = re.escape(command.strip())

	# Remove fenced blocks that only contain the command.
	stripped = re.sub(
	rf"```(?:bash\|sh\|shell\|zsh)?\s\n\s{command_pattern}\s*```",
	"",
	stripped,
	flags=re.IGNORECASE \| re.DOTALL,
	).strip()

	# Remove explicit "Command:" lines.
	stripped = re.sub(
	rf"(?im)^\s(?:command\|run\|execute\|cmd)\s:\s{command_pattern}\s$",
	"",
	stripped,
	).strip()

	# Remove a trailing bare command line.
	lines = stripped.splitlines()
	if lines and lines[-1].strip().strip("`") == command.strip():
	lines = lines[:-1]
	return "\n".join(lines).strip()