"""Command extraction from LLM output.

LLMs often wrap commands in markdown code blocks or add explanatory text.
``extract_command`` strips that away and returns the raw shell command.
"""

from __future__ import annotations

import re


def extract_command(text: str) -> str:
    """Extract a shell command from free-form LLM output.

    Handles common patterns:
    1. Bare command (no wrapping)
    2. Markdown code block (```bash ... ``` or ``` ... ```)
    3. Single backtick wrapping (`command`)
    4. "Command:" prefix
    5. Multi-line output -- takes the first non-empty, non-comment line

    Args:
        text: Raw LLM output text.

    Returns:
        Cleaned shell command string. Returns original text stripped
        if no pattern matches.
    """
    if not text:
        return ""

    stripped = text.strip()

    # Pattern 1: Markdown fenced code block (```bash ... ``` or ``` ... ```)
    fenced = re.search(
        r"```(?:bash|sh|shell|zsh)?\s*\n(.*?)```",
        stripped,
        re.DOTALL,
    )
    if fenced:
        # Take the first non-empty line from the code block
        lines = [
            ln.strip()
            for ln in fenced.group(1).strip().splitlines()
            if ln.strip() and not ln.strip().startswith("#")
        ]
        if lines:
            return lines[0]

    # Pattern 2: Single backtick wrapping
    backtick = re.search(r"`([^`]+)`", stripped)
    if backtick:
        candidate = backtick.group(1).strip()
        # Only use if it looks like a command (not prose with backticks)
        if candidate and not candidate[0].isupper():
            return candidate

    # Pattern 3: "Command:" or "Run:" prefix
    prefix_match = re.search(
        r"(?:command|run|execute|cmd)\s*:\s*(.+)",
        stripped,
        re.IGNORECASE,
    )
    if prefix_match:
        return prefix_match.group(1).strip().strip("`")

    # Pattern 4: Multi-line -- take first non-empty, non-comment line
    lines = [
        ln.strip()
        for ln in stripped.splitlines()
        if ln.strip() and not ln.strip().startswith("#")
    ]
    if lines:
        # If the first line looks like prose (starts with uppercase and has
        # many words), try subsequent lines
        first = lines[0]
        if len(lines) > 1 and first[0].isupper() and len(first.split()) > 5:
            # Probably explanation text; try to find the actual command
            for ln in lines[1:]:
                if not ln[0].isupper() or ln.startswith(("nmap", "curl", "ssh")):
                    return ln.strip("`").strip()

        return first

    return stripped


def strip_command_from_response(text: str, command: str) -> str:
    """Remove the extracted command from an LLM response, preserving reasoning.

    This is best-effort. It handles the response patterns encouraged by the
    synthetic-data prompts:
    - fenced code blocks
    - ``Command: ...`` lines
    - a trailing bare command line
    """
    if not text:
        return ""

    stripped = text.strip()
    if not command:
        return stripped

    command_pattern = re.escape(command.strip())

    # Remove fenced blocks that only contain the command.
    stripped = re.sub(
        rf"```(?:bash|sh|shell|zsh)?\s*\n\s*{command_pattern}\s*```",
        "",
        stripped,
        flags=re.IGNORECASE | re.DOTALL,
    ).strip()

    # Remove explicit "Command:" lines.
    stripped = re.sub(
        rf"(?im)^\s*(?:command|run|execute|cmd)\s*:\s*{command_pattern}\s*$",
        "",
        stripped,
    ).strip()

    # Remove a trailing bare command line.
    lines = stripped.splitlines()
    if lines and lines[-1].strip().strip("`") == command.strip():
        lines = lines[:-1]
    return "\n".join(lines).strip()