File size: 3,741 Bytes
8c486a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f016eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""Command extraction from LLM output.

LLMs often wrap commands in markdown code blocks or add explanatory text.
``extract_command`` strips that away and returns the raw shell command.
"""

from __future__ import annotations

import re


def extract_command(text: str) -> str:
    """Extract a shell command from free-form LLM output.

    Handles common patterns:
    1. Bare command (no wrapping)
    2. Markdown code block (```bash ... ``` or ``` ... ```)
    3. Single backtick wrapping (`command`)
    4. "Command:" prefix
    5. Multi-line output -- takes the first non-empty, non-comment line

    Args:
        text: Raw LLM output text.

    Returns:
        Cleaned shell command string. Returns original text stripped
        if no pattern matches.
    """
    if not text:
        return ""

    stripped = text.strip()

    # Pattern 1: Markdown fenced code block (```bash ... ``` or ``` ... ```)
    fenced = re.search(
        r"```(?:bash|sh|shell|zsh)?\s*\n(.*?)```",
        stripped,
        re.DOTALL,
    )
    if fenced:
        # Take the first non-empty line from the code block
        lines = [
            ln.strip()
            for ln in fenced.group(1).strip().splitlines()
            if ln.strip() and not ln.strip().startswith("#")
        ]
        if lines:
            return lines[0]

    # Pattern 2: Single backtick wrapping
    backtick = re.search(r"`([^`]+)`", stripped)
    if backtick:
        candidate = backtick.group(1).strip()
        # Only use if it looks like a command (not prose with backticks)
        if candidate and not candidate[0].isupper():
            return candidate

    # Pattern 3: "Command:" or "Run:" prefix
    prefix_match = re.search(
        r"(?:command|run|execute|cmd)\s*:\s*(.+)",
        stripped,
        re.IGNORECASE,
    )
    if prefix_match:
        return prefix_match.group(1).strip().strip("`")

    # Pattern 4: Multi-line -- take first non-empty, non-comment line
    lines = [
        ln.strip()
        for ln in stripped.splitlines()
        if ln.strip() and not ln.strip().startswith("#")
    ]
    if lines:
        # If the first line looks like prose (starts with uppercase and has
        # many words), try subsequent lines
        first = lines[0]
        if len(lines) > 1 and first[0].isupper() and len(first.split()) > 5:
            # Probably explanation text; try to find the actual command
            for ln in lines[1:]:
                if not ln[0].isupper() or ln.startswith(("nmap", "curl", "ssh")):
                    return ln.strip("`").strip()

        return first

    return stripped


def strip_command_from_response(text: str, command: str) -> str:
    """Remove the extracted command from an LLM response, preserving reasoning.

    This is best-effort. It handles the response patterns encouraged by the
    synthetic-data prompts:
    - fenced code blocks
    - ``Command: ...`` lines
    - a trailing bare command line
    """
    if not text:
        return ""

    stripped = text.strip()
    if not command:
        return stripped

    command_pattern = re.escape(command.strip())

    # Remove fenced blocks that only contain the command.
    stripped = re.sub(
        rf"```(?:bash|sh|shell|zsh)?\s*\n\s*{command_pattern}\s*```",
        "",
        stripped,
        flags=re.IGNORECASE | re.DOTALL,
    ).strip()

    # Remove explicit "Command:" lines.
    stripped = re.sub(
        rf"(?im)^\s*(?:command|run|execute|cmd)\s*:\s*{command_pattern}\s*$",
        "",
        stripped,
    ).strip()

    # Remove a trailing bare command line.
    lines = stripped.splitlines()
    if lines and lines[-1].strip().strip("`") == command.strip():
        lines = lines[:-1]
    return "\n".join(lines).strip()