File size: 3,810 Bytes

665e529

"""Prompt templates for CaP-Agent — system, task, multi-turn, VDM.

All prompts in one place. Templates use .format() for variable injection.
"""

SYSTEM_PROMPT = """\
You are a robot control agent. You write Python code to control a robot arm.

RULES:
1. Write ONLY executable Python code. No explanations outside comments.
2. Wrap your code in ```python fences.
3. Use the documented API functions. Prefer high-level functions when available.
4. Import numpy as np if needed.
5. Keep code concise — one clear sequence of actions.

Example for picking up an object:
```python
import numpy as np
pos, quat = sample_grasp_pose("object_name")
open_gripper()
goto_pose(pos, quat, z_approach=0.1)
close_gripper()
lift = pos + np.array([0, 0, 0.2])
goto_pose(lift, quat)
```
"""

TASK_PROMPT = """\
{system_prompt}

Available API functions:
{api_documentation}

{skill_section}

Task: {task_description}
"""

SKILL_SECTION = """\
Available utility functions (you can call these directly, they are already imported):
{skill_code}
"""

MULTITURN_DECISION_PROMPT = """\
The following code was just executed:

```python
{executed_code}
```

Console output:
stdout: {stdout}
stderr: {stderr}

Task reward: {reward}
Task completed: {completed}

{visual_diff_section}

If the task is NOT completed (reward < 1.0), respond with REGENERATE followed by \
improved Python code in ```python fences that fixes issues or tries a different approach.
If the task IS completed (reward = 1.0), respond with FINISH.

Your response MUST start with either REGENERATE or FINISH.
"""

VISUAL_DIFF_SECTION = """\
Visual observation (what changed in the scene):
{visual_diff}
"""

VDM_SCENE_PROMPT = """\
You are observing a robot manipulation scene. The task is: {task}

Describe the current state of the scene in detail:
- What objects are visible and their approximate positions
- The state of the robot gripper (open/closed, holding anything)
- Any relevant spatial relationships between objects
- Colors, sizes, and orientations of key objects
"""

VDM_DIFF_PROMPT = """\
You are comparing two observations of a robot manipulation scene.
The task is: {task}

Image 1: Before the last action
Image 2: After the last action

Describe what changed:
- Did any objects move? In which direction and approximately how far?
- Did the gripper state change?
- Was the action successful or did something go wrong?
- Is the task closer to completion?
"""


def build_initial_prompt(
    task_description: str,
    api_documentation: str,
    skill_library_code: str = "",
) -> list[dict[str, str]]:
    """Build the initial prompt messages for code generation."""
    skill_section = ""
    if skill_library_code:
        skill_section = SKILL_SECTION.format(skill_code=skill_library_code)

    user_content = TASK_PROMPT.format(
        system_prompt="",
        api_documentation=api_documentation,
        skill_section=skill_section,
        task_description=task_description,
    )

    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_content},
    ]


def build_multiturn_prompt(
    executed_code: str,
    stdout: str,
    stderr: str,
    visual_diff: str | None = None,
    reward: float = 0.0,
    task_completed: bool = False,
) -> list[dict[str, str]]:
    """Build multi-turn decision prompt after code execution."""
    vis_section = ""
    if visual_diff:
        vis_section = VISUAL_DIFF_SECTION.format(visual_diff=visual_diff)

    content = MULTITURN_DECISION_PROMPT.format(
        executed_code=executed_code,
        stdout=stdout or "(empty)",
        stderr=stderr or "(empty)",
        reward=f"{reward:.2f}",
        completed="Yes" if task_completed else "No",
        visual_diff_section=vis_section,
    )

    return [{"role": "user", "content": content}]