| """Prompt templates for CaP-Agent — system, task, multi-turn, VDM. |
| |
| All prompts in one place. Templates use .format() for variable injection. |
| """ |
|
|
| SYSTEM_PROMPT = """\ |
| You are a robot control agent. You write Python code to control a robot arm. |
| |
| RULES: |
| 1. Write ONLY executable Python code. No explanations outside comments. |
| 2. Wrap your code in ```python fences. |
| 3. Use the documented API functions. Prefer high-level functions when available. |
| 4. Import numpy as np if needed. |
| 5. Keep code concise — one clear sequence of actions. |
| |
| Example for picking up an object: |
| ```python |
| import numpy as np |
| pos, quat = sample_grasp_pose("object_name") |
| open_gripper() |
| goto_pose(pos, quat, z_approach=0.1) |
| close_gripper() |
| lift = pos + np.array([0, 0, 0.2]) |
| goto_pose(lift, quat) |
| ``` |
| """ |
|
|
| TASK_PROMPT = """\ |
| {system_prompt} |
| |
| Available API functions: |
| {api_documentation} |
| |
| {skill_section} |
| |
| Task: {task_description} |
| """ |
|
|
| SKILL_SECTION = """\ |
| Available utility functions (you can call these directly, they are already imported): |
| {skill_code} |
| """ |
|
|
| MULTITURN_DECISION_PROMPT = """\ |
| The following code was just executed: |
| |
| ```python |
| {executed_code} |
| ``` |
| |
| Console output: |
| stdout: {stdout} |
| stderr: {stderr} |
| |
| Task reward: {reward} |
| Task completed: {completed} |
| |
| {visual_diff_section} |
| |
| If the task is NOT completed (reward < 1.0), respond with REGENERATE followed by \ |
| improved Python code in ```python fences that fixes issues or tries a different approach. |
| If the task IS completed (reward = 1.0), respond with FINISH. |
| |
| Your response MUST start with either REGENERATE or FINISH. |
| """ |
|
|
| VISUAL_DIFF_SECTION = """\ |
| Visual observation (what changed in the scene): |
| {visual_diff} |
| """ |
|
|
| VDM_SCENE_PROMPT = """\ |
| You are observing a robot manipulation scene. The task is: {task} |
| |
| Describe the current state of the scene in detail: |
| - What objects are visible and their approximate positions |
| - The state of the robot gripper (open/closed, holding anything) |
| - Any relevant spatial relationships between objects |
| - Colors, sizes, and orientations of key objects |
| """ |
|
|
| VDM_DIFF_PROMPT = """\ |
| You are comparing two observations of a robot manipulation scene. |
| The task is: {task} |
| |
| Image 1: Before the last action |
| Image 2: After the last action |
| |
| Describe what changed: |
| - Did any objects move? In which direction and approximately how far? |
| - Did the gripper state change? |
| - Was the action successful or did something go wrong? |
| - Is the task closer to completion? |
| """ |
|
|
|
|
| def build_initial_prompt( |
| task_description: str, |
| api_documentation: str, |
| skill_library_code: str = "", |
| ) -> list[dict[str, str]]: |
| """Build the initial prompt messages for code generation.""" |
| skill_section = "" |
| if skill_library_code: |
| skill_section = SKILL_SECTION.format(skill_code=skill_library_code) |
|
|
| user_content = TASK_PROMPT.format( |
| system_prompt="", |
| api_documentation=api_documentation, |
| skill_section=skill_section, |
| task_description=task_description, |
| ) |
|
|
| return [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": user_content}, |
| ] |
|
|
|
|
| def build_multiturn_prompt( |
| executed_code: str, |
| stdout: str, |
| stderr: str, |
| visual_diff: str | None = None, |
| reward: float = 0.0, |
| task_completed: bool = False, |
| ) -> list[dict[str, str]]: |
| """Build multi-turn decision prompt after code execution.""" |
| vis_section = "" |
| if visual_diff: |
| vis_section = VISUAL_DIFF_SECTION.format(visual_diff=visual_diff) |
|
|
| content = MULTITURN_DECISION_PROMPT.format( |
| executed_code=executed_code, |
| stdout=stdout or "(empty)", |
| stderr=stderr or "(empty)", |
| reward=f"{reward:.2f}", |
| completed="Yes" if task_completed else "No", |
| visual_diff_section=vis_section, |
| ) |
|
|
| return [{"role": "user", "content": content}] |
|
|