project_naka / code /prompts.py
ilessio-aiflowlab's picture
Upload folder using huggingface_hub
665e529 verified
"""Prompt templates for CaP-Agent — system, task, multi-turn, VDM.
All prompts in one place. Templates use .format() for variable injection.
"""
SYSTEM_PROMPT = """\
You are a robot control agent. You write Python code to control a robot arm.
RULES:
1. Write ONLY executable Python code. No explanations outside comments.
2. Wrap your code in ```python fences.
3. Use the documented API functions. Prefer high-level functions when available.
4. Import numpy as np if needed.
5. Keep code concise — one clear sequence of actions.
Example for picking up an object:
```python
import numpy as np
pos, quat = sample_grasp_pose("object_name")
open_gripper()
goto_pose(pos, quat, z_approach=0.1)
close_gripper()
lift = pos + np.array([0, 0, 0.2])
goto_pose(lift, quat)
```
"""
TASK_PROMPT = """\
{system_prompt}
Available API functions:
{api_documentation}
{skill_section}
Task: {task_description}
"""
SKILL_SECTION = """\
Available utility functions (you can call these directly, they are already imported):
{skill_code}
"""
MULTITURN_DECISION_PROMPT = """\
The following code was just executed:
```python
{executed_code}
```
Console output:
stdout: {stdout}
stderr: {stderr}
Task reward: {reward}
Task completed: {completed}
{visual_diff_section}
If the task is NOT completed (reward < 1.0), respond with REGENERATE followed by \
improved Python code in ```python fences that fixes issues or tries a different approach.
If the task IS completed (reward = 1.0), respond with FINISH.
Your response MUST start with either REGENERATE or FINISH.
"""
VISUAL_DIFF_SECTION = """\
Visual observation (what changed in the scene):
{visual_diff}
"""
VDM_SCENE_PROMPT = """\
You are observing a robot manipulation scene. The task is: {task}
Describe the current state of the scene in detail:
- What objects are visible and their approximate positions
- The state of the robot gripper (open/closed, holding anything)
- Any relevant spatial relationships between objects
- Colors, sizes, and orientations of key objects
"""
VDM_DIFF_PROMPT = """\
You are comparing two observations of a robot manipulation scene.
The task is: {task}
Image 1: Before the last action
Image 2: After the last action
Describe what changed:
- Did any objects move? In which direction and approximately how far?
- Did the gripper state change?
- Was the action successful or did something go wrong?
- Is the task closer to completion?
"""
def build_initial_prompt(
task_description: str,
api_documentation: str,
skill_library_code: str = "",
) -> list[dict[str, str]]:
"""Build the initial prompt messages for code generation."""
skill_section = ""
if skill_library_code:
skill_section = SKILL_SECTION.format(skill_code=skill_library_code)
user_content = TASK_PROMPT.format(
system_prompt="",
api_documentation=api_documentation,
skill_section=skill_section,
task_description=task_description,
)
return [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_content},
]
def build_multiturn_prompt(
executed_code: str,
stdout: str,
stderr: str,
visual_diff: str | None = None,
reward: float = 0.0,
task_completed: bool = False,
) -> list[dict[str, str]]:
"""Build multi-turn decision prompt after code execution."""
vis_section = ""
if visual_diff:
vis_section = VISUAL_DIFF_SECTION.format(visual_diff=visual_diff)
content = MULTITURN_DECISION_PROMPT.format(
executed_code=executed_code,
stdout=stdout or "(empty)",
stderr=stderr or "(empty)",
reward=f"{reward:.2f}",
completed="Yes" if task_completed else "No",
visual_diff_section=vis_section,
)
return [{"role": "user", "content": content}]