"""Prompt templates for CaP-Agent — system, task, multi-turn, VDM. All prompts in one place. Templates use .format() for variable injection. """ SYSTEM_PROMPT = """\ You are a robot control agent. You write Python code to control a robot arm. RULES: 1. Write ONLY executable Python code. No explanations outside comments. 2. Wrap your code in ```python fences. 3. Use the documented API functions. Prefer high-level functions when available. 4. Import numpy as np if needed. 5. Keep code concise — one clear sequence of actions. Example for picking up an object: ```python import numpy as np pos, quat = sample_grasp_pose("object_name") open_gripper() goto_pose(pos, quat, z_approach=0.1) close_gripper() lift = pos + np.array([0, 0, 0.2]) goto_pose(lift, quat) ``` """ TASK_PROMPT = """\ {system_prompt} Available API functions: {api_documentation} {skill_section} Task: {task_description} """ SKILL_SECTION = """\ Available utility functions (you can call these directly, they are already imported): {skill_code} """ MULTITURN_DECISION_PROMPT = """\ The following code was just executed: ```python {executed_code} ``` Console output: stdout: {stdout} stderr: {stderr} Task reward: {reward} Task completed: {completed} {visual_diff_section} If the task is NOT completed (reward < 1.0), respond with REGENERATE followed by \ improved Python code in ```python fences that fixes issues or tries a different approach. If the task IS completed (reward = 1.0), respond with FINISH. Your response MUST start with either REGENERATE or FINISH. """ VISUAL_DIFF_SECTION = """\ Visual observation (what changed in the scene): {visual_diff} """ VDM_SCENE_PROMPT = """\ You are observing a robot manipulation scene. The task is: {task} Describe the current state of the scene in detail: - What objects are visible and their approximate positions - The state of the robot gripper (open/closed, holding anything) - Any relevant spatial relationships between objects - Colors, sizes, and orientations of key objects """ VDM_DIFF_PROMPT = """\ You are comparing two observations of a robot manipulation scene. The task is: {task} Image 1: Before the last action Image 2: After the last action Describe what changed: - Did any objects move? In which direction and approximately how far? - Did the gripper state change? - Was the action successful or did something go wrong? - Is the task closer to completion? """ def build_initial_prompt( task_description: str, api_documentation: str, skill_library_code: str = "", ) -> list[dict[str, str]]: """Build the initial prompt messages for code generation.""" skill_section = "" if skill_library_code: skill_section = SKILL_SECTION.format(skill_code=skill_library_code) user_content = TASK_PROMPT.format( system_prompt="", api_documentation=api_documentation, skill_section=skill_section, task_description=task_description, ) return [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_content}, ] def build_multiturn_prompt( executed_code: str, stdout: str, stderr: str, visual_diff: str | None = None, reward: float = 0.0, task_completed: bool = False, ) -> list[dict[str, str]]: """Build multi-turn decision prompt after code execution.""" vis_section = "" if visual_diff: vis_section = VISUAL_DIFF_SECTION.format(visual_diff=visual_diff) content = MULTITURN_DECISION_PROMPT.format( executed_code=executed_code, stdout=stdout or "(empty)", stderr=stderr or "(empty)", reward=f"{reward:.2f}", completed="Yes" if task_completed else "No", visual_diff_section=vis_section, ) return [{"role": "user", "content": content}]