Upload folder using huggingface_hub

665e529 verified 12 days ago

3.81 kB

	"""Prompt templates for CaP-Agent — system, task, multi-turn, VDM.

	All prompts in one place. Templates use .format() for variable injection.
	"""

	SYSTEM_PROMPT = """\
	You are a robot control agent. You write Python code to control a robot arm.

	RULES:
	1. Write ONLY executable Python code. No explanations outside comments.
	2. Wrap your code in ```python fences.
	3. Use the documented API functions. Prefer high-level functions when available.
	4. Import numpy as np if needed.
	5. Keep code concise — one clear sequence of actions.

	Example for picking up an object:
	```python
	import numpy as np
	pos, quat = sample_grasp_pose("object_name")
	open_gripper()
	goto_pose(pos, quat, z_approach=0.1)
	close_gripper()
	lift = pos + np.array([0, 0, 0.2])
	goto_pose(lift, quat)
	```
	"""

	TASK_PROMPT = """\
	{system_prompt}

	Available API functions:
	{api_documentation}

	{skill_section}

	Task: {task_description}
	"""

	SKILL_SECTION = """\
	Available utility functions (you can call these directly, they are already imported):
	{skill_code}
	"""

	MULTITURN_DECISION_PROMPT = """\
	The following code was just executed:

	```python
	{executed_code}
	```

	Console output:
	stdout: {stdout}
	stderr: {stderr}

	Task reward: {reward}
	Task completed: {completed}

	{visual_diff_section}

	If the task is NOT completed (reward < 1.0), respond with REGENERATE followed by \
	improved Python code in ```python fences that fixes issues or tries a different approach.
	If the task IS completed (reward = 1.0), respond with FINISH.

	Your response MUST start with either REGENERATE or FINISH.
	"""

	VISUAL_DIFF_SECTION = """\
	Visual observation (what changed in the scene):
	{visual_diff}
	"""

	VDM_SCENE_PROMPT = """\
	You are observing a robot manipulation scene. The task is: {task}

	Describe the current state of the scene in detail:
	- What objects are visible and their approximate positions
	- The state of the robot gripper (open/closed, holding anything)
	- Any relevant spatial relationships between objects
	- Colors, sizes, and orientations of key objects
	"""

	VDM_DIFF_PROMPT = """\
	You are comparing two observations of a robot manipulation scene.
	The task is: {task}

	Image 1: Before the last action
	Image 2: After the last action

	Describe what changed:
	- Did any objects move? In which direction and approximately how far?
	- Did the gripper state change?
	- Was the action successful or did something go wrong?
	- Is the task closer to completion?
	"""


	def build_initial_prompt(
	task_description: str,
	api_documentation: str,
	skill_library_code: str = "",
	) -> list[dict[str, str]]:
	"""Build the initial prompt messages for code generation."""
	skill_section = ""
	if skill_library_code:
	skill_section = SKILL_SECTION.format(skill_code=skill_library_code)

	user_content = TASK_PROMPT.format(
	system_prompt="",
	api_documentation=api_documentation,
	skill_section=skill_section,
	task_description=task_description,
	)

	return [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_content},
	]


	def build_multiturn_prompt(
	executed_code: str,
	stdout: str,
	stderr: str,
	visual_diff: str \| None = None,
	reward: float = 0.0,
	task_completed: bool = False,
	) -> list[dict[str, str]]:
	"""Build multi-turn decision prompt after code execution."""
	vis_section = ""
	if visual_diff:
	vis_section = VISUAL_DIFF_SECTION.format(visual_diff=visual_diff)

	content = MULTITURN_DECISION_PROMPT.format(
	executed_code=executed_code,
	stdout=stdout or "(empty)",
	stderr=stderr or "(empty)",
	reward=f"{reward:.2f}",
	completed="Yes" if task_completed else "No",
	visual_diff_section=vis_section,
	)

	return [{"role": "user", "content": content}]