Spaces:

codecrypt112
/

openenv-hackathon-ctrlaltwin-tiffenpacker

Running

openenv-hackathon-ctrlaltwin-tiffenpacker / inference.py

vikash-nuvai

fix: clamp inference score between 0.0001 and 0.9999 to pass strictly bounded validation

6a5b308 about 1 month ago

12 kB

	#!/usr/bin/env python3
	# Copyright (c) 2026 CtrlAltWin Team
	"""
	Tiffin Packer — OpenEnv Inference Script.

	Runs an LLM agent against the tiffin packing environment using the
	OpenAI Client API with environment variables:
	API_BASE_URL — The API endpoint for the LLM
	MODEL_NAME — The model identifier for inference
	HF_TOKEN — Hugging Face / API key

	Usage:
	API_BASE_URL=https://api.openai.com/v1 \
	MODEL_NAME=gpt-4o \
	HF_TOKEN=your-key \
	python inference.py
	"""

	import json
	import os
	import sys
	import time
	import traceback

	import requests
	from openai import OpenAI

	# ---------------------------------------------------------------------------
	# Required environment variables
	# ---------------------------------------------------------------------------
	API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
	MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o")
	HF_TOKEN = os.environ.get("HF_TOKEN", "")
	ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")

	if not HF_TOKEN:
	print("WARNING: HF_TOKEN not set. LLM calls will fail.", flush=True)

	client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)

	# ---------------------------------------------------------------------------
	# System prompt
	# ---------------------------------------------------------------------------
	SYSTEM_PROMPT = """You are a tiffin packing assistant that controls a robotic arm.
	Your goal: pack Indian meal items into the correct tiffin containers.

	COMMANDS — respond with ONLY a JSON object, no other text:
	{"command": "observe"} — See the full scene
	{"command": "identify", "target_id": N} — Classify food item N using VLM
	{"command": "pick", "target_id": N} — Pick up food item N
	{"command": "place", "target_id": N} — Place held item into container N
	{"command": "pour", "target_id": N} — Pour held liquid into container N

	PACKING RULES:
	1. ALWAYS identify items before packing (you cannot see food properties otherwise)
	2. Liquids (sambar, dal, rasam, curry) → sealed containers only
	3. Solids (rice, chapati, idli) → any container type
	4. Semi-solids (curd, pickle, chutney) → sealed containers preferred
	5. FRAGILE items (papad=0.9, chapati=0.7) → don't crush under heavy items
	6. HOT and COLD food must NOT share a container
	7. Don't overflow containers — check volume math!
	8. Strong-flavor items (pickle, chutney) should be isolated

	STRATEGY:
	1. First: observe the scene
	2. Then: identify ALL food items (one by one)
	3. Then: plan which food goes where based on constraints
	4. Finally: pick and place/pour each item

	Respond with ONLY valid JSON. No explanation, no markdown, no extra text."""




	def parse_action(text: str) -> dict:
	"""Parse LLM output into an action dict."""
	text = text.strip()

	# Try to extract JSON from the text
	if text.startswith("```"):
	# Handle markdown code blocks
	lines = text.split("\n")
	json_lines = [l for l in lines if not l.startswith("```")]
	text = "\n".join(json_lines).strip()

	# Try direct JSON parse
	try:
	action = json.loads(text)
	if "command" in action:
	return action
	except json.JSONDecodeError:
	pass

	# Try to find JSON in the text
	for i in range(len(text)):
	if text[i] == "{":
	for j in range(len(text) - 1, i, -1):
	if text[j] == "}":
	try:
	action = json.loads(text[i : j + 1])
	if "command" in action:
	return action
	except json.JSONDecodeError:
	continue

	# Fallback
	print(f" [WARN] Could not parse action: {text[:100]}", flush=True)
	return {"command": "observe"}


	def run_episode(task_id: str) -> dict:
	"""Run one episode of the tiffin packing task."""
	# Emit [START] structured output for the validator
	print(f"[START] task={task_id}", flush=True)

	step = 0

	try:
	print(f"\n{'='*60}", flush=True)
	print(f" TASK: {task_id.upper()}", flush=True)
	print(f"{'='*60}", flush=True)

	# Reset the environment
	try:
	resp = requests.post(
	f"{ENV_URL}/reset",
	json={"task_id": task_id, "seed": 42},
	timeout=30,
	)
	resp.raise_for_status()
	result = resp.json()
	obs = result.get("observation", result)
	except Exception as e:
	print(f" ERROR: Failed to reset environment: {e}", flush=True)
	print(f"[END] task={task_id} score=0.0001 steps=0", flush=True)
	return {"task_id": task_id, "total_reward": 0.0, "reward": 0.0, "score": 0.0001, "steps": 0, "error": str(e)}

	# Initialize conversation
	init_scene = obs.get("scene_description", "")
	init_feedback = obs.get("step_feedback", "")
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{
	"role": "user",
	"content": (
	f"Task: {task_id}\n\n"
	f"{init_feedback}\n\n"
	f"Scene:\n{init_scene}\n\n"
	f"Available commands: {obs.get('available_commands', [])}\n\n"
	f"What is your first action? Respond with JSON only."
	),
	},
	]

	total_reward = 0.0
	max_steps = 35 # safety limit

	while not obs.get("done", False) and step < max_steps:
	step += 1

	# Get LLM decision
	try:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=messages,
	temperature=0.0,
	max_tokens=200,
	)
	action_text = response.choices[0].message.content.strip()
	except Exception as e:
	print(f" [Step {step}] LLM error: {e}", flush=True)
	action_text = '{"command": "observe"}'

	action = parse_action(action_text)
	print(f" [Step {step}] Action: {json.dumps(action)}", flush=True)

	# Execute step
	try:
	resp = requests.post(
	f"{ENV_URL}/step",
	json={"action": action},
	timeout=30,
	)
	resp.raise_for_status()
	result = resp.json()
	obs = result.get("observation", result)
	reward = result.get("reward", obs.get("reward", 0.0))
	total_reward += reward or 0
	# Emit [STEP] structured output for the validator
	print(f"[STEP] step={step} reward={reward}", flush=True)
	except Exception as e:
	print(f" [Step {step}] Step error: {e}", flush=True)
	break

	# Print feedback
	feedback = obs.get("step_feedback", "")[:200]
	print(f" Reward: {reward:+.2f} \| Feedback: {feedback}", flush=True)

	# Update conversation with assistant response and new observation
	messages.append({"role": "assistant", "content": action_text})

	# Build concise next observation for LLM
	held = obs.get("held_item")
	held_str = (
	f"Holding: {held.get('name', 'unknown')}" if held else "Arm: idle"
	)
	items_status = [
	f"[{i['id']}] {i.get('name', '?')} ({i['status']})"
	for i in obs.get("food_items", [])
	]
	containers_status = [
	f"[{c['id']}] {c['name']} {c.get('fill_percentage',0):.0f}% full"
	for c in obs.get("containers", [])
	]

	messages.append(
	{
	"role": "user",
	"content": (
	f"Step {step} result (reward={reward:+.2f}):\n"
	f"Feedback: {obs.get('step_feedback', '')}\n\n"
	f"{held_str}\n"
	f"Items: {', '.join(items_status)}\n"
	f"Containers: {', '.join(containers_status)}\n"
	f"Available: {obs.get('available_commands', [])}\n\n"
	f"{'VLM Result: ' + json.dumps(obs.get('vlm_result')) if obs.get('vlm_result') else ''}\n\n"
	f"Next action? JSON only."
	),
	},
	)

	# Extract final score
	final_score = obs.get("metadata", {}).get("final_score", 0.0)
	# Ensure score is strictly between 0 and 1 (exclusive) for the validator
	final_score = max(0.0001, min(0.9999, float(final_score)))
	grade_breakdown = obs.get("metadata", {}).get("grade_breakdown", {})

	print(f"\n {'─'*40}", flush=True)
	print(f" Steps taken: {step}", flush=True)
	print(f" Total reward: {total_reward:+.2f}", flush=True)
	print(f" Final score: {final_score:.4f}", flush=True)
	if grade_breakdown:
	print(f" Breakdown:", flush=True)
	print(f" Validity: {grade_breakdown.get('validity', 0):.4f} (x0.4)", flush=True)
	print(f" Efficiency: {grade_breakdown.get('efficiency', 0):.4f} (x0.3)", flush=True)
	print(f" Constraints: {grade_breakdown.get('constraints', 0):.4f} (x0.2)", flush=True)
	print(f" Neatness: {grade_breakdown.get('neatness', 0):.4f} (x0.1)", flush=True)

	# Emit [END] structured output for the validator
	print(f"[END] task={task_id} score={final_score} steps={step}", flush=True)

	return {
	"task_id": task_id,
	"steps": step,
	"total_reward": round(total_reward, 4),
	"score": final_score,
	"grade_breakdown": grade_breakdown,
	}

	except Exception as e:
	# Catch-all: ensure [END] is ALWAYS emitted even on unexpected errors
	print(f" FATAL ERROR in episode {task_id}: {e}", flush=True)
	traceback.print_exc()
	print(f"[END] task={task_id} score=0.0001 steps={step}", flush=True)
	return {"task_id": task_id, "total_reward": 0.0, "reward": 0.0, "score": 0.0001, "steps": step, "error": str(e)}


	def main():
	"""Run all 3 tasks and report results."""
	print("=" * 60, flush=True)
	print(" TIFFIN PACKER — INFERENCE SCRIPT", flush=True)
	print(f" Model: {MODEL_NAME}", flush=True)
	print(f" API: {API_BASE_URL}", flush=True)
	print(f" Env: {ENV_URL}", flush=True)
	print("=" * 60, flush=True)



	start_time = time.time()
	results = {}

	for task_id in ["easy", "medium", "hard"]:
	result = run_episode(task_id)
	results[task_id] = result

	elapsed = time.time() - start_time

	# Summary
	print("\n" + "=" * 60, flush=True)
	print(" FINAL RESULTS", flush=True)
	print("=" * 60, flush=True)
	for task_id, r in results.items():
	print(f" {task_id:8s}: score={r['score']:.4f} reward={r['total_reward']:+.2f} steps={r.get('steps', '?')}", flush=True)

	avg_score = sum(r["score"] for r in results.values()) / max(len(results), 1)
	print(f"\n Average score: {avg_score:.4f}", flush=True)
	print(f" Total time: {elapsed:.1f}s", flush=True)

	# Save results
	os.makedirs("outputs/evals", exist_ok=True)
	with open("outputs/evals/results.json", "w") as f:
	json.dump(
	{
	"model": MODEL_NAME,
	"api_base_url": API_BASE_URL,
	"results": results,
	"average_score": avg_score,
	"elapsed_seconds": round(elapsed, 1),
	},
	f,
	indent=2,
	)
	print(f"\n Results saved to outputs/evals/results.json", flush=True)


	if __name__ == "__main__":
	main()