| import os |
| import json |
| import base64 |
| from pathlib import Path |
| from tqdm import tqdm |
| from dotenv import load_dotenv |
|
|
| from decord import VideoReader |
| from openai import OpenAI |
| from PIL import Image |
| import io |
|
|
| load_dotenv() |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
|
|
|
|
|
|
| |
| ROOT = Path("/playpen-ssd/dataset/droid_raw/1.0.1/AUTOLab/failure") |
| TARGET_NAME = "22008760.mp4" |
| OUTPUT_FILE = "./output/labels_batch_1111.jsonl" |
| FPS_SAMPLE = 2 |
| MODEL_NAME = "gpt-5-mini" |
| MAX_VIDEOS = 10 |
| START_INDEX = 20 |
|
|
| |
| PROMPT = """ |
| You are a robot manipulation evaluator analyzing the video step-by-step. |
| |
| The task is not only to complete the movement, but also to ensure correct handling of the object. |
| A task is only considered SUCCESSFUL if: |
| - The object is securely grasped (not slipping), |
| - It is moved without spilling, dropping, or losing control, |
| - And it is placed correctly and stably at the target location. |
| |
| If the object is spilled, dropped, placed incorrectly, tipped, or ends up unstable → the episode is FAILURE even if the robot completed the motions. |
| |
| The task typically progresses through these phases: |
| 1) reach : robot moves toward object |
| 2) grasp : robot attempts to secure object |
| 3) up : robot lifts object |
| 4) move : robot carries object toward goal |
| 5) place : robot releases or places object carefully |
| 6) return : optional return to neutral state |
| |
| For each time step, output: |
| { |
| "t": <index>, |
| "stage": "reach" | "grasp" | "up" | "move" | "place" | "return", |
| "reward": <0 to 1>, |
| "delta": <-1 to 1>, |
| "success_prob": <0 to 1>, |
| "failure": <0 or 1>, |
| "explanation": "<brief reasoning>" |
| } |
| |
| Rules: |
| - Stage should progress logically unless failure occurs. |
| - reward increases as progress improves and decreases when mistakes occur. |
| - If object is dropped, spilled, crushed, knocked over, or final state is unstable → failure = 1. |
| - For the LAST time step: |
| If success_prob is low OR the object is not placed correctly/stably, |
| FORCE failure = 1. |
| |
| Output JSON LIST only. No extra commentary. |
| """ |
|
|
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def extract_frames(video_path, fps=FPS_SAMPLE): |
| vr = VideoReader(video_path) |
| total_frames = len(vr) |
| native_fps = vr.get_avg_fps() |
| step = max(int(native_fps / fps), 1) |
|
|
| idxs = list(range(0, total_frames, step)) |
|
|
| |
| if (total_frames - 1) not in idxs: |
| idxs.append(total_frames - 1) |
|
|
| idxs = sorted(set(idxs)) |
| frames = vr.get_batch(idxs).asnumpy() |
| return frames, total_frames, idxs |
|
|
|
|
| def encode_image(image_array): |
| img = Image.fromarray(image_array) |
| buf = io.BytesIO() |
| img.save(buf, format="JPEG") |
| return base64.b64encode(buf.getvalue()).decode("utf-8") |
|
|
|
|
| def call_model(frames): |
| imgs = [ |
| { |
| "type": "image_url", |
| "image_url": {"url": f"data:image/jpeg;base64,{encode_image(f)}"} |
| } |
| for f in frames |
| ] |
|
|
| response = client.chat.completions.create( |
| model=MODEL_NAME, |
| messages=[ |
| {"role": "system", "content": PROMPT}, |
| {"role": "user", "content": imgs} |
| ] |
| ) |
|
|
| return json.loads(response.choices[0].message.content) |
|
|
|
|
| |
|
|
| |
| |
| |
|
|
| |
|
|
| video_files = sorted(ROOT.rglob(f"*/recordings/MP4/{TARGET_NAME}")) |
| total_videos = len(video_files) |
|
|
| if START_INDEX >= total_videos: |
| raise ValueError(f"START_INDEX {START_INDEX} 超出视频总数 {total_videos}") |
|
|
| |
| if MAX_VIDEOS is None: |
| video_files = video_files[START_INDEX:] |
| else: |
| video_files = video_files[START_INDEX:START_INDEX + MAX_VIDEOS] |
|
|
| print(f"Found {len(video_files)} videos to process (from index {START_INDEX}):") |
| for v in video_files: |
| print(" -", v) |
|
|
|
|
| |
| os.makedirs(Path(OUTPUT_FILE).parent, exist_ok=True) |
|
|
| with open(OUTPUT_FILE, "w") as fout: |
| for vid_path in tqdm(video_files, desc="Processing videos"): |
| try: |
| frames, total_frames, idxs = extract_frames(str(vid_path)) |
| result = call_model(frames) |
|
|
| for i, step_data in enumerate(result): |
| entry = { |
| "video_path": str(vid_path), |
| "video_id": vid_path.stem, |
| "t": i, |
| "frame_index": int(idxs[i]), |
| "total_frames": int(total_frames), |
| **step_data |
| } |
| fout.write(json.dumps(entry) + "\n") |
|
|
| except Exception as e: |
| print(f"[ERROR] {vid_path}: {e}") |
|
|