| import os |
| import json |
| import base64 |
| from pathlib import Path |
| from tqdm import tqdm |
| from dotenv import load_dotenv |
|
|
| import decord |
| from decord import VideoReader |
|
|
| from openai import OpenAI |
|
|
| load_dotenv() |
| |
| client = OpenAI(api_key="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") |
|
|
|
|
| |
| VIDEO_DIR = "./data/droid" |
| OUTPUT_FILE = "./output/labels.jsonl" |
| FPS_SAMPLE = 2 |
| MODEL_NAME = "gpt-5-mini" |
|
|
| |
| PROMPT = """ |
| You are a robot manipulation evaluator. You analyze the video step-by-step and determine progress toward task success. |
| |
| The task usually follows 6 subtask phases in sequence: |
| 1) reach - the robot moves toward the target object |
| 2) grasp - the robot attempts to grasp or secure the object |
| 3) up - the robot lifts the object from the surface |
| 4) move - the robot moves the object toward the target location |
| 5) place - the robot places/releases the object |
| 6) return - the robot returns to a neutral/resting state (optional) |
| |
| For each time step, output: |
| - stage: one of ["reach", "grasp", "up", "move", "place", "return"] |
| - reward (0 to 1): current progress toward the goal |
| - delta (-1 or 1): improvement vs previous step |
| - success_prob (0 to 1): probability the task will end successfully |
| - failure (0 or 1): |
| - 1 if the robot enters an irreversible failure (object dropped, wrong target, motion stopped incorrectly, or no longer recoverable) |
| - For the LAST time step: if success_prob is still low and the task is not achieved, force failure = 1 |
| - explanation: short explanation describing what is happening and why the reward changes |
| |
| Rules: |
| - The phase must progress forward logically (reach → grasp → up → move → place → return) unless failure occurs. |
| - reward should generally increase as progress is made and drop when mistakes occur. |
| - delta reflects positive, negative, or no progress change between steps. |
| |
| Output JSON list only, no extra commentary: |
| |
| [ |
| { |
| "t": <int>, |
| "stage": "<string>", |
| "reward": <float>, |
| "delta": <float>, |
| "success_prob": <float>, |
| "failure": <0 or 1>, |
| "explanation": "<string>" |
| } |
| ] |
| """ |
|
|
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| def extract_frames(video_path, fps=2): |
| vr = VideoReader(video_path) |
| total_frames = len(vr) |
| native_fps = vr.get_avg_fps() |
| step = max(int(native_fps / fps), 1) |
| idxs = list(range(0, total_frames, step)) |
| frames = vr.get_batch(idxs).asnumpy() |
| return frames, total_frames, idxs |
|
|
|
|
| def encode_image(image_array): |
| from PIL import Image |
| import io |
| img = Image.fromarray(image_array) |
| buf = io.BytesIO() |
| img.save(buf, format="JPEG") |
| return base64.b64encode(buf.getvalue()).decode("utf-8") |
|
|
|
|
| def call_model(frames): |
| print(f"frames length: {len(frames)}") |
| imgs = [ |
| { |
| "type": "image_url", |
| "image_url": { |
| "url": f"data:image/jpeg;base64,{encode_image(f)}" |
| } |
| } |
| for f in frames |
| ] |
|
|
| response = client.chat.completions.create( |
| model=MODEL_NAME, |
| messages=[ |
| {"role": "system", "content": PROMPT}, |
| {"role": "user", "content": imgs} |
| ] |
| ) |
|
|
| text = response.choices[0].message.content |
| return json.loads(text) |
|
|
|
|
| |
|
|
| os.makedirs(Path(OUTPUT_FILE).parent, exist_ok=True) |
|
|
| with open(OUTPUT_FILE, "w") as fout: |
| for vid in tqdm(sorted(Path(VIDEO_DIR).glob("*.mp4"))): |
| try: |
| frames, total_frames, idxs = extract_frames(str(vid), FPS_SAMPLE) |
| result = call_model(frames) |
|
|
| for i, step_data in enumerate(result): |
| entry = { |
| "video_id": vid.stem, |
| "t": i, |
| "frame_index": int(idxs[i]), |
| "total_frames": int(total_frames), |
| **step_data |
| } |
| fout.write(json.dumps(entry) + "\n") |
|
|
| except Exception as e: |
| print(f"[ERROR] {vid}: {e}") |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|