Add files using upload-large-folder tool

517964a verified 8 days ago

6.84 kB

	import os
	import json
	import base64
	from pathlib import Path
	from tqdm import tqdm
	from dotenv import load_dotenv

	import decord
	from decord import VideoReader

	from openai import OpenAI

	load_dotenv()
	# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
	client = OpenAI(api_key="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")


	# -------- CONFIG --------
	VIDEO_DIR = "./data/droid"
	OUTPUT_FILE = "./output/labels.jsonl"
	FPS_SAMPLE = 2 # 每秒取多少帧
	MODEL_NAME = "gpt-5-mini" # 可换：gpt-4.1, gpt-4o, o1-mini

	# -------- PROMPT --------
	PROMPT = """
	You are a robot manipulation evaluator. You analyze the video step-by-step and determine progress toward task success.

	The task usually follows 6 subtask phases in sequence:
	1) reach - the robot moves toward the target object
	2) grasp - the robot attempts to grasp or secure the object
	3) up - the robot lifts the object from the surface
	4) move - the robot moves the object toward the target location
	5) place - the robot places/releases the object
	6) return - the robot returns to a neutral/resting state (optional)

	For each time step, output:
	- stage: one of ["reach", "grasp", "up", "move", "place", "return"]
	- reward (0 to 1): current progress toward the goal
	- delta (-1 or 1): improvement vs previous step
	- success_prob (0 to 1): probability the task will end successfully
	- failure (0 or 1):
	- 1 if the robot enters an irreversible failure (object dropped, wrong target, motion stopped incorrectly, or no longer recoverable)
	- For the LAST time step: if success_prob is still low and the task is not achieved, force failure = 1
	- explanation: short explanation describing what is happening and why the reward changes

	Rules:
	- The phase must progress forward logically (reach → grasp → up → move → place → return) unless failure occurs.
	- reward should generally increase as progress is made and drop when mistakes occur.
	- delta reflects positive, negative, or no progress change between steps.

	Output JSON list only, no extra commentary:

	[
	{
	"t": <int>,
	"stage": "<string>",
	"reward": <float>,
	"delta": <float>,
	"success_prob": <float>,
	"failure": <0 or 1>,
	"explanation": "<string>"
	}
	]
	"""

	# PROMPT = """
	# You are a robot manipulation evaluator that analyzes a video step-by-step.

	# Your goal is to judge progress toward a task.

	# For each time step, output:
	# - reward (0 to 1): current progress toward the task goal.
	# - delta (-1 or 1): improvement vs previous step.
	# - success_prob (0 to 1): probability the episode will end successfully.
	# - failure (0 or 1): irreversible failure detection.
	# - explanation: concise reasoning.

	# Output JSON list only, no extra text:

	# [
	# {
	# "t": <int>,
	# "reward": <float>,
	# "delta": <float>,
	# "success_prob": <float>,
	# "failure": <0 or 1>,
	# "explanation": "<string>"
	# }
	# ]
	# """

	# -------- FUNCTIONS --------
	def extract_frames(video_path, fps=2):
	vr = VideoReader(video_path)
	total_frames = len(vr) # 原始视频总帧数
	native_fps = vr.get_avg_fps()
	step = max(int(native_fps / fps), 1)
	idxs = list(range(0, total_frames, step)) # 记录采样到的是哪些帧
	frames = vr.get_batch(idxs).asnumpy()
	return frames, total_frames, idxs # 返回3项


	def encode_image(image_array):
	from PIL import Image
	import io
	img = Image.fromarray(image_array)
	buf = io.BytesIO()
	img.save(buf, format="JPEG")
	return base64.b64encode(buf.getvalue()).decode("utf-8")


	def call_model(frames):
	print(f"frames length: {len(frames)}")
	imgs = [
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{encode_image(f)}"
	}
	}
	for f in frames
	]

	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": PROMPT},
	{"role": "user", "content": imgs}
	]
	)

	text = response.choices[0].message.content
	return json.loads(text)


	# -------- MAIN LOOP --------

	os.makedirs(Path(OUTPUT_FILE).parent, exist_ok=True)

	with open(OUTPUT_FILE, "w") as fout:
	for vid in tqdm(sorted(Path(VIDEO_DIR).glob("*.mp4"))):
	try:
	frames, total_frames, idxs = extract_frames(str(vid), FPS_SAMPLE)
	result = call_model(frames)

	for i, step_data in enumerate(result):
	entry = {
	"video_id": vid.stem,
	"t": i,
	"frame_index": int(idxs[i]), # ← 当前 step 对应原始帧编号
	"total_frames": int(total_frames), # ← 原始视频的总帧数
	**step_data
	}
	fout.write(json.dumps(entry) + "\n")

	except Exception as e:
	print(f"[ERROR] {vid}: {e}")

	# def extract_frames(video_path, fps=2):
	# vr = VideoReader(video_path)
	# native_fps = vr.get_avg_fps()
	# step = max(int(native_fps / fps), 1)
	# idxs = list(range(0, len(vr), step))
	# frames = vr.get_batch(idxs).asnumpy()
	# return frames # shape (T, H, W, 3)

	# def encode_image(image_array):
	# from PIL import Image
	# import io
	# img = Image.fromarray(image_array)
	# buf = io.BytesIO()
	# img.save(buf, format="JPEG")
	# return base64.b64encode(buf.getvalue()).decode("utf-8")

	# def call_model(frames):
	# # ✅ 修正这里为最新格式
	# print(f"frames length: {len(frames)}")
	# imgs = [
	# {
	# "type": "image_url",
	# "image_url": {
	# "url": f"data:image/jpeg;base64,{encode_image(f)}"
	# }
	# }
	# for f in frames
	# ]

	# response = client.chat.completions.create(
	# model=MODEL_NAME,
	# messages=[
	# {"role": "system", "content": PROMPT},
	# {"role": "user", "content": imgs}
	# ]
	# )

	# text = response.choices[0].message.content
	# return json.loads(text)

	# # -------- MAIN LOOP --------

	# os.makedirs(Path(OUTPUT_FILE).parent, exist_ok=True)

	# with open(OUTPUT_FILE, "w") as fout:
	# for vid in tqdm(sorted(Path(VIDEO_DIR).glob("*.mp4"))):
	# try:
	# frames = extract_frames(str(vid), FPS_SAMPLE)
	# result = call_model(frames)

	# for i, step_data in enumerate(result):
	# entry = {
	# "video_id": vid.stem,
	# "t": i,
	# **step_data
	# }
	# fout.write(json.dumps(entry) + "\n")

	# except Exception as e:
	# print(f"[ERROR] {vid}: {e}")