Spaces:
Runtime error
Runtime error
| # -------------------------------------------------------- | |
| # Licensed under The MIT License [see LICENSE for details] | |
| # -------------------------------------------------------- | |
| import os | |
| from typing import Iterable | |
| import numpy as np | |
| from tqdm import tqdm | |
| from collections import OrderedDict | |
| import os | |
| import numpy as np | |
| from pathlib import Path | |
| CURRENT_DIR = os.path.dirname(__file__) | |
| import cv2 | |
| from os.path import expanduser | |
| import json | |
| import matplotlib.pyplot as plt | |
| RESOLUTION = (480, 480) | |
| home = expanduser("~") | |
| # Adjust these to the where-ever your detections and frames are stored. | |
| ROOT = "/datasets01/ego4d_track2/" | |
| LABEL_ROOT = ROOT + "v2_1/annotations/fho_main.json" | |
| VIDEO_PATH = ROOT + "v2_1/full_scale/" | |
| # from epic_kitchens.hoa import load_detections | |
| # labels = json.load(open("/datasets01/ego4d_track2/v2_1/annotations/fho_main.json")) | |
| # videos = /datasets01/ego4d_track2/v2_1/clips | |
| def parse_video_frame(video_path, frame_id): | |
| cap = cv2.VideoCapture(video_path) | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id-1) | |
| ret, frame = cap.read() | |
| return frame | |
| def parse_raw_video(video_path): | |
| cap = cv2.VideoCapture(video_path) | |
| frames = [] | |
| while cap.isOpened(): | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| frames.append(frame) | |
| return frames | |
| def compute_state_and_actions(image, curr_frame, next_frame, frame_idx, save=False): | |
| # curr_frame is a list of bounding box labels | |
| img_width, img_height = image.shape[1], image.shape[0] | |
| for box in curr_frame: | |
| if box['object_type'] == 'left_hand': | |
| curr_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2] | |
| if box['object_type'] == 'right_hand': | |
| curr_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2] | |
| for box in next_frame: | |
| if box['object_type'] == 'left_hand': | |
| next_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2] | |
| if box['object_type'] == 'right_hand': | |
| next_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2] | |
| # normalized them | |
| curr_hand1_center = np.array([curr_hand1_center[0] / img_width, curr_hand1_center[1] / img_height]) | |
| curr_hand2_center = np.array([curr_hand2_center[0] / img_width, curr_hand2_center[1] / img_height]) | |
| # normalize them | |
| next_hand1_center = np.array([next_hand1_center[0] / img_width, next_hand1_center[1] / img_height]) | |
| next_hand2_center = np.array([next_hand2_center[0] / img_width, next_hand2_center[1] / img_height]) | |
| state = np.concatenate((curr_hand1_center, curr_hand2_center)) # - np.array(curr_hand1_center) - np.array(curr_hand2_center) | |
| action = np.concatenate( | |
| ( | |
| np.array(next_hand1_center), | |
| np.array(next_hand2_center), | |
| ) | |
| ) | |
| if save: | |
| # draw the bounding boxes | |
| cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1) | |
| cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1) | |
| cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1) | |
| cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1) | |
| # save the image | |
| cv2.imwrite(f"/private/home/xinleic/LR/hpt_video/data/ego4d_video_label_check/img_{frame_idx}.png", image) | |
| return state, action | |
| def parse_raw_video(video_path): | |
| import cv2 | |
| cap = cv2.VideoCapture(video_path) | |
| frames = [] | |
| while cap.isOpened(): | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| frames.append(frame) | |
| return frames | |
| def chunk_actions_and_concatenate(actions): | |
| chunk_size = 4 | |
| chunked_actions = [actions[i:i + chunk_size] for i in range(0, len(actions), chunk_size)][:-1] | |
| concatenated_frames = [] | |
| for chunk in chunked_actions: | |
| frames_to_concat = [] | |
| for action in chunk: | |
| frames = action['frames'] # Assuming 'frames' is a list or iterable | |
| if frames is not None: | |
| frames_to_concat.extend(frames) # Collect frames from each action | |
| concatenated_frames.append(frames_to_concat) # Store the concatenated frames for this chunk | |
| return concatenated_frames | |
| def ego4d_dataset_size() -> int: | |
| """ Returns the number of trajectories in the dataset. ~1725 for Ego4D. """ | |
| labels = json.load(open(LABEL_ROOT)) | |
| return len(labels['videos']) | |
| # define your own dataset conversion | |
| def ego4d_dataset_generator(example_inds: Iterable[int] = None): | |
| """ | |
| Generator yielding data from Ego4D. | |
| Args: | |
| example_inds: if specified, will only yield data from these indices. | |
| Otherwise, will default to yielding the entire dataset. | |
| """ | |
| # convert to a list of episodes that can be added to replay buffer | |
| labels = json.load(open(LABEL_ROOT)) | |
| if example_inds is None: | |
| example_inds = range(len(labels['videos'])) | |
| for example_ind in example_inds: | |
| label = labels['videos'][example_ind] | |
| # ['annotated_intervals'][2]['narrated_actions'] | |
| video_path = VIDEO_PATH + label['video_uid'] + ".mp4" | |
| if not os.path.exists(video_path): | |
| print("skip", video_path) | |
| continue | |
| label_detections = labels | |
| print("video_path:", video_path) | |
| print("len label detections", len(label_detections)) | |
| # action extractions over bounding boxes subtractions of both hands. | |
| for interval in label['annotated_intervals']: | |
| # print(video_detections[frame_idx].hands) | |
| lang = "use human hands to do some tasks" # dummies | |
| # import IPython; IPython.embed() | |
| print(f"Interval [{interval['start_sec']} - {interval['end_sec']}]") | |
| actions = list(filter(lambda x: not (x['is_invalid_annotation'] or x['is_rejected']) and x['stage'] is not None, interval['narrated_actions'])) | |
| print(f"Actions: {len(actions)}") | |
| # because we need to concatenate | |
| if len(actions) < 3: | |
| continue | |
| # the number of frames is usually 7 and it also does not follow strict 2hz | |
| chunk_actions = chunk_actions_and_concatenate(actions) | |
| for frame_idx, frames in enumerate(chunk_actions): | |
| # lang = frame['narration_text'] | |
| steps = [] | |
| # need to use dummy actions to expand from 6 frames to 16 frames | |
| for idx, frame in enumerate(frames[:-1]): | |
| frame_id = frame['frame_number'] | |
| next_frame = frames[idx + 1] | |
| image = parse_video_frame(video_path, frame_id) | |
| if len(frame['boxes']) > 2 and len(next_frame['boxes']) > 2: | |
| try: | |
| s, a = compute_state_and_actions(image, frame['boxes'], next_frame['boxes'], idx, save=False) | |
| except: | |
| print(f'compute action failed idx {idx} frame idx {frame_idx}') | |
| continue | |
| # break into step dict | |
| step = { | |
| "observation": {"image": image, "state": s}, | |
| "action": a, | |
| "language_instruction": lang, | |
| } | |
| steps.append(OrderedDict(step)) | |
| if len(steps) < 16: | |
| print("skip this traj because frame window length < 16") | |
| continue | |
| data_dict = {"steps": steps} | |
| yield data_dict | |