#!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Download small SimplerEnv sample datasets from HuggingFace for inference testing. Creates two demo datasets under demo_data/: - simplerenv_fractal_sample (3 episodes from IPEC-COMMUNITY/fractal20220817_data_lerobot) - simplerenv_bridge_sample (3 episodes from IPEC-COMMUNITY/bridge_orig_lerobot) Both source datasets are already in LeRobot v2 format (per-episode parquet + per-episode mp4), so this script simply downloads the first few episodes and rewrites the meta files. Prerequisites: pip install huggingface_hub jsonlines pyarrow Usage: python scripts/download_simplerenv_sample.py python scripts/download_simplerenv_sample.py --num-episodes 3 """ from __future__ import annotations import argparse import json import logging from pathlib import Path import shutil import jsonlines logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") logger = logging.getLogger(__name__) DEFAULT_NUM_EPISODES = 3 DATASETS = { "fractal": { "hf_repo": "IPEC-COMMUNITY/fractal20220817_data_lerobot", "output_dir": "demo_data/simplerenv_fractal_sample", "robot_type": "google_robot", "video_keys": ["observation.images.image"], "modality_source": "examples/SimplerEnv/fractal_modality.json", "embodiment_tag": "SIMPLER_ENV_GOOGLE", }, "bridge": { "hf_repo": "IPEC-COMMUNITY/bridge_orig_lerobot", "output_dir": "demo_data/simplerenv_bridge_sample", "robot_type": "widowx", # Bridge has 4 cameras, but the model only uses image_0 "video_keys": ["observation.images.image_0"], "modality_source": "examples/SimplerEnv/bridge_modality.json", "embodiment_tag": "SIMPLER_ENV_WIDOWX", }, } def download_sample( dataset_key: str, num_episodes: int, repo_root: Path, ) -> None: """Download a small sample from a SimplerEnv dataset.""" from huggingface_hub import hf_hub_download cfg = DATASETS[dataset_key] hf_repo = cfg["hf_repo"] output_dir = repo_root / cfg["output_dir"] if output_dir.exists(): logger.info(f"Output already exists: {output_dir} — delete it to regenerate.") return logger.info(f"Downloading {dataset_key} sample ({num_episodes} episodes) from {hf_repo}") cache_dir = Path(f"/tmp/simplerenv_{dataset_key}_cache") # Download meta files for meta_file in [ "meta/info.json", "meta/stats.json", "meta/tasks.jsonl", "meta/episodes.jsonl", ]: logger.info(f" {meta_file}...") hf_hub_download( repo_id=hf_repo, repo_type="dataset", filename=meta_file, local_dir=str(cache_dir), ) # Download first N episode data parquets for ep_idx in range(num_episodes): fname = f"data/chunk-000/episode_{ep_idx:06d}.parquet" logger.info(f" {fname}...") hf_hub_download( repo_id=hf_repo, repo_type="dataset", filename=fname, local_dir=str(cache_dir), ) # Download first N episode videos for each video key for video_key in cfg["video_keys"]: for ep_idx in range(num_episodes): fname = f"videos/chunk-000/{video_key}/episode_{ep_idx:06d}.mp4" logger.info(f" {fname}...") hf_hub_download( repo_id=hf_repo, repo_type="dataset", filename=fname, local_dir=str(cache_dir), ) # Assemble output dataset _assemble_sample(cache_dir, output_dir, num_episodes, cfg, repo_root) def _assemble_sample( cache_dir: Path, output_dir: Path, num_episodes: int, cfg: dict, repo_root: Path, ) -> None: """Assemble the downloaded files into a proper LeRobot v2 demo dataset.""" output_dir.mkdir(parents=True, exist_ok=True) meta_dir = output_dir / "meta" meta_dir.mkdir(exist_ok=True) # Load source info with open(cache_dir / "meta" / "info.json") as f: source_info = json.load(f) fps = source_info.get("fps", 5) # Copy data parquets data_chunk_dir = output_dir / "data" / "chunk-000" data_chunk_dir.mkdir(parents=True, exist_ok=True) import pyarrow.parquet as pq total_frames = 0 for ep_idx in range(num_episodes): src = cache_dir / "data" / "chunk-000" / f"episode_{ep_idx:06d}.parquet" dst = data_chunk_dir / f"episode_{ep_idx:06d}.parquet" shutil.copy2(src, dst) table = pq.read_table(str(src)) total_frames += len(table) logger.info(f" Copied data episode {ep_idx}: {len(table)} frames") # Copy video files for video_key in cfg["video_keys"]: video_chunk_dir = output_dir / "videos" / "chunk-000" / video_key video_chunk_dir.mkdir(parents=True, exist_ok=True) for ep_idx in range(num_episodes): src = cache_dir / "videos" / "chunk-000" / video_key / f"episode_{ep_idx:06d}.mp4" dst = video_chunk_dir / f"episode_{ep_idx:06d}.mp4" shutil.copy2(src, dst) logger.info(f" Copied video {video_key} episode {ep_idx}") # Filter episodes.jsonl to only include our episodes src_episodes = cache_dir / "meta" / "episodes.jsonl" with jsonlines.open(meta_dir / "episodes.jsonl", mode="w") as writer: with jsonlines.open(src_episodes) as reader: for rec in reader: if rec["episode_index"] < num_episodes: writer.write(rec) # Collect task indices from parquet data task_indices_used = set() for ep_idx in range(num_episodes): ep_path = data_chunk_dir / f"episode_{ep_idx:06d}.parquet" df = pq.read_table(str(ep_path)).to_pandas() if "task_index" in df.columns: task_indices_used.update(df["task_index"].unique().tolist()) # Filter tasks.jsonl to only include tasks referenced by our episodes src_tasks = cache_dir / "meta" / "tasks.jsonl" with jsonlines.open(meta_dir / "tasks.jsonl", mode="w") as writer: with jsonlines.open(src_tasks) as reader: for rec in reader: if not task_indices_used or rec.get("task_index") in task_indices_used: writer.write(rec) # Build video feature entries from source info (only for keys we include) video_features = {} for video_key in cfg["video_keys"]: if video_key in source_info.get("features", {}): video_features[video_key] = source_info["features"][video_key] else: video_features[video_key] = {"dtype": "video", "shape": [256, 256, 3]} # Build info.json features = {**video_features} for key in ["observation.state", "action", "task_index"]: if key in source_info.get("features", {}): features[key] = source_info["features"][key] info = { "codebase_version": "v2.1", "robot_type": cfg["robot_type"], "total_episodes": num_episodes, "total_frames": total_frames, "fps": fps, "data_path": "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet", "video_path": "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4", "chunks_size": 1000, "splits": {"train": f"0:{num_episodes}"}, "features": features, } with open(meta_dir / "info.json", "w") as f: json.dump(info, f, indent=2) # Filter stats.json to only keep keys present in info.json features src_stats = cache_dir / "meta" / "stats.json" if src_stats.exists(): with open(src_stats) as f: full_stats = json.load(f) filtered_stats = {k: v for k, v in full_stats.items() if k in features} with open(meta_dir / "stats.json", "w") as f: json.dump(filtered_stats, f, indent=2) # Copy modality.json from the examples directory modality_src = repo_root / cfg["modality_source"] shutil.copy2(modality_src, meta_dir / "modality.json") logger.info(f"\nDataset created at: {output_dir}") logger.info(f" Episodes: {num_episodes}, Total frames: {total_frames}, FPS: {fps}") def main(): parser = argparse.ArgumentParser( description="Download small SimplerEnv sample datasets for GR00T inference testing.", ) parser.add_argument("--num-episodes", type=int, default=DEFAULT_NUM_EPISODES) parser.add_argument( "--datasets", nargs="+", default=list(DATASETS.keys()), choices=list(DATASETS.keys()), ) args = parser.parse_args() repo_root = Path(__file__).resolve().parents[1] for dataset_key in args.datasets: download_sample(dataset_key, args.num_episodes, repo_root) logger.info("\nTo run inference:") for dataset_key in args.datasets: cfg = DATASETS[dataset_key] logger.info( f"\n uv run python scripts/deployment/standalone_inference_script.py \\\n" f" --model-path nvidia/GR00T-N1.7-3B \\\n" f" --dataset-path {cfg['output_dir']} \\\n" f" --embodiment-tag {cfg['embodiment_tag']} \\\n" f" --traj-ids 0 1 --inference-mode pytorch --action-horizon 8" ) if __name__ == "__main__": main()