Spaces:

cortexairobot
/

delete-episodes-from-dataset

Sleeping

App Files Files Community

SuveenE commited on Oct 11, 2025

Commit

828363b

1 Parent(s): 9e8f68e

Add files

Browse files

Files changed (2) hide show

app.py +3 -3
delete_episodes.py +53 -0

app.py CHANGED Viewed

@@ -103,9 +103,9 @@ def delete_episodes_stream(repo_id: str, episode_indexes_str: str, dest_repo_id:
         yield "Please provide at least one episode index to delete."
         return
-    # If no destination provided, use the same repo name
     if not dest_repo_id or not dest_repo_id.strip():
-        dest_repo_id = repo_id
     # Parse comma-separated episode indexes
     episode_indexes = []
@@ -221,7 +221,7 @@ with gr.Blocks(title="LeRobot Episode Deleter") as demo:
     )
     dest_repo_input = gr.Textbox(
-        label="Destination repo id (leave empty to use same repo)",
         placeholder="org/cleaned_dataset"
     )

         yield "Please provide at least one episode index to delete."
         return
     if not dest_repo_id or not dest_repo_id.strip():
+        yield "Please provide a destination repo ID."
+        return
     # Parse comma-separated episode indexes
     episode_indexes = []
     )
     dest_repo_input = gr.Textbox(
+        label="Destination repo id (required)",
         placeholder="org/cleaned_dataset"
     )

delete_episodes.py CHANGED Viewed

@@ -79,6 +79,56 @@ def check_v2_format(dataset_path: str) -> bool:
             raise ValueError(f"Error: {info_path} is not a valid JSON file")
 def list_episodes(dataset_path: str) -> List[int]:
     """List all episode numbers in the dataset"""
     parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
@@ -288,6 +338,9 @@ def delete_episodes_and_repair(
     # Process and repair remaining parquet files
     process_parquet_files(dataset_path)
     # Run stats computation
     if run_stats:
         run_stats_computation(dataset_path)

             raise ValueError(f"Error: {info_path} is not a valid JSON file")
+def update_info_counts(dataset_path: str):
+    """Update total_episodes and total_videos counts in info.json to reflect actual counts.
+    Args:
+        dataset_path: Path to the dataset
+    """
+    info_path = os.path.join(dataset_path, "meta", "info.json")
+    if not os.path.exists(info_path):
+        raise ValueError(f"Error: {info_path} does not exist")
+    logger.info("Updating info.json counts to reflect actual dataset state...")
+    # Count actual episodes
+    episodes = list_episodes(dataset_path)
+    new_episode_count = len(episodes)
+    # Count actual videos
+    videos_folder = os.path.join(dataset_path, "videos", "chunk-000")
+    video_count = 0
+    if os.path.exists(videos_folder):
+        video_folders = [d for d in os.listdir(videos_folder)
+                        if os.path.isdir(os.path.join(videos_folder, d))]
+        for folder in video_folders:
+            video_files = glob.glob(
+                os.path.join(videos_folder, folder, "episode_*.mp4")
+            )
+            video_count += len(video_files)
+    # Read and update info.json
+    with open(info_path, "r") as f:
+        info = json.load(f)
+    old_episodes = info.get("total_episodes", 0)
+    old_videos = info.get("total_videos", 0)
+    info["total_episodes"] = new_episode_count
+    info["total_videos"] = video_count
+    with open(info_path, "w") as f:
+        json.dump(info, f, indent=4)
+    logger.info(
+        f"Updated total_episodes: {old_episodes} → {new_episode_count}"
+    )
+    logger.info(
+        f"Updated total_videos: {old_videos} → {video_count}"
+    )
 def list_episodes(dataset_path: str) -> List[int]:
     """List all episode numbers in the dataset"""
     parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
     # Process and repair remaining parquet files
     process_parquet_files(dataset_path)
+    # Update info.json with new episode and video counts
+    update_info_counts(dataset_path)
     # Run stats computation
     if run_stats:
         run_stats_computation(dataset_path)