Spaces:

cortexairobot
/

delete-episodes-from-dataset

Sleeping

App Files Files Community

suveen013 commited on Oct 16, 2025

Commit

abb873c

1 Parent(s): 828363b

Update episodes.jsonl and episodes_stats.jsonl

Browse files

Files changed (1) hide show

delete_episodes.py +60 -0

delete_episodes.py CHANGED Viewed

@@ -165,6 +165,63 @@ def delete_ds_store(dataset_path: str):
     logger.info(".DS_Store files deleted")
 def delete_episode_files(dataset_path: str, indexes: List[int]):
     """Delete parquet and video files for specified episode indexes"""
     parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
@@ -335,6 +392,9 @@ def delete_episodes_and_repair(
     # Delete episode files
     delete_episode_files(dataset_path, episode_indexes)
     # Process and repair remaining parquet files
     process_parquet_files(dataset_path)

     logger.info(".DS_Store files deleted")
+def update_meta_jsonl_files(dataset_path: str, indexes_to_delete: List[int]):
+    """Update episodes.jsonl and episodes_stats.jsonl by removing deleted episodes and re-indexing"""
+    meta_folder = os.path.join(dataset_path, "meta")
+    episodes_file = os.path.join(meta_folder, "episodes.jsonl")
+    episodes_stats_file = os.path.join(meta_folder, "episodes_stats.jsonl")
+    # Process episodes.jsonl
+    if os.path.exists(episodes_file):
+        logger.info("Updating episodes.jsonl...")
+        episodes = []
+        with open(episodes_file, "r") as f:
+            for line in f:
+                line = line.strip()
+                if line:  # Skip empty lines
+                    episode = json.loads(line)
+                    if episode["episode_index"] not in indexes_to_delete:
+                        episodes.append(episode)
+        # Re-index episodes
+        for new_index, episode in enumerate(episodes):
+            episode["episode_index"] = new_index
+        # Write back
+        with open(episodes_file, "w") as f:
+            for episode in episodes:
+                f.write(json.dumps(episode) + "\n")
+        logger.info(f"Updated episodes.jsonl: {len(episodes)} episodes remaining")
+    else:
+        logger.warning(f"episodes.jsonl not found at {episodes_file}")
+    # Process episodes_stats.jsonl
+    if os.path.exists(episodes_stats_file):
+        logger.info("Updating episodes_stats.jsonl...")
+        stats = []
+        with open(episodes_stats_file, "r") as f:
+            for line in f:
+                line = line.strip()
+                if line:  # Skip empty lines
+                    stat = json.loads(line)
+                    if stat["episode_index"] not in indexes_to_delete:
+                        stats.append(stat)
+        # Re-index stats
+        for new_index, stat in enumerate(stats):
+            stat["episode_index"] = new_index
+        # Write back
+        with open(episodes_stats_file, "w") as f:
+            for stat in stats:
+                f.write(json.dumps(stat) + "\n")
+        logger.info(f"Updated episodes_stats.jsonl: {len(stats)} episode stats remaining")
+    else:
+        logger.warning(f"episodes_stats.jsonl not found at {episodes_stats_file}")
 def delete_episode_files(dataset_path: str, indexes: List[int]):
     """Delete parquet and video files for specified episode indexes"""
     parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
     # Delete episode files
     delete_episode_files(dataset_path, episode_indexes)
+    # Update meta JSONL files (episodes.jsonl and episodes_stats.jsonl)
+    update_meta_jsonl_files(dataset_path, episode_indexes)
     # Process and repair remaining parquet files
     process_parquet_files(dataset_path)