Add files
Browse files- app.py +3 -3
- delete_episodes.py +53 -0
app.py
CHANGED
|
@@ -103,9 +103,9 @@ def delete_episodes_stream(repo_id: str, episode_indexes_str: str, dest_repo_id:
|
|
| 103 |
yield "Please provide at least one episode index to delete."
|
| 104 |
return
|
| 105 |
|
| 106 |
-
# If no destination provided, use the same repo name
|
| 107 |
if not dest_repo_id or not dest_repo_id.strip():
|
| 108 |
-
|
|
|
|
| 109 |
|
| 110 |
# Parse comma-separated episode indexes
|
| 111 |
episode_indexes = []
|
|
@@ -221,7 +221,7 @@ with gr.Blocks(title="LeRobot Episode Deleter") as demo:
|
|
| 221 |
)
|
| 222 |
|
| 223 |
dest_repo_input = gr.Textbox(
|
| 224 |
-
label="Destination repo id (
|
| 225 |
placeholder="org/cleaned_dataset"
|
| 226 |
)
|
| 227 |
|
|
|
|
| 103 |
yield "Please provide at least one episode index to delete."
|
| 104 |
return
|
| 105 |
|
|
|
|
| 106 |
if not dest_repo_id or not dest_repo_id.strip():
|
| 107 |
+
yield "Please provide a destination repo ID."
|
| 108 |
+
return
|
| 109 |
|
| 110 |
# Parse comma-separated episode indexes
|
| 111 |
episode_indexes = []
|
|
|
|
| 221 |
)
|
| 222 |
|
| 223 |
dest_repo_input = gr.Textbox(
|
| 224 |
+
label="Destination repo id (required)",
|
| 225 |
placeholder="org/cleaned_dataset"
|
| 226 |
)
|
| 227 |
|
delete_episodes.py
CHANGED
|
@@ -79,6 +79,56 @@ def check_v2_format(dataset_path: str) -> bool:
|
|
| 79 |
raise ValueError(f"Error: {info_path} is not a valid JSON file")
|
| 80 |
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
def list_episodes(dataset_path: str) -> List[int]:
|
| 83 |
"""List all episode numbers in the dataset"""
|
| 84 |
parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
|
|
@@ -288,6 +338,9 @@ def delete_episodes_and_repair(
|
|
| 288 |
# Process and repair remaining parquet files
|
| 289 |
process_parquet_files(dataset_path)
|
| 290 |
|
|
|
|
|
|
|
|
|
|
| 291 |
# Run stats computation
|
| 292 |
if run_stats:
|
| 293 |
run_stats_computation(dataset_path)
|
|
|
|
| 79 |
raise ValueError(f"Error: {info_path} is not a valid JSON file")
|
| 80 |
|
| 81 |
|
| 82 |
+
def update_info_counts(dataset_path: str):
|
| 83 |
+
"""Update total_episodes and total_videos counts in info.json to reflect actual counts.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
dataset_path: Path to the dataset
|
| 87 |
+
"""
|
| 88 |
+
info_path = os.path.join(dataset_path, "meta", "info.json")
|
| 89 |
+
|
| 90 |
+
if not os.path.exists(info_path):
|
| 91 |
+
raise ValueError(f"Error: {info_path} does not exist")
|
| 92 |
+
|
| 93 |
+
logger.info("Updating info.json counts to reflect actual dataset state...")
|
| 94 |
+
|
| 95 |
+
# Count actual episodes
|
| 96 |
+
episodes = list_episodes(dataset_path)
|
| 97 |
+
new_episode_count = len(episodes)
|
| 98 |
+
|
| 99 |
+
# Count actual videos
|
| 100 |
+
videos_folder = os.path.join(dataset_path, "videos", "chunk-000")
|
| 101 |
+
video_count = 0
|
| 102 |
+
if os.path.exists(videos_folder):
|
| 103 |
+
video_folders = [d for d in os.listdir(videos_folder)
|
| 104 |
+
if os.path.isdir(os.path.join(videos_folder, d))]
|
| 105 |
+
for folder in video_folders:
|
| 106 |
+
video_files = glob.glob(
|
| 107 |
+
os.path.join(videos_folder, folder, "episode_*.mp4")
|
| 108 |
+
)
|
| 109 |
+
video_count += len(video_files)
|
| 110 |
+
|
| 111 |
+
# Read and update info.json
|
| 112 |
+
with open(info_path, "r") as f:
|
| 113 |
+
info = json.load(f)
|
| 114 |
+
|
| 115 |
+
old_episodes = info.get("total_episodes", 0)
|
| 116 |
+
old_videos = info.get("total_videos", 0)
|
| 117 |
+
|
| 118 |
+
info["total_episodes"] = new_episode_count
|
| 119 |
+
info["total_videos"] = video_count
|
| 120 |
+
|
| 121 |
+
with open(info_path, "w") as f:
|
| 122 |
+
json.dump(info, f, indent=4)
|
| 123 |
+
|
| 124 |
+
logger.info(
|
| 125 |
+
f"Updated total_episodes: {old_episodes} → {new_episode_count}"
|
| 126 |
+
)
|
| 127 |
+
logger.info(
|
| 128 |
+
f"Updated total_videos: {old_videos} → {video_count}"
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
def list_episodes(dataset_path: str) -> List[int]:
|
| 133 |
"""List all episode numbers in the dataset"""
|
| 134 |
parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
|
|
|
|
| 338 |
# Process and repair remaining parquet files
|
| 339 |
process_parquet_files(dataset_path)
|
| 340 |
|
| 341 |
+
# Update info.json with new episode and video counts
|
| 342 |
+
update_info_counts(dataset_path)
|
| 343 |
+
|
| 344 |
# Run stats computation
|
| 345 |
if run_stats:
|
| 346 |
run_stats_computation(dataset_path)
|