SuveenE commited on
Commit
828363b
·
1 Parent(s): 9e8f68e
Files changed (2) hide show
  1. app.py +3 -3
  2. delete_episodes.py +53 -0
app.py CHANGED
@@ -103,9 +103,9 @@ def delete_episodes_stream(repo_id: str, episode_indexes_str: str, dest_repo_id:
103
  yield "Please provide at least one episode index to delete."
104
  return
105
 
106
- # If no destination provided, use the same repo name
107
  if not dest_repo_id or not dest_repo_id.strip():
108
- dest_repo_id = repo_id
 
109
 
110
  # Parse comma-separated episode indexes
111
  episode_indexes = []
@@ -221,7 +221,7 @@ with gr.Blocks(title="LeRobot Episode Deleter") as demo:
221
  )
222
 
223
  dest_repo_input = gr.Textbox(
224
- label="Destination repo id (leave empty to use same repo)",
225
  placeholder="org/cleaned_dataset"
226
  )
227
 
 
103
  yield "Please provide at least one episode index to delete."
104
  return
105
 
 
106
  if not dest_repo_id or not dest_repo_id.strip():
107
+ yield "Please provide a destination repo ID."
108
+ return
109
 
110
  # Parse comma-separated episode indexes
111
  episode_indexes = []
 
221
  )
222
 
223
  dest_repo_input = gr.Textbox(
224
+ label="Destination repo id (required)",
225
  placeholder="org/cleaned_dataset"
226
  )
227
 
delete_episodes.py CHANGED
@@ -79,6 +79,56 @@ def check_v2_format(dataset_path: str) -> bool:
79
  raise ValueError(f"Error: {info_path} is not a valid JSON file")
80
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def list_episodes(dataset_path: str) -> List[int]:
83
  """List all episode numbers in the dataset"""
84
  parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
@@ -288,6 +338,9 @@ def delete_episodes_and_repair(
288
  # Process and repair remaining parquet files
289
  process_parquet_files(dataset_path)
290
 
 
 
 
291
  # Run stats computation
292
  if run_stats:
293
  run_stats_computation(dataset_path)
 
79
  raise ValueError(f"Error: {info_path} is not a valid JSON file")
80
 
81
 
82
+ def update_info_counts(dataset_path: str):
83
+ """Update total_episodes and total_videos counts in info.json to reflect actual counts.
84
+
85
+ Args:
86
+ dataset_path: Path to the dataset
87
+ """
88
+ info_path = os.path.join(dataset_path, "meta", "info.json")
89
+
90
+ if not os.path.exists(info_path):
91
+ raise ValueError(f"Error: {info_path} does not exist")
92
+
93
+ logger.info("Updating info.json counts to reflect actual dataset state...")
94
+
95
+ # Count actual episodes
96
+ episodes = list_episodes(dataset_path)
97
+ new_episode_count = len(episodes)
98
+
99
+ # Count actual videos
100
+ videos_folder = os.path.join(dataset_path, "videos", "chunk-000")
101
+ video_count = 0
102
+ if os.path.exists(videos_folder):
103
+ video_folders = [d for d in os.listdir(videos_folder)
104
+ if os.path.isdir(os.path.join(videos_folder, d))]
105
+ for folder in video_folders:
106
+ video_files = glob.glob(
107
+ os.path.join(videos_folder, folder, "episode_*.mp4")
108
+ )
109
+ video_count += len(video_files)
110
+
111
+ # Read and update info.json
112
+ with open(info_path, "r") as f:
113
+ info = json.load(f)
114
+
115
+ old_episodes = info.get("total_episodes", 0)
116
+ old_videos = info.get("total_videos", 0)
117
+
118
+ info["total_episodes"] = new_episode_count
119
+ info["total_videos"] = video_count
120
+
121
+ with open(info_path, "w") as f:
122
+ json.dump(info, f, indent=4)
123
+
124
+ logger.info(
125
+ f"Updated total_episodes: {old_episodes} → {new_episode_count}"
126
+ )
127
+ logger.info(
128
+ f"Updated total_videos: {old_videos} → {video_count}"
129
+ )
130
+
131
+
132
  def list_episodes(dataset_path: str) -> List[int]:
133
  """List all episode numbers in the dataset"""
134
  parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
 
338
  # Process and repair remaining parquet files
339
  process_parquet_files(dataset_path)
340
 
341
+ # Update info.json with new episode and video counts
342
+ update_info_counts(dataset_path)
343
+
344
  # Run stats computation
345
  if run_stats:
346
  run_stats_computation(dataset_path)