SuveenE commited on
Commit
9e3db2b
·
1 Parent(s): 8184d73
Files changed (3) hide show
  1. app.py +288 -4
  2. delete_episodes.py +332 -0
  3. requirements.txt +73 -0
app.py CHANGED
@@ -1,7 +1,291 @@
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
+ import os
2
+ import sys
3
+ import threading
4
+ import queue
5
+ import time
6
+ import tempfile
7
+ import shutil
8
+ from contextlib import redirect_stdout, redirect_stderr
9
+ from typing import List, Optional
10
+
11
  import gradio as gr
12
+ from huggingface_hub import HfApi
13
+
14
+ from delete_episodes import (
15
+ download_dataset,
16
+ list_episodes,
17
+ delete_episodes_and_repair,
18
+ upload_dataset,
19
+ )
20
+
21
+
22
+ class _StreamToQueue:
23
+ def __init__(self, q: "queue.Queue[str]"):
24
+ self.q = q
25
+ self._buffer = ""
26
+
27
+ def write(self, s: str):
28
+ if not isinstance(s, str):
29
+ s = str(s)
30
+ self._buffer += s
31
+ while "\n" in self._buffer:
32
+ line, self._buffer = self._buffer.split("\n", 1)
33
+ self.q.put(line + "\n")
34
+
35
+ def flush(self):
36
+ if self._buffer:
37
+ self.q.put(self._buffer)
38
+ self._buffer = ""
39
+
40
+
41
+ def search_datasets_fn(query: str) -> List[str]:
42
+ """Search for datasets on HuggingFace"""
43
+ api = HfApi()
44
+ try:
45
+ items = api.list_datasets(search=(query or "").strip() or None)
46
+ repo_ids = [getattr(d, "id", None) or getattr(d, "repo_id", None) for d in items]
47
+ repo_ids = [r for r in repo_ids if r]
48
+ # Remove duplicates while preserving order
49
+ seen = set()
50
+ unique = []
51
+ for r in repo_ids:
52
+ if r not in seen:
53
+ unique.append(r)
54
+ seen.add(r)
55
+ return unique[:500]
56
+ except Exception as e:
57
+ print(f"Error searching datasets: {e}")
58
+ return []
59
+
60
+
61
+ def download_and_list_episodes(repo_id: str, progress=gr.Progress()):
62
+ """Download dataset and list available episodes"""
63
+ if not repo_id:
64
+ return "Please provide a dataset repo ID.", []
65
+
66
+ token = os.environ.get("HF_TOKEN")
67
+ temp_dir = tempfile.mkdtemp(prefix="episode_delete_")
68
+
69
+ try:
70
+ progress(0.3, desc="Downloading dataset...")
71
+ download_dataset(repo_id, temp_dir, hf_token=token)
72
+
73
+ progress(0.7, desc="Listing episodes...")
74
+ episodes = list_episodes(temp_dir)
75
+
76
+ if not episodes:
77
+ return f"No episodes found in {repo_id}", []
78
+
79
+ # Format episodes as choices
80
+ episode_choices = [f"Episode {ep}" for ep in episodes]
81
+
82
+ return (
83
+ f"Downloaded {repo_id}. Found {len(episodes)} episodes.",
84
+ gr.update(choices=episode_choices, value=[]),
85
+ )
86
+ except Exception as e:
87
+ return f"Error: {str(e)}", gr.update(choices=[], value=[])
88
+
89
+
90
+ def delete_episodes_stream(repo_id: str, selected_episodes: List[str], dest_repo_id: str):
91
+ """Delete selected episodes and upload to destination repo"""
92
+ if not repo_id:
93
+ yield "Please provide a source dataset repo ID."
94
+ return
95
+
96
+ if not selected_episodes:
97
+ yield "Please select at least one episode to delete."
98
+ return
99
+
100
+ if not dest_repo_id:
101
+ yield "Please provide a destination repo ID."
102
+ return
103
+
104
+ # Parse episode numbers from selection (format: "Episode 0", "Episode 1", etc.)
105
+ episode_indexes = []
106
+ for ep_str in selected_episodes:
107
+ try:
108
+ ep_num = int(ep_str.replace("Episode ", ""))
109
+ episode_indexes.append(ep_num)
110
+ except ValueError:
111
+ yield f"Invalid episode format: {ep_str}"
112
+ return
113
+
114
+ token = os.environ.get("HF_TOKEN")
115
+ q: "queue.Queue[str]" = queue.Queue()
116
+ done = {"ok": False, "msg": ""}
117
+
118
+ def _worker():
119
+ stream = _StreamToQueue(q)
120
+ temp_dir = tempfile.mkdtemp(prefix="episode_delete_")
121
+
122
+ try:
123
+ with redirect_stdout(stream), redirect_stderr(stream):
124
+ print("Downloading dataset...", flush=True)
125
+ download_dataset(repo_id, temp_dir, hf_token=token)
126
+
127
+ print(f"\nDeleting episodes: {episode_indexes}", flush=True)
128
+ delete_episodes_and_repair(
129
+ dataset_path=temp_dir,
130
+ episode_indexes=episode_indexes,
131
+ run_stats=False, # Skip stats for now as script may not be available
132
+ )
133
+
134
+ print(f"\nUploading to {dest_repo_id}...", flush=True)
135
+ upload_dataset(
136
+ local_dir=temp_dir,
137
+ dest_repo_id=dest_repo_id,
138
+ hf_token=token,
139
+ commit_message=f"Deleted episodes: {episode_indexes}",
140
+ private=False,
141
+ )
142
+
143
+ print("\nUpload complete!", flush=True)
144
+ done["ok"] = True
145
+ done["msg"] = f"Successfully deleted {len(episode_indexes)} episodes and uploaded to {dest_repo_id}"
146
+ except Exception as e:
147
+ print(f"\nError: {e}", flush=True)
148
+ done["ok"] = False
149
+ done["msg"] = f"Error: {e}"
150
+ finally:
151
+ # Cleanup
152
+ try:
153
+ if os.path.isdir(temp_dir):
154
+ shutil.rmtree(temp_dir, ignore_errors=True)
155
+ print(f"\nCleaned up temp directory: {temp_dir}", flush=True)
156
+ except Exception:
157
+ pass
158
+ try:
159
+ stream.flush()
160
+ except Exception:
161
+ pass
162
+
163
+ t = threading.Thread(target=_worker, daemon=True)
164
+ t.start()
165
+
166
+ buffer = ""
167
+ yield "Starting process...\n"
168
+
169
+ while t.is_alive() or not q.empty():
170
+ try:
171
+ line = q.get(timeout=0.1)
172
+ buffer += line
173
+ if len(buffer) > 0:
174
+ yield buffer
175
+ except queue.Empty:
176
+ pass
177
+ time.sleep(0.05)
178
+
179
+ # Final status
180
+ if done["msg"]:
181
+ buffer += ("\n" if not buffer.endswith("\n") else "") + "=" * 50 + "\n" + done["msg"]
182
+ yield buffer
183
+
184
+
185
+ # Build the Gradio interface
186
+ with gr.Blocks(title="LeRobot Episode Deleter", theme=gr.themes.Soft()) as demo:
187
+ gr.Markdown("# 🗑️ LeRobot Dataset Episode Deleter")
188
+ gr.Markdown(
189
+ "Delete specific episodes from a HuggingFace LeRobot dataset and upload the cleaned version."
190
+ )
191
+
192
+ with gr.Tabs():
193
+ with gr.Tab("Step 1: Select Dataset"):
194
+ with gr.Row():
195
+ with gr.Column(scale=3):
196
+ search_box = gr.Textbox(
197
+ label="Search Datasets",
198
+ placeholder="Enter keyword or organization name (e.g., 'lerobot', 'griffinlabs-cortex')",
199
+ value="griffinlabs-cortex"
200
+ )
201
+ with gr.Column(scale=1):
202
+ search_btn = gr.Button("🔍 Search", variant="primary")
203
+
204
+ dataset_dropdown = gr.Dropdown(
205
+ label="Available Datasets",
206
+ choices=search_datasets_fn("griffinlabs-cortex"),
207
+ interactive=True,
208
+ allow_custom_value=True,
209
+ )
210
+
211
+ with gr.Row():
212
+ download_btn = gr.Button("📥 Download & List Episodes", variant="secondary", size="lg")
213
+
214
+ download_status = gr.Textbox(
215
+ label="Download Status",
216
+ lines=2,
217
+ interactive=False,
218
+ )
219
+
220
+ with gr.Tab("Step 2: Select Episodes to Delete"):
221
+ gr.Markdown("Select the episodes you want to **remove** from the dataset.")
222
+
223
+ episodes_selector = gr.CheckboxGroup(
224
+ label="Episodes (select to delete)",
225
+ choices=[],
226
+ interactive=True,
227
+ )
228
+
229
+ selected_count = gr.Markdown("*No episodes selected*")
230
+
231
+ with gr.Tab("Step 3: Delete & Upload"):
232
+ gr.Markdown("Configure the destination and start the deletion process.")
233
+
234
+ dest_repo_input = gr.Textbox(
235
+ label="Destination Repository ID",
236
+ placeholder="your-org/cleaned-dataset",
237
+ info="The HuggingFace repo where the cleaned dataset will be uploaded"
238
+ )
239
+
240
+ with gr.Row():
241
+ execute_btn = gr.Button(
242
+ "🚀 Delete Episodes & Upload",
243
+ variant="primary",
244
+ size="lg",
245
+ )
246
+
247
+ progress_log = gr.Textbox(
248
+ label="Progress Log",
249
+ lines=25,
250
+ interactive=False,
251
+ max_lines=30,
252
+ )
253
+
254
+ # Event handlers
255
+ def update_search_results(query):
256
+ results = search_datasets_fn(query)
257
+ return gr.update(choices=results, value=None)
258
+
259
+ search_btn.click(
260
+ update_search_results,
261
+ inputs=search_box,
262
+ outputs=dataset_dropdown,
263
+ )
264
+
265
+ download_btn.click(
266
+ download_and_list_episodes,
267
+ inputs=dataset_dropdown,
268
+ outputs=[download_status, episodes_selector],
269
+ )
270
+
271
+ def update_selected_count(selected):
272
+ if not selected:
273
+ return "*No episodes selected*"
274
+ return f"**{len(selected)} episode(s) selected for deletion**"
275
+
276
+ episodes_selector.change(
277
+ update_selected_count,
278
+ inputs=episodes_selector,
279
+ outputs=selected_count,
280
+ )
281
+
282
+ execute_btn.click(
283
+ delete_episodes_stream,
284
+ inputs=[dataset_dropdown, episodes_selector, dest_repo_input],
285
+ outputs=progress_log,
286
+ )
287
+
288
 
289
+ if __name__ == "__main__":
290
+ demo.launch()
291
 
 
 
delete_episodes.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ import glob
5
+ import json
6
+ import logging
7
+ import shutil
8
+ import subprocess
9
+ from pathlib import Path
10
+ from typing import List, Optional, Tuple
11
+
12
+ from huggingface_hub import snapshot_download, upload_folder, create_repo
13
+ import pandas as pd
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+ if not logger.handlers:
18
+ logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
19
+
20
+
21
+ def _enable_hf_transfer():
22
+ """Enable hf_transfer acceleration if the package is installed"""
23
+ if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER") != "1":
24
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
25
+ logger.info("Enabled hf_transfer acceleration (HF_HUB_ENABLE_HF_TRANSFER=1)")
26
+
27
+
28
+ def download_dataset(
29
+ repo_id: str,
30
+ local_dir: str,
31
+ hf_token: Optional[str] = None,
32
+ ) -> str:
33
+ """Download a Hugging Face dataset by repo_id.
34
+
35
+ Returns the local directory path.
36
+ """
37
+ _enable_hf_transfer()
38
+
39
+ local_path = Path(local_dir)
40
+ local_path.mkdir(parents=True, exist_ok=True)
41
+
42
+ logger.info(f"Downloading dataset '{repo_id}' to '{local_dir}' ...")
43
+
44
+ path = snapshot_download(
45
+ repo_id=repo_id,
46
+ repo_type="dataset",
47
+ token=hf_token,
48
+ local_dir=str(local_dir),
49
+ local_dir_use_symlinks=False,
50
+ )
51
+
52
+ logger.info(f"Downloaded: {repo_id} -> {path}")
53
+ return str(local_path)
54
+
55
+
56
+ def check_v2_format(dataset_path: str) -> bool:
57
+ """Check if dataset is in v2.0 format"""
58
+ info_path = os.path.join(dataset_path, "meta", "info.json")
59
+
60
+ if not os.path.exists(info_path):
61
+ raise ValueError(f"Error: {info_path} does not exist")
62
+
63
+ with open(info_path, "r") as f:
64
+ try:
65
+ info = json.load(f)
66
+ if "codebase_version" not in info:
67
+ raise ValueError(f"Error: {info_path} is not a valid v2.0 dataset")
68
+ elif info["codebase_version"] != "v2.0":
69
+ raise ValueError(
70
+ f"Error: {info_path} is not a v2.0 dataset, found {info['codebase_version']}"
71
+ )
72
+ return True
73
+ except json.JSONDecodeError:
74
+ raise ValueError(f"Error: {info_path} is not a valid JSON file")
75
+
76
+
77
+ def list_episodes(dataset_path: str) -> List[int]:
78
+ """List all episode numbers in the dataset"""
79
+ parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
80
+
81
+ if not os.path.exists(parquets_folder):
82
+ return []
83
+
84
+ parquet_files = glob.glob(os.path.join(parquets_folder, "episode_*.parquet"))
85
+
86
+ episode_numbers = []
87
+ for file in parquet_files:
88
+ match = re.search(r"episode_(\d+)\.parquet", file)
89
+ if match:
90
+ episode_numbers.append(int(match.group(1)))
91
+
92
+ return sorted(episode_numbers)
93
+
94
+
95
+ def delete_ds_store(dataset_path: str):
96
+ """Delete all .DS_Store files in the given dataset path and its subdirectories"""
97
+ logger.info("Deleting .DS_Store files...")
98
+ ds_store_files = glob.glob(
99
+ os.path.join(dataset_path, "**", ".DS_Store"), recursive=True
100
+ )
101
+
102
+ if not ds_store_files:
103
+ logger.info("No .DS_Store files found")
104
+ return
105
+
106
+ for file in ds_store_files:
107
+ os.remove(file)
108
+ logger.info(f"Deleted {file}")
109
+
110
+ logger.info(".DS_Store files deleted")
111
+
112
+
113
+ def delete_episode_files(dataset_path: str, indexes: List[int]):
114
+ """Delete parquet and video files for specified episode indexes"""
115
+ parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
116
+ videos_folder = os.path.join(dataset_path, "videos", "chunk-000")
117
+
118
+ # Delete parquet files
119
+ logger.info("Deleting parquet files...")
120
+ parquet_files = glob.glob(os.path.join(parquets_folder, "*.parquet"))
121
+ for index in indexes:
122
+ for file in parquet_files:
123
+ if f"episode_{index:06d}.parquet" in file:
124
+ os.remove(file)
125
+ logger.info(f"Deleted file {file}")
126
+
127
+ # Delete video files
128
+ logger.info("Deleting video files...")
129
+ if os.path.exists(videos_folder):
130
+ video_folders = os.listdir(videos_folder)
131
+ for index in indexes:
132
+ for folder in video_folders:
133
+ video_files = glob.glob(
134
+ os.path.join(videos_folder, folder, f"episode_{index:06d}.mp4")
135
+ )
136
+ for video_file in video_files:
137
+ os.remove(video_file)
138
+ logger.info(f"Deleted file {video_file}")
139
+
140
+
141
+ def process_parquet_files(dataset_path: str):
142
+ """Process all parquet files by correcting the episode_index column"""
143
+ parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
144
+ videos_folder = os.path.join(dataset_path, "videos", "chunk-000")
145
+
146
+ logger.info("Processing parquet files...")
147
+ parquet_files = glob.glob(os.path.join(parquets_folder, "episode_*.parquet"))
148
+
149
+ if not parquet_files:
150
+ logger.info(f"No parquet files found in {parquets_folder}")
151
+ return
152
+
153
+ logger.info(f"Found {len(parquet_files)} parquet files to process")
154
+
155
+ # Order files by episode number
156
+ parquet_files.sort(
157
+ key=lambda x: int(re.search(r"episode_(\d+)\.parquet", x).group(1))
158
+ )
159
+
160
+ # Check if episode numbers are continuous
161
+ episode_numbers = [
162
+ int(re.search(r"episode_(\d+)\.parquet", file).group(1))
163
+ for file in parquet_files
164
+ ]
165
+ episode_numbers.sort()
166
+
167
+ # Get video folders if they exist
168
+ video_folders = []
169
+ if os.path.exists(videos_folder):
170
+ video_folders = os.listdir(videos_folder)
171
+
172
+ if episode_numbers != list(range(len(episode_numbers))):
173
+ logger.info(
174
+ "Episode numbers are not continuous or starting from 0. Renaming files and videos..."
175
+ )
176
+ for i, file in enumerate(parquet_files):
177
+ new_episode_number = i
178
+ new_file = os.path.join(
179
+ parquets_folder, f"episode_{new_episode_number:06d}.parquet"
180
+ )
181
+ os.rename(file, new_file)
182
+ logger.info(f"Renamed {file} to {new_file}")
183
+
184
+ # Rename corresponding video files
185
+ for folder in video_folders:
186
+ video_file = os.path.join(
187
+ videos_folder, folder, f"episode_{episode_numbers[i]:06d}.mp4"
188
+ )
189
+ new_video_file = os.path.join(
190
+ videos_folder, folder, f"episode_{new_episode_number:06d}.mp4"
191
+ )
192
+ if os.path.exists(video_file):
193
+ os.rename(video_file, new_video_file)
194
+ logger.info(f"Renamed {video_file} to {new_video_file}")
195
+
196
+ # Update list after renaming
197
+ parquet_files = glob.glob(os.path.join(parquets_folder, "episode_*.parquet"))
198
+ parquet_files.sort(
199
+ key=lambda x: int(re.search(r"episode_(\d+)\.parquet", x).group(1))
200
+ )
201
+ logger.info("Updated parquet files list after renaming")
202
+
203
+ # Process each parquet file
204
+ total_index = 0
205
+ for file_path in parquet_files:
206
+ filename = os.path.basename(file_path)
207
+ match = re.search(r"episode_(\d+)\.parquet", filename)
208
+
209
+ if match:
210
+ episode_number = int(match.group(1))
211
+ logger.info(f"Processing {filename} - Episode {episode_number}")
212
+
213
+ try:
214
+ df = pd.read_parquet(file_path, engine="pyarrow")
215
+
216
+ df["episode_index"] = episode_number
217
+ df["frame_index"] = range(len(df))
218
+ df["index"] = range(total_index, total_index + len(df))
219
+ total_index += len(df)
220
+
221
+ df.to_parquet(file_path, index=False)
222
+ logger.info(f"Successfully updated {filename}")
223
+
224
+ except Exception as e:
225
+ raise RuntimeError(f"Error processing {filename}: {str(e)}")
226
+ else:
227
+ logger.info(f"Skipping {filename} - doesn't match expected pattern")
228
+
229
+ logger.info("Parquet processing complete")
230
+
231
+
232
+ def run_stats_computation(dataset_path: str):
233
+ """Run the lerobot stats computation script"""
234
+ script_path = "lerobot_stats_compute.py"
235
+
236
+ if not os.path.exists(script_path):
237
+ logger.warning(f"Stats script '{script_path}' not found, skipping stats computation")
238
+ return
239
+
240
+ logger.info("Running lerobot_stats_compute.py...")
241
+
242
+ try:
243
+ subprocess.run(
244
+ ["uv", "run", script_path, "--dataset-path", dataset_path],
245
+ check=True,
246
+ )
247
+ logger.info(f"Successfully executed {script_path}")
248
+ except subprocess.CalledProcessError as e:
249
+ logger.warning(f"Error executing stats script: {str(e)}")
250
+ except FileNotFoundError:
251
+ logger.warning("uv not found, skipping stats computation")
252
+
253
+
254
+ def delete_episodes_and_repair(
255
+ dataset_path: str,
256
+ episode_indexes: List[int],
257
+ run_stats: bool = True,
258
+ ) -> str:
259
+ """Delete specified episodes and repair the dataset.
260
+
261
+ Args:
262
+ dataset_path: Path to the dataset
263
+ episode_indexes: List of episode indexes to delete
264
+ run_stats: Whether to run stats computation after repair
265
+
266
+ Returns:
267
+ Path to the repaired dataset
268
+ """
269
+ if not episode_indexes:
270
+ raise ValueError("No episode indexes provided for deletion")
271
+
272
+ # Check v2.0 format
273
+ check_v2_format(dataset_path)
274
+
275
+ logger.info(f"Deleting episodes: {episode_indexes}")
276
+
277
+ # Delete .DS_Store files
278
+ delete_ds_store(dataset_path)
279
+
280
+ # Delete episode files
281
+ delete_episode_files(dataset_path, episode_indexes)
282
+
283
+ # Process and repair remaining parquet files
284
+ process_parquet_files(dataset_path)
285
+
286
+ # Run stats computation
287
+ if run_stats:
288
+ run_stats_computation(dataset_path)
289
+
290
+ logger.info("Episode deletion and repair complete")
291
+ return dataset_path
292
+
293
+
294
+ def upload_dataset(
295
+ local_dir: str,
296
+ dest_repo_id: str,
297
+ hf_token: Optional[str] = None,
298
+ commit_message: Optional[str] = None,
299
+ private: bool = False,
300
+ ) -> str:
301
+ """Upload a local dataset folder to a destination HF dataset repo.
302
+
303
+ Returns the repo URL/identifier.
304
+ """
305
+ if not dest_repo_id:
306
+ raise ValueError("dest_repo_id must be provided")
307
+
308
+ token = hf_token or os.environ.get("HF_TOKEN")
309
+ create_repo(
310
+ repo_id=dest_repo_id,
311
+ repo_type="dataset",
312
+ private=private,
313
+ exist_ok=True,
314
+ token=token,
315
+ )
316
+
317
+ _enable_hf_transfer()
318
+ msg = commit_message or "Updated dataset after episode deletion"
319
+ logger.info(f"Uploading '{local_dir}' to '{dest_repo_id}' (private={private}) ...")
320
+
321
+ upload_folder(
322
+ repo_id=dest_repo_id,
323
+ repo_type="dataset",
324
+ folder_path=local_dir,
325
+ path_in_repo=".",
326
+ commit_message=msg,
327
+ token=token,
328
+ )
329
+
330
+ logger.info(f"Uploaded to: {dest_repo_id}")
331
+ return dest_repo_id
332
+
requirements.txt ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.12.15
4
+ aiosignal==1.4.0
5
+ annotated-types==0.7.0
6
+ anyio==4.11.0
7
+ attrs==25.3.0
8
+ Brotli==1.1.0
9
+ certifi==2025.8.3
10
+ charset-normalizer==3.4.3
11
+ click==8.3.0
12
+ datasets==4.1.1
13
+ dill==0.4.0
14
+ fastapi==0.117.1
15
+ ffmpy==0.6.1
16
+ filelock==3.19.1
17
+ frozenlist==1.7.0
18
+ fsspec==2025.9.0
19
+ gradio==5.47.0
20
+ gradio_client==1.13.2
21
+ groovy==0.1.2
22
+ h11==0.16.0
23
+ hf-xet==1.1.10
24
+ hf_transfer==0.1.9
25
+ httpcore==1.0.9
26
+ httpx==0.28.1
27
+ huggingface-hub==0.35.1
28
+ idna==3.10
29
+ inquirerpy==0.3.4
30
+ Jinja2==3.1.6
31
+ markdown-it-py==4.0.0
32
+ MarkupSafe==3.0.2
33
+ mdurl==0.1.2
34
+ multidict==6.6.4
35
+ multiprocess==0.70.16
36
+ numpy==2.2.6
37
+ orjson==3.11.3
38
+ packaging==25.0
39
+ pandas==2.3.2
40
+ pfzy==0.3.4
41
+ pillow==11.3.0
42
+ prompt_toolkit==3.0.52
43
+ propcache==0.3.2
44
+ pyarrow==21.0.0
45
+ pydantic==2.11.9
46
+ pydantic_core==2.33.2
47
+ pydub==0.25.1
48
+ Pygments==2.19.2
49
+ python-dateutil==2.9.0.post0
50
+ python-multipart==0.0.20
51
+ pytz==2025.2
52
+ PyYAML==6.0.2
53
+ requests==2.32.5
54
+ rich==14.1.0
55
+ ruff==0.13.1
56
+ safehttpx==0.1.6
57
+ semantic-version==2.10.0
58
+ shellingham==1.5.4
59
+ six==1.17.0
60
+ sniffio==1.3.1
61
+ starlette==0.48.0
62
+ tomlkit==0.13.3
63
+ tqdm==4.67.1
64
+ typer==0.19.2
65
+ typing-inspection==0.4.1
66
+ typing_extensions==4.15.0
67
+ tzdata==2025.2
68
+ urllib3==2.5.0
69
+ uvicorn==0.37.0
70
+ wcwidth==0.2.14
71
+ websockets==15.0.1
72
+ xxhash==3.5.0
73
+ yarl==1.20.1