SuveenE commited on
Commit
abb873c
·
1 Parent(s): 828363b

Update episodes.jsonl and episodes_stats.jsonl

Browse files
Files changed (1) hide show
  1. delete_episodes.py +60 -0
delete_episodes.py CHANGED
@@ -165,6 +165,63 @@ def delete_ds_store(dataset_path: str):
165
  logger.info(".DS_Store files deleted")
166
 
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  def delete_episode_files(dataset_path: str, indexes: List[int]):
169
  """Delete parquet and video files for specified episode indexes"""
170
  parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
@@ -335,6 +392,9 @@ def delete_episodes_and_repair(
335
  # Delete episode files
336
  delete_episode_files(dataset_path, episode_indexes)
337
 
 
 
 
338
  # Process and repair remaining parquet files
339
  process_parquet_files(dataset_path)
340
 
 
165
  logger.info(".DS_Store files deleted")
166
 
167
 
168
+ def update_meta_jsonl_files(dataset_path: str, indexes_to_delete: List[int]):
169
+ """Update episodes.jsonl and episodes_stats.jsonl by removing deleted episodes and re-indexing"""
170
+ meta_folder = os.path.join(dataset_path, "meta")
171
+ episodes_file = os.path.join(meta_folder, "episodes.jsonl")
172
+ episodes_stats_file = os.path.join(meta_folder, "episodes_stats.jsonl")
173
+
174
+ # Process episodes.jsonl
175
+ if os.path.exists(episodes_file):
176
+ logger.info("Updating episodes.jsonl...")
177
+ episodes = []
178
+ with open(episodes_file, "r") as f:
179
+ for line in f:
180
+ line = line.strip()
181
+ if line: # Skip empty lines
182
+ episode = json.loads(line)
183
+ if episode["episode_index"] not in indexes_to_delete:
184
+ episodes.append(episode)
185
+
186
+ # Re-index episodes
187
+ for new_index, episode in enumerate(episodes):
188
+ episode["episode_index"] = new_index
189
+
190
+ # Write back
191
+ with open(episodes_file, "w") as f:
192
+ for episode in episodes:
193
+ f.write(json.dumps(episode) + "\n")
194
+
195
+ logger.info(f"Updated episodes.jsonl: {len(episodes)} episodes remaining")
196
+ else:
197
+ logger.warning(f"episodes.jsonl not found at {episodes_file}")
198
+
199
+ # Process episodes_stats.jsonl
200
+ if os.path.exists(episodes_stats_file):
201
+ logger.info("Updating episodes_stats.jsonl...")
202
+ stats = []
203
+ with open(episodes_stats_file, "r") as f:
204
+ for line in f:
205
+ line = line.strip()
206
+ if line: # Skip empty lines
207
+ stat = json.loads(line)
208
+ if stat["episode_index"] not in indexes_to_delete:
209
+ stats.append(stat)
210
+
211
+ # Re-index stats
212
+ for new_index, stat in enumerate(stats):
213
+ stat["episode_index"] = new_index
214
+
215
+ # Write back
216
+ with open(episodes_stats_file, "w") as f:
217
+ for stat in stats:
218
+ f.write(json.dumps(stat) + "\n")
219
+
220
+ logger.info(f"Updated episodes_stats.jsonl: {len(stats)} episode stats remaining")
221
+ else:
222
+ logger.warning(f"episodes_stats.jsonl not found at {episodes_stats_file}")
223
+
224
+
225
  def delete_episode_files(dataset_path: str, indexes: List[int]):
226
  """Delete parquet and video files for specified episode indexes"""
227
  parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
 
392
  # Delete episode files
393
  delete_episode_files(dataset_path, episode_indexes)
394
 
395
+ # Update meta JSONL files (episodes.jsonl and episodes_stats.jsonl)
396
+ update_meta_jsonl_files(dataset_path, episode_indexes)
397
+
398
  # Process and repair remaining parquet files
399
  process_parquet_files(dataset_path)
400