Update episodes.jsonl and episodes_stats.jsonl
Browse files- delete_episodes.py +60 -0
delete_episodes.py
CHANGED
|
@@ -165,6 +165,63 @@ def delete_ds_store(dataset_path: str):
|
|
| 165 |
logger.info(".DS_Store files deleted")
|
| 166 |
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
def delete_episode_files(dataset_path: str, indexes: List[int]):
|
| 169 |
"""Delete parquet and video files for specified episode indexes"""
|
| 170 |
parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
|
|
@@ -335,6 +392,9 @@ def delete_episodes_and_repair(
|
|
| 335 |
# Delete episode files
|
| 336 |
delete_episode_files(dataset_path, episode_indexes)
|
| 337 |
|
|
|
|
|
|
|
|
|
|
| 338 |
# Process and repair remaining parquet files
|
| 339 |
process_parquet_files(dataset_path)
|
| 340 |
|
|
|
|
| 165 |
logger.info(".DS_Store files deleted")
|
| 166 |
|
| 167 |
|
| 168 |
+
def update_meta_jsonl_files(dataset_path: str, indexes_to_delete: List[int]):
|
| 169 |
+
"""Update episodes.jsonl and episodes_stats.jsonl by removing deleted episodes and re-indexing"""
|
| 170 |
+
meta_folder = os.path.join(dataset_path, "meta")
|
| 171 |
+
episodes_file = os.path.join(meta_folder, "episodes.jsonl")
|
| 172 |
+
episodes_stats_file = os.path.join(meta_folder, "episodes_stats.jsonl")
|
| 173 |
+
|
| 174 |
+
# Process episodes.jsonl
|
| 175 |
+
if os.path.exists(episodes_file):
|
| 176 |
+
logger.info("Updating episodes.jsonl...")
|
| 177 |
+
episodes = []
|
| 178 |
+
with open(episodes_file, "r") as f:
|
| 179 |
+
for line in f:
|
| 180 |
+
line = line.strip()
|
| 181 |
+
if line: # Skip empty lines
|
| 182 |
+
episode = json.loads(line)
|
| 183 |
+
if episode["episode_index"] not in indexes_to_delete:
|
| 184 |
+
episodes.append(episode)
|
| 185 |
+
|
| 186 |
+
# Re-index episodes
|
| 187 |
+
for new_index, episode in enumerate(episodes):
|
| 188 |
+
episode["episode_index"] = new_index
|
| 189 |
+
|
| 190 |
+
# Write back
|
| 191 |
+
with open(episodes_file, "w") as f:
|
| 192 |
+
for episode in episodes:
|
| 193 |
+
f.write(json.dumps(episode) + "\n")
|
| 194 |
+
|
| 195 |
+
logger.info(f"Updated episodes.jsonl: {len(episodes)} episodes remaining")
|
| 196 |
+
else:
|
| 197 |
+
logger.warning(f"episodes.jsonl not found at {episodes_file}")
|
| 198 |
+
|
| 199 |
+
# Process episodes_stats.jsonl
|
| 200 |
+
if os.path.exists(episodes_stats_file):
|
| 201 |
+
logger.info("Updating episodes_stats.jsonl...")
|
| 202 |
+
stats = []
|
| 203 |
+
with open(episodes_stats_file, "r") as f:
|
| 204 |
+
for line in f:
|
| 205 |
+
line = line.strip()
|
| 206 |
+
if line: # Skip empty lines
|
| 207 |
+
stat = json.loads(line)
|
| 208 |
+
if stat["episode_index"] not in indexes_to_delete:
|
| 209 |
+
stats.append(stat)
|
| 210 |
+
|
| 211 |
+
# Re-index stats
|
| 212 |
+
for new_index, stat in enumerate(stats):
|
| 213 |
+
stat["episode_index"] = new_index
|
| 214 |
+
|
| 215 |
+
# Write back
|
| 216 |
+
with open(episodes_stats_file, "w") as f:
|
| 217 |
+
for stat in stats:
|
| 218 |
+
f.write(json.dumps(stat) + "\n")
|
| 219 |
+
|
| 220 |
+
logger.info(f"Updated episodes_stats.jsonl: {len(stats)} episode stats remaining")
|
| 221 |
+
else:
|
| 222 |
+
logger.warning(f"episodes_stats.jsonl not found at {episodes_stats_file}")
|
| 223 |
+
|
| 224 |
+
|
| 225 |
def delete_episode_files(dataset_path: str, indexes: List[int]):
|
| 226 |
"""Delete parquet and video files for specified episode indexes"""
|
| 227 |
parquets_folder = os.path.join(dataset_path, "data", "chunk-000")
|
|
|
|
| 392 |
# Delete episode files
|
| 393 |
delete_episode_files(dataset_path, episode_indexes)
|
| 394 |
|
| 395 |
+
# Update meta JSONL files (episodes.jsonl and episodes_stats.jsonl)
|
| 396 |
+
update_meta_jsonl_files(dataset_path, episode_indexes)
|
| 397 |
+
|
| 398 |
# Process and repair remaining parquet files
|
| 399 |
process_parquet_files(dataset_path)
|
| 400 |
|