import json import os import shutil from tqdm import tqdm # This script creates a cleaned dataset by removing samples with abrupt cutoffs # It uses the results from detect_audio_cutoffs.py print("Creating cleaned dataset from cutoff analysis...") # Read the cutoff analysis with open('audio_cutoff_analysis.json', 'r') as f: analysis = json.load(f) # Get good samples good_samples = analysis['good_samples'] print(f"Found {len(good_samples)} good samples out of {analysis['total_samples']} total") # Create output directory os.makedirs("elise_cleaned", exist_ok=True) os.makedirs("elise_cleaned/wavs", exist_ok=True) # Process train split train_good = [] val_good = [] # Read original train data with open("jinsaryko_elise_formatted/elise_train_split.jsonl", 'r') as f: for line in tqdm(f, desc="Processing train split"): entry = json.loads(line) audio_path = entry['audio'] # Check if this is a good sample if audio_path in [s['audio_path'] for s in good_samples]: # Copy audio file basename = os.path.basename(audio_path) new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}") shutil.copy2(audio_path, new_audio_path) # Update entry with new path new_entry = { "text": entry['text'], "audio": new_audio_path } train_good.append(new_entry) # Read original validation data with open("jinsaryko_elise_formatted/elise_val.jsonl", 'r') as f: for line in tqdm(f, desc="Processing validation split"): entry = json.loads(line) audio_path = entry['audio'] # Check if this is a good sample if audio_path in [s['audio_path'] for s in good_samples]: # Copy audio file basename = os.path.basename(audio_path) new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}") if not os.path.exists(new_audio_path): shutil.copy2(audio_path, new_audio_path) # Update entry with new path new_entry = { "text": entry['text'], "audio": new_audio_path } val_good.append(new_entry) # Save cleaned datasets with open("elise_cleaned/train_split.jsonl", 'w') as f: for entry in train_good: f.write(json.dumps(entry) + '\n') with open("elise_cleaned/val.jsonl", 'w') as f: for entry in val_good: f.write(json.dumps(entry) + '\n') print(f"\nCleaned dataset created!") print(f"Training samples: {len(train_good)}") print(f"Validation samples: {len(val_good)}") print(f"Files saved in elise_cleaned/")