File size: 2,675 Bytes
86e8346 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | import json
import os
import shutil
from tqdm import tqdm
# This script creates a cleaned dataset by removing samples with abrupt cutoffs
# It uses the results from detect_audio_cutoffs.py
print("Creating cleaned dataset from cutoff analysis...")
# Read the cutoff analysis
with open('audio_cutoff_analysis.json', 'r') as f:
analysis = json.load(f)
# Get good samples
good_samples = analysis['good_samples']
print(f"Found {len(good_samples)} good samples out of {analysis['total_samples']} total")
# Create output directory
os.makedirs("elise_cleaned", exist_ok=True)
os.makedirs("elise_cleaned/wavs", exist_ok=True)
# Process train split
train_good = []
val_good = []
# Read original train data
with open("jinsaryko_elise_formatted/elise_train_split.jsonl", 'r') as f:
for line in tqdm(f, desc="Processing train split"):
entry = json.loads(line)
audio_path = entry['audio']
# Check if this is a good sample
if audio_path in [s['audio_path'] for s in good_samples]:
# Copy audio file
basename = os.path.basename(audio_path)
new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
shutil.copy2(audio_path, new_audio_path)
# Update entry with new path
new_entry = {
"text": entry['text'],
"audio": new_audio_path
}
train_good.append(new_entry)
# Read original validation data
with open("jinsaryko_elise_formatted/elise_val.jsonl", 'r') as f:
for line in tqdm(f, desc="Processing validation split"):
entry = json.loads(line)
audio_path = entry['audio']
# Check if this is a good sample
if audio_path in [s['audio_path'] for s in good_samples]:
# Copy audio file
basename = os.path.basename(audio_path)
new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
if not os.path.exists(new_audio_path):
shutil.copy2(audio_path, new_audio_path)
# Update entry with new path
new_entry = {
"text": entry['text'],
"audio": new_audio_path
}
val_good.append(new_entry)
# Save cleaned datasets
with open("elise_cleaned/train_split.jsonl", 'w') as f:
for entry in train_good:
f.write(json.dumps(entry) + '\n')
with open("elise_cleaned/val.jsonl", 'w') as f:
for entry in val_good:
f.write(json.dumps(entry) + '\n')
print(f"\nCleaned dataset created!")
print(f"Training samples: {len(train_good)}")
print(f"Validation samples: {len(val_good)}")
print(f"Files saved in elise_cleaned/") |