Vibevoice_1_5_lora / create_cleaned_dataset.py
DevParker's picture
Upload 8 files
86e8346 verified
import json
import os
import shutil
from tqdm import tqdm
# This script creates a cleaned dataset by removing samples with abrupt cutoffs
# It uses the results from detect_audio_cutoffs.py
print("Creating cleaned dataset from cutoff analysis...")
# Read the cutoff analysis
with open('audio_cutoff_analysis.json', 'r') as f:
analysis = json.load(f)
# Get good samples
good_samples = analysis['good_samples']
print(f"Found {len(good_samples)} good samples out of {analysis['total_samples']} total")
# Create output directory
os.makedirs("elise_cleaned", exist_ok=True)
os.makedirs("elise_cleaned/wavs", exist_ok=True)
# Process train split
train_good = []
val_good = []
# Read original train data
with open("jinsaryko_elise_formatted/elise_train_split.jsonl", 'r') as f:
for line in tqdm(f, desc="Processing train split"):
entry = json.loads(line)
audio_path = entry['audio']
# Check if this is a good sample
if audio_path in [s['audio_path'] for s in good_samples]:
# Copy audio file
basename = os.path.basename(audio_path)
new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
shutil.copy2(audio_path, new_audio_path)
# Update entry with new path
new_entry = {
"text": entry['text'],
"audio": new_audio_path
}
train_good.append(new_entry)
# Read original validation data
with open("jinsaryko_elise_formatted/elise_val.jsonl", 'r') as f:
for line in tqdm(f, desc="Processing validation split"):
entry = json.loads(line)
audio_path = entry['audio']
# Check if this is a good sample
if audio_path in [s['audio_path'] for s in good_samples]:
# Copy audio file
basename = os.path.basename(audio_path)
new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
if not os.path.exists(new_audio_path):
shutil.copy2(audio_path, new_audio_path)
# Update entry with new path
new_entry = {
"text": entry['text'],
"audio": new_audio_path
}
val_good.append(new_entry)
# Save cleaned datasets
with open("elise_cleaned/train_split.jsonl", 'w') as f:
for entry in train_good:
f.write(json.dumps(entry) + '\n')
with open("elise_cleaned/val.jsonl", 'w') as f:
for entry in val_good:
f.write(json.dumps(entry) + '\n')
print(f"\nCleaned dataset created!")
print(f"Training samples: {len(train_good)}")
print(f"Validation samples: {len(val_good)}")
print(f"Files saved in elise_cleaned/")