File size: 2,675 Bytes

86e8346

import json
import os
import shutil
from tqdm import tqdm

# This script creates a cleaned dataset by removing samples with abrupt cutoffs
# It uses the results from detect_audio_cutoffs.py

print("Creating cleaned dataset from cutoff analysis...")

# Read the cutoff analysis
with open('audio_cutoff_analysis.json', 'r') as f:
    analysis = json.load(f)

# Get good samples
good_samples = analysis['good_samples']
print(f"Found {len(good_samples)} good samples out of {analysis['total_samples']} total")

# Create output directory
os.makedirs("elise_cleaned", exist_ok=True)
os.makedirs("elise_cleaned/wavs", exist_ok=True)

# Process train split
train_good = []
val_good = []

# Read original train data
with open("jinsaryko_elise_formatted/elise_train_split.jsonl", 'r') as f:
    for line in tqdm(f, desc="Processing train split"):
        entry = json.loads(line)
        audio_path = entry['audio']

        # Check if this is a good sample
        if audio_path in [s['audio_path'] for s in good_samples]:
            # Copy audio file
            basename = os.path.basename(audio_path)
            new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
            shutil.copy2(audio_path, new_audio_path)

            # Update entry with new path
            new_entry = {
                "text": entry['text'],
                "audio": new_audio_path
            }
            train_good.append(new_entry)

# Read original validation data
with open("jinsaryko_elise_formatted/elise_val.jsonl", 'r') as f:
    for line in tqdm(f, desc="Processing validation split"):
        entry = json.loads(line)
        audio_path = entry['audio']

        # Check if this is a good sample
        if audio_path in [s['audio_path'] for s in good_samples]:
            # Copy audio file
            basename = os.path.basename(audio_path)
            new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
            if not os.path.exists(new_audio_path):
                shutil.copy2(audio_path, new_audio_path)

            # Update entry with new path
            new_entry = {
                "text": entry['text'],
                "audio": new_audio_path
            }
            val_good.append(new_entry)

# Save cleaned datasets
with open("elise_cleaned/train_split.jsonl", 'w') as f:
    for entry in train_good:
        f.write(json.dumps(entry) + '\n')

with open("elise_cleaned/val.jsonl", 'w') as f:
    for entry in val_good:
        f.write(json.dumps(entry) + '\n')

print(f"\nCleaned dataset created!")
print(f"Training samples: {len(train_good)}")
print(f"Validation samples: {len(val_good)}")
print(f"Files saved in elise_cleaned/")