|
|
import json |
|
|
import os |
|
|
import shutil |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Creating cleaned dataset from cutoff analysis...") |
|
|
|
|
|
|
|
|
with open('audio_cutoff_analysis.json', 'r') as f: |
|
|
analysis = json.load(f) |
|
|
|
|
|
|
|
|
good_samples = analysis['good_samples'] |
|
|
print(f"Found {len(good_samples)} good samples out of {analysis['total_samples']} total") |
|
|
|
|
|
|
|
|
os.makedirs("elise_cleaned", exist_ok=True) |
|
|
os.makedirs("elise_cleaned/wavs", exist_ok=True) |
|
|
|
|
|
|
|
|
train_good = [] |
|
|
val_good = [] |
|
|
|
|
|
|
|
|
with open("jinsaryko_elise_formatted/elise_train_split.jsonl", 'r') as f: |
|
|
for line in tqdm(f, desc="Processing train split"): |
|
|
entry = json.loads(line) |
|
|
audio_path = entry['audio'] |
|
|
|
|
|
|
|
|
if audio_path in [s['audio_path'] for s in good_samples]: |
|
|
|
|
|
basename = os.path.basename(audio_path) |
|
|
new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}") |
|
|
shutil.copy2(audio_path, new_audio_path) |
|
|
|
|
|
|
|
|
new_entry = { |
|
|
"text": entry['text'], |
|
|
"audio": new_audio_path |
|
|
} |
|
|
train_good.append(new_entry) |
|
|
|
|
|
|
|
|
with open("jinsaryko_elise_formatted/elise_val.jsonl", 'r') as f: |
|
|
for line in tqdm(f, desc="Processing validation split"): |
|
|
entry = json.loads(line) |
|
|
audio_path = entry['audio'] |
|
|
|
|
|
|
|
|
if audio_path in [s['audio_path'] for s in good_samples]: |
|
|
|
|
|
basename = os.path.basename(audio_path) |
|
|
new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}") |
|
|
if not os.path.exists(new_audio_path): |
|
|
shutil.copy2(audio_path, new_audio_path) |
|
|
|
|
|
|
|
|
new_entry = { |
|
|
"text": entry['text'], |
|
|
"audio": new_audio_path |
|
|
} |
|
|
val_good.append(new_entry) |
|
|
|
|
|
|
|
|
with open("elise_cleaned/train_split.jsonl", 'w') as f: |
|
|
for entry in train_good: |
|
|
f.write(json.dumps(entry) + '\n') |
|
|
|
|
|
with open("elise_cleaned/val.jsonl", 'w') as f: |
|
|
for entry in val_good: |
|
|
f.write(json.dumps(entry) + '\n') |
|
|
|
|
|
print(f"\nCleaned dataset created!") |
|
|
print(f"Training samples: {len(train_good)}") |
|
|
print(f"Validation samples: {len(val_good)}") |
|
|
print(f"Files saved in elise_cleaned/") |