Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Validate that all required files exist for training""" | |
| import argparse | |
| import glob | |
| import os | |
| from tqdm import tqdm | |
| def validate_data(src_dir): | |
| """Check that all required files exist | |
| Args: | |
| src_dir: Directory containing audio files | |
| """ | |
| # Find all wav files | |
| wav_files = glob.glob(os.path.join(src_dir, '*/*/*wav')) | |
| if not wav_files: | |
| wav_files = glob.glob(os.path.join(src_dir, '**/*.wav'), recursive=True) | |
| print(f"Found {len(wav_files)} WAV files") | |
| missing_txt = [] | |
| missing_embedding = [] | |
| missing_token = [] | |
| missing_spk_embedding = [] | |
| speakers = set() | |
| for wav_path in tqdm(wav_files, desc="Validating files"): | |
| # Check text file | |
| txt_path = wav_path.replace('.wav', '.normalized.txt') | |
| if not os.path.exists(txt_path): | |
| missing_txt.append(wav_path) | |
| # Check embedding file | |
| embedding_path = wav_path.replace('.wav', '_embedding.pt') | |
| if not os.path.exists(embedding_path): | |
| missing_embedding.append(wav_path) | |
| # Check token file | |
| token_path = wav_path.replace('.wav', '_tokens.pt') | |
| if not os.path.exists(token_path): | |
| missing_token.append(wav_path) | |
| # Extract speaker | |
| utt = os.path.basename(wav_path).replace('.wav', '') | |
| spk = utt.split('_')[0] | |
| speakers.add(spk) | |
| # Check speaker embeddings | |
| spk_embed_dir = os.path.join(src_dir, 'spk_embeddings') | |
| if os.path.exists(spk_embed_dir): | |
| for spk in speakers: | |
| spk_embedding_path = os.path.join(spk_embed_dir, f'{spk}_embedding.pt') | |
| if not os.path.exists(spk_embedding_path): | |
| missing_spk_embedding.append(spk) | |
| else: | |
| print(f"Speaker embedding directory not found: {spk_embed_dir}") | |
| missing_spk_embedding = list(speakers) | |
| # Report results | |
| print("\n=== Validation Results ===") | |
| print(f"Total WAV files: {len(wav_files)}") | |
| print(f"Total speakers: {len(speakers)}") | |
| print(f"Missing text files: {len(missing_txt)}") | |
| print(f"Missing embedding files: {len(missing_embedding)}") | |
| print(f"Missing token files: {len(missing_token)}") | |
| print(f"Missing speaker embeddings: {len(missing_spk_embedding)}") | |
| if missing_txt: | |
| print(f"\nFirst 5 missing text files:") | |
| for f in missing_txt[:5]: | |
| print(f" {f}") | |
| if missing_embedding: | |
| print(f"\nFirst 5 missing embedding files:") | |
| for f in missing_embedding[:5]: | |
| print(f" {f}") | |
| if missing_token: | |
| print(f"\nFirst 5 missing token files:") | |
| for f in missing_token[:5]: | |
| print(f" {f}") | |
| if missing_spk_embedding: | |
| print(f"\nFirst 5 missing speaker embeddings:") | |
| for spk in list(missing_spk_embedding)[:5]: | |
| print(f" {spk}") | |
| # Return success if no missing files | |
| return len(missing_txt) == 0 and len(missing_embedding) == 0 and len(missing_token) == 0 | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--src_dir', type=str, required=True, | |
| help='Source directory to validate') | |
| args = parser.parse_args() | |
| success = validate_data(args.src_dir) | |
| exit(0 if success else 1) |