import json import os from pathlib import Path def create_training_file(): json_dir = Path("/root/piper_msa/Json_dic") audio_base_dir = Path("/root/piper_msa/raw_audio") output_file = Path("/root/piper_msa/training_data.csv") training_lines = [] # Process each speaker (SPK1 to SPK5) for spk_num in range(1, 6): json_file = json_dir / f"SPK{spk_num}_phoneme_data.json" audio_dir = audio_base_dir / f"SPK{spk_num}" if not json_file.exists(): print(f"Warning: {json_file} not found, skipping...") continue if not audio_dir.exists(): print(f"Warning: {audio_dir} not found, skipping...") continue # Read JSON file with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) # Process each sample for sample in data.get('train_samples', []): audio_file = sample.get('audio_file') text = sample.get('text') if audio_file and text: # Add .wav extension if not present if not audio_file.endswith('.wav'): audio_file = f"{audio_file}.wav" # Construct full audio path audio_path = audio_dir / audio_file # Check if audio file exists if audio_path.exists(): # Format: /full/path/to/audio.wav|Text content line = f"{audio_path}|{text}" training_lines.append(line) else: print(f"Warning: Audio file not found: {audio_path}") print(f"Processed SPK{spk_num}: {len(data.get('train_samples', []))} samples") # Write to output file in CSV format (pipe-separated) with open(output_file, 'w', encoding='utf-8') as f: f.write('\n'.join(training_lines)) print(f"\nTraining file created: {output_file}") print(f"Total samples: {len(training_lines)}") return output_file, len(training_lines) if __name__ == "__main__": output_file, total_samples = create_training_file() print(f"\nDone! Created {output_file} with {total_samples} training samples.")