| | import json |
| | import os |
| | from pathlib import Path |
| |
|
| | def create_training_file(): |
| | json_dir = Path("/root/piper_msa/Json_dic") |
| | audio_base_dir = Path("/root/piper_msa/raw_audio") |
| | output_file = Path("/root/piper_msa/training_data.csv") |
| | |
| | training_lines = [] |
| | |
| | |
| | for spk_num in range(1, 6): |
| | json_file = json_dir / f"SPK{spk_num}_phoneme_data.json" |
| | audio_dir = audio_base_dir / f"SPK{spk_num}" |
| | |
| | if not json_file.exists(): |
| | print(f"Warning: {json_file} not found, skipping...") |
| | continue |
| | |
| | if not audio_dir.exists(): |
| | print(f"Warning: {audio_dir} not found, skipping...") |
| | continue |
| | |
| | |
| | with open(json_file, 'r', encoding='utf-8') as f: |
| | data = json.load(f) |
| | |
| | |
| | for sample in data.get('train_samples', []): |
| | audio_file = sample.get('audio_file') |
| | text = sample.get('text') |
| | |
| | if audio_file and text: |
| | |
| | if not audio_file.endswith('.wav'): |
| | audio_file = f"{audio_file}.wav" |
| | |
| | |
| | audio_path = audio_dir / audio_file |
| | |
| | |
| | if audio_path.exists(): |
| | |
| | line = f"{audio_path}|{text}" |
| | training_lines.append(line) |
| | else: |
| | print(f"Warning: Audio file not found: {audio_path}") |
| | |
| | print(f"Processed SPK{spk_num}: {len(data.get('train_samples', []))} samples") |
| | |
| | |
| | with open(output_file, 'w', encoding='utf-8') as f: |
| | f.write('\n'.join(training_lines)) |
| | |
| | print(f"\nTraining file created: {output_file}") |
| | print(f"Total samples: {len(training_lines)}") |
| | |
| | return output_file, len(training_lines) |
| |
|
| | if __name__ == "__main__": |
| | output_file, total_samples = create_training_file() |
| | print(f"\nDone! Created {output_file} with {total_samples} training samples.") |
| |
|