| """ |
| Metadata Generator Module |
| Generates SoulX-Singer metadata from score and voice samples |
| """ |
|
|
| import os |
| import numpy as np |
| import soundfile as sf |
| from typing import Dict, List, Optional |
| from .score_parser import SOLFEGE_SYLLABLES |
|
|
|
|
| def prepare_voice_samples( |
| voice_mode: str, |
| user_samples: Optional[Dict[str, str]], |
| enable_denoise: bool |
| ) -> Dict[str, np.ndarray]: |
| """ |
| Prepare voice samples for synthesis. |
| |
| Args: |
| voice_mode: "My Recording" or "Child Voice" |
| user_samples: Dict mapping syllable to audio file path |
| enable_denoise: Whether to apply denoising |
| |
| Returns: |
| Dict mapping syllable to audio array |
| """ |
| from .config import get_default_voice_path |
| |
| samples = {} |
| |
| if voice_mode == "Child Voice (Built-in)" or voice_mode == "童声音色 (内置)" or voice_mode == "子供の声 (内蔵)": |
| |
| default_path = get_default_voice_path() |
| |
| for syllable in SOLFEGE_SYLLABLES: |
| |
| filename = syllable.capitalize() + '.wav' |
| file_path = os.path.join(default_path, filename) |
| |
| if os.path.exists(file_path): |
| audio, sr = sf.read(file_path) |
| |
| if len(audio.shape) > 1: |
| audio = audio.mean(axis=1) |
| samples[syllable] = audio |
| else: |
| print(f"Warning: Default voice file not found: {file_path}") |
| |
| elif user_samples: |
| |
| for syllable in SOLFEGE_SYLLABLES: |
| file_path = user_samples.get(syllable) |
| |
| if file_path and os.path.exists(file_path): |
| audio, sr = sf.read(file_path) |
| |
| |
| if len(audio.shape) > 1: |
| audio = audio.mean(axis=1) |
| |
| |
| if enable_denoise: |
| audio = apply_denoise(audio, sr) |
| |
| samples[syllable] = audio |
| |
| return samples |
|
|
|
|
| def apply_denoise(audio: np.ndarray, sample_rate: int) -> np.ndarray: |
| """ |
| Apply conservative denoising using noisereduce. |
| |
| Args: |
| audio: Audio array |
| sample_rate: Sample rate |
| |
| Returns: |
| Denoised audio |
| """ |
| try: |
| import noisereduce as nr |
| return nr.reduce_noise(y=audio, sr=sample_rate, prop_decrease=0.5) |
| except ImportError: |
| print("Warning: noisereduce not installed, skipping denoising") |
| return audio |
|
|
|
|
| def generate_metadata_for_voices( |
| voices: List[Dict], |
| voice_samples: Dict[str, np.ndarray] |
| ) -> List[Dict]: |
| """ |
| Generate SoulX-Singer metadata for each voice. |
| |
| Args: |
| voices: List of voice data from score parser |
| voice_samples: Dict of syllable -> audio array |
| |
| Returns: |
| List of metadata dicts for SoulX-Singer |
| """ |
| metadata_list = [] |
| |
| for voice in voices: |
| notes = voice['notes'] |
| |
| |
| prompt_audio = create_prompt_audio(notes, voice_samples) |
| |
| |
| target_metadata = create_target_metadata(notes) |
| |
| metadata = { |
| 'voice_id': voice['id'], |
| 'instrument': voice['instrument'], |
| 'prompt_audio': prompt_audio, |
| 'target': target_metadata |
| } |
| |
| metadata_list.append(metadata) |
| |
| return metadata_list |
|
|
|
|
| def create_prompt_audio(notes: List[Dict], voice_samples: Dict[str, np.ndarray]) -> np.ndarray: |
| """ |
| Create prompt audio by concatenating voice samples. |
| |
| Strategy: |
| - Use first few notes' solfege to create a representative prompt |
| - Aim for ~3-5 seconds of prompt audio |
| |
| Args: |
| notes: List of notes for this voice |
| voice_samples: Dict of syllable -> audio array |
| |
| Returns: |
| Concatenated prompt audio |
| """ |
| |
| solfeges = [] |
| for note in notes[:10]: |
| solfege = note['solfege'] |
| if solfege not in solfeges and solfege in voice_samples: |
| solfeges.append(solfege) |
| |
| |
| if len(solfeges) < 3: |
| for syllable in SOLFEGE_SYLLABLES: |
| if syllable not in solfeges and syllable in voice_samples: |
| solfeges.append(syllable) |
| if len(solfeges) >= 3: |
| break |
| |
| |
| prompt_segments = [] |
| for syllable in solfeges[:5]: |
| if syllable in voice_samples: |
| sample = voice_samples[syllable] |
| prompt_segments.append(sample) |
| |
| |
| gap = np.zeros(int(44100 * 0.05)) |
| prompt_segments.append(gap) |
| |
| if prompt_segments: |
| return np.concatenate(prompt_segments) |
| else: |
| |
| for sample in voice_samples.values(): |
| return sample |
| |
| return np.zeros(44100) |
|
|
|
|
| def create_target_metadata(notes: List[Dict]) -> Dict: |
| """ |
| Create target metadata for SoulX-Singer. |
| |
| Args: |
| notes: List of notes |
| |
| Returns: |
| Target metadata dict compatible with DataProcessor.preprocess() |
| """ |
| |
| phonemes = [] |
| note_pitches = [] |
| note_durations = [] |
| note_types = [] |
| |
| for note in notes: |
| solfege = note['solfege'] |
| midi_num = note['midi'] |
| duration = note['duration'] |
| |
| |
| phoneme = solfege_to_phoneme(solfege) |
| phonemes.append(phoneme) |
| |
| |
| note_pitches.append(midi_num) |
| |
| |
| note_durations.append(duration) |
| |
| |
| note_types.append(1) |
| |
| return { |
| 'phoneme': phonemes, |
| 'note_pitch': note_pitches, |
| 'note_duration': note_durations, |
| 'note_type': note_types, |
| 'duration': sum(note['duration'] for note in notes) |
| } |
|
|
|
|
| def solfege_to_phoneme(solfege: str) -> str: |
| """ |
| Convert solfege syllable to phoneme using English phonemes from phone_set.json. |
| |
| Format: en_Phoneme for single, en_P1-P2-P3 for multiple (dash-separated). |
| The data_processor.preprocess() will: |
| 1. Detect "en_" prefix and remove it |
| 2. Split by "-" to get individual phonemes |
| 3. Add "en_" prefix to each phoneme |
| |
| So we should pass "en_P1-P2-P3" (without internal "en_" prefixes). |
| |
| Example: "en_D-OW1" → data_processor removes "en_" → "D-OW1" → split → ["D", "OW1"] → add "en_" → ["en_D", "en_OW1"] |
| |
| Args: |
| solfege: Solfege syllable (do, re, mi, fa, sol, la, ti) |
| |
| Returns: |
| Phoneme string compatible with data_processor.preprocess() |
| """ |
| |
| |
| |
| SOLFEGE_TO_PHONEME = { |
| 'do': 'en_D-OW1', |
| 're': 'en_R-EY1', |
| 'mi': 'en_M-IY1', |
| 'fa': 'en_F-AA1', |
| 'sol': 'en_S-OW1-L', |
| 'la': 'en_L-AA1', |
| 'ti': 'en_T-IY1' |
| } |
| |
| return SOLFEGE_TO_PHONEME.get(solfege, 'en_D-OW1') |
|
|