SolfegeScore-Singer-01 / backend /metadata_generator.py
JeffreyZhou798's picture
Update backend/metadata_generator.py
c052e4f verified
"""
Metadata Generator Module
Generates SoulX-Singer metadata from score and voice samples
"""
import os
import numpy as np
import soundfile as sf
from typing import Dict, List, Optional
from .score_parser import SOLFEGE_SYLLABLES
def prepare_voice_samples(
voice_mode: str,
user_samples: Optional[Dict[str, str]],
enable_denoise: bool
) -> Dict[str, np.ndarray]:
"""
Prepare voice samples for synthesis.
Args:
voice_mode: "My Recording" or "Child Voice"
user_samples: Dict mapping syllable to audio file path
enable_denoise: Whether to apply denoising
Returns:
Dict mapping syllable to audio array
"""
from .config import get_default_voice_path
samples = {}
if voice_mode == "Child Voice (Built-in)" or voice_mode == "童声音色 (内置)" or voice_mode == "子供の声 (内蔵)":
# Load default voice
default_path = get_default_voice_path()
for syllable in SOLFEGE_SYLLABLES:
# Capitalize first letter for filename
filename = syllable.capitalize() + '.wav'
file_path = os.path.join(default_path, filename)
if os.path.exists(file_path):
audio, sr = sf.read(file_path)
# Convert to mono if stereo
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
samples[syllable] = audio
else:
print(f"Warning: Default voice file not found: {file_path}")
elif user_samples:
# Load user recorded samples
for syllable in SOLFEGE_SYLLABLES:
file_path = user_samples.get(syllable)
if file_path and os.path.exists(file_path):
audio, sr = sf.read(file_path)
# Convert to mono if stereo
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
# Apply denoising if enabled
if enable_denoise:
audio = apply_denoise(audio, sr)
samples[syllable] = audio
return samples
def apply_denoise(audio: np.ndarray, sample_rate: int) -> np.ndarray:
"""
Apply conservative denoising using noisereduce.
Args:
audio: Audio array
sample_rate: Sample rate
Returns:
Denoised audio
"""
try:
import noisereduce as nr
return nr.reduce_noise(y=audio, sr=sample_rate, prop_decrease=0.5)
except ImportError:
print("Warning: noisereduce not installed, skipping denoising")
return audio
def generate_metadata_for_voices(
voices: List[Dict],
voice_samples: Dict[str, np.ndarray]
) -> List[Dict]:
"""
Generate SoulX-Singer metadata for each voice.
Args:
voices: List of voice data from score parser
voice_samples: Dict of syllable -> audio array
Returns:
List of metadata dicts for SoulX-Singer
"""
metadata_list = []
for voice in voices:
notes = voice['notes']
# Create prompt audio by concatenating solfege samples
prompt_audio = create_prompt_audio(notes, voice_samples)
# Create target metadata
target_metadata = create_target_metadata(notes)
metadata = {
'voice_id': voice['id'],
'instrument': voice['instrument'],
'prompt_audio': prompt_audio,
'target': target_metadata
}
metadata_list.append(metadata)
return metadata_list
def create_prompt_audio(notes: List[Dict], voice_samples: Dict[str, np.ndarray]) -> np.ndarray:
"""
Create prompt audio by concatenating voice samples.
Strategy:
- Use first few notes' solfege to create a representative prompt
- Aim for ~3-5 seconds of prompt audio
Args:
notes: List of notes for this voice
voice_samples: Dict of syllable -> audio array
Returns:
Concatenated prompt audio
"""
# Get unique solfeges from first few notes
solfeges = []
for note in notes[:10]:
solfege = note['solfege']
if solfege not in solfeges and solfege in voice_samples:
solfeges.append(solfege)
# Use at least 3 different syllables
if len(solfeges) < 3:
for syllable in SOLFEGE_SYLLABLES:
if syllable not in solfeges and syllable in voice_samples:
solfeges.append(syllable)
if len(solfeges) >= 3:
break
# Concatenate samples with small gaps
prompt_segments = []
for syllable in solfeges[:5]:
if syllable in voice_samples:
sample = voice_samples[syllable]
prompt_segments.append(sample)
# Add small gap (50ms silence)
gap = np.zeros(int(44100 * 0.05))
prompt_segments.append(gap)
if prompt_segments:
return np.concatenate(prompt_segments)
else:
# Fallback: use first available sample
for sample in voice_samples.values():
return sample
return np.zeros(44100) # 1 second silence
def create_target_metadata(notes: List[Dict]) -> Dict:
"""
Create target metadata for SoulX-Singer.
Args:
notes: List of notes
Returns:
Target metadata dict compatible with DataProcessor.preprocess()
"""
# Convert notes to SoulX format
phonemes = []
note_pitches = []
note_durations = []
note_types = []
for note in notes:
solfege = note['solfege']
midi_num = note['midi']
duration = note['duration']
# Phoneme (simplified - just use solfege name)
phoneme = solfege_to_phoneme(solfege)
phonemes.append(phoneme)
# Pitch
note_pitches.append(midi_num)
# Duration in SECONDS (DataProcessor.preprocess expects seconds)
note_durations.append(duration)
# Note type (1 = regular)
note_types.append(1)
return {
'phoneme': phonemes,
'note_pitch': note_pitches,
'note_duration': note_durations,
'note_type': note_types,
'duration': sum(note['duration'] for note in notes)
}
def solfege_to_phoneme(solfege: str) -> str:
"""
Convert solfege syllable to phoneme using English phonemes from phone_set.json.
Format: en_Phoneme for single, en_P1-P2-P3 for multiple (dash-separated).
The data_processor.preprocess() will:
1. Detect "en_" prefix and remove it
2. Split by "-" to get individual phonemes
3. Add "en_" prefix to each phoneme
So we should pass "en_P1-P2-P3" (without internal "en_" prefixes).
Example: "en_D-OW1" → data_processor removes "en_" → "D-OW1" → split → ["D", "OW1"] → add "en_" → ["en_D", "en_OW1"]
Args:
solfege: Solfege syllable (do, re, mi, fa, sol, la, ti)
Returns:
Phoneme string compatible with data_processor.preprocess()
"""
# English phonemes using ARPAbet notation
# Using stress=1 for primary stress (suitable for singing)
# Format: en_P1-P2-P3 (data_processor will split and add en_ prefix to each)
SOLFEGE_TO_PHONEME = {
'do': 'en_D-OW1', # d + ow → en_D, en_OW1
're': 'en_R-EY1', # r + ey → en_R, en_EY1
'mi': 'en_M-IY1', # m + iy → en_M, en_IY1
'fa': 'en_F-AA1', # f + aa → en_F, en_AA1
'sol': 'en_S-OW1-L', # s + ow + l → en_S, en_OW1, en_L
'la': 'en_L-AA1', # l + aa → en_L, en_AA1
'ti': 'en_T-IY1' # t + iy → en_T, en_IY1
}
return SOLFEGE_TO_PHONEME.get(solfege, 'en_D-OW1') # Default to 'do'