Spaces:

JeffreyZhou798
/

SolfegeScore-Singer-01

Paused

App Files Files Community

SolfegeScore-Singer-01 / backend /metadata_generator.py

JeffreyZhou798

Update backend/metadata_generator.py

c052e4f verified 21 days ago

raw

history blame contribute delete

7.92 kB

	"""
	Metadata Generator Module
	Generates SoulX-Singer metadata from score and voice samples
	"""

	import os
	import numpy as np
	import soundfile as sf
	from typing import Dict, List, Optional
	from .score_parser import SOLFEGE_SYLLABLES


	def prepare_voice_samples(
	voice_mode: str,
	user_samples: Optional[Dict[str, str]],
	enable_denoise: bool
	) -> Dict[str, np.ndarray]:
	"""
	Prepare voice samples for synthesis.

	Args:
	voice_mode: "My Recording" or "Child Voice"
	user_samples: Dict mapping syllable to audio file path
	enable_denoise: Whether to apply denoising

	Returns:
	Dict mapping syllable to audio array
	"""
	from .config import get_default_voice_path

	samples = {}

	if voice_mode == "Child Voice (Built-in)" or voice_mode == "童声音色 (内置)" or voice_mode == "子供の声 (内蔵)":
	# Load default voice
	default_path = get_default_voice_path()

	for syllable in SOLFEGE_SYLLABLES:
	# Capitalize first letter for filename
	filename = syllable.capitalize() + '.wav'
	file_path = os.path.join(default_path, filename)

	if os.path.exists(file_path):
	audio, sr = sf.read(file_path)
	# Convert to mono if stereo
	if len(audio.shape) > 1:
	audio = audio.mean(axis=1)
	samples[syllable] = audio
	else:
	print(f"Warning: Default voice file not found: {file_path}")

	elif user_samples:
	# Load user recorded samples
	for syllable in SOLFEGE_SYLLABLES:
	file_path = user_samples.get(syllable)

	if file_path and os.path.exists(file_path):
	audio, sr = sf.read(file_path)

	# Convert to mono if stereo
	if len(audio.shape) > 1:
	audio = audio.mean(axis=1)

	# Apply denoising if enabled
	if enable_denoise:
	audio = apply_denoise(audio, sr)

	samples[syllable] = audio

	return samples


	def apply_denoise(audio: np.ndarray, sample_rate: int) -> np.ndarray:
	"""
	Apply conservative denoising using noisereduce.

	Args:
	audio: Audio array
	sample_rate: Sample rate

	Returns:
	Denoised audio
	"""
	try:
	import noisereduce as nr
	return nr.reduce_noise(y=audio, sr=sample_rate, prop_decrease=0.5)
	except ImportError:
	print("Warning: noisereduce not installed, skipping denoising")
	return audio


	def generate_metadata_for_voices(
	voices: List[Dict],
	voice_samples: Dict[str, np.ndarray]
	) -> List[Dict]:
	"""
	Generate SoulX-Singer metadata for each voice.

	Args:
	voices: List of voice data from score parser
	voice_samples: Dict of syllable -> audio array

	Returns:
	List of metadata dicts for SoulX-Singer
	"""
	metadata_list = []

	for voice in voices:
	notes = voice['notes']

	# Create prompt audio by concatenating solfege samples
	prompt_audio = create_prompt_audio(notes, voice_samples)

	# Create target metadata
	target_metadata = create_target_metadata(notes)

	metadata = {
	'voice_id': voice['id'],
	'instrument': voice['instrument'],
	'prompt_audio': prompt_audio,
	'target': target_metadata
	}

	metadata_list.append(metadata)

	return metadata_list


	def create_prompt_audio(notes: List[Dict], voice_samples: Dict[str, np.ndarray]) -> np.ndarray:
	"""
	Create prompt audio by concatenating voice samples.

	Strategy:
	- Use first few notes' solfege to create a representative prompt
	- Aim for ~3-5 seconds of prompt audio

	Args:
	notes: List of notes for this voice
	voice_samples: Dict of syllable -> audio array

	Returns:
	Concatenated prompt audio
	"""
	# Get unique solfeges from first few notes
	solfeges = []
	for note in notes[:10]:
	solfege = note['solfege']
	if solfege not in solfeges and solfege in voice_samples:
	solfeges.append(solfege)

	# Use at least 3 different syllables
	if len(solfeges) < 3:
	for syllable in SOLFEGE_SYLLABLES:
	if syllable not in solfeges and syllable in voice_samples:
	solfeges.append(syllable)
	if len(solfeges) >= 3:
	break

	# Concatenate samples with small gaps
	prompt_segments = []
	for syllable in solfeges[:5]:
	if syllable in voice_samples:
	sample = voice_samples[syllable]
	prompt_segments.append(sample)

	# Add small gap (50ms silence)
	gap = np.zeros(int(44100 * 0.05))
	prompt_segments.append(gap)

	if prompt_segments:
	return np.concatenate(prompt_segments)
	else:
	# Fallback: use first available sample
	for sample in voice_samples.values():
	return sample

	return np.zeros(44100) # 1 second silence


	def create_target_metadata(notes: List[Dict]) -> Dict:
	"""
	Create target metadata for SoulX-Singer.

	Args:
	notes: List of notes

	Returns:
	Target metadata dict compatible with DataProcessor.preprocess()
	"""
	# Convert notes to SoulX format
	phonemes = []
	note_pitches = []
	note_durations = []
	note_types = []

	for note in notes:
	solfege = note['solfege']
	midi_num = note['midi']
	duration = note['duration']

	# Phoneme (simplified - just use solfege name)
	phoneme = solfege_to_phoneme(solfege)
	phonemes.append(phoneme)

	# Pitch
	note_pitches.append(midi_num)

	# Duration in SECONDS (DataProcessor.preprocess expects seconds)
	note_durations.append(duration)

	# Note type (1 = regular)
	note_types.append(1)

	return {
	'phoneme': phonemes,
	'note_pitch': note_pitches,
	'note_duration': note_durations,
	'note_type': note_types,
	'duration': sum(note['duration'] for note in notes)
	}


	def solfege_to_phoneme(solfege: str) -> str:
	"""
	Convert solfege syllable to phoneme using English phonemes from phone_set.json.

	Format: en_Phoneme for single, en_P1-P2-P3 for multiple (dash-separated).
	The data_processor.preprocess() will:
	1. Detect "en_" prefix and remove it
	2. Split by "-" to get individual phonemes
	3. Add "en_" prefix to each phoneme

	So we should pass "en_P1-P2-P3" (without internal "en_" prefixes).

	Example: "en_D-OW1" → data_processor removes "en_" → "D-OW1" → split → ["D", "OW1"] → add "en_" → ["en_D", "en_OW1"]

	Args:
	solfege: Solfege syllable (do, re, mi, fa, sol, la, ti)

	Returns:
	Phoneme string compatible with data_processor.preprocess()
	"""
	# English phonemes using ARPAbet notation
	# Using stress=1 for primary stress (suitable for singing)
	# Format: en_P1-P2-P3 (data_processor will split and add en_ prefix to each)
	SOLFEGE_TO_PHONEME = {
	'do': 'en_D-OW1', # d + ow → en_D, en_OW1
	're': 'en_R-EY1', # r + ey → en_R, en_EY1
	'mi': 'en_M-IY1', # m + iy → en_M, en_IY1
	'fa': 'en_F-AA1', # f + aa → en_F, en_AA1
	'sol': 'en_S-OW1-L', # s + ow + l → en_S, en_OW1, en_L
	'la': 'en_L-AA1', # l + aa → en_L, en_AA1
	'ti': 'en_T-IY1' # t + iy → en_T, en_IY1
	}

	return SOLFEGE_TO_PHONEME.get(solfege, 'en_D-OW1') # Default to 'do'