import numpy as np import torch import librosa import soundfile as sf import parselmouth from parselmouth.praat import call import os from scipy.signal import resample class SingingConverter: def __init__(self): """Initialize the singing converter with DiffSinger model""" # In production, you would load the DiffSinger model here self.model_loaded = False try: # Load DiffSinger model (placeholder for actual implementation) self.model_loaded = True print("Singing converter initialized successfully") except Exception as e: print(f"Error loading DiffSinger model: {e}") print("Using fallback singing conversion method") def convert(self, speech_path, output_path, emotion, phonemes, durations, stress_markers, pitch_shift=0, tempo=100): """ Convert speech to singing using the DiffSinger approach Args: speech_path (str): Path to input speech audio output_path (str): Path to save the singing audio emotion (str): Dominant emotion for singing style phonemes (str): Phonetic transcription of the text durations (list): Duration for each phoneme stress_markers (numpy.ndarray): Stress markers for emphasis pitch_shift (int): Pitch adjustment in semitones tempo (int): Tempo in BPM """ # Load audio speech, sr = librosa.load(speech_path, sr=None) if self.model_loaded: # DiffSinger approach (placeholder for actual implementation) singing = self._apply_diffsinger(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo) else: # Fallback method using signal processing singing = self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo) # Save output sf.write(output_path, singing, sr) print(f"Singing audio saved to {output_path}") return output_path def _apply_diffsinger(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo): """Apply DiffSinger model for speech-to-singing conversion""" # This is a placeholder for the actual DiffSinger implementation # In a real implementation, you would pass the inputs to the model # For now, we'll use our signal processing method return self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo) def _apply_signal_processing(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo): """Apply signal processing techniques to convert speech to singing""" # 1. Extract and manipulate pitch contour sound = parselmouth.Sound(speech, sr) pitch = call(sound, "To Pitch", 0.0, 75, 600) # Convert pitch to numpy array pitch_values = call(pitch, "To Matrix").as_array() pitch_values = np.nan_to_num(pitch_values) # 2. Apply pitch modulation based on emotion and stress modulated_pitch = self._modulate_pitch(pitch_values, emotion, stress_markers, pitch_shift) # 3. Apply duration changes for singing-like timing stretched_speech = self._adjust_timing(speech, sr, durations, tempo) # 4. Apply pitch changes to the audio modified_speech = self._apply_pitch_changes(stretched_speech, sr, modulated_pitch) # 5. Add vibrato effect singing = self._add_vibrato(modified_speech, sr, emotion) # 6. Enhance formants for singing quality singing = self._enhance_formants(singing, sr) return singing def _modulate_pitch(self, pitch_values, emotion, stress_markers, pitch_shift): """Modulate pitch contour based on emotion and stress""" # Base multiplier for pitch shifting (semitones to ratio) base_shift = 2 ** (pitch_shift / 12) # Emotion-based pitch modulation emotion_factors = { "Happy": 1.05, "Angry": 1.02, "Sad": 0.97, "Fear": 0.98, "Surprise": 1.1 } emotion_factor = emotion_factors.get(emotion, 1.0) # Apply emotion factor and pitch shift modulated_pitch = pitch_values * emotion_factor * base_shift # Apply additional emphasis on stressed syllables # In a real implementation, you would align stress_markers with pitch contour return modulated_pitch def _adjust_timing(self, speech, sr, durations, tempo): """Adjust timing of speech to make it more singing-like""" # Convert tempo (BPM) to a stretch factor # Standard speech is roughly equivalent to 100 BPM for rhythmical purposes stretch_factor = 100 / tempo # Global time stretching y_stretched = librosa.effects.time_stretch(speech, rate=stretch_factor) # In a full implementation, you would use durations to create # more precise timing adjustments for each syllable return y_stretched def _apply_pitch_changes(self, audio, sr, pitch_contour): """Apply pitch changes to audio using vocoder technique""" # This is a simplified version # In a full implementation, you would use a vocoder like WORLD or STRAIGHT # For now, we'll use a simple pitch shift # Convert pitch contour to semitones if np.mean(pitch_contour) > 0: semitones = np.log2(np.mean(pitch_contour)) * 12 shifted_audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=semitones) else: shifted_audio = audio return shifted_audio def _add_vibrato(self, audio, sr, emotion): """Add vibrato effect to make sound more singing-like""" # Vibrato parameters based on emotion vibrato_rate = 5.0 # Hz if emotion == "Sad": vibrato_depth = 0.3 vibrato_rate = 3.0 elif emotion == "Happy": vibrato_depth = 0.5 vibrato_rate = 6.0 else: vibrato_depth = 0.4 # Generate vibrato signal t = np.arange(0, len(audio)) / sr vibrato = 1.0 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t) # Apply vibrato audio_with_vibrato = audio * vibrato return audio_with_vibrato def _enhance_formants(self, audio, sr): """Enhance formants to make voice more singing-like""" # Simple EQ-based formant enhancement # In a full implementation, you would use a more sophisticated technique # Use librosa's decompose to separate harmonic and percussive components harmonic, percussive = librosa.effects.hpss(audio) # Boost harmonic content for singing-like quality enhanced = harmonic * 1.5 + percussive * 0.7 # Normalize enhanced = enhanced / np.max(np.abs(enhanced)) return enhanced