| import numpy as np |
| import torch |
| import librosa |
| import soundfile as sf |
| import parselmouth |
| from parselmouth.praat import call |
| import os |
| from scipy.signal import resample |
|
|
| class SingingConverter: |
| def __init__(self): |
| """Initialize the singing converter with DiffSinger model""" |
| |
| self.model_loaded = False |
| try: |
| |
| self.model_loaded = True |
| print("Singing converter initialized successfully") |
| except Exception as e: |
| print(f"Error loading DiffSinger model: {e}") |
| print("Using fallback singing conversion method") |
| |
| def convert(self, speech_path, output_path, emotion, phonemes, durations, stress_markers, pitch_shift=0, tempo=100): |
| """ |
| Convert speech to singing using the DiffSinger approach |
| |
| Args: |
| speech_path (str): Path to input speech audio |
| output_path (str): Path to save the singing audio |
| emotion (str): Dominant emotion for singing style |
| phonemes (str): Phonetic transcription of the text |
| durations (list): Duration for each phoneme |
| stress_markers (numpy.ndarray): Stress markers for emphasis |
| pitch_shift (int): Pitch adjustment in semitones |
| tempo (int): Tempo in BPM |
| """ |
| |
| speech, sr = librosa.load(speech_path, sr=None) |
| |
| if self.model_loaded: |
| |
| singing = self._apply_diffsinger(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo) |
| else: |
| |
| singing = self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo) |
| |
| |
| sf.write(output_path, singing, sr) |
| print(f"Singing audio saved to {output_path}") |
| |
| return output_path |
| |
| def _apply_diffsinger(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo): |
| """Apply DiffSinger model for speech-to-singing conversion""" |
| |
| |
| |
| |
| return self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo) |
| |
| def _apply_signal_processing(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo): |
| """Apply signal processing techniques to convert speech to singing""" |
| |
| sound = parselmouth.Sound(speech, sr) |
| pitch = call(sound, "To Pitch", 0.0, 75, 600) |
| |
| |
| pitch_values = call(pitch, "To Matrix").as_array() |
| pitch_values = np.nan_to_num(pitch_values) |
| |
| |
| modulated_pitch = self._modulate_pitch(pitch_values, emotion, stress_markers, pitch_shift) |
| |
| |
| stretched_speech = self._adjust_timing(speech, sr, durations, tempo) |
| |
| |
| modified_speech = self._apply_pitch_changes(stretched_speech, sr, modulated_pitch) |
| |
| |
| singing = self._add_vibrato(modified_speech, sr, emotion) |
| |
| |
| singing = self._enhance_formants(singing, sr) |
| |
| return singing |
| |
| def _modulate_pitch(self, pitch_values, emotion, stress_markers, pitch_shift): |
| """Modulate pitch contour based on emotion and stress""" |
| |
| base_shift = 2 ** (pitch_shift / 12) |
| |
| |
| emotion_factors = { |
| "Happy": 1.05, |
| "Angry": 1.02, |
| "Sad": 0.97, |
| "Fear": 0.98, |
| "Surprise": 1.1 |
| } |
| |
| emotion_factor = emotion_factors.get(emotion, 1.0) |
| |
| |
| modulated_pitch = pitch_values * emotion_factor * base_shift |
| |
| |
| |
| |
| return modulated_pitch |
| |
| def _adjust_timing(self, speech, sr, durations, tempo): |
| """Adjust timing of speech to make it more singing-like""" |
| |
| |
| stretch_factor = 100 / tempo |
| |
| |
| y_stretched = librosa.effects.time_stretch(speech, rate=stretch_factor) |
| |
| |
| |
| |
| return y_stretched |
| |
| def _apply_pitch_changes(self, audio, sr, pitch_contour): |
| """Apply pitch changes to audio using vocoder technique""" |
| |
| |
| |
| |
| |
| if np.mean(pitch_contour) > 0: |
| semitones = np.log2(np.mean(pitch_contour)) * 12 |
| shifted_audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=semitones) |
| else: |
| shifted_audio = audio |
| |
| return shifted_audio |
| |
| def _add_vibrato(self, audio, sr, emotion): |
| """Add vibrato effect to make sound more singing-like""" |
| |
| vibrato_rate = 5.0 |
| if emotion == "Sad": |
| vibrato_depth = 0.3 |
| vibrato_rate = 3.0 |
| elif emotion == "Happy": |
| vibrato_depth = 0.5 |
| vibrato_rate = 6.0 |
| else: |
| vibrato_depth = 0.4 |
| |
| |
| t = np.arange(0, len(audio)) / sr |
| vibrato = 1.0 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t) |
| |
| |
| audio_with_vibrato = audio * vibrato |
| |
| return audio_with_vibrato |
| |
| def _enhance_formants(self, audio, sr): |
| """Enhance formants to make voice more singing-like""" |
| |
| |
| |
| |
| harmonic, percussive = librosa.effects.hpss(audio) |
| |
| |
| enhanced = harmonic * 1.5 + percussive * 0.7 |
| |
| |
| enhanced = enhanced / np.max(np.abs(enhanced)) |
| |
| return enhanced |