File size: 7,304 Bytes
a51cb53 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | import numpy as np
import torch
import librosa
import soundfile as sf
import parselmouth
from parselmouth.praat import call
import os
from scipy.signal import resample
class SingingConverter:
def __init__(self):
"""Initialize the singing converter with DiffSinger model"""
# In production, you would load the DiffSinger model here
self.model_loaded = False
try:
# Load DiffSinger model (placeholder for actual implementation)
self.model_loaded = True
print("Singing converter initialized successfully")
except Exception as e:
print(f"Error loading DiffSinger model: {e}")
print("Using fallback singing conversion method")
def convert(self, speech_path, output_path, emotion, phonemes, durations, stress_markers, pitch_shift=0, tempo=100):
"""
Convert speech to singing using the DiffSinger approach
Args:
speech_path (str): Path to input speech audio
output_path (str): Path to save the singing audio
emotion (str): Dominant emotion for singing style
phonemes (str): Phonetic transcription of the text
durations (list): Duration for each phoneme
stress_markers (numpy.ndarray): Stress markers for emphasis
pitch_shift (int): Pitch adjustment in semitones
tempo (int): Tempo in BPM
"""
# Load audio
speech, sr = librosa.load(speech_path, sr=None)
if self.model_loaded:
# DiffSinger approach (placeholder for actual implementation)
singing = self._apply_diffsinger(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
else:
# Fallback method using signal processing
singing = self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
# Save output
sf.write(output_path, singing, sr)
print(f"Singing audio saved to {output_path}")
return output_path
def _apply_diffsinger(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo):
"""Apply DiffSinger model for speech-to-singing conversion"""
# This is a placeholder for the actual DiffSinger implementation
# In a real implementation, you would pass the inputs to the model
# For now, we'll use our signal processing method
return self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
def _apply_signal_processing(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo):
"""Apply signal processing techniques to convert speech to singing"""
# 1. Extract and manipulate pitch contour
sound = parselmouth.Sound(speech, sr)
pitch = call(sound, "To Pitch", 0.0, 75, 600)
# Convert pitch to numpy array
pitch_values = call(pitch, "To Matrix").as_array()
pitch_values = np.nan_to_num(pitch_values)
# 2. Apply pitch modulation based on emotion and stress
modulated_pitch = self._modulate_pitch(pitch_values, emotion, stress_markers, pitch_shift)
# 3. Apply duration changes for singing-like timing
stretched_speech = self._adjust_timing(speech, sr, durations, tempo)
# 4. Apply pitch changes to the audio
modified_speech = self._apply_pitch_changes(stretched_speech, sr, modulated_pitch)
# 5. Add vibrato effect
singing = self._add_vibrato(modified_speech, sr, emotion)
# 6. Enhance formants for singing quality
singing = self._enhance_formants(singing, sr)
return singing
def _modulate_pitch(self, pitch_values, emotion, stress_markers, pitch_shift):
"""Modulate pitch contour based on emotion and stress"""
# Base multiplier for pitch shifting (semitones to ratio)
base_shift = 2 ** (pitch_shift / 12)
# Emotion-based pitch modulation
emotion_factors = {
"Happy": 1.05,
"Angry": 1.02,
"Sad": 0.97,
"Fear": 0.98,
"Surprise": 1.1
}
emotion_factor = emotion_factors.get(emotion, 1.0)
# Apply emotion factor and pitch shift
modulated_pitch = pitch_values * emotion_factor * base_shift
# Apply additional emphasis on stressed syllables
# In a real implementation, you would align stress_markers with pitch contour
return modulated_pitch
def _adjust_timing(self, speech, sr, durations, tempo):
"""Adjust timing of speech to make it more singing-like"""
# Convert tempo (BPM) to a stretch factor
# Standard speech is roughly equivalent to 100 BPM for rhythmical purposes
stretch_factor = 100 / tempo
# Global time stretching
y_stretched = librosa.effects.time_stretch(speech, rate=stretch_factor)
# In a full implementation, you would use durations to create
# more precise timing adjustments for each syllable
return y_stretched
def _apply_pitch_changes(self, audio, sr, pitch_contour):
"""Apply pitch changes to audio using vocoder technique"""
# This is a simplified version
# In a full implementation, you would use a vocoder like WORLD or STRAIGHT
# For now, we'll use a simple pitch shift
# Convert pitch contour to semitones
if np.mean(pitch_contour) > 0:
semitones = np.log2(np.mean(pitch_contour)) * 12
shifted_audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=semitones)
else:
shifted_audio = audio
return shifted_audio
def _add_vibrato(self, audio, sr, emotion):
"""Add vibrato effect to make sound more singing-like"""
# Vibrato parameters based on emotion
vibrato_rate = 5.0 # Hz
if emotion == "Sad":
vibrato_depth = 0.3
vibrato_rate = 3.0
elif emotion == "Happy":
vibrato_depth = 0.5
vibrato_rate = 6.0
else:
vibrato_depth = 0.4
# Generate vibrato signal
t = np.arange(0, len(audio)) / sr
vibrato = 1.0 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t)
# Apply vibrato
audio_with_vibrato = audio * vibrato
return audio_with_vibrato
def _enhance_formants(self, audio, sr):
"""Enhance formants to make voice more singing-like"""
# Simple EQ-based formant enhancement
# In a full implementation, you would use a more sophisticated technique
# Use librosa's decompose to separate harmonic and percussive components
harmonic, percussive = librosa.effects.hpss(audio)
# Boost harmonic content for singing-like quality
enhanced = harmonic * 1.5 + percussive * 0.7
# Normalize
enhanced = enhanced / np.max(np.abs(enhanced))
return enhanced |