Text2Sing-DiffSinger / singing_converter.py
Vaishnavi0404's picture
Create singing_converter.py
a51cb53 verified
import numpy as np
import torch
import librosa
import soundfile as sf
import parselmouth
from parselmouth.praat import call
import os
from scipy.signal import resample
class SingingConverter:
def __init__(self):
"""Initialize the singing converter with DiffSinger model"""
# In production, you would load the DiffSinger model here
self.model_loaded = False
try:
# Load DiffSinger model (placeholder for actual implementation)
self.model_loaded = True
print("Singing converter initialized successfully")
except Exception as e:
print(f"Error loading DiffSinger model: {e}")
print("Using fallback singing conversion method")
def convert(self, speech_path, output_path, emotion, phonemes, durations, stress_markers, pitch_shift=0, tempo=100):
"""
Convert speech to singing using the DiffSinger approach
Args:
speech_path (str): Path to input speech audio
output_path (str): Path to save the singing audio
emotion (str): Dominant emotion for singing style
phonemes (str): Phonetic transcription of the text
durations (list): Duration for each phoneme
stress_markers (numpy.ndarray): Stress markers for emphasis
pitch_shift (int): Pitch adjustment in semitones
tempo (int): Tempo in BPM
"""
# Load audio
speech, sr = librosa.load(speech_path, sr=None)
if self.model_loaded:
# DiffSinger approach (placeholder for actual implementation)
singing = self._apply_diffsinger(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
else:
# Fallback method using signal processing
singing = self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
# Save output
sf.write(output_path, singing, sr)
print(f"Singing audio saved to {output_path}")
return output_path
def _apply_diffsinger(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo):
"""Apply DiffSinger model for speech-to-singing conversion"""
# This is a placeholder for the actual DiffSinger implementation
# In a real implementation, you would pass the inputs to the model
# For now, we'll use our signal processing method
return self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
def _apply_signal_processing(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo):
"""Apply signal processing techniques to convert speech to singing"""
# 1. Extract and manipulate pitch contour
sound = parselmouth.Sound(speech, sr)
pitch = call(sound, "To Pitch", 0.0, 75, 600)
# Convert pitch to numpy array
pitch_values = call(pitch, "To Matrix").as_array()
pitch_values = np.nan_to_num(pitch_values)
# 2. Apply pitch modulation based on emotion and stress
modulated_pitch = self._modulate_pitch(pitch_values, emotion, stress_markers, pitch_shift)
# 3. Apply duration changes for singing-like timing
stretched_speech = self._adjust_timing(speech, sr, durations, tempo)
# 4. Apply pitch changes to the audio
modified_speech = self._apply_pitch_changes(stretched_speech, sr, modulated_pitch)
# 5. Add vibrato effect
singing = self._add_vibrato(modified_speech, sr, emotion)
# 6. Enhance formants for singing quality
singing = self._enhance_formants(singing, sr)
return singing
def _modulate_pitch(self, pitch_values, emotion, stress_markers, pitch_shift):
"""Modulate pitch contour based on emotion and stress"""
# Base multiplier for pitch shifting (semitones to ratio)
base_shift = 2 ** (pitch_shift / 12)
# Emotion-based pitch modulation
emotion_factors = {
"Happy": 1.05,
"Angry": 1.02,
"Sad": 0.97,
"Fear": 0.98,
"Surprise": 1.1
}
emotion_factor = emotion_factors.get(emotion, 1.0)
# Apply emotion factor and pitch shift
modulated_pitch = pitch_values * emotion_factor * base_shift
# Apply additional emphasis on stressed syllables
# In a real implementation, you would align stress_markers with pitch contour
return modulated_pitch
def _adjust_timing(self, speech, sr, durations, tempo):
"""Adjust timing of speech to make it more singing-like"""
# Convert tempo (BPM) to a stretch factor
# Standard speech is roughly equivalent to 100 BPM for rhythmical purposes
stretch_factor = 100 / tempo
# Global time stretching
y_stretched = librosa.effects.time_stretch(speech, rate=stretch_factor)
# In a full implementation, you would use durations to create
# more precise timing adjustments for each syllable
return y_stretched
def _apply_pitch_changes(self, audio, sr, pitch_contour):
"""Apply pitch changes to audio using vocoder technique"""
# This is a simplified version
# In a full implementation, you would use a vocoder like WORLD or STRAIGHT
# For now, we'll use a simple pitch shift
# Convert pitch contour to semitones
if np.mean(pitch_contour) > 0:
semitones = np.log2(np.mean(pitch_contour)) * 12
shifted_audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=semitones)
else:
shifted_audio = audio
return shifted_audio
def _add_vibrato(self, audio, sr, emotion):
"""Add vibrato effect to make sound more singing-like"""
# Vibrato parameters based on emotion
vibrato_rate = 5.0 # Hz
if emotion == "Sad":
vibrato_depth = 0.3
vibrato_rate = 3.0
elif emotion == "Happy":
vibrato_depth = 0.5
vibrato_rate = 6.0
else:
vibrato_depth = 0.4
# Generate vibrato signal
t = np.arange(0, len(audio)) / sr
vibrato = 1.0 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t)
# Apply vibrato
audio_with_vibrato = audio * vibrato
return audio_with_vibrato
def _enhance_formants(self, audio, sr):
"""Enhance formants to make voice more singing-like"""
# Simple EQ-based formant enhancement
# In a full implementation, you would use a more sophisticated technique
# Use librosa's decompose to separate harmonic and percussive components
harmonic, percussive = librosa.effects.hpss(audio)
# Boost harmonic content for singing-like quality
enhanced = harmonic * 1.5 + percussive * 0.7
# Normalize
enhanced = enhanced / np.max(np.abs(enhanced))
return enhanced