Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

File size: 7,304 Bytes

a51cb53

import numpy as np
import torch
import librosa
import soundfile as sf
import parselmouth
from parselmouth.praat import call
import os
from scipy.signal import resample

class SingingConverter:
    def __init__(self):
        """Initialize the singing converter with DiffSinger model"""
        # In production, you would load the DiffSinger model here
        self.model_loaded = False
        try:
            # Load DiffSinger model (placeholder for actual implementation)
            self.model_loaded = True
            print("Singing converter initialized successfully")
        except Exception as e:
            print(f"Error loading DiffSinger model: {e}")
            print("Using fallback singing conversion method")
    
    def convert(self, speech_path, output_path, emotion, phonemes, durations, stress_markers, pitch_shift=0, tempo=100):
        """
        Convert speech to singing using the DiffSinger approach
        
        Args:
            speech_path (str): Path to input speech audio
            output_path (str): Path to save the singing audio
            emotion (str): Dominant emotion for singing style
            phonemes (str): Phonetic transcription of the text
            durations (list): Duration for each phoneme
            stress_markers (numpy.ndarray): Stress markers for emphasis
            pitch_shift (int): Pitch adjustment in semitones
            tempo (int): Tempo in BPM
        """
        # Load audio
        speech, sr = librosa.load(speech_path, sr=None)
        
        if self.model_loaded:
            # DiffSinger approach (placeholder for actual implementation)
            singing = self._apply_diffsinger(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
        else:
            # Fallback method using signal processing
            singing = self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
        
        # Save output
        sf.write(output_path, singing, sr)
        print(f"Singing audio saved to {output_path}")
        
        return output_path
    
    def _apply_diffsinger(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo):
        """Apply DiffSinger model for speech-to-singing conversion"""
        # This is a placeholder for the actual DiffSinger implementation
        # In a real implementation, you would pass the inputs to the model
        
        # For now, we'll use our signal processing method
        return self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
    
    def _apply_signal_processing(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo):
        """Apply signal processing techniques to convert speech to singing"""
        # 1. Extract and manipulate pitch contour
        sound = parselmouth.Sound(speech, sr)
        pitch = call(sound, "To Pitch", 0.0, 75, 600)
        
        # Convert pitch to numpy array
        pitch_values = call(pitch, "To Matrix").as_array()
        pitch_values = np.nan_to_num(pitch_values)
        
        # 2. Apply pitch modulation based on emotion and stress
        modulated_pitch = self._modulate_pitch(pitch_values, emotion, stress_markers, pitch_shift)
        
        # 3. Apply duration changes for singing-like timing
        stretched_speech = self._adjust_timing(speech, sr, durations, tempo)
        
        # 4. Apply pitch changes to the audio
        modified_speech = self._apply_pitch_changes(stretched_speech, sr, modulated_pitch)
        
        # 5. Add vibrato effect
        singing = self._add_vibrato(modified_speech, sr, emotion)
        
        # 6. Enhance formants for singing quality
        singing = self._enhance_formants(singing, sr)
        
        return singing
    
    def _modulate_pitch(self, pitch_values, emotion, stress_markers, pitch_shift):
        """Modulate pitch contour based on emotion and stress"""
        # Base multiplier for pitch shifting (semitones to ratio)
        base_shift = 2 ** (pitch_shift / 12)
        
        # Emotion-based pitch modulation
        emotion_factors = {
            "Happy": 1.05,
            "Angry": 1.02,
            "Sad": 0.97,
            "Fear": 0.98,
            "Surprise": 1.1
        }
        
        emotion_factor = emotion_factors.get(emotion, 1.0)
        
        # Apply emotion factor and pitch shift
        modulated_pitch = pitch_values * emotion_factor * base_shift
        
        # Apply additional emphasis on stressed syllables
        # In a real implementation, you would align stress_markers with pitch contour
        
        return modulated_pitch
    
    def _adjust_timing(self, speech, sr, durations, tempo):
        """Adjust timing of speech to make it more singing-like"""
        # Convert tempo (BPM) to a stretch factor
        # Standard speech is roughly equivalent to 100 BPM for rhythmical purposes
        stretch_factor = 100 / tempo
        
        # Global time stretching
        y_stretched = librosa.effects.time_stretch(speech, rate=stretch_factor)
        
        # In a full implementation, you would use durations to create
        # more precise timing adjustments for each syllable
        
        return y_stretched
    
    def _apply_pitch_changes(self, audio, sr, pitch_contour):
        """Apply pitch changes to audio using vocoder technique"""
        # This is a simplified version
        # In a full implementation, you would use a vocoder like WORLD or STRAIGHT
        
        # For now, we'll use a simple pitch shift
        # Convert pitch contour to semitones
        if np.mean(pitch_contour) > 0:
            semitones = np.log2(np.mean(pitch_contour)) * 12
            shifted_audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=semitones)
        else:
            shifted_audio = audio
        
        return shifted_audio
    
    def _add_vibrato(self, audio, sr, emotion):
        """Add vibrato effect to make sound more singing-like"""
        # Vibrato parameters based on emotion
        vibrato_rate = 5.0  # Hz
        if emotion == "Sad":
            vibrato_depth = 0.3
            vibrato_rate = 3.0
        elif emotion == "Happy":
            vibrato_depth = 0.5
            vibrato_rate = 6.0
        else:
            vibrato_depth = 0.4
        
        # Generate vibrato signal
        t = np.arange(0, len(audio)) / sr
        vibrato = 1.0 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t)
        
        # Apply vibrato
        audio_with_vibrato = audio * vibrato
        
        return audio_with_vibrato
    
    def _enhance_formants(self, audio, sr):
        """Enhance formants to make voice more singing-like"""
        # Simple EQ-based formant enhancement
        # In a full implementation, you would use a more sophisticated technique
        
        # Use librosa's decompose to separate harmonic and percussive components
        harmonic, percussive = librosa.effects.hpss(audio)
        
        # Boost harmonic content for singing-like quality
        enhanced = harmonic * 1.5 + percussive * 0.7
        
        # Normalize
        enhanced = enhanced / np.max(np.abs(enhanced))
        
        return enhanced