Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

App Files Files Community

Text2Sing-DiffSinger / singing_converter.py

Vaishnavi0404

Create singing_converter.py

a51cb53 verified 11 months ago

raw

history blame contribute delete

7.3 kB

	import numpy as np
	import torch
	import librosa
	import soundfile as sf
	import parselmouth
	from parselmouth.praat import call
	import os
	from scipy.signal import resample

	class SingingConverter:
	def __init__(self):
	"""Initialize the singing converter with DiffSinger model"""
	# In production, you would load the DiffSinger model here
	self.model_loaded = False
	try:
	# Load DiffSinger model (placeholder for actual implementation)
	self.model_loaded = True
	print("Singing converter initialized successfully")
	except Exception as e:
	print(f"Error loading DiffSinger model: {e}")
	print("Using fallback singing conversion method")

	def convert(self, speech_path, output_path, emotion, phonemes, durations, stress_markers, pitch_shift=0, tempo=100):
	"""
	Convert speech to singing using the DiffSinger approach

	Args:
	speech_path (str): Path to input speech audio
	output_path (str): Path to save the singing audio
	emotion (str): Dominant emotion for singing style
	phonemes (str): Phonetic transcription of the text
	durations (list): Duration for each phoneme
	stress_markers (numpy.ndarray): Stress markers for emphasis
	pitch_shift (int): Pitch adjustment in semitones
	tempo (int): Tempo in BPM
	"""
	# Load audio
	speech, sr = librosa.load(speech_path, sr=None)

	if self.model_loaded:
	# DiffSinger approach (placeholder for actual implementation)
	singing = self._apply_diffsinger(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)
	else:
	# Fallback method using signal processing
	singing = self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)

	# Save output
	sf.write(output_path, singing, sr)
	print(f"Singing audio saved to {output_path}")

	return output_path

	def _apply_diffsinger(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo):
	"""Apply DiffSinger model for speech-to-singing conversion"""
	# This is a placeholder for the actual DiffSinger implementation
	# In a real implementation, you would pass the inputs to the model

	# For now, we'll use our signal processing method
	return self._apply_signal_processing(speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo)

	def _apply_signal_processing(self, speech, sr, emotion, phonemes, durations, stress_markers, pitch_shift, tempo):
	"""Apply signal processing techniques to convert speech to singing"""
	# 1. Extract and manipulate pitch contour
	sound = parselmouth.Sound(speech, sr)
	pitch = call(sound, "To Pitch", 0.0, 75, 600)

	# Convert pitch to numpy array
	pitch_values = call(pitch, "To Matrix").as_array()
	pitch_values = np.nan_to_num(pitch_values)

	# 2. Apply pitch modulation based on emotion and stress
	modulated_pitch = self._modulate_pitch(pitch_values, emotion, stress_markers, pitch_shift)

	# 3. Apply duration changes for singing-like timing
	stretched_speech = self._adjust_timing(speech, sr, durations, tempo)

	# 4. Apply pitch changes to the audio
	modified_speech = self._apply_pitch_changes(stretched_speech, sr, modulated_pitch)

	# 5. Add vibrato effect
	singing = self._add_vibrato(modified_speech, sr, emotion)

	# 6. Enhance formants for singing quality
	singing = self._enhance_formants(singing, sr)

	return singing

	def _modulate_pitch(self, pitch_values, emotion, stress_markers, pitch_shift):
	"""Modulate pitch contour based on emotion and stress"""
	# Base multiplier for pitch shifting (semitones to ratio)
	base_shift = 2 ** (pitch_shift / 12)

	# Emotion-based pitch modulation
	emotion_factors = {
	"Happy": 1.05,
	"Angry": 1.02,
	"Sad": 0.97,
	"Fear": 0.98,
	"Surprise": 1.1
	}

	emotion_factor = emotion_factors.get(emotion, 1.0)

	# Apply emotion factor and pitch shift
	modulated_pitch = pitch_values * emotion_factor * base_shift

	# Apply additional emphasis on stressed syllables
	# In a real implementation, you would align stress_markers with pitch contour

	return modulated_pitch

	def _adjust_timing(self, speech, sr, durations, tempo):
	"""Adjust timing of speech to make it more singing-like"""
	# Convert tempo (BPM) to a stretch factor
	# Standard speech is roughly equivalent to 100 BPM for rhythmical purposes
	stretch_factor = 100 / tempo

	# Global time stretching
	y_stretched = librosa.effects.time_stretch(speech, rate=stretch_factor)

	# In a full implementation, you would use durations to create
	# more precise timing adjustments for each syllable

	return y_stretched

	def _apply_pitch_changes(self, audio, sr, pitch_contour):
	"""Apply pitch changes to audio using vocoder technique"""
	# This is a simplified version
	# In a full implementation, you would use a vocoder like WORLD or STRAIGHT

	# For now, we'll use a simple pitch shift
	# Convert pitch contour to semitones
	if np.mean(pitch_contour) > 0:
	semitones = np.log2(np.mean(pitch_contour)) * 12
	shifted_audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=semitones)
	else:
	shifted_audio = audio

	return shifted_audio

	def _add_vibrato(self, audio, sr, emotion):
	"""Add vibrato effect to make sound more singing-like"""
	# Vibrato parameters based on emotion
	vibrato_rate = 5.0 # Hz
	if emotion == "Sad":
	vibrato_depth = 0.3
	vibrato_rate = 3.0
	elif emotion == "Happy":
	vibrato_depth = 0.5
	vibrato_rate = 6.0
	else:
	vibrato_depth = 0.4

	# Generate vibrato signal
	t = np.arange(0, len(audio)) / sr
	vibrato = 1.0 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t)

	# Apply vibrato
	audio_with_vibrato = audio * vibrato

	return audio_with_vibrato

	def _enhance_formants(self, audio, sr):
	"""Enhance formants to make voice more singing-like"""
	# Simple EQ-based formant enhancement
	# In a full implementation, you would use a more sophisticated technique

	# Use librosa's decompose to separate harmonic and percussive components
	harmonic, percussive = librosa.effects.hpss(audio)

	# Boost harmonic content for singing-like quality
	enhanced = harmonic * 1.5 + percussive * 0.7

	# Normalize
	enhanced = enhanced / np.max(np.abs(enhanced))

	return enhanced