Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

App Files Files Community

Text2Sing-DiffSinger / app.py

Vaishnavi0404

Update app.py

1973e0f verified 11 months ago

raw

history blame contribute delete

7.97 kB

	import os
	import gradio as gr
	import torch
	import numpy as np
	import librosa
	import text2emotion as te
	import nltk
	import soundfile as sf
	from pydub import AudioSegment
	from transformers import pipeline
	from music_generator import generate_accompaniment
	from text_processor import TextProcessor
	from voice_synthesizer import VoiceSynthesizer
	from singing_converter import SingingConverter
	import setup
	import sys
	import subprocess

	nltk.download('punkt')
	nltk.download('punkt_tab')
	nltk.download('stopwords')
	nltk.download('wordnet')
	nltk.download('omw-1.4')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('averaged_perceptron_tagger_eng')

	sys.path.append('./g2pM')
	sys.path.append('./DiffSinger')

	setup.setup_speaker_embeddings()

	# Download necessary NLTK data
	nltk.download('omw-1.4')
	nltk.download('vader_lexicon')
	nltk.download('punkt')

	# Initialize components
	text_processor = TextProcessor()
	voice_synthesizer = VoiceSynthesizer()
	singing_converter = SingingConverter()

	# Setup sentiment analysis
	sentiment_analyzer = pipeline("sentiment-analysis")

	def create_placeholder_audio(output_path, duration=5, sample_rate=22050):
	"""Create a placeholder silence audio file"""
	silence = np.zeros(int(duration * sample_rate))
	sf.write(output_path, silence, sample_rate)
	return output_path

	def convert_midi_to_wav(midi_path, wav_path, soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2'):
	"""Convert MIDI file to WAV using fluidsynth"""
	# Check if the MIDI file exists
	if not os.path.exists(midi_path):
	print(f"MIDI file not found: {midi_path}")
	print("Creating placeholder audio file instead")
	return create_placeholder_audio(wav_path)

	try:
	# Use fluidsynth to convert MIDI to WAV
	subprocess.run([
	'fluidsynth',
	'-a', 'file',
	'-F', wav_path,
	soundfont_path,
	midi_path
	], check=True)

	return wav_path
	except subprocess.CalledProcessError as e:
	print(f"Error converting MIDI to WAV: {e}")
	return create_placeholder_audio(wav_path)
	except FileNotFoundError:
	print("fluidsynth not found. Using placeholder audio instead.")
	return create_placeholder_audio(wav_path)

	def process_text_to_singing(text, voice_type="neutral", tempo=100, pitch_shift=0):
	"""
	Convert text to singing voice with accompaniment based on mood

	Args:
	text (str): Input text to be converted to singing
	voice_type (str): Type of voice (neutral, feminine, masculine)
	tempo (int): Speed of the singing (60-180 BPM)
	pitch_shift (int): Pitch adjustment (-12 to 12 semitones)

	Returns:
	tuple: (input_audio_path, output_audio_path)
	"""
	# Step 1: Analyze text for emotion/mood
	emotions = te.get_emotion(text)
	dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0] if emotions else "Happy"

	# Additional sentiment analysis
	sentiment_result = sentiment_analyzer(text)[0]
	sentiment_score = sentiment_result['score'] * (1 if sentiment_result['label'] == 'POSITIVE' else -1)

	print(f"Detected emotion: {dominant_emotion}")
	print(f"Sentiment score: {sentiment_score}")

	# Step 2: Process text for pronunciation and timing
	phonemes, durations, stress_markers = text_processor.process(text)

	# Step 3: Generate speech audio first
	speech_audio_path = "temp_speech.wav"
	voice_synthesizer.synthesize(
	text=text,
	output_path=speech_audio_path,
	voice_type=voice_type
	)

	# Step 4: Convert speech to singing
	singing_audio_path = "temp_singing.wav"
	singing_converter.convert(
	speech_path=speech_audio_path,
	output_path=singing_audio_path,
	emotion=dominant_emotion,
	phonemes=phonemes,
	durations=durations,
	stress_markers=stress_markers,
	pitch_shift=pitch_shift,
	tempo=tempo
	)

	# Step 5: Generate musical accompaniment based on mood
	accompaniment_midi_path = "temp_accompaniment.mid"

	# Map emotion to musical key and style
	emotion_key_map = {
	"Happy": "C",
	"Sad": "Am",
	"Angry": "Em",
	"Fear": "Dm",
	"Surprise": "G"
	}

	key = emotion_key_map.get(dominant_emotion, "C")
	style = "pop" # Default style

	# Adjust tempo based on emotion if not explicitly set
	tempo_value = tempo

	try:
	# Try to generate the accompaniment MIDI
	generate_accompaniment(
	lyrics=text,
	melody_path=singing_audio_path,
	output_path=accompaniment_midi_path,
	tempo_value=tempo_value,
	key=key,
	time_signature="4/4",
	style=style
	)
	except Exception as e:
	print(f"Error generating accompaniment: {e}")
	# We'll handle this with the convert_midi_to_wav function that creates a placeholder

	# Convert MIDI to WAV
	accompaniment_path = "temp_accompaniment.wav"
	convert_midi_to_wav(accompaniment_midi_path, accompaniment_path)

	# Step 6: Mix singing voice with accompaniment
	final_output_path = "output_song.wav"

	# Load singing audio
	singing = AudioSegment.from_file(singing_audio_path)

	# Load accompaniment or create placeholder if loading fails
	try:
	accompaniment = AudioSegment.from_file(accompaniment_path)
	except Exception as e:
	print(f"Error loading accompaniment: {e}")
	create_placeholder_audio(accompaniment_path)
	accompaniment = AudioSegment.from_file(accompaniment_path)

	# Adjust volumes
	singing = singing - 3 # Reduce singing volume slightly
	accompaniment = accompaniment - 10 # Reduce accompaniment volume more

	# Make sure accompaniment is the same length as singing
	if len(accompaniment) < len(singing):
	# Loop accompaniment to match singing length
	times_to_repeat = (len(singing) / len(accompaniment)) + 1
	accompaniment = accompaniment * int(times_to_repeat)

	accompaniment = accompaniment[:len(singing)]

	# Mix tracks
	mixed = singing.overlay(accompaniment)
	mixed.export(final_output_path, format="wav")

	return speech_audio_path, final_output_path

	# Create Gradio interface
	with gr.Blocks(title="Text2Sing-DiffSinger") as demo:
	gr.Markdown("# Text2Sing-DiffSinger")
	gr.Markdown("Convert text into singing voice with musical accompaniment based on emotional content")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Enter text to convert to singing",
	placeholder="Type your lyrics here...",
	lines=5
	)

	with gr.Row():
	voice_type = gr.Dropdown(
	label="Voice Type",
	choices=["neutral", "feminine", "masculine"],
	value="neutral"
	)
	tempo = gr.Slider(
	label="Tempo (BPM)",
	minimum=60,
	maximum=180,
	value=100,
	step=5
	)
	pitch_shift = gr.Slider(
	label="Pitch Adjustment",
	minimum=-12,
	maximum=12,
	value=0,
	step=1
	)

	convert_btn = gr.Button("Convert to Singing")

	with gr.Column():
	input_audio = gr.Audio(label="Original Speech")
	output_audio = gr.Audio(label="Singing Output")

	convert_btn.click(
	fn=process_text_to_singing,
	inputs=[text_input, voice_type, tempo, pitch_shift],
	outputs=[input_audio, output_audio]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()