Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

File size: 7,969 Bytes

import os
import gradio as gr
import torch
import numpy as np
import librosa
import text2emotion as te
import nltk
import soundfile as sf
from pydub import AudioSegment
from transformers import pipeline
from music_generator import generate_accompaniment
from text_processor import TextProcessor
from voice_synthesizer import VoiceSynthesizer
from singing_converter import SingingConverter
import setup
import sys
import subprocess

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger') 
nltk.download('averaged_perceptron_tagger_eng')

sys.path.append('./g2pM')
sys.path.append('./DiffSinger')

setup.setup_speaker_embeddings()

# Download necessary NLTK data
nltk.download('omw-1.4')
nltk.download('vader_lexicon')
nltk.download('punkt')

# Initialize components
text_processor = TextProcessor()
voice_synthesizer = VoiceSynthesizer()
singing_converter = SingingConverter()

# Setup sentiment analysis
sentiment_analyzer = pipeline("sentiment-analysis")

def create_placeholder_audio(output_path, duration=5, sample_rate=22050):
    """Create a placeholder silence audio file"""
    silence = np.zeros(int(duration * sample_rate))
    sf.write(output_path, silence, sample_rate)
    return output_path

def convert_midi_to_wav(midi_path, wav_path, soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2'):
    """Convert MIDI file to WAV using fluidsynth"""
    # Check if the MIDI file exists
    if not os.path.exists(midi_path):
        print(f"MIDI file not found: {midi_path}")
        print("Creating placeholder audio file instead")
        return create_placeholder_audio(wav_path)
        
    try:
        # Use fluidsynth to convert MIDI to WAV
        subprocess.run([
            'fluidsynth', 
            '-a', 'file', 
            '-F', wav_path, 
            soundfont_path, 
            midi_path
        ], check=True)
        
        return wav_path
    except subprocess.CalledProcessError as e:
        print(f"Error converting MIDI to WAV: {e}")
        return create_placeholder_audio(wav_path)
    except FileNotFoundError:
        print("fluidsynth not found. Using placeholder audio instead.")
        return create_placeholder_audio(wav_path)

def process_text_to_singing(text, voice_type="neutral", tempo=100, pitch_shift=0):
    """
    Convert text to singing voice with accompaniment based on mood
    
    Args:
        text (str): Input text to be converted to singing
        voice_type (str): Type of voice (neutral, feminine, masculine)
        tempo (int): Speed of the singing (60-180 BPM)
        pitch_shift (int): Pitch adjustment (-12 to 12 semitones)
    
    Returns:
        tuple: (input_audio_path, output_audio_path)
    """
    # Step 1: Analyze text for emotion/mood
    emotions = te.get_emotion(text)
    dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0] if emotions else "Happy"
    
    # Additional sentiment analysis
    sentiment_result = sentiment_analyzer(text)[0]
    sentiment_score = sentiment_result['score'] * (1 if sentiment_result['label'] == 'POSITIVE' else -1)
    
    print(f"Detected emotion: {dominant_emotion}")
    print(f"Sentiment score: {sentiment_score}")
    
    # Step 2: Process text for pronunciation and timing
    phonemes, durations, stress_markers = text_processor.process(text)
    
    # Step 3: Generate speech audio first
    speech_audio_path = "temp_speech.wav"
    voice_synthesizer.synthesize(
        text=text, 
        output_path=speech_audio_path, 
        voice_type=voice_type
    )
    
    # Step 4: Convert speech to singing
    singing_audio_path = "temp_singing.wav"
    singing_converter.convert(
        speech_path=speech_audio_path,
        output_path=singing_audio_path,
        emotion=dominant_emotion,
        phonemes=phonemes,
        durations=durations,
        stress_markers=stress_markers,
        pitch_shift=pitch_shift,
        tempo=tempo
    )
    
    # Step 5: Generate musical accompaniment based on mood
    accompaniment_midi_path = "temp_accompaniment.mid"
    
    # Map emotion to musical key and style
    emotion_key_map = {
        "Happy": "C",
        "Sad": "Am",
        "Angry": "Em",
        "Fear": "Dm",
        "Surprise": "G"
    }
    
    key = emotion_key_map.get(dominant_emotion, "C")
    style = "pop"  # Default style
    
    # Adjust tempo based on emotion if not explicitly set
    tempo_value = tempo
    
    try:
        # Try to generate the accompaniment MIDI
        generate_accompaniment(
            lyrics=text,
            melody_path=singing_audio_path,
            output_path=accompaniment_midi_path,
            tempo_value=tempo_value,
            key=key,
            time_signature="4/4",
            style=style
        )
    except Exception as e:
        print(f"Error generating accompaniment: {e}")
        # We'll handle this with the convert_midi_to_wav function that creates a placeholder
    
    # Convert MIDI to WAV
    accompaniment_path = "temp_accompaniment.wav"
    convert_midi_to_wav(accompaniment_midi_path, accompaniment_path)
    
    # Step 6: Mix singing voice with accompaniment
    final_output_path = "output_song.wav"
    
    # Load singing audio
    singing = AudioSegment.from_file(singing_audio_path)
    
    # Load accompaniment or create placeholder if loading fails
    try:
        accompaniment = AudioSegment.from_file(accompaniment_path)
    except Exception as e:
        print(f"Error loading accompaniment: {e}")
        create_placeholder_audio(accompaniment_path)
        accompaniment = AudioSegment.from_file(accompaniment_path)
    
    # Adjust volumes
    singing = singing - 3  # Reduce singing volume slightly
    accompaniment = accompaniment - 10  # Reduce accompaniment volume more
    
    # Make sure accompaniment is the same length as singing
    if len(accompaniment) < len(singing):
        # Loop accompaniment to match singing length
        times_to_repeat = (len(singing) / len(accompaniment)) + 1
        accompaniment = accompaniment * int(times_to_repeat)
    
    accompaniment = accompaniment[:len(singing)]
    
    # Mix tracks
    mixed = singing.overlay(accompaniment)
    mixed.export(final_output_path, format="wav")
    
    return speech_audio_path, final_output_path

# Create Gradio interface
with gr.Blocks(title="Text2Sing-DiffSinger") as demo:
    gr.Markdown("# Text2Sing-DiffSinger")
    gr.Markdown("Convert text into singing voice with musical accompaniment based on emotional content")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Enter text to convert to singing",
                placeholder="Type your lyrics here...",
                lines=5
            )
            
            with gr.Row():
                voice_type = gr.Dropdown(
                    label="Voice Type",
                    choices=["neutral", "feminine", "masculine"],
                    value="neutral"
                )
                tempo = gr.Slider(
                    label="Tempo (BPM)",
                    minimum=60,
                    maximum=180,
                    value=100,
                    step=5
                )
                pitch_shift = gr.Slider(
                    label="Pitch Adjustment",
                    minimum=-12,
                    maximum=12,
                    value=0,
                    step=1
                )
            
            convert_btn = gr.Button("Convert to Singing")
        
        with gr.Column():
            input_audio = gr.Audio(label="Original Speech")
            output_audio = gr.Audio(label="Singing Output")
    
    convert_btn.click(
        fn=process_text_to_singing,
        inputs=[text_input, voice_type, tempo, pitch_shift],
        outputs=[input_audio, output_audio]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()