import os import gradio as gr import torch import numpy as np import librosa import text2emotion as te import nltk import soundfile as sf from pydub import AudioSegment from transformers import pipeline from music_generator import generate_accompaniment from text_processor import TextProcessor from voice_synthesizer import VoiceSynthesizer from singing_converter import SingingConverter import setup import sys import subprocess nltk.download('punkt') nltk.download('punkt_tab') nltk.download('stopwords') nltk.download('wordnet') nltk.download('omw-1.4') nltk.download('averaged_perceptron_tagger') nltk.download('averaged_perceptron_tagger_eng') sys.path.append('./g2pM') sys.path.append('./DiffSinger') setup.setup_speaker_embeddings() # Download necessary NLTK data nltk.download('omw-1.4') nltk.download('vader_lexicon') nltk.download('punkt') # Initialize components text_processor = TextProcessor() voice_synthesizer = VoiceSynthesizer() singing_converter = SingingConverter() # Setup sentiment analysis sentiment_analyzer = pipeline("sentiment-analysis") def create_placeholder_audio(output_path, duration=5, sample_rate=22050): """Create a placeholder silence audio file""" silence = np.zeros(int(duration * sample_rate)) sf.write(output_path, silence, sample_rate) return output_path def convert_midi_to_wav(midi_path, wav_path, soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2'): """Convert MIDI file to WAV using fluidsynth""" # Check if the MIDI file exists if not os.path.exists(midi_path): print(f"MIDI file not found: {midi_path}") print("Creating placeholder audio file instead") return create_placeholder_audio(wav_path) try: # Use fluidsynth to convert MIDI to WAV subprocess.run([ 'fluidsynth', '-a', 'file', '-F', wav_path, soundfont_path, midi_path ], check=True) return wav_path except subprocess.CalledProcessError as e: print(f"Error converting MIDI to WAV: {e}") return create_placeholder_audio(wav_path) except FileNotFoundError: print("fluidsynth not found. Using placeholder audio instead.") return create_placeholder_audio(wav_path) def process_text_to_singing(text, voice_type="neutral", tempo=100, pitch_shift=0): """ Convert text to singing voice with accompaniment based on mood Args: text (str): Input text to be converted to singing voice_type (str): Type of voice (neutral, feminine, masculine) tempo (int): Speed of the singing (60-180 BPM) pitch_shift (int): Pitch adjustment (-12 to 12 semitones) Returns: tuple: (input_audio_path, output_audio_path) """ # Step 1: Analyze text for emotion/mood emotions = te.get_emotion(text) dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0] if emotions else "Happy" # Additional sentiment analysis sentiment_result = sentiment_analyzer(text)[0] sentiment_score = sentiment_result['score'] * (1 if sentiment_result['label'] == 'POSITIVE' else -1) print(f"Detected emotion: {dominant_emotion}") print(f"Sentiment score: {sentiment_score}") # Step 2: Process text for pronunciation and timing phonemes, durations, stress_markers = text_processor.process(text) # Step 3: Generate speech audio first speech_audio_path = "temp_speech.wav" voice_synthesizer.synthesize( text=text, output_path=speech_audio_path, voice_type=voice_type ) # Step 4: Convert speech to singing singing_audio_path = "temp_singing.wav" singing_converter.convert( speech_path=speech_audio_path, output_path=singing_audio_path, emotion=dominant_emotion, phonemes=phonemes, durations=durations, stress_markers=stress_markers, pitch_shift=pitch_shift, tempo=tempo ) # Step 5: Generate musical accompaniment based on mood accompaniment_midi_path = "temp_accompaniment.mid" # Map emotion to musical key and style emotion_key_map = { "Happy": "C", "Sad": "Am", "Angry": "Em", "Fear": "Dm", "Surprise": "G" } key = emotion_key_map.get(dominant_emotion, "C") style = "pop" # Default style # Adjust tempo based on emotion if not explicitly set tempo_value = tempo try: # Try to generate the accompaniment MIDI generate_accompaniment( lyrics=text, melody_path=singing_audio_path, output_path=accompaniment_midi_path, tempo_value=tempo_value, key=key, time_signature="4/4", style=style ) except Exception as e: print(f"Error generating accompaniment: {e}") # We'll handle this with the convert_midi_to_wav function that creates a placeholder # Convert MIDI to WAV accompaniment_path = "temp_accompaniment.wav" convert_midi_to_wav(accompaniment_midi_path, accompaniment_path) # Step 6: Mix singing voice with accompaniment final_output_path = "output_song.wav" # Load singing audio singing = AudioSegment.from_file(singing_audio_path) # Load accompaniment or create placeholder if loading fails try: accompaniment = AudioSegment.from_file(accompaniment_path) except Exception as e: print(f"Error loading accompaniment: {e}") create_placeholder_audio(accompaniment_path) accompaniment = AudioSegment.from_file(accompaniment_path) # Adjust volumes singing = singing - 3 # Reduce singing volume slightly accompaniment = accompaniment - 10 # Reduce accompaniment volume more # Make sure accompaniment is the same length as singing if len(accompaniment) < len(singing): # Loop accompaniment to match singing length times_to_repeat = (len(singing) / len(accompaniment)) + 1 accompaniment = accompaniment * int(times_to_repeat) accompaniment = accompaniment[:len(singing)] # Mix tracks mixed = singing.overlay(accompaniment) mixed.export(final_output_path, format="wav") return speech_audio_path, final_output_path # Create Gradio interface with gr.Blocks(title="Text2Sing-DiffSinger") as demo: gr.Markdown("# Text2Sing-DiffSinger") gr.Markdown("Convert text into singing voice with musical accompaniment based on emotional content") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Enter text to convert to singing", placeholder="Type your lyrics here...", lines=5 ) with gr.Row(): voice_type = gr.Dropdown( label="Voice Type", choices=["neutral", "feminine", "masculine"], value="neutral" ) tempo = gr.Slider( label="Tempo (BPM)", minimum=60, maximum=180, value=100, step=5 ) pitch_shift = gr.Slider( label="Pitch Adjustment", minimum=-12, maximum=12, value=0, step=1 ) convert_btn = gr.Button("Convert to Singing") with gr.Column(): input_audio = gr.Audio(label="Original Speech") output_audio = gr.Audio(label="Singing Output") convert_btn.click( fn=process_text_to_singing, inputs=[text_input, voice_type, tempo, pitch_shift], outputs=[input_audio, output_audio] ) # Launch the app if __name__ == "__main__": demo.launch()