File size: 7,969 Bytes
86e14e9
b6e4f5d
86e14e9
 
 
 
 
 
 
 
 
 
 
 
c63d652
494f115
1973e0f
 
1dc5139
d60d010
 
 
 
848cf46
ab3ff38
494f115
cb441ff
 
2328878
c63d652
b6e4f5d
86e14e9
 
 
 
b6e4f5d
86e14e9
 
 
 
b6e4f5d
86e14e9
 
 
1973e0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e14e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1973e0f
86e14e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1973e0f
594632a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1973e0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
929facd
 
86e14e9
 
 
 
1973e0f
86e14e9
1973e0f
 
 
 
 
 
 
 
86e14e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import os
import gradio as gr
import torch
import numpy as np
import librosa
import text2emotion as te
import nltk
import soundfile as sf
from pydub import AudioSegment
from transformers import pipeline
from music_generator import generate_accompaniment
from text_processor import TextProcessor
from voice_synthesizer import VoiceSynthesizer
from singing_converter import SingingConverter
import setup
import sys
import subprocess

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger') 
nltk.download('averaged_perceptron_tagger_eng')

sys.path.append('./g2pM')
sys.path.append('./DiffSinger')

setup.setup_speaker_embeddings()

# Download necessary NLTK data
nltk.download('omw-1.4')
nltk.download('vader_lexicon')
nltk.download('punkt')

# Initialize components
text_processor = TextProcessor()
voice_synthesizer = VoiceSynthesizer()
singing_converter = SingingConverter()

# Setup sentiment analysis
sentiment_analyzer = pipeline("sentiment-analysis")

def create_placeholder_audio(output_path, duration=5, sample_rate=22050):
    """Create a placeholder silence audio file"""
    silence = np.zeros(int(duration * sample_rate))
    sf.write(output_path, silence, sample_rate)
    return output_path

def convert_midi_to_wav(midi_path, wav_path, soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2'):
    """Convert MIDI file to WAV using fluidsynth"""
    # Check if the MIDI file exists
    if not os.path.exists(midi_path):
        print(f"MIDI file not found: {midi_path}")
        print("Creating placeholder audio file instead")
        return create_placeholder_audio(wav_path)
        
    try:
        # Use fluidsynth to convert MIDI to WAV
        subprocess.run([
            'fluidsynth', 
            '-a', 'file', 
            '-F', wav_path, 
            soundfont_path, 
            midi_path
        ], check=True)
        
        return wav_path
    except subprocess.CalledProcessError as e:
        print(f"Error converting MIDI to WAV: {e}")
        return create_placeholder_audio(wav_path)
    except FileNotFoundError:
        print("fluidsynth not found. Using placeholder audio instead.")
        return create_placeholder_audio(wav_path)

def process_text_to_singing(text, voice_type="neutral", tempo=100, pitch_shift=0):
    """
    Convert text to singing voice with accompaniment based on mood
    
    Args:
        text (str): Input text to be converted to singing
        voice_type (str): Type of voice (neutral, feminine, masculine)
        tempo (int): Speed of the singing (60-180 BPM)
        pitch_shift (int): Pitch adjustment (-12 to 12 semitones)
    
    Returns:
        tuple: (input_audio_path, output_audio_path)
    """
    # Step 1: Analyze text for emotion/mood
    emotions = te.get_emotion(text)
    dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0] if emotions else "Happy"
    
    # Additional sentiment analysis
    sentiment_result = sentiment_analyzer(text)[0]
    sentiment_score = sentiment_result['score'] * (1 if sentiment_result['label'] == 'POSITIVE' else -1)
    
    print(f"Detected emotion: {dominant_emotion}")
    print(f"Sentiment score: {sentiment_score}")
    
    # Step 2: Process text for pronunciation and timing
    phonemes, durations, stress_markers = text_processor.process(text)
    
    # Step 3: Generate speech audio first
    speech_audio_path = "temp_speech.wav"
    voice_synthesizer.synthesize(
        text=text, 
        output_path=speech_audio_path, 
        voice_type=voice_type
    )
    
    # Step 4: Convert speech to singing
    singing_audio_path = "temp_singing.wav"
    singing_converter.convert(
        speech_path=speech_audio_path,
        output_path=singing_audio_path,
        emotion=dominant_emotion,
        phonemes=phonemes,
        durations=durations,
        stress_markers=stress_markers,
        pitch_shift=pitch_shift,
        tempo=tempo
    )
    
    # Step 5: Generate musical accompaniment based on mood
    accompaniment_midi_path = "temp_accompaniment.mid"
    
    # Map emotion to musical key and style
    emotion_key_map = {
        "Happy": "C",
        "Sad": "Am",
        "Angry": "Em",
        "Fear": "Dm",
        "Surprise": "G"
    }
    
    key = emotion_key_map.get(dominant_emotion, "C")
    style = "pop"  # Default style
    
    # Adjust tempo based on emotion if not explicitly set
    tempo_value = tempo
    
    try:
        # Try to generate the accompaniment MIDI
        generate_accompaniment(
            lyrics=text,
            melody_path=singing_audio_path,
            output_path=accompaniment_midi_path,
            tempo_value=tempo_value,
            key=key,
            time_signature="4/4",
            style=style
        )
    except Exception as e:
        print(f"Error generating accompaniment: {e}")
        # We'll handle this with the convert_midi_to_wav function that creates a placeholder
    
    # Convert MIDI to WAV
    accompaniment_path = "temp_accompaniment.wav"
    convert_midi_to_wav(accompaniment_midi_path, accompaniment_path)
    
    # Step 6: Mix singing voice with accompaniment
    final_output_path = "output_song.wav"
    
    # Load singing audio
    singing = AudioSegment.from_file(singing_audio_path)
    
    # Load accompaniment or create placeholder if loading fails
    try:
        accompaniment = AudioSegment.from_file(accompaniment_path)
    except Exception as e:
        print(f"Error loading accompaniment: {e}")
        create_placeholder_audio(accompaniment_path)
        accompaniment = AudioSegment.from_file(accompaniment_path)
    
    # Adjust volumes
    singing = singing - 3  # Reduce singing volume slightly
    accompaniment = accompaniment - 10  # Reduce accompaniment volume more
    
    # Make sure accompaniment is the same length as singing
    if len(accompaniment) < len(singing):
        # Loop accompaniment to match singing length
        times_to_repeat = (len(singing) / len(accompaniment)) + 1
        accompaniment = accompaniment * int(times_to_repeat)
    
    accompaniment = accompaniment[:len(singing)]
    
    # Mix tracks
    mixed = singing.overlay(accompaniment)
    mixed.export(final_output_path, format="wav")
    
    return speech_audio_path, final_output_path

# Create Gradio interface
with gr.Blocks(title="Text2Sing-DiffSinger") as demo:
    gr.Markdown("# Text2Sing-DiffSinger")
    gr.Markdown("Convert text into singing voice with musical accompaniment based on emotional content")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Enter text to convert to singing",
                placeholder="Type your lyrics here...",
                lines=5
            )
            
            with gr.Row():
                voice_type = gr.Dropdown(
                    label="Voice Type",
                    choices=["neutral", "feminine", "masculine"],
                    value="neutral"
                )
                tempo = gr.Slider(
                    label="Tempo (BPM)",
                    minimum=60,
                    maximum=180,
                    value=100,
                    step=5
                )
                pitch_shift = gr.Slider(
                    label="Pitch Adjustment",
                    minimum=-12,
                    maximum=12,
                    value=0,
                    step=1
                )
            
            convert_btn = gr.Button("Convert to Singing")
        
        with gr.Column():
            input_audio = gr.Audio(label="Original Speech")
            output_audio = gr.Audio(label="Singing Output")
    
    convert_btn.click(
        fn=process_text_to_singing,
        inputs=[text_input, voice_type, tempo, pitch_shift],
        outputs=[input_audio, output_audio]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()