File size: 7,969 Bytes
86e14e9 b6e4f5d 86e14e9 c63d652 494f115 1973e0f 1dc5139 d60d010 848cf46 ab3ff38 494f115 cb441ff 2328878 c63d652 b6e4f5d 86e14e9 b6e4f5d 86e14e9 b6e4f5d 86e14e9 1973e0f 86e14e9 1973e0f 86e14e9 1973e0f 594632a 1973e0f 929facd 86e14e9 1973e0f 86e14e9 1973e0f 86e14e9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | import os
import gradio as gr
import torch
import numpy as np
import librosa
import text2emotion as te
import nltk
import soundfile as sf
from pydub import AudioSegment
from transformers import pipeline
from music_generator import generate_accompaniment
from text_processor import TextProcessor
from voice_synthesizer import VoiceSynthesizer
from singing_converter import SingingConverter
import setup
import sys
import subprocess
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
sys.path.append('./g2pM')
sys.path.append('./DiffSinger')
setup.setup_speaker_embeddings()
# Download necessary NLTK data
nltk.download('omw-1.4')
nltk.download('vader_lexicon')
nltk.download('punkt')
# Initialize components
text_processor = TextProcessor()
voice_synthesizer = VoiceSynthesizer()
singing_converter = SingingConverter()
# Setup sentiment analysis
sentiment_analyzer = pipeline("sentiment-analysis")
def create_placeholder_audio(output_path, duration=5, sample_rate=22050):
"""Create a placeholder silence audio file"""
silence = np.zeros(int(duration * sample_rate))
sf.write(output_path, silence, sample_rate)
return output_path
def convert_midi_to_wav(midi_path, wav_path, soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2'):
"""Convert MIDI file to WAV using fluidsynth"""
# Check if the MIDI file exists
if not os.path.exists(midi_path):
print(f"MIDI file not found: {midi_path}")
print("Creating placeholder audio file instead")
return create_placeholder_audio(wav_path)
try:
# Use fluidsynth to convert MIDI to WAV
subprocess.run([
'fluidsynth',
'-a', 'file',
'-F', wav_path,
soundfont_path,
midi_path
], check=True)
return wav_path
except subprocess.CalledProcessError as e:
print(f"Error converting MIDI to WAV: {e}")
return create_placeholder_audio(wav_path)
except FileNotFoundError:
print("fluidsynth not found. Using placeholder audio instead.")
return create_placeholder_audio(wav_path)
def process_text_to_singing(text, voice_type="neutral", tempo=100, pitch_shift=0):
"""
Convert text to singing voice with accompaniment based on mood
Args:
text (str): Input text to be converted to singing
voice_type (str): Type of voice (neutral, feminine, masculine)
tempo (int): Speed of the singing (60-180 BPM)
pitch_shift (int): Pitch adjustment (-12 to 12 semitones)
Returns:
tuple: (input_audio_path, output_audio_path)
"""
# Step 1: Analyze text for emotion/mood
emotions = te.get_emotion(text)
dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0] if emotions else "Happy"
# Additional sentiment analysis
sentiment_result = sentiment_analyzer(text)[0]
sentiment_score = sentiment_result['score'] * (1 if sentiment_result['label'] == 'POSITIVE' else -1)
print(f"Detected emotion: {dominant_emotion}")
print(f"Sentiment score: {sentiment_score}")
# Step 2: Process text for pronunciation and timing
phonemes, durations, stress_markers = text_processor.process(text)
# Step 3: Generate speech audio first
speech_audio_path = "temp_speech.wav"
voice_synthesizer.synthesize(
text=text,
output_path=speech_audio_path,
voice_type=voice_type
)
# Step 4: Convert speech to singing
singing_audio_path = "temp_singing.wav"
singing_converter.convert(
speech_path=speech_audio_path,
output_path=singing_audio_path,
emotion=dominant_emotion,
phonemes=phonemes,
durations=durations,
stress_markers=stress_markers,
pitch_shift=pitch_shift,
tempo=tempo
)
# Step 5: Generate musical accompaniment based on mood
accompaniment_midi_path = "temp_accompaniment.mid"
# Map emotion to musical key and style
emotion_key_map = {
"Happy": "C",
"Sad": "Am",
"Angry": "Em",
"Fear": "Dm",
"Surprise": "G"
}
key = emotion_key_map.get(dominant_emotion, "C")
style = "pop" # Default style
# Adjust tempo based on emotion if not explicitly set
tempo_value = tempo
try:
# Try to generate the accompaniment MIDI
generate_accompaniment(
lyrics=text,
melody_path=singing_audio_path,
output_path=accompaniment_midi_path,
tempo_value=tempo_value,
key=key,
time_signature="4/4",
style=style
)
except Exception as e:
print(f"Error generating accompaniment: {e}")
# We'll handle this with the convert_midi_to_wav function that creates a placeholder
# Convert MIDI to WAV
accompaniment_path = "temp_accompaniment.wav"
convert_midi_to_wav(accompaniment_midi_path, accompaniment_path)
# Step 6: Mix singing voice with accompaniment
final_output_path = "output_song.wav"
# Load singing audio
singing = AudioSegment.from_file(singing_audio_path)
# Load accompaniment or create placeholder if loading fails
try:
accompaniment = AudioSegment.from_file(accompaniment_path)
except Exception as e:
print(f"Error loading accompaniment: {e}")
create_placeholder_audio(accompaniment_path)
accompaniment = AudioSegment.from_file(accompaniment_path)
# Adjust volumes
singing = singing - 3 # Reduce singing volume slightly
accompaniment = accompaniment - 10 # Reduce accompaniment volume more
# Make sure accompaniment is the same length as singing
if len(accompaniment) < len(singing):
# Loop accompaniment to match singing length
times_to_repeat = (len(singing) / len(accompaniment)) + 1
accompaniment = accompaniment * int(times_to_repeat)
accompaniment = accompaniment[:len(singing)]
# Mix tracks
mixed = singing.overlay(accompaniment)
mixed.export(final_output_path, format="wav")
return speech_audio_path, final_output_path
# Create Gradio interface
with gr.Blocks(title="Text2Sing-DiffSinger") as demo:
gr.Markdown("# Text2Sing-DiffSinger")
gr.Markdown("Convert text into singing voice with musical accompaniment based on emotional content")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter text to convert to singing",
placeholder="Type your lyrics here...",
lines=5
)
with gr.Row():
voice_type = gr.Dropdown(
label="Voice Type",
choices=["neutral", "feminine", "masculine"],
value="neutral"
)
tempo = gr.Slider(
label="Tempo (BPM)",
minimum=60,
maximum=180,
value=100,
step=5
)
pitch_shift = gr.Slider(
label="Pitch Adjustment",
minimum=-12,
maximum=12,
value=0,
step=1
)
convert_btn = gr.Button("Convert to Singing")
with gr.Column():
input_audio = gr.Audio(label="Original Speech")
output_audio = gr.Audio(label="Singing Output")
convert_btn.click(
fn=process_text_to_singing,
inputs=[text_input, voice_type, tempo, pitch_shift],
outputs=[input_audio, output_audio]
)
# Launch the app
if __name__ == "__main__":
demo.launch() |