| import os |
| import gradio as gr |
| import torch |
| import numpy as np |
| import librosa |
| import text2emotion as te |
| import nltk |
| import soundfile as sf |
| from pydub import AudioSegment |
| from transformers import pipeline |
| from music_generator import generate_accompaniment |
| from text_processor import TextProcessor |
| from voice_synthesizer import VoiceSynthesizer |
| from singing_converter import SingingConverter |
| import setup |
| import sys |
| import subprocess |
|
|
| nltk.download('punkt') |
| nltk.download('punkt_tab') |
| nltk.download('stopwords') |
| nltk.download('wordnet') |
| nltk.download('omw-1.4') |
| nltk.download('averaged_perceptron_tagger') |
| nltk.download('averaged_perceptron_tagger_eng') |
|
|
| sys.path.append('./g2pM') |
| sys.path.append('./DiffSinger') |
|
|
| setup.setup_speaker_embeddings() |
|
|
| |
| nltk.download('omw-1.4') |
| nltk.download('vader_lexicon') |
| nltk.download('punkt') |
|
|
| |
| text_processor = TextProcessor() |
| voice_synthesizer = VoiceSynthesizer() |
| singing_converter = SingingConverter() |
|
|
| |
| sentiment_analyzer = pipeline("sentiment-analysis") |
|
|
| def create_placeholder_audio(output_path, duration=5, sample_rate=22050): |
| """Create a placeholder silence audio file""" |
| silence = np.zeros(int(duration * sample_rate)) |
| sf.write(output_path, silence, sample_rate) |
| return output_path |
|
|
| def convert_midi_to_wav(midi_path, wav_path, soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2'): |
| """Convert MIDI file to WAV using fluidsynth""" |
| |
| if not os.path.exists(midi_path): |
| print(f"MIDI file not found: {midi_path}") |
| print("Creating placeholder audio file instead") |
| return create_placeholder_audio(wav_path) |
| |
| try: |
| |
| subprocess.run([ |
| 'fluidsynth', |
| '-a', 'file', |
| '-F', wav_path, |
| soundfont_path, |
| midi_path |
| ], check=True) |
| |
| return wav_path |
| except subprocess.CalledProcessError as e: |
| print(f"Error converting MIDI to WAV: {e}") |
| return create_placeholder_audio(wav_path) |
| except FileNotFoundError: |
| print("fluidsynth not found. Using placeholder audio instead.") |
| return create_placeholder_audio(wav_path) |
|
|
| def process_text_to_singing(text, voice_type="neutral", tempo=100, pitch_shift=0): |
| """ |
| Convert text to singing voice with accompaniment based on mood |
| |
| Args: |
| text (str): Input text to be converted to singing |
| voice_type (str): Type of voice (neutral, feminine, masculine) |
| tempo (int): Speed of the singing (60-180 BPM) |
| pitch_shift (int): Pitch adjustment (-12 to 12 semitones) |
| |
| Returns: |
| tuple: (input_audio_path, output_audio_path) |
| """ |
| |
| emotions = te.get_emotion(text) |
| dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0] if emotions else "Happy" |
| |
| |
| sentiment_result = sentiment_analyzer(text)[0] |
| sentiment_score = sentiment_result['score'] * (1 if sentiment_result['label'] == 'POSITIVE' else -1) |
| |
| print(f"Detected emotion: {dominant_emotion}") |
| print(f"Sentiment score: {sentiment_score}") |
| |
| |
| phonemes, durations, stress_markers = text_processor.process(text) |
| |
| |
| speech_audio_path = "temp_speech.wav" |
| voice_synthesizer.synthesize( |
| text=text, |
| output_path=speech_audio_path, |
| voice_type=voice_type |
| ) |
| |
| |
| singing_audio_path = "temp_singing.wav" |
| singing_converter.convert( |
| speech_path=speech_audio_path, |
| output_path=singing_audio_path, |
| emotion=dominant_emotion, |
| phonemes=phonemes, |
| durations=durations, |
| stress_markers=stress_markers, |
| pitch_shift=pitch_shift, |
| tempo=tempo |
| ) |
| |
| |
| accompaniment_midi_path = "temp_accompaniment.mid" |
| |
| |
| emotion_key_map = { |
| "Happy": "C", |
| "Sad": "Am", |
| "Angry": "Em", |
| "Fear": "Dm", |
| "Surprise": "G" |
| } |
| |
| key = emotion_key_map.get(dominant_emotion, "C") |
| style = "pop" |
| |
| |
| tempo_value = tempo |
| |
| try: |
| |
| generate_accompaniment( |
| lyrics=text, |
| melody_path=singing_audio_path, |
| output_path=accompaniment_midi_path, |
| tempo_value=tempo_value, |
| key=key, |
| time_signature="4/4", |
| style=style |
| ) |
| except Exception as e: |
| print(f"Error generating accompaniment: {e}") |
| |
| |
| |
| accompaniment_path = "temp_accompaniment.wav" |
| convert_midi_to_wav(accompaniment_midi_path, accompaniment_path) |
| |
| |
| final_output_path = "output_song.wav" |
| |
| |
| singing = AudioSegment.from_file(singing_audio_path) |
| |
| |
| try: |
| accompaniment = AudioSegment.from_file(accompaniment_path) |
| except Exception as e: |
| print(f"Error loading accompaniment: {e}") |
| create_placeholder_audio(accompaniment_path) |
| accompaniment = AudioSegment.from_file(accompaniment_path) |
| |
| |
| singing = singing - 3 |
| accompaniment = accompaniment - 10 |
| |
| |
| if len(accompaniment) < len(singing): |
| |
| times_to_repeat = (len(singing) / len(accompaniment)) + 1 |
| accompaniment = accompaniment * int(times_to_repeat) |
| |
| accompaniment = accompaniment[:len(singing)] |
| |
| |
| mixed = singing.overlay(accompaniment) |
| mixed.export(final_output_path, format="wav") |
| |
| return speech_audio_path, final_output_path |
|
|
| |
| with gr.Blocks(title="Text2Sing-DiffSinger") as demo: |
| gr.Markdown("# Text2Sing-DiffSinger") |
| gr.Markdown("Convert text into singing voice with musical accompaniment based on emotional content") |
| |
| with gr.Row(): |
| with gr.Column(): |
| text_input = gr.Textbox( |
| label="Enter text to convert to singing", |
| placeholder="Type your lyrics here...", |
| lines=5 |
| ) |
| |
| with gr.Row(): |
| voice_type = gr.Dropdown( |
| label="Voice Type", |
| choices=["neutral", "feminine", "masculine"], |
| value="neutral" |
| ) |
| tempo = gr.Slider( |
| label="Tempo (BPM)", |
| minimum=60, |
| maximum=180, |
| value=100, |
| step=5 |
| ) |
| pitch_shift = gr.Slider( |
| label="Pitch Adjustment", |
| minimum=-12, |
| maximum=12, |
| value=0, |
| step=1 |
| ) |
| |
| convert_btn = gr.Button("Convert to Singing") |
| |
| with gr.Column(): |
| input_audio = gr.Audio(label="Original Speech") |
| output_audio = gr.Audio(label="Singing Output") |
| |
| convert_btn.click( |
| fn=process_text_to_singing, |
| inputs=[text_input, voice_type, tempo, pitch_shift], |
| outputs=[input_audio, output_audio] |
| ) |
|
|
| |
| if __name__ == "__main__": |
| demo.launch() |