Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

App Files Files Community

Vaishnavi0404 commited on Apr 11, 2025

Commit

86e14e9

verified ·

1 Parent(s): c8e3569

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -11

app.py CHANGED Viewed

@@ -1,15 +1,159 @@
 import gradio as gr
-from diff_singer_infer import run_diffsinger_inference
-def convert_to_singing_style(input_audio):
-    return run_diffsinger_inference(input_audio)
-demo = gr.Interface(
-    fn=convert_to_singing_style,
-    inputs=gr.Audio(type="filepath", label="Input TTS + Music Audio"),
-    outputs=gr.Audio(type="filepath", label="Singing Style Output"),
-    title="🎤 Text2Sing - DiffSinger Inference",
-    description="Upload merged TTS + Music audio and convert it to expressive singing voice using pitch/vibrato modification."
-)
-demo.launch()

+import os
 import gradio as gr
+import torch
+import numpy as np
+import librosa
+import text2emotion as te
+import nltk
+import soundfile as sf
+from pydub import AudioSegment
+from transformers import pipeline
+from music_generator import generate_accompaniment
+from text_processor import TextProcessor
+from voice_synthesizer import VoiceSynthesizer
+from singing_converter import SingingConverter
+# Download necessary NLTK data
+nltk.download('omw-1.4')
+nltk.download('vader_lexicon')
+nltk.download('punkt')
+# Initialize components
+text_processor = TextProcessor()
+voice_synthesizer = VoiceSynthesizer()
+singing_converter = SingingConverter()
+# Setup sentiment analysis
+sentiment_analyzer = pipeline("sentiment-analysis")
+def process_text_to_singing(text, voice_type="neutral", tempo=100, pitch_shift=0):
+    """
+    Convert text to singing voice with accompaniment based on mood
+    Args:
+        text (str): Input text to be converted to singing
+        voice_type (str): Type of voice (neutral, feminine, masculine)
+        tempo (int): Speed of the singing (60-180 BPM)
+        pitch_shift (int): Pitch adjustment (-12 to 12 semitones)
+    Returns:
+        tuple: (input_audio_path, output_audio_path)
+    """
+    # Step 1: Analyze text for emotion/mood
+    emotions = te.get_emotion(text)
+    dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0]
+    # Additional sentiment analysis
+    sentiment_result = sentiment_analyzer(text)[0]
+    sentiment_score = sentiment_result['score'] * (1 if sentiment_result['label'] == 'POSITIVE' else -1)
+    print(f"Detected emotion: {dominant_emotion}")
+    print(f"Sentiment score: {sentiment_score}")
+    # Step 2: Process text for pronunciation and timing
+    phonemes, durations, stress_markers = text_processor.process(text)
+    # Step 3: Generate speech audio first
+    speech_audio_path = "temp_speech.wav"
+    voice_synthesizer.synthesize(
+        text=text,
+        output_path=speech_audio_path,
+        voice_type=voice_type
+    )
+    # Step 4: Convert speech to singing
+    singing_audio_path = "temp_singing.wav"
+    singing_converter.convert(
+        speech_path=speech_audio_path,
+        output_path=singing_audio_path,
+        emotion=dominant_emotion,
+        phonemes=phonemes,
+        durations=durations,
+        stress_markers=stress_markers,
+        pitch_shift=pitch_shift,
+        tempo=tempo
+    )
+    # Step 5: Generate musical accompaniment based on mood
+    accompaniment_path = "temp_accompaniment.wav"
+    generate_accompaniment(
+        emotion=dominant_emotion,
+        sentiment_score=sentiment_score,
+        tempo=tempo,
+        output_path=accompaniment_path
+    )
+    # Step 6: Mix singing voice with accompaniment
+    final_output_path = "output_song.wav"
+    # Load audio files
+    singing = AudioSegment.from_file(singing_audio_path)
+    accompaniment = AudioSegment.from_file(accompaniment_path)
+    # Adjust volumes
+    singing = singing - 3  # Reduce singing volume slightly
+    accompaniment = accompaniment - 10  # Reduce accompaniment volume more
+    # Make sure accompaniment is the same length as singing
+    if len(accompaniment) < len(singing):
+        # Loop accompaniment to match singing length
+        times_to_repeat = (len(singing) / len(accompaniment)) + 1
+        accompaniment = accompaniment * int(times_to_repeat)
+    accompaniment = accompaniment[:len(singing)]
+    # Mix tracks
+    mixed = singing.overlay(accompaniment)
+    mixed.export(final_output_path, format="wav")
+    return speech_audio_path, final_output_path
+# Create Gradio interface
+with gr.Blocks(title="Text2Sing-DiffSinger") as demo:
+    gr.Markdown("# Text2Sing-DiffSinger")
+    gr.Markdown("Convert text into singing voice with musical accompaniment based on emotional content")
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Enter text to convert to singing",
+                placeholder="Type your lyrics here...",
+                lines=5
+            )
+            with gr.Row():
+                voice_type = gr.Dropdown(
+                    label="Voice Type",
+                    choices=["neutral", "feminine", "masculine"],
+                    value="neutral"
+                )
+                tempo = gr.Slider(
+                    label="Tempo (BPM)",
+                    minimum=60,
+                    maximum=180,
+                    value=100,
+                    step=5
+                )
+                pitch_shift = gr.Slider(
+                    label="Pitch Adjustment",
+                    minimum=-12,
+                    maximum=12,
+                    value=0,
+                    step=1
+                )
+            convert_btn = gr.Button("Convert to Singing")
+        with gr.Column():
+            input_audio = gr.Audio(label="Original Speech")
+            output_audio = gr.Audio(label="Singing Output")
+    convert_btn.click(
+        fn=process_text_to_singing,
+        inputs=[text_input, voice_type, tempo, pitch_shift],
+        outputs=[input_audio, output_audio]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()