Soprano-TTS

Sleeping

App Files Files Community

ekwek commited on Jan 13

Commit

c2c4056

verified ·

1 Parent(s): 7a89eff

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -29

app.py CHANGED Viewed

@@ -18,8 +18,8 @@ def load_model():
         model = SopranoTTS(
             backend="auto",
             device=DEVICE,
-            cache_size_mb=100,
-            decoder_batch_size=1,
         )
     return model
@@ -31,18 +31,22 @@ def generate_speech(
     temperature: float = 0.3,
     top_p: float = 0.95,
     repetition_penalty: float = 1.2,
-) -> tuple:
     """
     Runs Soprano text-to-speech model with the given input text and sampling parameters.
     Returns:
        ((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
     """
     if not text.strip():
-        return None, "Please enter some text to generate speech."
-    print(text)
     try:
         model = load_model()
         start_time = time.perf_counter()
         audio = model.infer(
@@ -66,31 +70,29 @@ def generate_speech(
             f"({rtf:.2f}x realtime)"
         )
-        return (SAMPLE_RATE, audio_int16), status
     except Exception as e:
-        return None, f"✗ Error: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Soprano TTS") as demo:
     gr.Markdown(
         f"""
 # 🗣️ Soprano TTS
 **Running on: {DEVICE.upper()}**
-Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time,
-high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency**
-and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.
 **GitHub:** https://github.com/ekwek1/soprano
 **Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
-**Model Weights:** https://huggingface.co/ekwek/Soprano-80M
 """
     )
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
@@ -100,7 +102,6 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
                 lines=5,
                 max_lines=10,
             )
             with gr.Accordion("Advanced Settings", open=False):
                 temperature = gr.Slider(
                     minimum=0.1,
@@ -109,7 +110,6 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
                     step=0.05,
                     label="Temperature",
                 )
                 top_p = gr.Slider(
                     minimum=0.5,
                     maximum=1.0,
@@ -117,7 +117,6 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
                     step=0.05,
                     label="Top P",
                 )
                 repetition_penalty = gr.Slider(
                     minimum=1.0,
                     maximum=2.0,
@@ -125,34 +124,29 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
                     step=0.1,
                     label="Repetition Penalty",
                 )
             generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
                 label="Generated Speech",
                 type="numpy",
                 autoplay=True,
             )
             status_output = gr.Textbox(
                 label="Status",
                 interactive=False,
                 lines=3,
                 max_lines=10
             )
     gr.Examples(
         examples=[
             ["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2],
-            ["Hello! Welcome to Soprano text to speech.", 0.3, 0.95, 1.2],
-            ["The quick brown fox jumps over the lazy dog.", 0.3, 0.95, 1.2],
             ["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2],
         ],
         inputs=[text_input, temperature, top_p, repetition_penalty],
         label="Example Prompts",
     )
     generate_btn.click(
         fn=generate_speech,
         inputs=[text_input, temperature, top_p, repetition_penalty],
@@ -161,18 +155,19 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
     gr.Markdown(
         f"""
 ### Usage tips:
 - Soprano works best when each sentence is between 2 and 15 seconds long.
 - Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
-  Best results can be achieved by converting these into their phonetic form.
-  (1+1 -> one plus one, etc)
 - If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
-  You may also change the sampling settings for more varied results.
 - Avoid improper grammar such as not using contractions, multiple spaces, etc.
 """
     )
 def main():
     demo.launch(
         mcp_server=True,
         theme=gr.themes.Soft(primary_hue="green"),

         model = SopranoTTS(
             backend="auto",
             device=DEVICE,
+            cache_size_mb=10000,
+            decoder_batch_size=8,
         )
     return model
     temperature: float = 0.3,
     top_p: float = 0.95,
     repetition_penalty: float = 1.2,
+):
     """
     Runs Soprano text-to-speech model with the given input text and sampling parameters.
     Returns:
        ((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
     """
     if not text.strip():
+        yield None, "Please enter some text to generate speech."
+        return
+    try: print(text.split('\n')[0])
+    except: pass
     try:
+        yield None, "⏳ Loading model..."
         model = load_model()
+        yield None, "⏳ Generating audio..."
         start_time = time.perf_counter()
         audio = model.infer(
             f"({rtf:.2f}x realtime)"
         )
+        yield (SAMPLE_RATE, audio_int16), status
+        return
     except Exception as e:
+        yield None, f"✗ Error: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Soprano TTS") as demo:
     gr.Markdown(
         f"""
 # 🗣️ Soprano TTS
+<div align="center">
+<img width="300" height="300" alt="soprano-github" src="https://github.com/user-attachments/assets/4d612eac-23b8-44e6-8c59-d7ac14ebafd1" />
+</div>
 **Running on: {DEVICE.upper()}**
 **GitHub:** https://github.com/ekwek1/soprano
+**Model Weights:** https://huggingface.co/ekwek/Soprano-80M
 **Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
 """
     )
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
                 lines=5,
                 max_lines=10,
             )
             with gr.Accordion("Advanced Settings", open=False):
                 temperature = gr.Slider(
                     minimum=0.1,
                     step=0.05,
                     label="Temperature",
                 )
                 top_p = gr.Slider(
                     minimum=0.5,
                     maximum=1.0,
                     step=0.05,
                     label="Top P",
                 )
                 repetition_penalty = gr.Slider(
                     minimum=1.0,
                     maximum=2.0,
                     step=0.1,
                     label="Repetition Penalty",
                 )
             generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
                 label="Generated Speech",
                 type="numpy",
                 autoplay=True,
             )
             status_output = gr.Textbox(
                 label="Status",
                 interactive=False,
                 lines=3,
                 max_lines=10
             )
     gr.Examples(
         examples=[
             ["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2],
             ["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2],
+            ["I'm so excited, I can't even wait!", 0.3, 0.95, 1.2],
+            ["Why don't you go ahead and try it?", 0.3, 0.95, 1.2],
         ],
         inputs=[text_input, temperature, top_p, repetition_penalty],
         label="Example Prompts",
     )
     generate_btn.click(
         fn=generate_speech,
         inputs=[text_input, temperature, top_p, repetition_penalty],
     gr.Markdown(
         f"""
 ### Usage tips:
+- Note: Soprano is currently **English-only**. Other languages are not guaranteed to work.
 - Soprano works best when each sentence is between 2 and 15 seconds long.
 - Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
+Best results can be achieved by converting these into their phonetic form.
+(1+1 -> one plus one, etc)
 - If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
+You may also change the sampling settings for more varied results.
 - Avoid improper grammar such as not using contractions, multiple spaces, etc.
 """
     )
 def main():
+    # Start Gradio interface
     demo.launch(
         mcp_server=True,
         theme=gr.themes.Soft(primary_hue="green"),