Spaces:

ak6868674
/

TTS-ASMR

Sleeping

App Files Files Community

ak6868674 commited on Jul 23, 2025

Commit

ba530c7

verified ·

1 Parent(s): 48ea4e0

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -13

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import torch
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from datasets import load_dataset
 import soundfile as sf
 from pydub import AudioSegment
 import os
@@ -12,23 +11,29 @@ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-# Speaker embeddings
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-# Rain sound
 DEFAULT_RAIN = "rain.mp3"
 RAIN_URL = "https://cdn.pixabay.com/download/audio/2022/03/15/audio_7e9f0b47b6.mp3?filename=gentle-rain-ambient-11022.mp3"
 if not os.path.exists(DEFAULT_RAIN):
-    r = requests.get(RAIN_URL)
-    with open(DEFAULT_RAIN, "wb") as f:
-        f.write(r.content)
 def generate_audio(prompt, emotion, speed, background_audio):
     if not prompt:
         raise gr.Error("Text cannot be empty.")
     inputs = processor(text=prompt, return_tensors="pt")
     with torch.no_grad():
         speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
@@ -36,13 +41,16 @@ def generate_audio(prompt, emotion, speed, background_audio):
     temp_wav = "voice.wav"
     sf.write(temp_wav, speech.numpy(), samplerate=16000)
-    # Overlay rain
     final_audio = AudioSegment.from_file(temp_wav)
     if speed != 1.0:
         final_audio = final_audio._spawn(final_audio.raw_data, overrides={
             "frame_rate": int(final_audio.frame_rate * speed)
         }).set_frame_rate(final_audio.frame_rate)
     try:
         if background_audio:
             bg = AudioSegment.from_file(background_audio).apply_gain(-20)
@@ -51,17 +59,20 @@ def generate_audio(prompt, emotion, speed, background_audio):
         bg = bg[:len(final_audio)]
         final_audio = final_audio.overlay(bg)
     except Exception as e:
-        print(f"Background failed: {e}")
     output_path = "final_output.mp3"
     final_audio.export(output_path, format="mp3")
-    return output_path, f"Generated with SpeechT5 + ASMR rain"
 with gr.Blocks() as app:
-    gr.Markdown("# 🎧 Midnight History ASMR TTS (SpeechT5)")
     with gr.Row():
         with gr.Column():
-            text_input = gr.Textbox(label="Enter Text", lines=8)
             emotion_choice = gr.Dropdown(["calm", "neutral"], value="calm", label="Emotion")
             speed_slider = gr.Slider(0.7, 1.3, value=0.9, step=0.05, label="Speed")
             bg_audio = gr.Audio(label="Upload Background (Optional)", type="filepath")

 import gradio as gr
 import torch
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import soundfile as sf
 from pydub import AudioSegment
 import os
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+# Generate a random but fixed speaker embedding
+speaker_embeddings = torch.rand(1, 512)
+# Rain background sound
 DEFAULT_RAIN = "rain.mp3"
 RAIN_URL = "https://cdn.pixabay.com/download/audio/2022/03/15/audio_7e9f0b47b6.mp3?filename=gentle-rain-ambient-11022.mp3"
 if not os.path.exists(DEFAULT_RAIN):
+    try:
+        r = requests.get(RAIN_URL)
+        with open(DEFAULT_RAIN, "wb") as f:
+            f.write(r.content)
+    except Exception as e:
+        print(f"Error downloading rain: {e}")
 def generate_audio(prompt, emotion, speed, background_audio):
     if not prompt:
         raise gr.Error("Text cannot be empty.")
+    # Add ASMR effect for calm emotion
+    if emotion == "calm":
+        prompt = "... " + prompt.replace(".", "... ")
     inputs = processor(text=prompt, return_tensors="pt")
     with torch.no_grad():
         speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     temp_wav = "voice.wav"
     sf.write(temp_wav, speech.numpy(), samplerate=16000)
+    # Load audio and apply adjustments
     final_audio = AudioSegment.from_file(temp_wav)
+    # Adjust speed for ASMR
     if speed != 1.0:
         final_audio = final_audio._spawn(final_audio.raw_data, overrides={
             "frame_rate": int(final_audio.frame_rate * speed)
         }).set_frame_rate(final_audio.frame_rate)
+    # Add background rain or user-uploaded audio
     try:
         if background_audio:
             bg = AudioSegment.from_file(background_audio).apply_gain(-20)
         bg = bg[:len(final_audio)]
         final_audio = final_audio.overlay(bg)
     except Exception as e:
+        print(f"Background merge failed: {e}")
     output_path = "final_output.mp3"
     final_audio.export(output_path, format="mp3")
+    return output_path, "✅ Audio generated successfully!"
+# Gradio UI
 with gr.Blocks() as app:
+    gr.Markdown("# 🎧 Midnight History ASMR TTS")
+    gr.Markdown("Convert your text into soothing ASMR audio with background rain.")
     with gr.Row():
         with gr.Column():
+            text_input = gr.Textbox(label="Enter Text", placeholder="Paste your script...", lines=8)
             emotion_choice = gr.Dropdown(["calm", "neutral"], value="calm", label="Emotion")
             speed_slider = gr.Slider(0.7, 1.3, value=0.9, step=0.05, label="Speed")
             bg_audio = gr.Audio(label="Upload Background (Optional)", type="filepath")