ak6868674 commited on
Commit
ba530c7
·
verified ·
1 Parent(s): 48ea4e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -13
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import torch
3
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
4
- from datasets import load_dataset
5
  import soundfile as sf
6
  from pydub import AudioSegment
7
  import os
@@ -12,23 +11,29 @@ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
13
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
14
 
15
- # Speaker embeddings
16
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
17
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
18
 
19
- # Rain sound
20
  DEFAULT_RAIN = "rain.mp3"
21
  RAIN_URL = "https://cdn.pixabay.com/download/audio/2022/03/15/audio_7e9f0b47b6.mp3?filename=gentle-rain-ambient-11022.mp3"
22
 
23
  if not os.path.exists(DEFAULT_RAIN):
24
- r = requests.get(RAIN_URL)
25
- with open(DEFAULT_RAIN, "wb") as f:
26
- f.write(r.content)
 
 
 
27
 
28
  def generate_audio(prompt, emotion, speed, background_audio):
29
  if not prompt:
30
  raise gr.Error("Text cannot be empty.")
31
 
 
 
 
 
32
  inputs = processor(text=prompt, return_tensors="pt")
33
  with torch.no_grad():
34
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
@@ -36,13 +41,16 @@ def generate_audio(prompt, emotion, speed, background_audio):
36
  temp_wav = "voice.wav"
37
  sf.write(temp_wav, speech.numpy(), samplerate=16000)
38
 
39
- # Overlay rain
40
  final_audio = AudioSegment.from_file(temp_wav)
 
 
41
  if speed != 1.0:
42
  final_audio = final_audio._spawn(final_audio.raw_data, overrides={
43
  "frame_rate": int(final_audio.frame_rate * speed)
44
  }).set_frame_rate(final_audio.frame_rate)
45
 
 
46
  try:
47
  if background_audio:
48
  bg = AudioSegment.from_file(background_audio).apply_gain(-20)
@@ -51,17 +59,20 @@ def generate_audio(prompt, emotion, speed, background_audio):
51
  bg = bg[:len(final_audio)]
52
  final_audio = final_audio.overlay(bg)
53
  except Exception as e:
54
- print(f"Background failed: {e}")
55
 
56
  output_path = "final_output.mp3"
57
  final_audio.export(output_path, format="mp3")
58
- return output_path, f"Generated with SpeechT5 + ASMR rain"
59
 
 
60
  with gr.Blocks() as app:
61
- gr.Markdown("# 🎧 Midnight History ASMR TTS (SpeechT5)")
 
 
62
  with gr.Row():
63
  with gr.Column():
64
- text_input = gr.Textbox(label="Enter Text", lines=8)
65
  emotion_choice = gr.Dropdown(["calm", "neutral"], value="calm", label="Emotion")
66
  speed_slider = gr.Slider(0.7, 1.3, value=0.9, step=0.05, label="Speed")
67
  bg_audio = gr.Audio(label="Upload Background (Optional)", type="filepath")
 
1
  import gradio as gr
2
  import torch
3
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
4
  import soundfile as sf
5
  from pydub import AudioSegment
6
  import os
 
11
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
+ # Generate a random but fixed speaker embedding
15
+ speaker_embeddings = torch.rand(1, 512)
 
16
 
17
+ # Rain background sound
18
  DEFAULT_RAIN = "rain.mp3"
19
  RAIN_URL = "https://cdn.pixabay.com/download/audio/2022/03/15/audio_7e9f0b47b6.mp3?filename=gentle-rain-ambient-11022.mp3"
20
 
21
  if not os.path.exists(DEFAULT_RAIN):
22
+ try:
23
+ r = requests.get(RAIN_URL)
24
+ with open(DEFAULT_RAIN, "wb") as f:
25
+ f.write(r.content)
26
+ except Exception as e:
27
+ print(f"Error downloading rain: {e}")
28
 
29
  def generate_audio(prompt, emotion, speed, background_audio):
30
  if not prompt:
31
  raise gr.Error("Text cannot be empty.")
32
 
33
+ # Add ASMR effect for calm emotion
34
+ if emotion == "calm":
35
+ prompt = "... " + prompt.replace(".", "... ")
36
+
37
  inputs = processor(text=prompt, return_tensors="pt")
38
  with torch.no_grad():
39
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
 
41
  temp_wav = "voice.wav"
42
  sf.write(temp_wav, speech.numpy(), samplerate=16000)
43
 
44
+ # Load audio and apply adjustments
45
  final_audio = AudioSegment.from_file(temp_wav)
46
+
47
+ # Adjust speed for ASMR
48
  if speed != 1.0:
49
  final_audio = final_audio._spawn(final_audio.raw_data, overrides={
50
  "frame_rate": int(final_audio.frame_rate * speed)
51
  }).set_frame_rate(final_audio.frame_rate)
52
 
53
+ # Add background rain or user-uploaded audio
54
  try:
55
  if background_audio:
56
  bg = AudioSegment.from_file(background_audio).apply_gain(-20)
 
59
  bg = bg[:len(final_audio)]
60
  final_audio = final_audio.overlay(bg)
61
  except Exception as e:
62
+ print(f"Background merge failed: {e}")
63
 
64
  output_path = "final_output.mp3"
65
  final_audio.export(output_path, format="mp3")
66
+ return output_path, " Audio generated successfully!"
67
 
68
+ # Gradio UI
69
  with gr.Blocks() as app:
70
+ gr.Markdown("# 🎧 Midnight History ASMR TTS")
71
+ gr.Markdown("Convert your text into soothing ASMR audio with background rain.")
72
+
73
  with gr.Row():
74
  with gr.Column():
75
+ text_input = gr.Textbox(label="Enter Text", placeholder="Paste your script...", lines=8)
76
  emotion_choice = gr.Dropdown(["calm", "neutral"], value="calm", label="Emotion")
77
  speed_slider = gr.Slider(0.7, 1.3, value=0.9, step=0.05, label="Speed")
78
  bg_audio = gr.Audio(label="Upload Background (Optional)", type="filepath")