nambn0321 commited on
Commit
c577d87
Β·
verified Β·
1 Parent(s): f492d3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -13
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import torch
2
  import gradio as gr
3
  import torchaudio
4
- # from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
6
  from transformers.models.speecht5 import SpeechT5HifiGan
7
 
@@ -15,7 +14,6 @@ vocoder = vocoder.to(device)
15
 
16
  speaker_embedding = torch.zeros(1, 512).to(device)
17
 
18
- # Load model and processor
19
  # processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5_4")
20
  # model = SpeechT5ForTextToSpeech.from_pretrained(
21
  # "nambn0321/TTS_with_T5_4",
@@ -24,12 +22,10 @@ speaker_embedding = torch.zeros(1, 512).to(device)
24
  # )
25
  # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
26
 
27
- # # Move to CUDA if available
28
  # device = "cuda" if torch.cuda.is_available() else "cpu"
29
  # model = model.to(device)
30
  # vocoder = vocoder.to(device)
31
 
32
- # # # Dummy speaker embedding (or load your real one here)
33
  # speaker_embedding = torch.tensor([[-0.06632216, -0.02325863, 0.04376163, 0.01112046, -0.02864115,
34
  # -0.03048201, -0.04865832, 0.00598873, 0.03105048, 0.01635859,
35
  # -0.07552029, -0.09258246, 0.04839027, 0.04307159, 0.05019059,
@@ -135,37 +131,30 @@ speaker_embedding = torch.zeros(1, 512).to(device)
135
  # 0.02549847, -0.06043207]]).to(device)
136
 
137
  def tts_generate(text):
138
- print(f"πŸ“ Input text: {text}")
139
  try:
140
  # Preprocess input
141
- print("πŸ”„ Processing input...")
142
  inputs = processor(text=text, return_tensors="pt").to(device)
143
- print("βœ… Text processed.")
144
 
145
  # Generate waveform directly (with vocoder)
146
- print("🎀 Generating speech waveform...")
147
  with torch.no_grad():
148
  waveform = model.generate_speech(
149
  inputs["input_ids"],
150
  speaker_embedding,
151
  vocoder=vocoder
152
  )
153
- print("βœ… Waveform generated.")
154
 
155
  # Save waveform
156
  output_path = "output.wav"
157
  if waveform.dim() == 1:
158
  waveform = waveform.unsqueeze(0)
159
  torchaudio.save(output_path, waveform.cpu(), sample_rate=16000)
160
- print(f"πŸ’Ύ Audio saved to {output_path}")
161
 
162
  return output_path
163
 
164
  except Exception as e:
165
- print("❌ Error during TTS generation:", e)
166
  return "Error during speech synthesis."
167
 
168
- # Gradio interface
169
  demo = gr.Interface(
170
  fn=tts_generate,
171
  inputs=gr.Textbox(label="Enter text"),
@@ -175,6 +164,6 @@ demo = gr.Interface(
175
  )
176
 
177
  if __name__ == "__main__":
178
- print("πŸš€ Launching Gradio demo...")
179
  demo.launch()
180
 
 
1
  import torch
2
  import gradio as gr
3
  import torchaudio
 
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
5
  from transformers.models.speecht5 import SpeechT5HifiGan
6
 
 
14
 
15
  speaker_embedding = torch.zeros(1, 512).to(device)
16
 
 
17
  # processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5_4")
18
  # model = SpeechT5ForTextToSpeech.from_pretrained(
19
  # "nambn0321/TTS_with_T5_4",
 
22
  # )
23
  # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
24
 
 
25
  # device = "cuda" if torch.cuda.is_available() else "cpu"
26
  # model = model.to(device)
27
  # vocoder = vocoder.to(device)
28
 
 
29
  # speaker_embedding = torch.tensor([[-0.06632216, -0.02325863, 0.04376163, 0.01112046, -0.02864115,
30
  # -0.03048201, -0.04865832, 0.00598873, 0.03105048, 0.01635859,
31
  # -0.07552029, -0.09258246, 0.04839027, 0.04307159, 0.05019059,
 
131
  # 0.02549847, -0.06043207]]).to(device)
132
 
133
  def tts_generate(text):
 
134
  try:
135
  # Preprocess input
 
136
  inputs = processor(text=text, return_tensors="pt").to(device)
 
137
 
138
  # Generate waveform directly (with vocoder)
 
139
  with torch.no_grad():
140
  waveform = model.generate_speech(
141
  inputs["input_ids"],
142
  speaker_embedding,
143
  vocoder=vocoder
144
  )
 
145
 
146
  # Save waveform
147
  output_path = "output.wav"
148
  if waveform.dim() == 1:
149
  waveform = waveform.unsqueeze(0)
150
  torchaudio.save(output_path, waveform.cpu(), sample_rate=16000)
 
151
 
152
  return output_path
153
 
154
  except Exception as e:
155
+ print("Error during TTS generation:", e)
156
  return "Error during speech synthesis."
157
 
 
158
  demo = gr.Interface(
159
  fn=tts_generate,
160
  inputs=gr.Textbox(label="Enter text"),
 
164
  )
165
 
166
  if __name__ == "__main__":
167
+ print("Launching Gradio demo")
168
  demo.launch()
169