Hyprlyf commited on
Commit
7f38d2b
·
verified ·
1 Parent(s): f4b2387

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -8
app.py CHANGED
@@ -1,17 +1,39 @@
1
- from transformers import SpeechT5HifiGan
 
 
2
  import torch
 
 
 
 
3
 
4
- # Load TTS model
5
  model_name = "microsoft/speecht5_tts"
6
  processor = SpeechT5Processor.from_pretrained(model_name)
7
  model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device)
8
-
9
- # Load default HiFi-GAN vocoder
10
  vocoder = SpeechT5HifiGan.from_pretrained(model_name)
11
 
12
- # Example speaker embedding
13
- # Hugging Face dataset example: cmu-arctic-xvectors
14
- # Here we can use 'matthijs/cmu-arctic-xvectors' speaker
15
- from datasets import load_dataset
16
  dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="train[:1]")
17
  speaker_embedding = torch.tensor(dataset[0]["xvector"]).unsqueeze(0).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
+ from datasets import load_dataset
4
  import torch
5
+ import soundfile as sf
6
+ import numpy as np
7
+
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
 
10
+ # Model
11
  model_name = "microsoft/speecht5_tts"
12
  processor = SpeechT5Processor.from_pretrained(model_name)
13
  model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device)
 
 
14
  vocoder = SpeechT5HifiGan.from_pretrained(model_name)
15
 
16
+ # Load a default speaker embedding
 
 
 
17
  dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="train[:1]")
18
  speaker_embedding = torch.tensor(dataset[0]["xvector"]).unsqueeze(0).to(device)
19
+
20
+ # TTS function
21
+ def text_to_speech(text):
22
+ inputs = processor(text=text, return_tensors="pt").to(device)
23
+ with torch.no_grad():
24
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=speaker_embedding)
25
+
26
+ audio_np = speech.squeeze().cpu().numpy()
27
+ samplerate = processor.feature_extractor.sampling_rate
28
+ sf.write("output.wav", audio_np, samplerate)
29
+ return (audio_np, samplerate)
30
+
31
+ iface = gr.Interface(
32
+ fn=text_to_speech,
33
+ inputs=gr.Textbox(lines=2, placeholder="Type your text here..."),
34
+ outputs=gr.Audio(type="numpy", autoplay=True),
35
+ title="SpeechT5 TTS Demo",
36
+ description="Type text and get speech output with default speaker voice."
37
+ )
38
+
39
+ iface.launch()