Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import torch
|
| 2 |
import gradio as gr
|
| 3 |
import torchaudio
|
| 4 |
-
# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
| 5 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
| 6 |
from transformers.models.speecht5 import SpeechT5HifiGan
|
| 7 |
|
|
@@ -15,7 +14,6 @@ vocoder = vocoder.to(device)
|
|
| 15 |
|
| 16 |
speaker_embedding = torch.zeros(1, 512).to(device)
|
| 17 |
|
| 18 |
-
# Load model and processor
|
| 19 |
# processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5_4")
|
| 20 |
# model = SpeechT5ForTextToSpeech.from_pretrained(
|
| 21 |
# "nambn0321/TTS_with_T5_4",
|
|
@@ -24,12 +22,10 @@ speaker_embedding = torch.zeros(1, 512).to(device)
|
|
| 24 |
# )
|
| 25 |
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 26 |
|
| 27 |
-
# # Move to CUDA if available
|
| 28 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
# model = model.to(device)
|
| 30 |
# vocoder = vocoder.to(device)
|
| 31 |
|
| 32 |
-
# # # Dummy speaker embedding (or load your real one here)
|
| 33 |
# speaker_embedding = torch.tensor([[-0.06632216, -0.02325863, 0.04376163, 0.01112046, -0.02864115,
|
| 34 |
# -0.03048201, -0.04865832, 0.00598873, 0.03105048, 0.01635859,
|
| 35 |
# -0.07552029, -0.09258246, 0.04839027, 0.04307159, 0.05019059,
|
|
@@ -135,37 +131,30 @@ speaker_embedding = torch.zeros(1, 512).to(device)
|
|
| 135 |
# 0.02549847, -0.06043207]]).to(device)
|
| 136 |
|
| 137 |
def tts_generate(text):
|
| 138 |
-
print(f"π Input text: {text}")
|
| 139 |
try:
|
| 140 |
# Preprocess input
|
| 141 |
-
print("π Processing input...")
|
| 142 |
inputs = processor(text=text, return_tensors="pt").to(device)
|
| 143 |
-
print("β
Text processed.")
|
| 144 |
|
| 145 |
# Generate waveform directly (with vocoder)
|
| 146 |
-
print("π€ Generating speech waveform...")
|
| 147 |
with torch.no_grad():
|
| 148 |
waveform = model.generate_speech(
|
| 149 |
inputs["input_ids"],
|
| 150 |
speaker_embedding,
|
| 151 |
vocoder=vocoder
|
| 152 |
)
|
| 153 |
-
print("β
Waveform generated.")
|
| 154 |
|
| 155 |
# Save waveform
|
| 156 |
output_path = "output.wav"
|
| 157 |
if waveform.dim() == 1:
|
| 158 |
waveform = waveform.unsqueeze(0)
|
| 159 |
torchaudio.save(output_path, waveform.cpu(), sample_rate=16000)
|
| 160 |
-
print(f"πΎ Audio saved to {output_path}")
|
| 161 |
|
| 162 |
return output_path
|
| 163 |
|
| 164 |
except Exception as e:
|
| 165 |
-
print("
|
| 166 |
return "Error during speech synthesis."
|
| 167 |
|
| 168 |
-
# Gradio interface
|
| 169 |
demo = gr.Interface(
|
| 170 |
fn=tts_generate,
|
| 171 |
inputs=gr.Textbox(label="Enter text"),
|
|
@@ -175,6 +164,6 @@ demo = gr.Interface(
|
|
| 175 |
)
|
| 176 |
|
| 177 |
if __name__ == "__main__":
|
| 178 |
-
print("
|
| 179 |
demo.launch()
|
| 180 |
|
|
|
|
| 1 |
import torch
|
| 2 |
import gradio as gr
|
| 3 |
import torchaudio
|
|
|
|
| 4 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
| 5 |
from transformers.models.speecht5 import SpeechT5HifiGan
|
| 6 |
|
|
|
|
| 14 |
|
| 15 |
speaker_embedding = torch.zeros(1, 512).to(device)
|
| 16 |
|
|
|
|
| 17 |
# processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5_4")
|
| 18 |
# model = SpeechT5ForTextToSpeech.from_pretrained(
|
| 19 |
# "nambn0321/TTS_with_T5_4",
|
|
|
|
| 22 |
# )
|
| 23 |
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 24 |
|
|
|
|
| 25 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 26 |
# model = model.to(device)
|
| 27 |
# vocoder = vocoder.to(device)
|
| 28 |
|
|
|
|
| 29 |
# speaker_embedding = torch.tensor([[-0.06632216, -0.02325863, 0.04376163, 0.01112046, -0.02864115,
|
| 30 |
# -0.03048201, -0.04865832, 0.00598873, 0.03105048, 0.01635859,
|
| 31 |
# -0.07552029, -0.09258246, 0.04839027, 0.04307159, 0.05019059,
|
|
|
|
| 131 |
# 0.02549847, -0.06043207]]).to(device)
|
| 132 |
|
| 133 |
def tts_generate(text):
|
|
|
|
| 134 |
try:
|
| 135 |
# Preprocess input
|
|
|
|
| 136 |
inputs = processor(text=text, return_tensors="pt").to(device)
|
|
|
|
| 137 |
|
| 138 |
# Generate waveform directly (with vocoder)
|
|
|
|
| 139 |
with torch.no_grad():
|
| 140 |
waveform = model.generate_speech(
|
| 141 |
inputs["input_ids"],
|
| 142 |
speaker_embedding,
|
| 143 |
vocoder=vocoder
|
| 144 |
)
|
|
|
|
| 145 |
|
| 146 |
# Save waveform
|
| 147 |
output_path = "output.wav"
|
| 148 |
if waveform.dim() == 1:
|
| 149 |
waveform = waveform.unsqueeze(0)
|
| 150 |
torchaudio.save(output_path, waveform.cpu(), sample_rate=16000)
|
|
|
|
| 151 |
|
| 152 |
return output_path
|
| 153 |
|
| 154 |
except Exception as e:
|
| 155 |
+
print("Error during TTS generation:", e)
|
| 156 |
return "Error during speech synthesis."
|
| 157 |
|
|
|
|
| 158 |
demo = gr.Interface(
|
| 159 |
fn=tts_generate,
|
| 160 |
inputs=gr.Textbox(label="Enter text"),
|
|
|
|
| 164 |
)
|
| 165 |
|
| 166 |
if __name__ == "__main__":
|
| 167 |
+
print("Launching Gradio demo")
|
| 168 |
demo.launch()
|
| 169 |
|