Spaces:
Sleeping
Sleeping
| import torch | |
| import torchaudio | |
| import gradio as gr | |
| from fastapi import FastAPI | |
| from speechbrain.pretrained import Tacotron2, HIFIGAN | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| SR = 22050 | |
| tacotron2 = Tacotron2.from_hparams( | |
| source="speechbrain/tts-tacotron2-ljspeech", | |
| savedir="models/tacotron2", | |
| run_opts={"device": DEVICE}, | |
| ) | |
| hifigan = HIFIGAN.from_hparams( | |
| source="speechbrain/tts-hifigan-ljspeech", | |
| savedir="models/hifigan", | |
| run_opts={"device": DEVICE}, | |
| ) | |
| def synth(text): | |
| text = (text or "").strip() | |
| if not text: | |
| return None | |
| seq, seq_len = tacotron2.text_to_seq(text) | |
| seq = [int(x) for x in seq] | |
| # Tacotron2 encoder conv needs enough timesteps | |
| min_tokens = 5 | |
| pad_id = 0 | |
| if len(seq) < min_tokens: | |
| seq = seq + [pad_id] * (min_tokens - len(seq)) | |
| seq_len = len(seq) | |
| seq = torch.tensor(seq, dtype=torch.long, device=DEVICE).unsqueeze(0) | |
| seq_len = torch.tensor([seq_len], device=DEVICE) | |
| mel, _, _ = tacotron2.infer(seq, seq_len) | |
| # Optional: still keep mel padding for vocoder safety | |
| if mel.shape[-1] < 5: | |
| mel = F.pad(mel, (0, 5 - mel.shape[-1]), mode="replicate") | |
| wav = hifigan.decode_batch(mel) | |
| if wav.dim() == 3: | |
| wav = wav.squeeze(1) | |
| wav = wav[0].cpu() | |
| out_path = "out.wav" | |
| torchaudio.save(out_path, wav.unsqueeze(0), SR) | |
| return out_path | |
| io = gr.Interface( | |
| fn=synth, | |
| inputs=gr.Textbox(label="Text", lines=3), | |
| outputs=gr.Audio(type="filepath", label="Output"), | |
| title="Tacotron 2 + HiFi-GAN", | |
| allow_flagging="never", | |
| api_name=False, | |
| ) | |
| app = FastAPI() | |
| app = gr.mount_gradio_app(app, io, path="/") |