Spaces:

nambn0321
/

TTS_run

Runtime error

App Files Files Community

nambn0321 commited on Jul 30, 2025

Commit

8d7e20b

verified ·

1 Parent(s): 0515d75

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -56

app.py CHANGED Viewed

@@ -1,69 +1,42 @@
-import os
-import sys
 import torch
-import json
-import numpy as np
 import gradio as gr
-import soundfile as sf
-from huggingface_hub import snapshot_download
-from safetensors.torch import load_file as safe_load_file
-from TTS.utils.synthesizer import Synthesizer
-# Download model repo from Hugging Face
-model_dir = snapshot_download(repo_id="nambn0321/TTS_model")
-# Add model directory to path so we can import models.py
-sys.path.append(model_dir)
-from models import Generator  # Now valid!
-# Load Glow-TTS synthesizer
-synthesizer = Synthesizer(
-    tts_checkpoint=os.path.join(model_dir, "best_model.pth"),
-    tts_config_path=os.path.join(model_dir, "config.json"),
-    use_cuda=torch.cuda.is_available()
-)
-# Load HiFi-GAN generator with safetensors
-hifigan_config_path = os.path.join(model_dir, "config (2).json")
-hifigan_checkpoint_path = os.path.join(model_dir, "model.safetensors")
-with open(hifigan_config_path, "r") as f:
-    hifigan_config = json.load(f)
-hifigan = Generator(hifigan_config)
-# Load safetensors weights safely
-state_dict = safe_load_file(hifigan_checkpoint_path, device="cpu")
-hifigan.load_state_dict(state_dict)
-hifigan.eval()
-if torch.cuda.is_available():
-    hifigan.cuda()
-# Inference pipeline: text ➝ mel ➝ waveform
-def tts(text):
-    # Generate mel spectrogram from text using Glow-TTS
-    mel = synthesizer.tts(text, None, None, return_wav=False).squeeze().cpu().numpy()
-    # Convert mel to tensor and add batch dim
-    mel_tensor = torch.from_numpy(mel).unsqueeze(0)
-    if torch.cuda.is_available():
-        mel_tensor = mel_tensor.cuda()
-    # Generate waveform audio from mel using HiFi-GAN vocoder
-    with torch.no_grad():
-        audio = hifigan(mel_tensor).cpu().squeeze().numpy()
-    # Save to output file
-    sf.write("output.wav", audio, 22050)
     return "output.wav"
 # Gradio interface
-gr.Interface(
-    fn=tts,
-    inputs=gr.Textbox(label="Enter Text"),
-    outputs=gr.Audio(label="Generated Speech"),
-    title="Glow-TTS + HiFi-GAN TTS",
-    description="Type text to synthesize speech using Glow-TTS and HiFi-GAN."
-).launch()

 import torch
 import gradio as gr
+import torchaudio
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+# Load model and processor
+processor = SpeechT5Processor.from_pretrained("your-username/your-model-name")
+model = SpeechT5ForTextToSpeech.from_pretrained("your-username/your-model-name")
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+# Move to CUDA if available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = model.to(device)
+vocoder = vocoder.to(device)
+# Dummy speaker embedding (or load your real one here)
+speaker_embedding = torch.zeros(1, 512).to(device)
+def tts_generate(text):
+    # Preprocess input
+    inputs = processor(text=text, return_tensors="pt").to(device)
+    # Generate mel spectrogram
+    with torch.no_grad():
+        mel = model.generate_speech(inputs["input_ids"], speaker_embedding)
+    # Convert mel spectrogram to waveform
+    waveform = vocoder(mel)
+    waveform = waveform.cpu()
+    # Save waveform to file
+    torchaudio.save("output.wav", waveform, sample_rate=16000)
     return "output.wav"
 # Gradio interface
+demo = gr.Interface(
+    fn=tts_generate,
+    inputs=gr.Textbox(label="Enter text"),
+    outputs=gr.Audio(label="Generated Speech", type="filepath"),
+    title="SpeechT5 Text-to-Speech",
+    description="Enter t