Spaces:

nambn0321
/

TTS_run

Runtime error

App Files Files Community

nambn0321 commited on Jul 28, 2025

Commit

d2b5676

verified ·

1 Parent(s): 2ef63bd

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -47

app.py CHANGED Viewed

@@ -1,68 +1,60 @@
 import os
 import torch
-import gradio as gr
 import numpy as np
 import soundfile as sf
-import json
 from huggingface_hub import snapshot_download
 from TTS.utils.synthesizer import Synthesizer
-# Step 1: Download and load Glow-TTS from Hugging Face
 model_dir = snapshot_download(repo_id="nambn0321/TTS_model")
 synthesizer = Synthesizer(
     tts_checkpoint=os.path.join(model_dir, "best_model.pth"),
     tts_config_path=os.path.join(model_dir, "config.json"),
     use_cuda=torch.cuda.is_available()
 )
-# Step 2: Load HiFi-GAN
 hifigan_checkpoint_path = os.path.join(model_dir, "g_02500000.pth")
-hifigan_config_path = os.path.join(model_dir, "config (1).json")  # or config.json if shared
-with open(hifigan_config_path) as f:
     hifigan_config = json.load(f)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-vocoder = Generator(hifigan_config).to(device)
-vocoder.load_state_dict(torch.load(hifigan_checkpoint_path, map_location=device)["generator"])
-vocoder.eval()
-# Step 3: Text → Mel → Waveform
-def tts_fn(text):
-    try:
-        # Generate mel spectrogram
-        mel = synthesizer.tts(text, use_glow=False, speaker_name=None, return_spec=True)
-        # De-normalize mel if Glow-TTS used symmetric normalization
-        mel = (mel + 1) * (4.0 / 2)  # from symmetric [-1, 1] → [0, 4]
-        # Convert to tensor
-        mel_tensor = torch.tensor(mel).unsqueeze(0).to(torch.float32).to(device)
-        # Generate waveform
-        with torch.no_grad():
-            audio = vocoder(mel_tensor).squeeze().cpu().numpy()
-        # Save to file
-        out_path = "output.wav"
-        sf.write(out_path, audio, samplerate=22050)
-        return out_path
-    except Exception as e:
-        error_msg = f"Error during TTS processing: {str(e)}"
-        print(error_msg)
-        return error_msg
-# Step 4: Launch Gradio app
 gr.Interface(
-    fn=tts_fn,
-    inputs=gr.Textbox(label="Enter Text"),
-    outputs=gr.Audio(label="Generated Audio", type="filepath"),
-    title="Glow-TTS + HiFi-GAN Vocoder",
-    description="Text-to-speech using a pretrained Glow-TTS model and your custom HiFi-GAN vocoder."
 ).launch()

 import os
+import json
 import torch
 import numpy as np
 import soundfile as sf
+import gradio as gr
 from huggingface_hub import snapshot_download
 from TTS.utils.synthesizer import Synthesizer
+from models import Generator  # Your HiFi-GAN Generator class
+# Download and load models
 model_dir = snapshot_download(repo_id="nambn0321/TTS_model")
+# Glow-TTS
 synthesizer = Synthesizer(
     tts_checkpoint=os.path.join(model_dir, "best_model.pth"),
     tts_config_path=os.path.join(model_dir, "config.json"),
     use_cuda=torch.cuda.is_available()
 )
+# HiFi-GAN
 hifigan_checkpoint_path = os.path.join(model_dir, "g_02500000.pth")
+hifigan_config_path = os.path.join(model_dir, "config (1).json")
+with open(hifigan_config_path, "r") as f:
     hifigan_config = json.load(f)
+hifigan = Generator(hifigan_config)
+hifigan.load_state_dict(torch.load(hifigan_checkpoint_path, map_location="cpu")["generator"])
+hifigan.eval()
+if torch.cuda.is_available():
+    hifigan.cuda()
+# Inference function
+def tts(text):
+    # Glow-TTS: text -> mel
+    wav_tensor = synthesizer.tts(text, None, None, return_wav=False)  # returns mel
+    mel = wav_tensor.squeeze().cpu().numpy()
+    # HiFi-GAN: mel -> waveform
+    mel_tensor = torch.from_numpy(mel).unsqueeze(0)  # [1, num_mels, T]
+    if torch.cuda.is_available():
+        mel_tensor = mel_tensor.cuda()
+    with torch.no_grad():
+        audio_tensor = hifigan(mel_tensor).cpu().squeeze()
+    audio_np = audio_tensor.numpy()
+    sf.write("output.wav", audio_np, samplerate=22050)
+    return "output.wav"
+# Gradio UI
 gr.Interface(
+    fn=tts,
+    inputs=gr.Textbox(label="Enter text", placeholder="Type something..."),
+    outputs=gr.Audio(label="Generated Speech"),
+    title="Glow-TTS + HiFi-GAN TTS",
+    description="Enter text and listen to the generated speech using Glow-TTS and HiFi-GAN"
 ).launch()