nambn0321 commited on
Commit
8d7e20b
·
verified ·
1 Parent(s): 0515d75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -56
app.py CHANGED
@@ -1,69 +1,42 @@
1
- import os
2
- import sys
3
  import torch
4
- import json
5
- import numpy as np
6
  import gradio as gr
7
- import soundfile as sf
8
- from huggingface_hub import snapshot_download
9
- from safetensors.torch import load_file as safe_load_file
10
 
11
- from TTS.utils.synthesizer import Synthesizer
 
 
 
12
 
13
- # Download model repo from Hugging Face
14
- model_dir = snapshot_download(repo_id="nambn0321/TTS_model")
 
 
15
 
16
- # Add model directory to path so we can import models.py
17
- sys.path.append(model_dir)
18
- from models import Generator # Now valid!
19
 
20
- # Load Glow-TTS synthesizer
21
- synthesizer = Synthesizer(
22
- tts_checkpoint=os.path.join(model_dir, "best_model.pth"),
23
- tts_config_path=os.path.join(model_dir, "config.json"),
24
- use_cuda=torch.cuda.is_available()
25
- )
26
 
27
- # Load HiFi-GAN generator with safetensors
28
- hifigan_config_path = os.path.join(model_dir, "config (2).json")
29
- hifigan_checkpoint_path = os.path.join(model_dir, "model.safetensors")
30
-
31
- with open(hifigan_config_path, "r") as f:
32
- hifigan_config = json.load(f)
33
-
34
- hifigan = Generator(hifigan_config)
35
-
36
- # Load safetensors weights safely
37
- state_dict = safe_load_file(hifigan_checkpoint_path, device="cpu")
38
-
39
- hifigan.load_state_dict(state_dict)
40
- hifigan.eval()
41
- if torch.cuda.is_available():
42
- hifigan.cuda()
43
-
44
- # Inference pipeline: text ➝ mel ➝ waveform
45
- def tts(text):
46
- # Generate mel spectrogram from text using Glow-TTS
47
- mel = synthesizer.tts(text, None, None, return_wav=False).squeeze().cpu().numpy()
48
 
49
- # Convert mel to tensor and add batch dim
50
- mel_tensor = torch.from_numpy(mel).unsqueeze(0)
51
- if torch.cuda.is_available():
52
- mel_tensor = mel_tensor.cuda()
53
 
54
- # Generate waveform audio from mel using HiFi-GAN vocoder
55
- with torch.no_grad():
56
- audio = hifigan(mel_tensor).cpu().squeeze().numpy()
57
 
58
- # Save to output file
59
- sf.write("output.wav", audio, 22050)
60
  return "output.wav"
61
 
62
  # Gradio interface
63
- gr.Interface(
64
- fn=tts,
65
- inputs=gr.Textbox(label="Enter Text"),
66
- outputs=gr.Audio(label="Generated Speech"),
67
- title="Glow-TTS + HiFi-GAN TTS",
68
- description="Type text to synthesize speech using Glow-TTS and HiFi-GAN."
69
- ).launch()
 
 
 
1
  import torch
 
 
2
  import gradio as gr
3
+ import torchaudio
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
5
 
6
+ # Load model and processor
7
+ processor = SpeechT5Processor.from_pretrained("your-username/your-model-name")
8
+ model = SpeechT5ForTextToSpeech.from_pretrained("your-username/your-model-name")
9
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
10
 
11
+ # Move to CUDA if available
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ model = model.to(device)
14
+ vocoder = vocoder.to(device)
15
 
16
+ # Dummy speaker embedding (or load your real one here)
17
+ speaker_embedding = torch.zeros(1, 512).to(device)
 
18
 
19
+ def tts_generate(text):
20
+ # Preprocess input
21
+ inputs = processor(text=text, return_tensors="pt").to(device)
 
 
 
22
 
23
+ # Generate mel spectrogram
24
+ with torch.no_grad():
25
+ mel = model.generate_speech(inputs["input_ids"], speaker_embedding)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ # Convert mel spectrogram to waveform
28
+ waveform = vocoder(mel)
29
+ waveform = waveform.cpu()
 
30
 
31
+ # Save waveform to file
32
+ torchaudio.save("output.wav", waveform, sample_rate=16000)
 
33
 
 
 
34
  return "output.wav"
35
 
36
  # Gradio interface
37
+ demo = gr.Interface(
38
+ fn=tts_generate,
39
+ inputs=gr.Textbox(label="Enter text"),
40
+ outputs=gr.Audio(label="Generated Speech", type="filepath"),
41
+ title="SpeechT5 Text-to-Speech",
42
+ description="Enter t