Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import subprocess
|
|
| 4 |
import traceback
|
| 5 |
import gradio as gr
|
| 6 |
import numpy as np
|
|
|
|
| 7 |
import spaces
|
| 8 |
import torch
|
| 9 |
from huggingface_hub import snapshot_download
|
|
@@ -22,9 +23,7 @@ sys.path.insert(0, os.getcwd())
|
|
| 22 |
from fish_speech.models.text2semantic.inference import (
|
| 23 |
init_model,
|
| 24 |
generate_long,
|
| 25 |
-
load_codec_model
|
| 26 |
-
decode_to_audio,
|
| 27 |
-
encode_audio
|
| 28 |
)
|
| 29 |
|
| 30 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -54,6 +53,24 @@ codec_model = load_codec_model(codec_checkpoint, device=device, precision=precis
|
|
| 54 |
|
| 55 |
print("✅ Todos os modelos carregados com sucesso!")
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
@spaces.GPU(duration=120)
|
| 58 |
def tts_inference(
|
| 59 |
text,
|
|
@@ -69,7 +86,7 @@ def tts_inference(
|
|
| 69 |
prompt_tokens_list = None
|
| 70 |
|
| 71 |
if ref_audio is not None and ref_text:
|
| 72 |
-
prompt_tokens_list = [
|
| 73 |
|
| 74 |
generator = generate_long(
|
| 75 |
model=llama_model,
|
|
@@ -99,8 +116,8 @@ def tts_inference(
|
|
| 99 |
if not codes:
|
| 100 |
raise gr.Error("Nenhum áudio foi gerado. Verifique o seu texto de entrada.")
|
| 101 |
|
| 102 |
-
merged_codes = torch.cat(codes, dim=1)
|
| 103 |
-
audio_waveform =
|
| 104 |
audio_np = audio_waveform.cpu().float().numpy()
|
| 105 |
|
| 106 |
return (codec_model.sample_rate, audio_np)
|
|
|
|
| 4 |
import traceback
|
| 5 |
import gradio as gr
|
| 6 |
import numpy as np
|
| 7 |
+
import librosa
|
| 8 |
import spaces
|
| 9 |
import torch
|
| 10 |
from huggingface_hub import snapshot_download
|
|
|
|
| 23 |
from fish_speech.models.text2semantic.inference import (
|
| 24 |
init_model,
|
| 25 |
generate_long,
|
| 26 |
+
load_codec_model
|
|
|
|
|
|
|
| 27 |
)
|
| 28 |
|
| 29 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 53 |
|
| 54 |
print("✅ Todos os modelos carregados com sucesso!")
|
| 55 |
|
| 56 |
+
|
| 57 |
+
@torch.no_grad()
|
| 58 |
+
def custom_encode_audio(audio_path, codec, device):
|
| 59 |
+
wav_np, _ = librosa.load(audio_path, sr=codec.sample_rate, mono=True)
|
| 60 |
+
wav = torch.from_numpy(wav_np).to(device)
|
| 61 |
+
|
| 62 |
+
model_dtype = next(codec.parameters()).dtype
|
| 63 |
+
audios = wav[None, None, :].to(dtype=model_dtype)
|
| 64 |
+
audio_lengths = torch.tensor([wav.shape[0]], device=device, dtype=torch.long)
|
| 65 |
+
|
| 66 |
+
indices, feature_lengths = codec.encode(audios, audio_lengths)
|
| 67 |
+
return indices[0, :, : feature_lengths[0]]
|
| 68 |
+
|
| 69 |
+
@torch.no_grad()
|
| 70 |
+
def custom_decode_audio(codes, codec):
|
| 71 |
+
audio = codec.from_indices(codes[None])
|
| 72 |
+
return audio[0, 0]
|
| 73 |
+
|
| 74 |
@spaces.GPU(duration=120)
|
| 75 |
def tts_inference(
|
| 76 |
text,
|
|
|
|
| 86 |
prompt_tokens_list = None
|
| 87 |
|
| 88 |
if ref_audio is not None and ref_text:
|
| 89 |
+
prompt_tokens_list = [custom_encode_audio(ref_audio, codec_model, device).cpu()]
|
| 90 |
|
| 91 |
generator = generate_long(
|
| 92 |
model=llama_model,
|
|
|
|
| 116 |
if not codes:
|
| 117 |
raise gr.Error("Nenhum áudio foi gerado. Verifique o seu texto de entrada.")
|
| 118 |
|
| 119 |
+
merged_codes = torch.cat(codes, dim=1).to(device)
|
| 120 |
+
audio_waveform = custom_decode_audio(merged_codes, codec_model)
|
| 121 |
audio_np = audio_waveform.cpu().float().numpy()
|
| 122 |
|
| 123 |
return (codec_model.sample_rate, audio_np)
|