artificialguybr commited on
Commit
d422fc0
·
verified ·
1 Parent(s): d733ada

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -6
app.py CHANGED
@@ -4,6 +4,7 @@ import subprocess
4
  import traceback
5
  import gradio as gr
6
  import numpy as np
 
7
  import spaces
8
  import torch
9
  from huggingface_hub import snapshot_download
@@ -22,9 +23,7 @@ sys.path.insert(0, os.getcwd())
22
  from fish_speech.models.text2semantic.inference import (
23
  init_model,
24
  generate_long,
25
- load_codec_model,
26
- decode_to_audio,
27
- encode_audio
28
  )
29
 
30
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -54,6 +53,24 @@ codec_model = load_codec_model(codec_checkpoint, device=device, precision=precis
54
 
55
  print("✅ Todos os modelos carregados com sucesso!")
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  @spaces.GPU(duration=120)
58
  def tts_inference(
59
  text,
@@ -69,7 +86,7 @@ def tts_inference(
69
  prompt_tokens_list = None
70
 
71
  if ref_audio is not None and ref_text:
72
- prompt_tokens_list = [encode_audio(ref_audio, codec_model, device).cpu()]
73
 
74
  generator = generate_long(
75
  model=llama_model,
@@ -99,8 +116,8 @@ def tts_inference(
99
  if not codes:
100
  raise gr.Error("Nenhum áudio foi gerado. Verifique o seu texto de entrada.")
101
 
102
- merged_codes = torch.cat(codes, dim=1)
103
- audio_waveform = decode_to_audio(merged_codes.to(device), codec_model)
104
  audio_np = audio_waveform.cpu().float().numpy()
105
 
106
  return (codec_model.sample_rate, audio_np)
 
4
  import traceback
5
  import gradio as gr
6
  import numpy as np
7
+ import librosa
8
  import spaces
9
  import torch
10
  from huggingface_hub import snapshot_download
 
23
  from fish_speech.models.text2semantic.inference import (
24
  init_model,
25
  generate_long,
26
+ load_codec_model
 
 
27
  )
28
 
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
53
 
54
  print("✅ Todos os modelos carregados com sucesso!")
55
 
56
+
57
+ @torch.no_grad()
58
+ def custom_encode_audio(audio_path, codec, device):
59
+ wav_np, _ = librosa.load(audio_path, sr=codec.sample_rate, mono=True)
60
+ wav = torch.from_numpy(wav_np).to(device)
61
+
62
+ model_dtype = next(codec.parameters()).dtype
63
+ audios = wav[None, None, :].to(dtype=model_dtype)
64
+ audio_lengths = torch.tensor([wav.shape[0]], device=device, dtype=torch.long)
65
+
66
+ indices, feature_lengths = codec.encode(audios, audio_lengths)
67
+ return indices[0, :, : feature_lengths[0]]
68
+
69
+ @torch.no_grad()
70
+ def custom_decode_audio(codes, codec):
71
+ audio = codec.from_indices(codes[None])
72
+ return audio[0, 0]
73
+
74
  @spaces.GPU(duration=120)
75
  def tts_inference(
76
  text,
 
86
  prompt_tokens_list = None
87
 
88
  if ref_audio is not None and ref_text:
89
+ prompt_tokens_list = [custom_encode_audio(ref_audio, codec_model, device).cpu()]
90
 
91
  generator = generate_long(
92
  model=llama_model,
 
116
  if not codes:
117
  raise gr.Error("Nenhum áudio foi gerado. Verifique o seu texto de entrada.")
118
 
119
+ merged_codes = torch.cat(codes, dim=1).to(device)
120
+ audio_waveform = custom_decode_audio(merged_codes, codec_model)
121
  audio_np = audio_waveform.cpu().float().numpy()
122
 
123
  return (codec_model.sample_rate, audio_np)