Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,39 +18,77 @@ model_path = hf_hub_download(
|
|
| 18 |
repo_id="Bmo411/WGAN",
|
| 19 |
filename="generator_epoch_3500.keras"
|
| 20 |
)
|
| 21 |
-
|
| 22 |
generator = keras.models.load_model(model_path, compile=False)
|
| 23 |
|
| 24 |
# Función para convertir texto a audio
|
| 25 |
def text_to_audio(text):
|
| 26 |
-
#
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
# Crear interfaz en Gradio
|
| 46 |
interface = gr.Interface(
|
| 47 |
fn=text_to_audio,
|
| 48 |
-
inputs=gr.Textbox(lines=
|
| 49 |
-
outputs=gr.Audio(
|
| 50 |
title="Demo de TTS con Tacotron2 + Generador",
|
| 51 |
-
description="Convierte texto en audio usando Tacotron2 +
|
|
|
|
| 52 |
)
|
| 53 |
|
| 54 |
# Lanzar aplicación
|
| 55 |
if __name__ == "__main__":
|
| 56 |
-
interface.launch()
|
|
|
|
| 18 |
repo_id="Bmo411/WGAN",
|
| 19 |
filename="generator_epoch_3500.keras"
|
| 20 |
)
|
|
|
|
| 21 |
generator = keras.models.load_model(model_path, compile=False)
|
| 22 |
|
| 23 |
# Función para convertir texto a audio
|
| 24 |
def text_to_audio(text):
|
| 25 |
+
# Crear un array vacío por defecto en caso de error
|
| 26 |
+
default_audio = np.zeros(8000, dtype=np.float32)
|
| 27 |
+
sample_rate = 22050 # Ajusta según la configuración de tu modelo
|
| 28 |
+
|
| 29 |
+
if not text or not text.strip():
|
| 30 |
+
return (sample_rate, default_audio)
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
# Convertir texto a mel-spectrograma con Tacotron2
|
| 34 |
+
mel_output, _, _ = tacotron2.encode_text(text)
|
| 35 |
+
mel = mel_output.detach().cpu().numpy().astype(np.float32)
|
| 36 |
+
|
| 37 |
+
# Imprimir forma original del mel para debugging
|
| 38 |
+
print(f"Forma original del mel: {mel.shape}")
|
| 39 |
+
|
| 40 |
+
# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
|
| 41 |
+
# Si mel tiene forma (80, frames) - lo más probable
|
| 42 |
+
if len(mel.shape) == 2:
|
| 43 |
+
mel_input = np.expand_dims(mel, axis=0) # (1, 80, frames)
|
| 44 |
+
mel_input = np.expand_dims(mel_input, axis=-1) # (1, 80, frames, 1)
|
| 45 |
+
# Si viene con otra forma, intentamos adaptarla
|
| 46 |
+
elif len(mel.shape) == 3 and mel.shape[0] == 1:
|
| 47 |
+
# Si es (1, 80, frames) o (1, frames, 80)
|
| 48 |
+
if mel.shape[1] == 80:
|
| 49 |
+
mel_input = np.expand_dims(mel, axis=-1) # (1, 80, frames, 1)
|
| 50 |
+
else:
|
| 51 |
+
mel_input = np.expand_dims(np.transpose(mel, (0, 2, 1)), axis=-1) # (1, 80, frames, 1)
|
| 52 |
+
else:
|
| 53 |
+
# Intento final de reorganización
|
| 54 |
+
mel_input = np.expand_dims(np.expand_dims(mel, axis=0), axis=-1)
|
| 55 |
+
|
| 56 |
+
print(f"Forma del mel preparado: {mel_input.shape}")
|
| 57 |
+
|
| 58 |
+
# Generar audio
|
| 59 |
+
generated_audio = generator(mel_input, training=False)
|
| 60 |
+
|
| 61 |
+
# Procesar el audio generado
|
| 62 |
+
generated_audio = tf.squeeze(generated_audio).numpy()
|
| 63 |
+
|
| 64 |
+
# Asegurarse de que hay valores no cero antes de normalizar
|
| 65 |
+
if np.max(np.abs(generated_audio)) > 0:
|
| 66 |
+
generated_audio = generated_audio / np.max(np.abs(generated_audio))
|
| 67 |
+
|
| 68 |
+
# Convertir a float32 para gradio
|
| 69 |
+
generated_audio = generated_audio.astype(np.float32)
|
| 70 |
+
|
| 71 |
+
print(f"Forma del audio generado: {generated_audio.shape}")
|
| 72 |
+
|
| 73 |
+
return (sample_rate, generated_audio)
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"Error en la generación de audio: {e}")
|
| 77 |
+
# Si hay error, imprimir un traceback completo para mejor diagnóstico
|
| 78 |
+
import traceback
|
| 79 |
+
traceback.print_exc()
|
| 80 |
+
return (sample_rate, default_audio)
|
| 81 |
|
| 82 |
# Crear interfaz en Gradio
|
| 83 |
interface = gr.Interface(
|
| 84 |
fn=text_to_audio,
|
| 85 |
+
inputs=gr.Textbox(lines=2, placeholder="Escribe algo (ej. 'Hello world')"),
|
| 86 |
+
outputs=gr.Audio(label="Audio generado"),
|
| 87 |
title="Demo de TTS con Tacotron2 + Generador",
|
| 88 |
+
description="Convierte texto en audio usando Tacotron2 + modelo Generator entrenado.",
|
| 89 |
+
examples=[["Hello"], ["Hi there"]]
|
| 90 |
)
|
| 91 |
|
| 92 |
# Lanzar aplicación
|
| 93 |
if __name__ == "__main__":
|
| 94 |
+
interface.launch(debug=True)
|