Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,10 +22,9 @@ generator = keras.models.load_model(model_path, compile=False)
|
|
| 22 |
|
| 23 |
# Función para convertir texto a audio
|
| 24 |
def text_to_audio(text):
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
default_audio = np.zeros(target_length, dtype=np.float32)
|
| 29 |
|
| 30 |
if not text or not text.strip():
|
| 31 |
return (sample_rate, default_audio)
|
|
@@ -35,11 +34,24 @@ def text_to_audio(text):
|
|
| 35 |
mel_output, _, _ = tacotron2.encode_text(text)
|
| 36 |
mel = mel_output.detach().cpu().numpy().astype(np.float32)
|
| 37 |
|
|
|
|
| 38 |
print(f"Forma original del mel: {mel.shape}")
|
| 39 |
|
| 40 |
# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
print(f"Forma del mel preparado: {mel_input.shape}")
|
| 45 |
|
|
@@ -49,34 +61,20 @@ def text_to_audio(text):
|
|
| 49 |
# Procesar el audio generado
|
| 50 |
generated_audio = tf.squeeze(generated_audio).numpy()
|
| 51 |
|
| 52 |
-
print(f"Forma del audio generado: {generated_audio.shape}")
|
| 53 |
-
|
| 54 |
# Asegurarse de que hay valores no cero antes de normalizar
|
| 55 |
if np.max(np.abs(generated_audio)) > 0:
|
| 56 |
generated_audio = generated_audio / np.max(np.abs(generated_audio))
|
| 57 |
|
| 58 |
-
# RECORTAR O RELLENAR EL AUDIO A 2 SEGUNDOS (16000 muestras)
|
| 59 |
-
current_length = len(generated_audio)
|
| 60 |
-
|
| 61 |
-
if current_length > target_length:
|
| 62 |
-
# Recortar si es más largo de 2 segundos
|
| 63 |
-
print(f"Recortando audio de {current_length} a {target_length} muestras")
|
| 64 |
-
final_audio = generated_audio[:target_length]
|
| 65 |
-
else:
|
| 66 |
-
# Rellenar con ceros si es más corto de 2 segundos
|
| 67 |
-
print(f"Rellenando audio de {current_length} a {target_length} muestras")
|
| 68 |
-
final_audio = np.zeros(target_length, dtype=np.float32)
|
| 69 |
-
final_audio[:current_length] = generated_audio
|
| 70 |
-
|
| 71 |
# Convertir a float32 para gradio
|
| 72 |
-
|
| 73 |
|
| 74 |
-
print(f"Forma
|
| 75 |
|
| 76 |
-
return (sample_rate,
|
| 77 |
|
| 78 |
except Exception as e:
|
| 79 |
print(f"Error en la generación de audio: {e}")
|
|
|
|
| 80 |
import traceback
|
| 81 |
traceback.print_exc()
|
| 82 |
return (sample_rate, default_audio)
|
|
@@ -85,12 +83,12 @@ def text_to_audio(text):
|
|
| 85 |
interface = gr.Interface(
|
| 86 |
fn=text_to_audio,
|
| 87 |
inputs=gr.Textbox(lines=2, placeholder="Escribe algo (ej. 'Hello world')"),
|
| 88 |
-
outputs=gr.Audio(label="Audio generado
|
| 89 |
title="Demo de TTS con Tacotron2 + Generador",
|
| 90 |
-
description="Convierte texto en audio
|
| 91 |
examples=[["Hello"], ["Hi there"]]
|
| 92 |
)
|
| 93 |
|
| 94 |
# Lanzar aplicación
|
| 95 |
-
if
|
| 96 |
interface.launch(debug=True)
|
|
|
|
| 22 |
|
| 23 |
# Función para convertir texto a audio
|
| 24 |
def text_to_audio(text):
|
| 25 |
+
# Crear un array vacío por defecto en caso de error
|
| 26 |
+
default_audio = np.zeros(8000, dtype=np.float32)
|
| 27 |
+
sample_rate = 8000 # Ajusta según la configuración de tu modelo
|
|
|
|
| 28 |
|
| 29 |
if not text or not text.strip():
|
| 30 |
return (sample_rate, default_audio)
|
|
|
|
| 34 |
mel_output, _, _ = tacotron2.encode_text(text)
|
| 35 |
mel = mel_output.detach().cpu().numpy().astype(np.float32)
|
| 36 |
|
| 37 |
+
# Imprimir forma original del mel para debugging
|
| 38 |
print(f"Forma original del mel: {mel.shape}")
|
| 39 |
|
| 40 |
# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
|
| 41 |
+
# Si mel tiene forma (80, frames) - lo más probable
|
| 42 |
+
if len(mel.shape) == 2:
|
| 43 |
+
mel_input = np.expand_dims(mel, axis=0) # (1, 80, frames)
|
| 44 |
+
mel_input = np.expand_dims(mel_input, axis=-1) # (1, 80, frames, 1)
|
| 45 |
+
# Si viene con otra forma, intentamos adaptarla
|
| 46 |
+
elif len(mel.shape) == 3 and mel.shape[0] == 1:
|
| 47 |
+
# Si es (1, 80, frames) o (1, frames, 80)
|
| 48 |
+
if mel.shape[1] == 80:
|
| 49 |
+
mel_input = np.expand_dims(mel, axis=-1) # (1, 80, frames, 1)
|
| 50 |
+
else:
|
| 51 |
+
mel_input = np.expand_dims(np.transpose(mel, (0, 2, 1)), axis=-1) # (1, 80, frames, 1)
|
| 52 |
+
else:
|
| 53 |
+
# Intento final de reorganización
|
| 54 |
+
mel_input = np.expand_dims(np.expand_dims(mel, axis=0), axis=-1)
|
| 55 |
|
| 56 |
print(f"Forma del mel preparado: {mel_input.shape}")
|
| 57 |
|
|
|
|
| 61 |
# Procesar el audio generado
|
| 62 |
generated_audio = tf.squeeze(generated_audio).numpy()
|
| 63 |
|
|
|
|
|
|
|
| 64 |
# Asegurarse de que hay valores no cero antes de normalizar
|
| 65 |
if np.max(np.abs(generated_audio)) > 0:
|
| 66 |
generated_audio = generated_audio / np.max(np.abs(generated_audio))
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# Convertir a float32 para gradio
|
| 69 |
+
generated_audio = generated_audio.astype(np.float32)
|
| 70 |
|
| 71 |
+
print(f"Forma del audio generado: {generated_audio.shape}")
|
| 72 |
|
| 73 |
+
return (sample_rate, generated_audio)
|
| 74 |
|
| 75 |
except Exception as e:
|
| 76 |
print(f"Error en la generación de audio: {e}")
|
| 77 |
+
# Si hay error, imprimir un traceback completo para mejor diagnóstico
|
| 78 |
import traceback
|
| 79 |
traceback.print_exc()
|
| 80 |
return (sample_rate, default_audio)
|
|
|
|
| 83 |
interface = gr.Interface(
|
| 84 |
fn=text_to_audio,
|
| 85 |
inputs=gr.Textbox(lines=2, placeholder="Escribe algo (ej. 'Hello world')"),
|
| 86 |
+
outputs=gr.Audio(label="Audio generado"),
|
| 87 |
title="Demo de TTS con Tacotron2 + Generador",
|
| 88 |
+
description="Convierte texto en audio usando Tacotron2 + modelo Generator entrenado.",
|
| 89 |
examples=[["Hello"], ["Hi there"]]
|
| 90 |
)
|
| 91 |
|
| 92 |
# Lanzar aplicación
|
| 93 |
+
if _name_ == "_main_":
|
| 94 |
interface.launch(debug=True)
|