Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import gradio as gr
|
|
| 5 |
from sambanova import SambaNova
|
| 6 |
from hume import HumeClient
|
| 7 |
from PIL import Image
|
|
|
|
| 8 |
|
| 9 |
# --- Configuración ---
|
| 10 |
SAMBA_API_KEY = os.getenv("SAMBA_API_KEY")
|
|
@@ -17,13 +18,16 @@ if not HUME_API_KEY:
|
|
| 17 |
|
| 18 |
SAMBA_BASE_URL = "https://api.sambanova.ai/v1"
|
| 19 |
VOICE_ID_DEFAULT = "085fdec7-b201-4a58-b65b-4d321f7abd85"
|
| 20 |
-
MODEL_NAME = "Llama-4-Maverick-17B-128E-Instruct"
|
| 21 |
|
| 22 |
# --- Inicializar clientes ---
|
| 23 |
samba = SambaNova(api_key=SAMBA_API_KEY, base_url=SAMBA_BASE_URL)
|
| 24 |
hume = HumeClient(api_key=HUME_API_KEY)
|
| 25 |
|
| 26 |
-
#
|
|
|
|
|
|
|
|
|
|
| 27 |
def imagen_pil_a_base64(pil_img):
|
| 28 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpeg") as tmp:
|
| 29 |
pil_img.save(tmp, format="JPEG")
|
|
@@ -32,9 +36,17 @@ def imagen_pil_a_base64(pil_img):
|
|
| 32 |
b64 = base64.b64encode(img_bytes).decode("utf-8")
|
| 33 |
return b64
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def generar_respuesta(texto, imagen_pil):
|
| 37 |
-
# Asegurar texto UTF-8 limpio
|
| 38 |
if not isinstance(texto, str):
|
| 39 |
texto = str(texto)
|
| 40 |
try:
|
|
@@ -42,15 +54,12 @@ def generar_respuesta(texto, imagen_pil):
|
|
| 42 |
except UnicodeError:
|
| 43 |
texto = texto.encode("ascii", errors="replace").decode("ascii")
|
| 44 |
|
| 45 |
-
# Construir contenido en formato multimodal
|
| 46 |
contenido = [{"type": "text", "text": texto}]
|
| 47 |
if imagen_pil is not None:
|
| 48 |
img_b64 = imagen_pil_a_base64(imagen_pil)
|
| 49 |
contenido.append({
|
| 50 |
"type": "image_url",
|
| 51 |
-
"image_url": {
|
| 52 |
-
"url": f"image/jpeg;base64,{img_b64}"
|
| 53 |
-
}
|
| 54 |
})
|
| 55 |
|
| 56 |
try:
|
|
@@ -62,11 +71,12 @@ def generar_respuesta(texto, imagen_pil):
|
|
| 62 |
)
|
| 63 |
return respuesta.choices[0].message.content
|
| 64 |
except Exception as e:
|
| 65 |
-
print("Error en SambaNova:",
|
| 66 |
-
return "Lo siento, no pude procesar la
|
| 67 |
|
| 68 |
-
# --- Texto a audio (Hume) ---
|
| 69 |
def generar_audio(texto, voz_id=VOICE_ID_DEFAULT):
|
|
|
|
|
|
|
| 70 |
try:
|
| 71 |
respuesta = hume.tts.speak(text=texto, voice=voz_id)
|
| 72 |
audio_bytes = respuesta.audio
|
|
@@ -80,59 +90,56 @@ def generar_audio(texto, voz_id=VOICE_ID_DEFAULT):
|
|
| 80 |
print("Error en Hume TTS:", e)
|
| 81 |
return None
|
| 82 |
|
| 83 |
-
|
| 84 |
-
def manejar_chat(texto, imagen_pil, historial, voz_id):
|
| 85 |
if historial is None:
|
| 86 |
historial = []
|
| 87 |
-
if not texto or not texto.strip():
|
| 88 |
-
return historial, None, "Por favor, escribe un mensaje."
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
historial.append({"role": "assistant", "content": respuesta})
|
| 93 |
|
| 94 |
-
|
| 95 |
-
return historial,
|
| 96 |
|
| 97 |
def limpiar_chat():
|
| 98 |
return [], None, ""
|
| 99 |
|
| 100 |
# --- Interfaz Gradio ---
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
salida_texto = gr.Textbox(label="Respuesta", show_copy_button=True)
|
| 121 |
-
salida_audio = gr.Audio(label="Audio", autoplay=True)
|
| 122 |
-
|
| 123 |
-
enviar_btn.click(
|
| 124 |
-
manejar_chat,
|
| 125 |
-
inputs=[texto_in, imagen_in, chat, voz_sel],
|
| 126 |
-
outputs=[chat, salida_audio, salida_texto]
|
| 127 |
)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from sambanova import SambaNova
|
| 6 |
from hume import HumeClient
|
| 7 |
from PIL import Image
|
| 8 |
+
import whisper
|
| 9 |
|
| 10 |
# --- Configuración ---
|
| 11 |
SAMBA_API_KEY = os.getenv("SAMBA_API_KEY")
|
|
|
|
| 18 |
|
| 19 |
SAMBA_BASE_URL = "https://api.sambanova.ai/v1"
|
| 20 |
VOICE_ID_DEFAULT = "085fdec7-b201-4a58-b65b-4d321f7abd85"
|
| 21 |
+
MODEL_NAME = "Llama-4-Maverick-17B-128E-Instruct"
|
| 22 |
|
| 23 |
# --- Inicializar clientes ---
|
| 24 |
samba = SambaNova(api_key=SAMBA_API_KEY, base_url=SAMBA_BASE_URL)
|
| 25 |
hume = HumeClient(api_key=HUME_API_KEY)
|
| 26 |
|
| 27 |
+
# Cargar Whisper (small es rápido y preciso en Spaces)
|
| 28 |
+
whisper_model = whisper.load_model("small")
|
| 29 |
+
|
| 30 |
+
# --- Funciones auxiliares ---
|
| 31 |
def imagen_pil_a_base64(pil_img):
|
| 32 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpeg") as tmp:
|
| 33 |
pil_img.save(tmp, format="JPEG")
|
|
|
|
| 36 |
b64 = base64.b64encode(img_bytes).decode("utf-8")
|
| 37 |
return b64
|
| 38 |
|
| 39 |
+
def audio_a_texto(audio_path):
|
| 40 |
+
if not audio_path:
|
| 41 |
+
return ""
|
| 42 |
+
try:
|
| 43 |
+
result = whisper_model.transcribe(audio_path, language="es") # o "en" si prefieres inglés
|
| 44 |
+
return result["text"].strip()
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print("Error en Whisper STT:", e)
|
| 47 |
+
return ""
|
| 48 |
+
|
| 49 |
def generar_respuesta(texto, imagen_pil):
|
|
|
|
| 50 |
if not isinstance(texto, str):
|
| 51 |
texto = str(texto)
|
| 52 |
try:
|
|
|
|
| 54 |
except UnicodeError:
|
| 55 |
texto = texto.encode("ascii", errors="replace").decode("ascii")
|
| 56 |
|
|
|
|
| 57 |
contenido = [{"type": "text", "text": texto}]
|
| 58 |
if imagen_pil is not None:
|
| 59 |
img_b64 = imagen_pil_a_base64(imagen_pil)
|
| 60 |
contenido.append({
|
| 61 |
"type": "image_url",
|
| 62 |
+
"image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
|
|
|
|
|
|
|
| 63 |
})
|
| 64 |
|
| 65 |
try:
|
|
|
|
| 71 |
)
|
| 72 |
return respuesta.choices[0].message.content
|
| 73 |
except Exception as e:
|
| 74 |
+
print("Error en SambaNova:", e)
|
| 75 |
+
return "Lo siento, no pude procesar la consulta. ¿Puedes repetirla?"
|
| 76 |
|
|
|
|
| 77 |
def generar_audio(texto, voz_id=VOICE_ID_DEFAULT):
|
| 78 |
+
if not texto or not texto.strip():
|
| 79 |
+
return None
|
| 80 |
try:
|
| 81 |
respuesta = hume.tts.speak(text=texto, voice=voz_id)
|
| 82 |
audio_bytes = respuesta.audio
|
|
|
|
| 90 |
print("Error en Hume TTS:", e)
|
| 91 |
return None
|
| 92 |
|
| 93 |
+
def manejar_chat(audio_path, imagen_pil, historial, voz_id):
|
|
|
|
| 94 |
if historial is None:
|
| 95 |
historial = []
|
|
|
|
|
|
|
| 96 |
|
| 97 |
+
texto_usuario = audio_a_texto(audio_path)
|
| 98 |
+
if not texto_usuario:
|
| 99 |
+
return historial, None, "No entendí lo que dijiste. ¿Puedes repetirlo?"
|
| 100 |
+
|
| 101 |
+
historial.append({"role": "user", "content": texto_usuario})
|
| 102 |
+
respuesta = generar_respuesta(texto_usuario, imagen_pil)
|
| 103 |
historial.append({"role": "assistant", "content": respuesta})
|
| 104 |
|
| 105 |
+
audio_respuesta = generar_audio(respuesta, voz_id)
|
| 106 |
+
return historial, audio_respuesta, respuesta
|
| 107 |
|
| 108 |
def limpiar_chat():
|
| 109 |
return [], None, ""
|
| 110 |
|
| 111 |
# --- Interfaz Gradio ---
|
| 112 |
+
with gr.Blocks(title="Batuto AI Voz-a-Voz", theme=gr.themes.Soft()) as demo:
|
| 113 |
+
gr.Markdown("# 🗣️ Batuto AI: Habla conmigo (voz + imagen)")
|
| 114 |
+
gr.Markdown("Presiona el micrófono, habla, y opcionalmente sube una imagen.")
|
| 115 |
+
|
| 116 |
+
chat = gr.Chatbot(label="Conversación", height=400, type="messages")
|
| 117 |
+
|
| 118 |
+
with gr.Row():
|
| 119 |
+
audio_in = gr.Audio(sources=["microphone"], type="filepath", label="🎤 Habla aquí")
|
| 120 |
+
imagen_in = gr.Image(label="📸 Imagen opcional", type="pil")
|
| 121 |
+
|
| 122 |
+
with gr.Row():
|
| 123 |
+
voz_sel = gr.Dropdown(
|
| 124 |
+
label="Voz de respuesta",
|
| 125 |
+
choices=[
|
| 126 |
+
("Femenina cálida", "085fdec7-b201-4a58-b65b-4d321f7abd85"),
|
| 127 |
+
("Masculina serena", "5c7d2e6a-5d3f-4b3a-8a3d-2e6a5d3f4b3a"),
|
| 128 |
+
("Neutra", "9a8b7c6d-5e4f-3a2b-1c0d-9e8f7a6b5c4d")
|
| 129 |
+
],
|
| 130 |
+
value=VOICE_ID_DEFAULT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
)
|
| 132 |
+
enviar_btn = gr.Button("Enviar")
|
| 133 |
+
limpiar_btn = gr.Button("Limpiar")
|
| 134 |
+
|
| 135 |
+
salida_texto = gr.Textbox(label="Respuesta", interactive=False)
|
| 136 |
+
salida_audio = gr.Audio(label="Audio", autoplay=True, interactive=False)
|
| 137 |
+
|
| 138 |
+
enviar_btn.click(
|
| 139 |
+
manejar_chat,
|
| 140 |
+
inputs=[audio_in, imagen_in, chat, voz_sel],
|
| 141 |
+
outputs=[chat, salida_audio, salida_texto]
|
| 142 |
+
)
|
| 143 |
+
limpiar_btn.click(limpiar_chat, outputs=[chat, salida_audio, salida_texto])
|
| 144 |
+
|
| 145 |
+
demo.launch()
|