Spaces:

BATUTO-ART
/

Maverick_batuto_pro

Sleeping

App Files Files Community

BATUTO-ART commited on Nov 13, 2025

Commit

2015b99

verified ·

1 Parent(s): f353748

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -56

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import gradio as gr
 from sambanova import SambaNova
 from hume import HumeClient
 from PIL import Image
 # --- Configuración ---
 SAMBA_API_KEY = os.getenv("SAMBA_API_KEY")
@@ -17,13 +18,16 @@ if not HUME_API_KEY:
 SAMBA_BASE_URL = "https://api.sambanova.ai/v1"
 VOICE_ID_DEFAULT = "085fdec7-b201-4a58-b65b-4d321f7abd85"
-MODEL_NAME = "Llama-4-Maverick-17B-128E-Instruct"  # ✔️ Sí soporta visión, según el playground
 # --- Inicializar clientes ---
 samba = SambaNova(api_key=SAMBA_API_KEY, base_url=SAMBA_BASE_URL)
 hume = HumeClient(api_key=HUME_API_KEY)
-# --- PIL → base64 ---
 def imagen_pil_a_base64(pil_img):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".jpeg") as tmp:
         pil_img.save(tmp, format="JPEG")
@@ -32,9 +36,17 @@ def imagen_pil_a_base64(pil_img):
     b64 = base64.b64encode(img_bytes).decode("utf-8")
     return b64
-# --- Generar respuesta multimodal ---
 def generar_respuesta(texto, imagen_pil):
-    # Asegurar texto UTF-8 limpio
     if not isinstance(texto, str):
         texto = str(texto)
     try:
@@ -42,15 +54,12 @@ def generar_respuesta(texto, imagen_pil):
     except UnicodeError:
         texto = texto.encode("ascii", errors="replace").decode("ascii")
-    # Construir contenido en formato multimodal
     contenido = [{"type": "text", "text": texto}]
     if imagen_pil is not None:
         img_b64 = imagen_pil_a_base64(imagen_pil)
         contenido.append({
             "type": "image_url",
-            "image_url": {
-                "url": f"image/jpeg;base64,{img_b64}"
-            }
         })
     try:
@@ -62,11 +71,12 @@ def generar_respuesta(texto, imagen_pil):
         )
         return respuesta.choices[0].message.content
     except Exception as e:
-        print("Error en SambaNova:", repr(e))
-        return "Lo siento, no pude procesar la imagen o el texto. ¿Puedes intentarlo de nuevo?"
-# --- Texto a audio (Hume) ---
 def generar_audio(texto, voz_id=VOICE_ID_DEFAULT):
     try:
         respuesta = hume.tts.speak(text=texto, voice=voz_id)
         audio_bytes = respuesta.audio
@@ -80,59 +90,56 @@ def generar_audio(texto, voz_id=VOICE_ID_DEFAULT):
         print("Error en Hume TTS:", e)
         return None
-# --- Manejador del chat ---
-def manejar_chat(texto, imagen_pil, historial, voz_id):
     if historial is None:
         historial = []
-    if not texto or not texto.strip():
-        return historial, None, "Por favor, escribe un mensaje."
-    historial.append({"role": "user", "content": texto})
-    respuesta = generar_respuesta(texto, imagen_pil)
     historial.append({"role": "assistant", "content": respuesta})
-    audio_path = generar_audio(respuesta, voz_id)
-    return historial, audio_path, respuesta
 def limpiar_chat():
     return [], None, ""
 # --- Interfaz Gradio ---
-def construir_ui():
-    with gr.Blocks(theme=gr.themes.Soft(), title="Batuto AI Total v2") as demo:
-        gr.Markdown("# 🧠 Batuto AI Total v2\nChat multimodal con voz empática.")
-        chat = gr.Chatbot(label="Conversación", height=420, type="messages")
-        with gr.Row():
-            texto_in = gr.Textbox(label="Tu mensaje", lines=2, placeholder="Ej: ¿Qué hay en esta imagen?")
-            imagen_in = gr.Image(label="Imagen opcional", type="pil")
-        with gr.Row():
-            voz_sel = gr.Dropdown(
-                label="Voz de respuesta",
-                choices=[
-                    ("Femenina cálida", "085fdec7-b201-4a58-b65b-4d321f7abd85"),
-                    ("Masculina serena", "5c7d2e6a-5d3f-4b3a-8a3d-2e6a5d3f4b3a"),
-                    ("Neutra", "9a8b7c6d-5e4f-3a2b-1c0d-9e8f7a6b5c4d")
-                ],
-                value=VOICE_ID_DEFAULT
-            )
-            enviar_btn = gr.Button("Enviar")
-            limpiar_btn = gr.Button("Limpiar")
-        salida_texto = gr.Textbox(label="Respuesta", show_copy_button=True)
-        salida_audio = gr.Audio(label="Audio", autoplay=True)
-        enviar_btn.click(
-            manejar_chat,
-            inputs=[texto_in, imagen_in, chat, voz_sel],
-            outputs=[chat, salida_audio, salida_texto]
         )
-        texto_in.submit(
-            manejar_chat,
-            inputs=[texto_in, imagen_in, chat, voz_sel],
-            outputs=[chat, salida_audio, salida_texto]
-        )
-        limpiar_btn.click(limpiar_chat, outputs=[chat, salida_audio, salida_texto])
-    return demo
-# --- Lanzar ---
-app = construir_ui()
-app.launch(share=True)

 from sambanova import SambaNova
 from hume import HumeClient
 from PIL import Image
+import whisper
 # --- Configuración ---
 SAMBA_API_KEY = os.getenv("SAMBA_API_KEY")
 SAMBA_BASE_URL = "https://api.sambanova.ai/v1"
 VOICE_ID_DEFAULT = "085fdec7-b201-4a58-b65b-4d321f7abd85"
+MODEL_NAME = "Llama-4-Maverick-17B-128E-Instruct"
 # --- Inicializar clientes ---
 samba = SambaNova(api_key=SAMBA_API_KEY, base_url=SAMBA_BASE_URL)
 hume = HumeClient(api_key=HUME_API_KEY)
+# Cargar Whisper (small es rápido y preciso en Spaces)
+whisper_model = whisper.load_model("small")
+# --- Funciones auxiliares ---
 def imagen_pil_a_base64(pil_img):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".jpeg") as tmp:
         pil_img.save(tmp, format="JPEG")
     b64 = base64.b64encode(img_bytes).decode("utf-8")
     return b64
+def audio_a_texto(audio_path):
+    if not audio_path:
+        return ""
+    try:
+        result = whisper_model.transcribe(audio_path, language="es")  # o "en" si prefieres inglés
+        return result["text"].strip()
+    except Exception as e:
+        print("Error en Whisper STT:", e)
+        return ""
 def generar_respuesta(texto, imagen_pil):
     if not isinstance(texto, str):
         texto = str(texto)
     try:
     except UnicodeError:
         texto = texto.encode("ascii", errors="replace").decode("ascii")
     contenido = [{"type": "text", "text": texto}]
     if imagen_pil is not None:
         img_b64 = imagen_pil_a_base64(imagen_pil)
         contenido.append({
             "type": "image_url",
+            "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
         })
     try:
         )
         return respuesta.choices[0].message.content
     except Exception as e:
+        print("Error en SambaNova:", e)
+        return "Lo siento, no pude procesar la consulta. ¿Puedes repetirla?"
 def generar_audio(texto, voz_id=VOICE_ID_DEFAULT):
+    if not texto or not texto.strip():
+        return None
     try:
         respuesta = hume.tts.speak(text=texto, voice=voz_id)
         audio_bytes = respuesta.audio
         print("Error en Hume TTS:", e)
         return None
+def manejar_chat(audio_path, imagen_pil, historial, voz_id):
     if historial is None:
         historial = []
+    texto_usuario = audio_a_texto(audio_path)
+    if not texto_usuario:
+        return historial, None, "No entendí lo que dijiste. ¿Puedes repetirlo?"
+    historial.append({"role": "user", "content": texto_usuario})
+    respuesta = generar_respuesta(texto_usuario, imagen_pil)
     historial.append({"role": "assistant", "content": respuesta})
+    audio_respuesta = generar_audio(respuesta, voz_id)
+    return historial, audio_respuesta, respuesta
 def limpiar_chat():
     return [], None, ""
 # --- Interfaz Gradio ---
+with gr.Blocks(title="Batuto AI Voz-a-Voz", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🗣️ Batuto AI: Habla conmigo (voz + imagen)")
+    gr.Markdown("Presiona el micrófono, habla, y opcionalmente sube una imagen.")
+    chat = gr.Chatbot(label="Conversación", height=400, type="messages")
+    with gr.Row():
+        audio_in = gr.Audio(sources=["microphone"], type="filepath", label="🎤 Habla aquí")
+        imagen_in = gr.Image(label="📸 Imagen opcional", type="pil")
+    with gr.Row():
+        voz_sel = gr.Dropdown(
+            label="Voz de respuesta",
+            choices=[
+                ("Femenina cálida", "085fdec7-b201-4a58-b65b-4d321f7abd85"),
+                ("Masculina serena", "5c7d2e6a-5d3f-4b3a-8a3d-2e6a5d3f4b3a"),
+                ("Neutra", "9a8b7c6d-5e4f-3a2b-1c0d-9e8f7a6b5c4d")
+            ],
+            value=VOICE_ID_DEFAULT
         )
+        enviar_btn = gr.Button("Enviar")
+        limpiar_btn = gr.Button("Limpiar")
+    salida_texto = gr.Textbox(label="Respuesta", interactive=False)
+    salida_audio = gr.Audio(label="Audio", autoplay=True, interactive=False)
+    enviar_btn.click(
+        manejar_chat,
+        inputs=[audio_in, imagen_in, chat, voz_sel],
+        outputs=[chat, salida_audio, salida_texto]
+    )
+    limpiar_btn.click(limpiar_chat, outputs=[chat, salida_audio, salida_texto])
+demo.launch()