Spaces:

BATUTO-ART
/

Maverick_batuto_pro

Sleeping

File size: 6,727 Bytes

import os
import base64
import tempfile
import gradio as gr
import requests
from hume import HumeClient
from PIL import Image
import whisper

# --- Configuración ---
SAMBA_API_KEY = os.getenv("SAMBA_API_KEY")
HUME_API_KEY = os.getenv("HUME_API_KEY")

if not SAMBA_API_KEY:
    raise ValueError("Falta la variable de entorno: SAMBA_API_KEY")
if not HUME_API_KEY:
    raise ValueError("Falta la variable de entorno: HUME_API_KEY")

VOICE_ID_DEFAULT = "085fdec7-b201-4a58-b65b-4d321f7abd85"
MODEL_NAME = "Llama-4-Maverick-17B-128E-Instruct"
SAMBA_API_URL = "https://api.sambanova.ai/v1/chat/completions"

# --- Inicializar clientes ---
hume = HumeClient(api_key=HUME_API_KEY)
whisper_model = whisper.load_model("base")

# --- Funciones auxiliares ---
def imagen_pil_a_base64(pil_img):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".jpeg") as tmp:
        pil_img.save(tmp, format="JPEG")
        tmp.seek(0)
        img_bytes = tmp.read()
    return base64.b64encode(img_bytes).decode("utf-8")

def audio_a_texto(audio_path):
    if not audio_path:
        return ""
    try:
        result = whisper_model.transcribe(audio_path, language="es")
        return result["text"].strip()
    except Exception as e:
        print("Error en Whisper STT:", e)
        return ""

def generar_respuesta(texto, imagen_pil):
    if not texto or not texto.strip():
        if imagen_pil is not None:
            prompt_usuario = "Describe this image in English in one short paragraph. Only output the description, no other text."
            contenido = [{"type": "text", "text": prompt_usuario}]
        else:
            return "Por favor, escribe un mensaje o envía una imagen."
    else:
        contenido = [{"type": "text", "text": texto}]

    if imagen_pil is not None:
        img_b64 = imagen_pil_a_base64(imagen_pil)
        contenido.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
        })

    headers = {
        "Authorization": f"Bearer {SAMBA_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "user", "content": contenido}],
        "temperature": 0.1 if (imagen_pil is not None and (not texto or "describe" in texto.lower())) else 0.3,
        "top_p": 0.9
    }

    try:
        response = requests.post(SAMBA_API_URL, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print("Error en SambaNova API:", e)
        return "Lo siento, no pude procesar la solicitud."

def generar_audio(texto, voz_id=VOICE_ID_DEFAULT):
    if not texto or not texto.strip():
        return None
    try:
        respuesta = hume.tts.speak(text=texto, voice=voz_id)
        audio_bytes = respuesta.audio
        if not audio_bytes:
            return None
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        tmp.write(audio_bytes)
        tmp.close()
        return tmp.name
    except Exception as e:
        print("Error en Hume TTS:", e)
        return None

def manejar_entrada(texto_input, audio_input, imagen_pil, historial, voz_id):
    if historial is None:
        historial = []

    if texto_input and texto_input.strip():
        texto_usuario = texto_input.strip()
    elif audio_input:
        texto_usuario = audio_a_texto(audio_input)
        if not texto_usuario:
            return historial, None, "No entendí lo que dijiste. ¿Puedes repetirlo?"
    else:
        texto_usuario = ""

    respuesta = generar_respuesta(texto_usuario, imagen_pil)
    historial.append({"role": "user", "content": texto_usuario or "(envió una imagen)"})
    historial.append({"role": "assistant", "content": respuesta})

    audio_respuesta = generar_audio(respuesta, voz_id)
    return historial, audio_respuesta, respuesta

def limpiar_chat():
    return [], None, ""

# --- Interfaz Gradio ---
with gr.Blocks(title="Batuto AI: Texto/Voz + Imagen → Voz", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 Batuto AI: Texto o Voz + Imagen → Respuesta en Voz")
    gr.Markdown("Escribe o habla. Sube una imagen. Siempre respondo con voz empática.")

    # Agregar script JS para botones de copiar dentro del chat
    gr.HTML("""
    <script>
    const observer = new MutationObserver(() => {
      document.querySelectorAll('.message.bot').forEach(msg => {
        if (!msg.querySelector('.copy-btn')) {
          const button = document.createElement('button');
          button.textContent = '📋 Copiar';
          button.className = 'copy-btn';
          button.style.cssText = 'float:right; margin-left:10px; cursor:pointer; background:#eee; border:none; border-radius:4px; padding:2px 6px;';
          button.onclick = () => {
            navigator.clipboard.writeText(msg.querySelector('p').innerText);
            button.textContent = '✅ Copiado';
            setTimeout(()=>button.textContent='📋 Copiar',1500);
          };
          msg.querySelector('p').appendChild(button);
        }
      });
    });
    observer.observe(document.body, { childList: true, subtree: true });
    </script>
    """)

    chat = gr.Chatbot(label="Conversación", height=400, type="messages", render_markdown=True)

    with gr.Row():
        texto_in = gr.Textbox(label="📝 Escribe tu mensaje", lines=1, placeholder="Ej: Describe this image in English")
        audio_in = gr.Audio(sources=["microphone"], type="filepath", label="🎤 O habla aquí")

    imagen_in = gr.Image(label="📸 Imagen opcional", type="pil")

    with gr.Row():
        voz_sel = gr.Dropdown(
            label="Voz de respuesta",
            choices=[
                ("Femenina cálida", "085fdec7-b201-4a58-b65b-4d321f7abd85"),
                ("Masculina serena", "5c7d2e6a-5d3f-4b3a-8a3d-2e6a5d3f4b3a"),
                ("Neutra", "9a8b7c6d-5e4f-3a2b-1c0d-9e8f7a6b5c4d")
            ],
            value=VOICE_ID_DEFAULT
        )
        enviar_btn = gr.Button("Enviar")
        limpiar_btn = gr.Button("Limpiar")

    salida_texto = gr.Textbox(label="Respuesta", interactive=False, show_copy_button=True)
    salida_audio = gr.Audio(label="Audio", autoplay=True, interactive=False)

    enviar_btn.click(
        manejar_entrada,
        inputs=[texto_in, audio_in, imagen_in, chat, voz_sel],
        outputs=[chat, salida_audio, salida_texto]
    )
    texto_in.submit(
        manejar_entrada,
        inputs=[texto_in, audio_in, imagen_in, chat, voz_sel],
        outputs=[chat, salida_audio, salida_texto]
    )
    limpiar_btn.click(limpiar_chat, outputs=[chat, salida_audio, salida_texto])

demo.launch()