Spaces:
Running
Running
| import os | |
| import base64 | |
| import tempfile | |
| import gradio as gr | |
| import requests | |
| from hume import HumeClient | |
| from PIL import Image | |
| import whisper | |
| # --- Configuración --- | |
| SAMBA_API_KEY = os.getenv("SAMBA_API_KEY") | |
| HUME_API_KEY = os.getenv("HUME_API_KEY") | |
| if not SAMBA_API_KEY: | |
| raise ValueError("Falta la variable de entorno: SAMBA_API_KEY") | |
| if not HUME_API_KEY: | |
| raise ValueError("Falta la variable de entorno: HUME_API_KEY") | |
| VOICE_ID_DEFAULT = "085fdec7-b201-4a58-b65b-4d321f7abd85" | |
| MODEL_NAME = "Llama-4-Maverick-17B-128E-Instruct" | |
| SAMBA_API_URL = "https://api.sambanova.ai/v1/chat/completions" | |
| # --- Inicializar clientes --- | |
| hume = HumeClient(api_key=HUME_API_KEY) | |
| whisper_model = whisper.load_model("base") | |
| # --- Funciones auxiliares --- | |
| def imagen_pil_a_base64(pil_img): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".jpeg") as tmp: | |
| pil_img.save(tmp, format="JPEG") | |
| tmp.seek(0) | |
| img_bytes = tmp.read() | |
| return base64.b64encode(img_bytes).decode("utf-8") | |
| def audio_a_texto(audio_path): | |
| if not audio_path: | |
| return "" | |
| try: | |
| result = whisper_model.transcribe(audio_path, language="es") | |
| return result["text"].strip() | |
| except Exception as e: | |
| print("Error en Whisper STT:", e) | |
| return "" | |
| def generar_respuesta(texto, imagen_pil): | |
| if not texto or not texto.strip(): | |
| if imagen_pil is not None: | |
| prompt_usuario = "Describe this image in English in one short paragraph. Only output the description, no other text." | |
| contenido = [{"type": "text", "text": prompt_usuario}] | |
| else: | |
| return "Por favor, escribe un mensaje o envía una imagen." | |
| else: | |
| contenido = [{"type": "text", "text": texto}] | |
| if imagen_pil is not None: | |
| img_b64 = imagen_pil_a_base64(imagen_pil) | |
| contenido.append({ | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"} | |
| }) | |
| headers = { | |
| "Authorization": f"Bearer {SAMBA_API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "model": MODEL_NAME, | |
| "messages": [{"role": "user", "content": contenido}], | |
| "temperature": 0.1 if (imagen_pil is not None and (not texto or "describe" in texto.lower())) else 0.3, | |
| "top_p": 0.9 | |
| } | |
| try: | |
| response = requests.post(SAMBA_API_URL, headers=headers, json=payload, timeout=120) | |
| response.raise_for_status() | |
| return response.json()["choices"][0]["message"]["content"].strip() | |
| except Exception as e: | |
| print("Error en SambaNova API:", e) | |
| return "Lo siento, no pude procesar la solicitud." | |
| def generar_audio(texto, voz_id=VOICE_ID_DEFAULT): | |
| if not texto or not texto.strip(): | |
| return None | |
| try: | |
| respuesta = hume.tts.speak(text=texto, voice=voz_id) | |
| audio_bytes = respuesta.audio | |
| if not audio_bytes: | |
| return None | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
| tmp.write(audio_bytes) | |
| tmp.close() | |
| return tmp.name | |
| except Exception as e: | |
| print("Error en Hume TTS:", e) | |
| return None | |
| def manejar_entrada(texto_input, audio_input, imagen_pil, historial, voz_id): | |
| if historial is None: | |
| historial = [] | |
| if texto_input and texto_input.strip(): | |
| texto_usuario = texto_input.strip() | |
| elif audio_input: | |
| texto_usuario = audio_a_texto(audio_input) | |
| if not texto_usuario: | |
| return historial, None, "No entendí lo que dijiste. ¿Puedes repetirlo?" | |
| else: | |
| texto_usuario = "" | |
| respuesta = generar_respuesta(texto_usuario, imagen_pil) | |
| historial.append({"role": "user", "content": texto_usuario or "(envió una imagen)"}) | |
| historial.append({"role": "assistant", "content": respuesta}) | |
| audio_respuesta = generar_audio(respuesta, voz_id) | |
| return historial, audio_respuesta, respuesta | |
| def limpiar_chat(): | |
| return [], None, "" | |
| # --- Interfaz Gradio --- | |
| with gr.Blocks(title="Batuto AI: Texto/Voz + Imagen → Voz", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🧠 Batuto AI: Texto o Voz + Imagen → Respuesta en Voz") | |
| gr.Markdown("Escribe o habla. Sube una imagen. Siempre respondo con voz empática.") | |
| # Agregar script JS para botones de copiar dentro del chat | |
| gr.HTML(""" | |
| <script> | |
| const observer = new MutationObserver(() => { | |
| document.querySelectorAll('.message.bot').forEach(msg => { | |
| if (!msg.querySelector('.copy-btn')) { | |
| const button = document.createElement('button'); | |
| button.textContent = '📋 Copiar'; | |
| button.className = 'copy-btn'; | |
| button.style.cssText = 'float:right; margin-left:10px; cursor:pointer; background:#eee; border:none; border-radius:4px; padding:2px 6px;'; | |
| button.onclick = () => { | |
| navigator.clipboard.writeText(msg.querySelector('p').innerText); | |
| button.textContent = '✅ Copiado'; | |
| setTimeout(()=>button.textContent='📋 Copiar',1500); | |
| }; | |
| msg.querySelector('p').appendChild(button); | |
| } | |
| }); | |
| }); | |
| observer.observe(document.body, { childList: true, subtree: true }); | |
| </script> | |
| """) | |
| chat = gr.Chatbot(label="Conversación", height=400, type="messages", render_markdown=True) | |
| with gr.Row(): | |
| texto_in = gr.Textbox(label="📝 Escribe tu mensaje", lines=1, placeholder="Ej: Describe this image in English") | |
| audio_in = gr.Audio(sources=["microphone"], type="filepath", label="🎤 O habla aquí") | |
| imagen_in = gr.Image(label="📸 Imagen opcional", type="pil") | |
| with gr.Row(): | |
| voz_sel = gr.Dropdown( | |
| label="Voz de respuesta", | |
| choices=[ | |
| ("Femenina cálida", "085fdec7-b201-4a58-b65b-4d321f7abd85"), | |
| ("Masculina serena", "5c7d2e6a-5d3f-4b3a-8a3d-2e6a5d3f4b3a"), | |
| ("Neutra", "9a8b7c6d-5e4f-3a2b-1c0d-9e8f7a6b5c4d") | |
| ], | |
| value=VOICE_ID_DEFAULT | |
| ) | |
| enviar_btn = gr.Button("Enviar") | |
| limpiar_btn = gr.Button("Limpiar") | |
| salida_texto = gr.Textbox(label="Respuesta", interactive=False, show_copy_button=True) | |
| salida_audio = gr.Audio(label="Audio", autoplay=True, interactive=False) | |
| enviar_btn.click( | |
| manejar_entrada, | |
| inputs=[texto_in, audio_in, imagen_in, chat, voz_sel], | |
| outputs=[chat, salida_audio, salida_texto] | |
| ) | |
| texto_in.submit( | |
| manejar_entrada, | |
| inputs=[texto_in, audio_in, imagen_in, chat, voz_sel], | |
| outputs=[chat, salida_audio, salida_texto] | |
| ) | |
| limpiar_btn.click(limpiar_chat, outputs=[chat, salida_audio, salida_texto]) | |
| demo.launch() | |