Adrian8as's picture
Update app.py
a39e017 verified
"""
Gradio UI para Keyword Spotting API v2.0
Usa el endpoint HTTP /predict para analizar audio y detectar keywords.
"""
import os
import httpx
import gradio as gr
from dotenv import load_dotenv
load_dotenv()
# ============================================================================
# CONFIGURACIÓN
# ============================================================================
API_URL = os.getenv("API_URL", "http://localhost:8000")
API_KEY = os.getenv("API_KEY", "")
DEFAULT_KEYWORDS = "sí, no, quizás, imposible, hola, adiós, gracias, por favor"
# Autenticación (opcional)
GRADIO_USERNAME = os.getenv("GRADIO_USERNAME")
GRADIO_PASSWORD = os.getenv("GRADIO_PASSWORD")
# ============================================================================
# ESTILOS CSS
# ============================================================================
CSS = """
/* Contenedor principal */
.gradio-container {
max-width: 900px !important;
margin: 0 auto !important;
font-family: 'Segoe UI', system-ui, sans-serif !important;
}
/* Header */
.header-container {
text-align: center;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 12px;
margin-bottom: 20px;
color: white;
}
.header-container h1 {
margin: 0;
font-size: 2em;
}
.header-container p {
margin: 10px 0 0 0;
opacity: 0.9;
}
/* Resultado principal */
.result-box {
padding: 24px;
border-radius: 12px;
background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
color: white;
text-align: center;
margin: 16px 0;
}
.result-box.error {
background: linear-gradient(135deg, #cb2d3e 0%, #ef473a 100%);
}
.result-word {
font-size: 2.5em;
font-weight: bold;
margin: 0;
text-transform: uppercase;
letter-spacing: 2px;
}
.result-confidence {
font-size: 1.2em;
margin-top: 8px;
opacity: 0.9;
}
/* Transcripción */
.transcription-box {
padding: 16px;
background: rgba(102, 126, 234, 0.15);
border-left: 4px solid #667eea;
border-radius: 0 8px 8px 0;
margin: 16px 0;
}
.transcription-label {
font-size: 0.85em;
color: #a0a0a0;
margin-bottom: 4px;
}
.transcription-text {
font-size: 1.2em;
color: #ffffff;
font-style: italic;
}
/* Alternativas */
.alternatives-container {
margin-top: 16px;
}
.alternatives-container > p {
color: #ffffff !important;
}
.alternative-item {
display: flex;
align-items: center;
padding: 12px 16px;
background: rgba(255, 255, 255, 0.1);
border-radius: 8px;
margin-bottom: 8px;
border: 1px solid rgba(255, 255, 255, 0.2);
}
.alternative-keyword {
font-weight: 600;
min-width: 120px;
color: #ffffff;
}
.alternative-bar {
flex: 1;
height: 24px;
background: #e9ecef;
border-radius: 12px;
overflow: hidden;
margin: 0 12px;
}
.alternative-fill {
height: 100%;
background: linear-gradient(90deg, #667eea, #764ba2);
border-radius: 12px;
transition: width 0.3s ease;
}
.alternative-score {
font-weight: 600;
min-width: 60px;
text-align: right;
color: #a78bfa;
}
/* Botón */
.primary-btn {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
border: none !important;
font-size: 1.1em !important;
padding: 12px 32px !important;
}
.primary-btn:hover {
opacity: 0.9;
transform: translateY(-1px);
}
"""
# ============================================================================
# FUNCIONES
# ============================================================================
def format_result_html(result: dict) -> str:
"""Formatea el resultado en HTML bonito."""
if not result.get("success", False):
return f"""
<div class="result-box error">
<p class="result-word">Error</p>
<p class="result-confidence">{result.get('message', 'Error desconocido')}</p>
</div>
"""
word = result.get("word_detected", "—")
confidence = result.get("confidence", 0)
transcription = result.get("transcription", "")
alternatives = result.get("alternatives", [])
# Buscar palabras con el mismo score máximo (100% o igual al principal)
top_words = [word]
remaining_alternatives = []
for alt in alternatives:
alt_score = alt.get("score", 0)
# Si tiene el mismo score que el principal (con tolerancia de 0.01)
if abs(alt_score - confidence) < 0.01:
top_words.append(alt.get("keyword", ""))
else:
remaining_alternatives.append(alt)
# Formatear palabras principales
if len(top_words) > 1:
words_display = " / ".join(top_words)
else:
words_display = word
# Box principal
html = f"""
<div class="result-box">
<p class="result-word">{words_display}</p>
<p class="result-confidence">Confianza: {confidence * 100:.1f}%</p>
</div>
"""
# Transcripción
if transcription:
html += f"""
<div class="transcription-box">
<div class="transcription-label">📝 Transcripción de Whisper:</div>
<div class="transcription-text">"{transcription}"</div>
</div>
"""
# Alternativas (solo las que no están en el top)
if remaining_alternatives:
html += '<div class="alternatives-container"><p style="font-weight: 600; margin-bottom: 12px;">🔄 Otras palabras detectadas:</p>'
for alt in remaining_alternatives:
keyword = alt.get("keyword", "")
score = alt.get("score", 0)
bar_width = score * 100
html += f"""
<div class="alternative-item">
<span class="alternative-keyword">{keyword}</span>
<div class="alternative-bar">
<div class="alternative-fill" style="width: {bar_width}%"></div>
</div>
<span class="alternative-score">{score * 100:.1f}%</span>
</div>
"""
html += '</div>'
return html
def predict_keywords(audio, keywords_text: str) -> str:
"""
Envía audio al endpoint /predict y retorna HTML con resultados.
"""
if audio is None:
return '<div class="result-box error"><p class="result-word">⚠️</p><p class="result-confidence">Por favor, graba o sube un audio</p></div>'
if not API_KEY:
return '<div class="result-box error"><p class="result-word">⚠️</p><p class="result-confidence">API_KEY no configurada. Configura la variable de entorno.</p></div>'
# Parsear keywords
keywords = keywords_text.strip() if keywords_text else DEFAULT_KEYWORDS
try:
# Gradio devuelve (sample_rate, numpy_array) o filepath según el type
# Con type="filepath" devuelve la ruta del archivo
audio_path = audio
# Preparar request
url = f"{API_URL}/predict"
headers = {"x-api-key": API_KEY}
with open(audio_path, "rb") as f:
files = {"audio_file": (os.path.basename(audio_path), f, "audio/wav")}
data = {"keywords": keywords}
response = httpx.post(
url,
headers=headers,
files=files,
data=data,
timeout=60.0
)
if response.status_code == 200:
result = response.json()
return format_result_html(result)
elif response.status_code == 401:
return '<div class="result-box error"><p class="result-word">🔐</p><p class="result-confidence">API Key inválida</p></div>'
else:
error_detail = response.json().get("detail", response.text)
return f'<div class="result-box error"><p class="result-word">Error {response.status_code}</p><p class="result-confidence">{error_detail}</p></div>'
except httpx.ConnectError:
return '<div class="result-box error"><p class="result-word">🔌</p><p class="result-confidence">No se pudo conectar al servidor. ¿Está corriendo la API?</p></div>'
except Exception as e:
return f'<div class="result-box error"><p class="result-word">❌</p><p class="result-confidence">{str(e)}</p></div>'
# ============================================================================
# INTERFAZ GRADIO
# ============================================================================
with gr.Blocks(title="🎯 Keyword Spotting") as demo:
# Inyectar CSS como HTML (compatible con todas las versiones de Gradio)
gr.HTML(f"<style>{CSS}</style>")
# Header
gr.HTML("""
<div class="header-container">
<h1>🎯 Keyword Spotting</h1>
<p>Detecta palabras clave en audio usando Whisper AI</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🎤 Audio")
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Graba o sube un audio"
)
gr.Markdown("### 🏷️ Keywords")
keywords_input = gr.Textbox(
label="Palabras a detectar (separadas por coma)",
placeholder="sí, no, quizás, imposible...",
value=DEFAULT_KEYWORDS,
lines=2
)
submit_btn = gr.Button(
"🔍 Analizar Audio",
variant="primary",
elem_classes=["primary-btn"]
)
with gr.Column(scale=1):
gr.Markdown("### 📊 Resultados")
result_output = gr.HTML(
value='<div style="padding: 40px; text-align: center; color: #999;">Los resultados aparecerán aquí</div>'
)
# Ejemplos
gr.Markdown("---")
gr.Markdown("### 💡 Tips")
gr.Markdown("""
- **Habla claro**: Pronuncia la palabra de forma clara y pausada
- **Sin ruido**: Evita ruido de fondo para mejores resultados
- **Keywords**: Puedes personalizar las palabras a detectar
- **Formatos**: Soporta WAV, MP3, OGG y otros formatos de audio
""")
# Footer
gr.Markdown("""
---
<center style="color: #999; font-size: 0.9em;">
Powered by <b>Whisper AI</b> • Transcription + Text Matching
</center>
""")
# Event handler
submit_btn.click(
fn=predict_keywords,
inputs=[audio_input, keywords_input],
outputs=result_output
)
# También procesar al soltar audio
audio_input.change(
fn=predict_keywords,
inputs=[audio_input, keywords_input],
outputs=result_output
)
# ============================================================================
# MAIN
# ============================================================================
if __name__ == "__main__":
# Configurar autenticación si hay contraseña
auth = None
if GRADIO_PASSWORD:
auth = (GRADIO_USERNAME, GRADIO_PASSWORD)
print(f"🔐 Autenticación habilitada. Usuario: {GRADIO_USERNAME}")
else:
print("⚠️ Sin autenticación. Configura GRADIO_PASSWORD para proteger la app.")
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
auth=auth
)