Simple-KWS / app.py
IvanLayer7's picture
Upload app.py
e25c059 verified
"""
Hugging Face Spaces version of the Keyword Spotting App.
Simplified for deployment without local authentication.
"""
import gradio as gr
import numpy as np
import torch
import os
from typing import Dict, Any, Tuple, Optional
import warnings
# Import our custom modules
from audio_processor import AudioProcessor
from whisper_classifier import WhisperKeywordSpotter
warnings.filterwarnings("ignore")
def get_auth_token():
"""Get authentication token from environment variables."""
# Default token if not set in environment
default_token = "layer7"
# Try to get from environment variable
token = os.getenv("ACCESS_TOKEN", default_token)
return token
def authenticate_user(token: str) -> bool:
"""
Simple token-based authentication.
Args:
token: User provided token
Returns:
True if token is valid, False otherwise
"""
valid_token = get_auth_token()
return token == valid_token
class KeywordSpottingApp:
"""Main application class for the keyword spotting interface."""
def __init__(self, model_size: str = "base"):
"""Initialize the application components."""
print("Initializing Keyword Spotting App for Hugging Face...")
# Initialize components
self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0)
self.classifier = WhisperKeywordSpotter(model_size=model_size)
print("App initialized successfully!")
def change_model(self, new_model_size: str) -> str:
"""Change the Whisper model size."""
try:
success = self.classifier.change_model(new_model_size)
if success:
return f"✅ Successfully changed to {new_model_size} model"
else:
return f"❌ Failed to change to {new_model_size} model"
except Exception as e:
return f"❌ Error changing model: {str(e)}"
def process_audio_and_classify(
self,
audio_input: Optional[Tuple[int, np.ndarray]],
audio_file: Optional[str],
keywords: str
) -> Tuple[Dict[str, float], str]:
"""
Process audio input and perform keyword classification.
Args:
audio_input: Tuple of (sample_rate, audio_array) from microphone
audio_file: Path to uploaded audio file
keywords: Comma-separated keywords string
Returns:
Tuple of (classification_results, status_message)
"""
try:
# Validate keywords
if not keywords or not keywords.strip():
return {}, "❌ Por favor, ingrese al menos una palabra clave."
# Determine audio source and process
audio_tensor = None
source_info = ""
if audio_file is not None:
# Process uploaded file
try:
audio_tensor = self.audio_processor.process_audio_file(audio_file)
source_info = f"📁 Archivo: {os.path.basename(audio_file)}"
except Exception as e:
return {}, f"❌ Error procesando archivo: {str(e)}"
elif audio_input is not None:
# Process microphone input
try:
sample_rate, audio_array = audio_input
# Convert to float32 if needed
if audio_array.dtype == np.int16:
audio_array = audio_array.astype(np.float32) / 32768.0
elif audio_array.dtype == np.int32:
audio_array = audio_array.astype(np.float32) / 2147483648.0
audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate)
source_info = "🎤 Micrófono"
except Exception as e:
return {}, f"❌ Error procesando audio del micrófono: {str(e)}"
else:
return {}, "❌ Por favor, grabe audio o suba un archivo de audio."
# Perform classification
results = self.classifier.classify_keywords(audio_tensor, keywords)
if "error" in results:
return {}, f"❌ Error en clasificación: {results['error']}"
# Create status message
num_keywords = len([k for k in keywords.split(",") if k.strip()])
status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave"
return results, status_msg
except Exception as e:
error_msg = f"❌ Error inesperado: {str(e)}"
print(error_msg)
return {}, error_msg
def format_results_for_display(self, results: Dict[str, float]) -> str:
"""
Format classification results for display.
Args:
results: Classification results dictionary
Returns:
Formatted string for display
"""
if not results:
return "No hay resultados para mostrar."
if "error" in results:
return f"Error: {results['error']}"
# Sort results by probability (descending)
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
output_lines = ["📊 **Resultados de Clasificación:**\n"]
for keyword, probability in sorted_results:
# Create visual probability bar
bar_length = 20
filled_length = int(bar_length * probability)
bar = "█" * filled_length + "░" * (bar_length - filled_length)
# Color coding based on probability
if probability >= 0.7:
emoji = "🟢" # High confidence
elif probability >= 0.4:
emoji = "🟡" # Medium confidence
else:
emoji = "🔴" # Low confidence
percentage = probability * 100
output_lines.append(
f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]"
)
return "\n".join(output_lines)
def create_gradio_interface():
"""Create and configure the Gradio interface for Hugging Face."""
# Initialize the app with default model
app = KeywordSpottingApp(model_size="tiny")
def classify_audio(audio_input, audio_file, keywords, model_size, access_token):
"""Wrapper function for Gradio interface."""
# Check authentication first
if not authenticate_user(access_token):
return "❌ **Access Denied**: Invalid token. Please enter the correct access token.", "❌ Authentication failed", "❌ Access denied"
# Change model if needed
model_change_msg = app.change_model(model_size)
results, status = app.process_audio_and_classify(audio_input, audio_file, keywords)
formatted_results = app.format_results_for_display(results)
# Add model info to status
status_with_model = f"{status} | Model: {model_size}"
return formatted_results, status_with_model, model_change_msg
# Create the interface
with gr.Blocks(
title="🎯 Zero-Shot Audio Keyword Spotting",
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 900px !important;
margin: auto !important;
}
.status-box {
padding: 10px;
border-radius: 5px;
margin: 10px 0;
}
"""
) as interface:
gr.Markdown("""
# 🎯 Zero-Shot Audio Keyword Spotting
Detect keywords in Spanish audio using **Whisper AI** without prior training.
Transcribes audio and matches keywords with high accuracy.
## 📋 Instructions:
1. **Enter access token** to authenticate
2. **Select Whisper model** (tiny=fastest, medium=most accurate)
3. **Enter keywords** you want to detect (comma-separated)
4. **Record audio** using microphone OR **upload audio file**
5. **Click "Analyze Audio"** to get results
### 💡 Example Keywords:
`Sí, Claro, No, Nunca, Quizás, Tal vez, Por supuesto, En absoluto`
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🔐 Authentication")
access_token_input = gr.Textbox(
label="Access Token",
placeholder="Enter access token",
type="password",
info="Required to use the application"
)
gr.Markdown("### 🤖 Model Selection")
model_selector = gr.Dropdown(
choices=["tiny", "base", "small", "medium"],
value="tiny",
label="Whisper Model",
info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy"
)
gr.Markdown("### 🔤 Keywords")
gr.Markdown("*Example: Sí, No, Quizás, Claro, Nunca*")
keywords_input = gr.Textbox(
label="Keywords (comma-separated)",
placeholder="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro",
value="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro",
lines=3
)
gr.Markdown("### 🎵 Audio Input")
with gr.Tab("🎤 Record Audio"):
gr.Markdown("*Click to record (max 30 seconds)*")
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="Record your audio here"
)
with gr.Tab("📁 Upload File"):
gr.Markdown("*Supported: WAV, MP3, M4A, etc.*")
audio_file = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload audio file"
)
analyze_btn = gr.Button(
"🔍 Analyze Audio",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
gr.Markdown("### 📊 Results")
results_output = gr.Markdown(
value="Results will appear here after analysis...",
label="Classification Results"
)
status_output = gr.Textbox(
label="Status",
value="Ready to analyze",
interactive=False,
elem_classes=["status-box"]
)
model_status_output = gr.Textbox(
label="Model Status",
value="Current model: tiny",
interactive=False,
elem_classes=["status-box"]
)
# Event handlers
analyze_btn.click(
fn=classify_audio,
inputs=[audio_input, audio_file, keywords_input, model_selector, access_token_input],
outputs=[results_output, status_output, model_status_output]
)
# Examples section
gr.Markdown("""
## 💡 Usage Examples:
**Tips:**
- Use clear audio without background noise
- Speak at normal speed
- Keywords can appear anywhere in the audio
- Works best with common Spanish words
""")
return interface
# Main execution for Hugging Face Spaces
if __name__ == "__main__":
print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...")
# Show authentication info
current_token = get_auth_token()
print(f"🔐 Access token required: {current_token}")
print("💡 Set ACCESS_TOKEN environment variable to change the token")
# Create and launch the interface
interface = create_gradio_interface()
# Launch with token-based authentication
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)