""" Hugging Face Spaces version of the Keyword Spotting App. Simplified for deployment without local authentication. """ import gradio as gr import numpy as np import torch import os from typing import Dict, Any, Tuple, Optional import warnings # Import our custom modules from audio_processor import AudioProcessor from whisper_classifier import WhisperKeywordSpotter warnings.filterwarnings("ignore") def get_auth_token(): """Get authentication token from environment variables.""" # Default token if not set in environment default_token = "layer7" # Try to get from environment variable token = os.getenv("ACCESS_TOKEN", default_token) return token def authenticate_user(token: str) -> bool: """ Simple token-based authentication. Args: token: User provided token Returns: True if token is valid, False otherwise """ valid_token = get_auth_token() return token == valid_token class KeywordSpottingApp: """Main application class for the keyword spotting interface.""" def __init__(self, model_size: str = "base"): """Initialize the application components.""" print("Initializing Keyword Spotting App for Hugging Face...") # Initialize components self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0) self.classifier = WhisperKeywordSpotter(model_size=model_size) print("App initialized successfully!") def change_model(self, new_model_size: str) -> str: """Change the Whisper model size.""" try: success = self.classifier.change_model(new_model_size) if success: return f"✅ Successfully changed to {new_model_size} model" else: return f"❌ Failed to change to {new_model_size} model" except Exception as e: return f"❌ Error changing model: {str(e)}" def process_audio_and_classify( self, audio_input: Optional[Tuple[int, np.ndarray]], audio_file: Optional[str], keywords: str ) -> Tuple[Dict[str, float], str]: """ Process audio input and perform keyword classification. Args: audio_input: Tuple of (sample_rate, audio_array) from microphone audio_file: Path to uploaded audio file keywords: Comma-separated keywords string Returns: Tuple of (classification_results, status_message) """ try: # Validate keywords if not keywords or not keywords.strip(): return {}, "❌ Por favor, ingrese al menos una palabra clave." # Determine audio source and process audio_tensor = None source_info = "" if audio_file is not None: # Process uploaded file try: audio_tensor = self.audio_processor.process_audio_file(audio_file) source_info = f"📁 Archivo: {os.path.basename(audio_file)}" except Exception as e: return {}, f"❌ Error procesando archivo: {str(e)}" elif audio_input is not None: # Process microphone input try: sample_rate, audio_array = audio_input # Convert to float32 if needed if audio_array.dtype == np.int16: audio_array = audio_array.astype(np.float32) / 32768.0 elif audio_array.dtype == np.int32: audio_array = audio_array.astype(np.float32) / 2147483648.0 audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate) source_info = "🎤 Micrófono" except Exception as e: return {}, f"❌ Error procesando audio del micrófono: {str(e)}" else: return {}, "❌ Por favor, grabe audio o suba un archivo de audio." # Perform classification results = self.classifier.classify_keywords(audio_tensor, keywords) if "error" in results: return {}, f"❌ Error en clasificación: {results['error']}" # Create status message num_keywords = len([k for k in keywords.split(",") if k.strip()]) status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave" return results, status_msg except Exception as e: error_msg = f"❌ Error inesperado: {str(e)}" print(error_msg) return {}, error_msg def format_results_for_display(self, results: Dict[str, float]) -> str: """ Format classification results for display. Args: results: Classification results dictionary Returns: Formatted string for display """ if not results: return "No hay resultados para mostrar." if "error" in results: return f"Error: {results['error']}" # Sort results by probability (descending) sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True) output_lines = ["📊 **Resultados de Clasificación:**\n"] for keyword, probability in sorted_results: # Create visual probability bar bar_length = 20 filled_length = int(bar_length * probability) bar = "█" * filled_length + "░" * (bar_length - filled_length) # Color coding based on probability if probability >= 0.7: emoji = "🟢" # High confidence elif probability >= 0.4: emoji = "🟡" # Medium confidence else: emoji = "🔴" # Low confidence percentage = probability * 100 output_lines.append( f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]" ) return "\n".join(output_lines) def create_gradio_interface(): """Create and configure the Gradio interface for Hugging Face.""" # Initialize the app with default model app = KeywordSpottingApp(model_size="tiny") def classify_audio(audio_input, audio_file, keywords, model_size, access_token): """Wrapper function for Gradio interface.""" # Check authentication first if not authenticate_user(access_token): return "❌ **Access Denied**: Invalid token. Please enter the correct access token.", "❌ Authentication failed", "❌ Access denied" # Change model if needed model_change_msg = app.change_model(model_size) results, status = app.process_audio_and_classify(audio_input, audio_file, keywords) formatted_results = app.format_results_for_display(results) # Add model info to status status_with_model = f"{status} | Model: {model_size}" return formatted_results, status_with_model, model_change_msg # Create the interface with gr.Blocks( title="🎯 Zero-Shot Audio Keyword Spotting", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 900px !important; margin: auto !important; } .status-box { padding: 10px; border-radius: 5px; margin: 10px 0; } """ ) as interface: gr.Markdown(""" # 🎯 Zero-Shot Audio Keyword Spotting Detect keywords in Spanish audio using **Whisper AI** without prior training. Transcribes audio and matches keywords with high accuracy. ## 📋 Instructions: 1. **Enter access token** to authenticate 2. **Select Whisper model** (tiny=fastest, medium=most accurate) 3. **Enter keywords** you want to detect (comma-separated) 4. **Record audio** using microphone OR **upload audio file** 5. **Click "Analyze Audio"** to get results ### 💡 Example Keywords: `Sí, Claro, No, Nunca, Quizás, Tal vez, Por supuesto, En absoluto` """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🔐 Authentication") access_token_input = gr.Textbox( label="Access Token", placeholder="Enter access token", type="password", info="Required to use the application" ) gr.Markdown("### 🤖 Model Selection") model_selector = gr.Dropdown( choices=["tiny", "base", "small", "medium"], value="tiny", label="Whisper Model", info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy" ) gr.Markdown("### 🔤 Keywords") gr.Markdown("*Example: Sí, No, Quizás, Claro, Nunca*") keywords_input = gr.Textbox( label="Keywords (comma-separated)", placeholder="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro", value="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro", lines=3 ) gr.Markdown("### 🎵 Audio Input") with gr.Tab("🎤 Record Audio"): gr.Markdown("*Click to record (max 30 seconds)*") audio_input = gr.Audio( sources=["microphone"], type="numpy", label="Record your audio here" ) with gr.Tab("📁 Upload File"): gr.Markdown("*Supported: WAV, MP3, M4A, etc.*") audio_file = gr.Audio( sources=["upload"], type="filepath", label="Upload audio file" ) analyze_btn = gr.Button( "🔍 Analyze Audio", variant="primary", size="lg" ) with gr.Column(scale=1): gr.Markdown("### 📊 Results") results_output = gr.Markdown( value="Results will appear here after analysis...", label="Classification Results" ) status_output = gr.Textbox( label="Status", value="Ready to analyze", interactive=False, elem_classes=["status-box"] ) model_status_output = gr.Textbox( label="Model Status", value="Current model: tiny", interactive=False, elem_classes=["status-box"] ) # Event handlers analyze_btn.click( fn=classify_audio, inputs=[audio_input, audio_file, keywords_input, model_selector, access_token_input], outputs=[results_output, status_output, model_status_output] ) # Examples section gr.Markdown(""" ## 💡 Usage Examples: **Tips:** - Use clear audio without background noise - Speak at normal speed - Keywords can appear anywhere in the audio - Works best with common Spanish words """) return interface # Main execution for Hugging Face Spaces if __name__ == "__main__": print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...") # Show authentication info current_token = get_auth_token() print(f"🔐 Access token required: {current_token}") print("💡 Set ACCESS_TOKEN environment variable to change the token") # Create and launch the interface interface = create_gradio_interface() # Launch with token-based authentication interface.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )