Spaces:
Sleeping
Sleeping
| """ | |
| Hugging Face Spaces version of the Keyword Spotting App. | |
| Simplified for deployment without local authentication. | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import os | |
| from typing import Dict, Any, Tuple, Optional | |
| import warnings | |
| # Import our custom modules | |
| from audio_processor import AudioProcessor | |
| from whisper_classifier import WhisperKeywordSpotter | |
| warnings.filterwarnings("ignore") | |
| def get_auth_token(): | |
| """Get authentication token from environment variables.""" | |
| # Default token if not set in environment | |
| default_token = "layer7" | |
| # Try to get from environment variable | |
| token = os.getenv("ACCESS_TOKEN", default_token) | |
| return token | |
| def authenticate_user(token: str) -> bool: | |
| """ | |
| Simple token-based authentication. | |
| Args: | |
| token: User provided token | |
| Returns: | |
| True if token is valid, False otherwise | |
| """ | |
| valid_token = get_auth_token() | |
| return token == valid_token | |
| class KeywordSpottingApp: | |
| """Main application class for the keyword spotting interface.""" | |
| def __init__(self, model_size: str = "base"): | |
| """Initialize the application components.""" | |
| print("Initializing Keyword Spotting App for Hugging Face...") | |
| # Initialize components | |
| self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0) | |
| self.classifier = WhisperKeywordSpotter(model_size=model_size) | |
| print("App initialized successfully!") | |
| def change_model(self, new_model_size: str) -> str: | |
| """Change the Whisper model size.""" | |
| try: | |
| success = self.classifier.change_model(new_model_size) | |
| if success: | |
| return f"✅ Successfully changed to {new_model_size} model" | |
| else: | |
| return f"❌ Failed to change to {new_model_size} model" | |
| except Exception as e: | |
| return f"❌ Error changing model: {str(e)}" | |
| def process_audio_and_classify( | |
| self, | |
| audio_input: Optional[Tuple[int, np.ndarray]], | |
| audio_file: Optional[str], | |
| keywords: str | |
| ) -> Tuple[Dict[str, float], str]: | |
| """ | |
| Process audio input and perform keyword classification. | |
| Args: | |
| audio_input: Tuple of (sample_rate, audio_array) from microphone | |
| audio_file: Path to uploaded audio file | |
| keywords: Comma-separated keywords string | |
| Returns: | |
| Tuple of (classification_results, status_message) | |
| """ | |
| try: | |
| # Validate keywords | |
| if not keywords or not keywords.strip(): | |
| return {}, "❌ Por favor, ingrese al menos una palabra clave." | |
| # Determine audio source and process | |
| audio_tensor = None | |
| source_info = "" | |
| if audio_file is not None: | |
| # Process uploaded file | |
| try: | |
| audio_tensor = self.audio_processor.process_audio_file(audio_file) | |
| source_info = f"📁 Archivo: {os.path.basename(audio_file)}" | |
| except Exception as e: | |
| return {}, f"❌ Error procesando archivo: {str(e)}" | |
| elif audio_input is not None: | |
| # Process microphone input | |
| try: | |
| sample_rate, audio_array = audio_input | |
| # Convert to float32 if needed | |
| if audio_array.dtype == np.int16: | |
| audio_array = audio_array.astype(np.float32) / 32768.0 | |
| elif audio_array.dtype == np.int32: | |
| audio_array = audio_array.astype(np.float32) / 2147483648.0 | |
| audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate) | |
| source_info = "🎤 Micrófono" | |
| except Exception as e: | |
| return {}, f"❌ Error procesando audio del micrófono: {str(e)}" | |
| else: | |
| return {}, "❌ Por favor, grabe audio o suba un archivo de audio." | |
| # Perform classification | |
| results = self.classifier.classify_keywords(audio_tensor, keywords) | |
| if "error" in results: | |
| return {}, f"❌ Error en clasificación: {results['error']}" | |
| # Create status message | |
| num_keywords = len([k for k in keywords.split(",") if k.strip()]) | |
| status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave" | |
| return results, status_msg | |
| except Exception as e: | |
| error_msg = f"❌ Error inesperado: {str(e)}" | |
| print(error_msg) | |
| return {}, error_msg | |
| def format_results_for_display(self, results: Dict[str, float]) -> str: | |
| """ | |
| Format classification results for display. | |
| Args: | |
| results: Classification results dictionary | |
| Returns: | |
| Formatted string for display | |
| """ | |
| if not results: | |
| return "No hay resultados para mostrar." | |
| if "error" in results: | |
| return f"Error: {results['error']}" | |
| # Sort results by probability (descending) | |
| sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True) | |
| output_lines = ["📊 **Resultados de Clasificación:**\n"] | |
| for keyword, probability in sorted_results: | |
| # Create visual probability bar | |
| bar_length = 20 | |
| filled_length = int(bar_length * probability) | |
| bar = "█" * filled_length + "░" * (bar_length - filled_length) | |
| # Color coding based on probability | |
| if probability >= 0.7: | |
| emoji = "🟢" # High confidence | |
| elif probability >= 0.4: | |
| emoji = "🟡" # Medium confidence | |
| else: | |
| emoji = "🔴" # Low confidence | |
| percentage = probability * 100 | |
| output_lines.append( | |
| f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]" | |
| ) | |
| return "\n".join(output_lines) | |
| def create_gradio_interface(): | |
| """Create and configure the Gradio interface for Hugging Face.""" | |
| # Initialize the app with default model | |
| app = KeywordSpottingApp(model_size="tiny") | |
| def classify_audio(audio_input, audio_file, keywords, model_size, access_token): | |
| """Wrapper function for Gradio interface.""" | |
| # Check authentication first | |
| if not authenticate_user(access_token): | |
| return "❌ **Access Denied**: Invalid token. Please enter the correct access token.", "❌ Authentication failed", "❌ Access denied" | |
| # Change model if needed | |
| model_change_msg = app.change_model(model_size) | |
| results, status = app.process_audio_and_classify(audio_input, audio_file, keywords) | |
| formatted_results = app.format_results_for_display(results) | |
| # Add model info to status | |
| status_with_model = f"{status} | Model: {model_size}" | |
| return formatted_results, status_with_model, model_change_msg | |
| # Create the interface | |
| with gr.Blocks( | |
| title="🎯 Zero-Shot Audio Keyword Spotting", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 900px !important; | |
| margin: auto !important; | |
| } | |
| .status-box { | |
| padding: 10px; | |
| border-radius: 5px; | |
| margin: 10px 0; | |
| } | |
| """ | |
| ) as interface: | |
| gr.Markdown(""" | |
| # 🎯 Zero-Shot Audio Keyword Spotting | |
| Detect keywords in Spanish audio using **Whisper AI** without prior training. | |
| Transcribes audio and matches keywords with high accuracy. | |
| ## 📋 Instructions: | |
| 1. **Enter access token** to authenticate | |
| 2. **Select Whisper model** (tiny=fastest, medium=most accurate) | |
| 3. **Enter keywords** you want to detect (comma-separated) | |
| 4. **Record audio** using microphone OR **upload audio file** | |
| 5. **Click "Analyze Audio"** to get results | |
| ### 💡 Example Keywords: | |
| `Sí, Claro, No, Nunca, Quizás, Tal vez, Por supuesto, En absoluto` | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🔐 Authentication") | |
| access_token_input = gr.Textbox( | |
| label="Access Token", | |
| placeholder="Enter access token", | |
| type="password", | |
| info="Required to use the application" | |
| ) | |
| gr.Markdown("### 🤖 Model Selection") | |
| model_selector = gr.Dropdown( | |
| choices=["tiny", "base", "small", "medium"], | |
| value="tiny", | |
| label="Whisper Model", | |
| info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy" | |
| ) | |
| gr.Markdown("### 🔤 Keywords") | |
| gr.Markdown("*Example: Sí, No, Quizás, Claro, Nunca*") | |
| keywords_input = gr.Textbox( | |
| label="Keywords (comma-separated)", | |
| placeholder="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro", | |
| value="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro", | |
| lines=3 | |
| ) | |
| gr.Markdown("### 🎵 Audio Input") | |
| with gr.Tab("🎤 Record Audio"): | |
| gr.Markdown("*Click to record (max 30 seconds)*") | |
| audio_input = gr.Audio( | |
| sources=["microphone"], | |
| type="numpy", | |
| label="Record your audio here" | |
| ) | |
| with gr.Tab("📁 Upload File"): | |
| gr.Markdown("*Supported: WAV, MP3, M4A, etc.*") | |
| audio_file = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Upload audio file" | |
| ) | |
| analyze_btn = gr.Button( | |
| "🔍 Analyze Audio", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 📊 Results") | |
| results_output = gr.Markdown( | |
| value="Results will appear here after analysis...", | |
| label="Classification Results" | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| value="Ready to analyze", | |
| interactive=False, | |
| elem_classes=["status-box"] | |
| ) | |
| model_status_output = gr.Textbox( | |
| label="Model Status", | |
| value="Current model: tiny", | |
| interactive=False, | |
| elem_classes=["status-box"] | |
| ) | |
| # Event handlers | |
| analyze_btn.click( | |
| fn=classify_audio, | |
| inputs=[audio_input, audio_file, keywords_input, model_selector, access_token_input], | |
| outputs=[results_output, status_output, model_status_output] | |
| ) | |
| # Examples section | |
| gr.Markdown(""" | |
| ## 💡 Usage Examples: | |
| **Tips:** | |
| - Use clear audio without background noise | |
| - Speak at normal speed | |
| - Keywords can appear anywhere in the audio | |
| - Works best with common Spanish words | |
| """) | |
| return interface | |
| # Main execution for Hugging Face Spaces | |
| if __name__ == "__main__": | |
| print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...") | |
| # Show authentication info | |
| current_token = get_auth_token() | |
| print(f"🔐 Access token required: {current_token}") | |
| print("💡 Set ACCESS_TOKEN environment variable to change the token") | |
| # Create and launch the interface | |
| interface = create_gradio_interface() | |
| # Launch with token-based authentication | |
| interface.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True | |
| ) | |