Spaces:

Layer7
/

Simple-KWS

Sleeping

App Files Files Community

IvanLayer7 commited on Oct 18, 2025

Commit

664bdbb

verified ·

1 Parent(s): 0d542e4

Upload app.py

Browse files

Files changed (1) hide show

app.py +367 -325

app.py CHANGED Viewed

@@ -1,325 +1,367 @@
-"""
-Hugging Face Spaces version of the Keyword Spotting App.
-Simplified for deployment without local authentication.
-"""
-import gradio as gr
-import numpy as np
-import torch
-import os
-from typing import Dict, Any, Tuple, Optional
-import warnings
-# Import our custom modules
-from audio_processor import AudioProcessor
-from whisper_classifier import WhisperKeywordSpotter
-warnings.filterwarnings("ignore")
-class KeywordSpottingApp:
-    """Main application class for the keyword spotting interface."""
-    def __init__(self, model_size: str = "base"):
-        """Initialize the application components."""
-        print("Initializing Keyword Spotting App for Hugging Face...")
-        # Initialize components
-        self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0)
-        self.classifier = WhisperKeywordSpotter(model_size=model_size)
-        print("App initialized successfully!")
-    def change_model(self, new_model_size: str) -> str:
-        """Change the Whisper model size."""
-        try:
-            success = self.classifier.change_model(new_model_size)
-            if success:
-                return f"✅ Successfully changed to {new_model_size} model"
-            else:
-                return f"❌ Failed to change to {new_model_size} model"
-        except Exception as e:
-            return f"❌ Error changing model: {str(e)}"
-    def process_audio_and_classify(
-        self,
-        audio_input: Optional[Tuple[int, np.ndarray]],
-        audio_file: Optional[str],
-        keywords: str
-    ) -> Tuple[Dict[str, float], str]:
-        """
-        Process audio input and perform keyword classification.
-        Args:
-            audio_input: Tuple of (sample_rate, audio_array) from microphone
-            audio_file: Path to uploaded audio file
-            keywords: Comma-separated keywords string
-        Returns:
-            Tuple of (classification_results, status_message)
-        """
-        try:
-            # Validate keywords
-            if not keywords or not keywords.strip():
-                return {}, "❌ Por favor, ingrese al menos una palabra clave."
-            # Determine audio source and process
-            audio_tensor = None
-            source_info = ""
-            if audio_file is not None:
-                # Process uploaded file
-                try:
-                    audio_tensor = self.audio_processor.process_audio_file(audio_file)
-                    source_info = f"📁 Archivo: {os.path.basename(audio_file)}"
-                except Exception as e:
-                    return {}, f"❌ Error procesando archivo: {str(e)}"
-            elif audio_input is not None:
-                # Process microphone input
-                try:
-                    sample_rate, audio_array = audio_input
-                    # Convert to float32 if needed
-                    if audio_array.dtype == np.int16:
-                        audio_array = audio_array.astype(np.float32) / 32768.0
-                    elif audio_array.dtype == np.int32:
-                        audio_array = audio_array.astype(np.float32) / 2147483648.0
-                    audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate)
-                    source_info = "🎤 Micrófono"
-                except Exception as e:
-                    return {}, f"❌ Error procesando audio del micrófono: {str(e)}"
-            else:
-                return {}, "❌ Por favor, grabe audio o suba un archivo de audio."
-            # Perform classification
-            results = self.classifier.classify_keywords(audio_tensor, keywords)
-            if "error" in results:
-                return {}, f"❌ Error en clasificación: {results['error']}"
-            # Create status message
-            num_keywords = len([k for k in keywords.split(",") if k.strip()])
-            status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave"
-            return results, status_msg
-        except Exception as e:
-            error_msg = f"❌ Error inesperado: {str(e)}"
-            print(error_msg)
-            return {}, error_msg
-    def format_results_for_display(self, results: Dict[str, float]) -> str:
-        """
-        Format classification results for display.
-        Args:
-            results: Classification results dictionary
-        Returns:
-            Formatted string for display
-        """
-        if not results:
-            return "No hay resultados para mostrar."
-        if "error" in results:
-            return f"Error: {results['error']}"
-        # Sort results by probability (descending)
-        sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
-        output_lines = ["📊 **Resultados de Clasificación:**\n"]
-        for keyword, probability in sorted_results:
-            # Create visual probability bar
-            bar_length = 20
-            filled_length = int(bar_length * probability)
-            bar = "█" * filled_length + "░" * (bar_length - filled_length)
-            # Color coding based on probability
-            if probability >= 0.7:
-                emoji = "🟢"  # High confidence
-            elif probability >= 0.4:
-                emoji = "🟡"  # Medium confidence
-            else:
-                emoji = "🔴"  # Low confidence
-            percentage = probability * 100
-            output_lines.append(
-                f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]"
-            )
-        return "\n".join(output_lines)
-def create_gradio_interface():
-    """Create and configure the Gradio interface for Hugging Face."""
-    # Initialize the app with default model
-    app = KeywordSpottingApp(model_size="base")
-    def classify_audio(audio_input, audio_file, keywords, model_size):
-        """Wrapper function for Gradio interface."""
-        # Change model if needed
-        model_change_msg = app.change_model(model_size)
-        results, status = app.process_audio_and_classify(audio_input, audio_file, keywords)
-        formatted_results = app.format_results_for_display(results)
-        # Add model info to status
-        status_with_model = f"{status} | Model: {model_size}"
-        return formatted_results, status_with_model, model_change_msg
-    # Create the interface
-    with gr.Blocks(
-        title="🎯 Zero-Shot Audio Keyword Spotting",
-        theme=gr.themes.Soft(),
-        css="""
-        .gradio-container {
-            max-width: 900px !important;
-            margin: auto !important;
-        }
-        .status-box {
-            padding: 10px;
-            border-radius: 5px;
-            margin: 10px 0;
-        }
-        """
-    ) as interface:
-        gr.Markdown("""
-        # 🎯 Zero-Shot Audio Keyword Spotting
-        Detect keywords in Spanish audio using **Whisper AI** without prior training.
-        Transcribes audio and matches keywords with high accuracy.
-        ## 📋 Instructions:
-        1. **Select Whisper model** (tiny=fastest, medium=most accurate)
-        2. **Enter keywords** you want to detect (comma-separated)
-        3. **Record audio** using microphone OR **upload audio file**
-        4. **Click "Analyze Audio"** to get results
-        ### 💡 Example Keywords:
-        `hola, gracias, adiós, sí, no, por favor`
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                gr.Markdown("### 🤖 Model Selection")
-                model_selector = gr.Dropdown(
-                    choices=["tiny", "base", "small", "medium"],
-                    value="tiny",
-                    label="Whisper Model",
-                    info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy"
-                )
-                gr.Markdown("### 🔤 Keywords")
-                gr.Markdown("*Example: hola, gracias, adiós*")
-                keywords_input = gr.Textbox(
-                    label="Keywords (comma-separated)",
-                    placeholder="hola, gracias, adiós, sí, no",
-                    lines=2,
-                    value="hola, gracias, adiós, sí, no"
-                )
-                gr.Markdown("### 🎵 Audio Input")
-                with gr.Tab("🎤 Record Audio"):
-                    gr.Markdown("*Click to record (max 30 seconds)*")
-                    audio_input = gr.Audio(
-                        sources=["microphone"],
-                        type="numpy",
-                        label="Record your audio here"
-                    )
-                with gr.Tab("📁 Upload File"):
-                    gr.Markdown("*Supported: WAV, MP3, M4A, etc.*")
-                    audio_file = gr.Audio(
-                        sources=["upload"],
-                        type="filepath",
-                        label="Upload audio file"
-                    )
-                analyze_btn = gr.Button(
-                    "🔍 Analyze Audio",
-                    variant="primary",
-                    size="lg"
-                )
-            with gr.Column(scale=1):
-                gr.Markdown("### 📊 Results")
-                results_output = gr.Markdown(
-                    value="Results will appear here after analysis...",
-                    label="Classification Results"
-                )
-                status_output = gr.Textbox(
-                    label="Status",
-                    value="Ready to analyze",
-                    interactive=False,
-                    elem_classes=["status-box"]
-                )
-                model_status_output = gr.Textbox(
-                    label="Model Status",
-                    value="Current model: base",
-                    interactive=False,
-                    elem_classes=["status-box"]
-                )
-        # Event handlers
-        analyze_btn.click(
-            fn=classify_audio,
-            inputs=[audio_input, audio_file, keywords_input, model_selector],
-            outputs=[results_output, status_output, model_status_output]
-        )
-        # Examples section
-        gr.Markdown("""
-        ## 💡 Usage Examples:
-        **Suggested Spanish keywords:**
-        - Greetings: `hola, buenos días, buenas tardes, adiós`
-        - Courtesy: `gracias, por favor, disculpe, perdón`
-        - Responses: `sí, no, tal vez, claro`
-        - Numbers: `uno, dos, tres, cuatro, cinco`
-        - Colors: `rojo, azul, verde, amarillo`
-        **Tips:**
-        - Use clear audio without background noise
-        - Speak at normal speed
-        - Keywords can appear anywhere in the audio
-        - Works best with common Spanish words
-        ## 🔧 Technical Details:
-        - **Model**: OpenAI Whisper (speech transcription)
-        - **Languages**: Optimized for Spanish, works with others
-        - **Processing**: Up to 30 seconds, 48kHz sampling rate
-        - **Approach**: Transcription + text matching
-        ## 🤖 Model Comparison:
-        - **tiny**: Fastest, basic accuracy (72MB)
-        - **base**: Balanced speed/accuracy (139MB)
-        - **small**: Better accuracy, slower (461MB)
-        - **medium**: Best accuracy, slowest (1.46GB)
-        """)
-    return interface
-# Main execution for Hugging Face Spaces
-if __name__ == "__main__":
-    print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...")
-    # Create and launch the interface
-    interface = create_gradio_interface()
-    # Launch without authentication (HF Spaces handles this)
-    interface.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
-    )

+"""
+Hugging Face Spaces version of the Keyword Spotting App.
+Simplified for deployment without local authentication.
+"""
+import gradio as gr
+import numpy as np
+import torch
+import os
+from typing import Dict, Any, Tuple, Optional
+import warnings
+# Import our custom modules
+from audio_processor import AudioProcessor
+from whisper_classifier import WhisperKeywordSpotter
+warnings.filterwarnings("ignore")
+def get_auth_token():
+    """Get authentication token from environment variables."""
+    # Default token if not set in environment
+    default_token = "layer7"
+    # Try to get from environment variable
+    token = os.getenv("ACCESS_TOKEN", default_token)
+    return token
+def authenticate_user(token: str) -> bool:
+    """
+    Simple token-based authentication.
+    Args:
+        token: User provided token
+    Returns:
+        True if token is valid, False otherwise
+    """
+    valid_token = get_auth_token()
+    return token == valid_token
+class KeywordSpottingApp:
+    """Main application class for the keyword spotting interface."""
+    def __init__(self, model_size: str = "base"):
+        """Initialize the application components."""
+        print("Initializing Keyword Spotting App for Hugging Face...")
+        # Initialize components
+        self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0)
+        self.classifier = WhisperKeywordSpotter(model_size=model_size)
+        print("App initialized successfully!")
+    def change_model(self, new_model_size: str) -> str:
+        """Change the Whisper model size."""
+        try:
+            success = self.classifier.change_model(new_model_size)
+            if success:
+                return f"✅ Successfully changed to {new_model_size} model"
+            else:
+                return f"❌ Failed to change to {new_model_size} model"
+        except Exception as e:
+            return f"❌ Error changing model: {str(e)}"
+    def process_audio_and_classify(
+        self,
+        audio_input: Optional[Tuple[int, np.ndarray]],
+        audio_file: Optional[str],
+        keywords: str
+    ) -> Tuple[Dict[str, float], str]:
+        """
+        Process audio input and perform keyword classification.
+        Args:
+            audio_input: Tuple of (sample_rate, audio_array) from microphone
+            audio_file: Path to uploaded audio file
+            keywords: Comma-separated keywords string
+        Returns:
+            Tuple of (classification_results, status_message)
+        """
+        try:
+            # Validate keywords
+            if not keywords or not keywords.strip():
+                return {}, "❌ Por favor, ingrese al menos una palabra clave."
+            # Determine audio source and process
+            audio_tensor = None
+            source_info = ""
+            if audio_file is not None:
+                # Process uploaded file
+                try:
+                    audio_tensor = self.audio_processor.process_audio_file(audio_file)
+                    source_info = f"📁 Archivo: {os.path.basename(audio_file)}"
+                except Exception as e:
+                    return {}, f"❌ Error procesando archivo: {str(e)}"
+            elif audio_input is not None:
+                # Process microphone input
+                try:
+                    sample_rate, audio_array = audio_input
+                    # Convert to float32 if needed
+                    if audio_array.dtype == np.int16:
+                        audio_array = audio_array.astype(np.float32) / 32768.0
+                    elif audio_array.dtype == np.int32:
+                        audio_array = audio_array.astype(np.float32) / 2147483648.0
+                    audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate)
+                    source_info = "🎤 Micrófono"
+                except Exception as e:
+                    return {}, f"❌ Error procesando audio del micrófono: {str(e)}"
+            else:
+                return {}, "❌ Por favor, grabe audio o suba un archivo de audio."
+            # Perform classification
+            results = self.classifier.classify_keywords(audio_tensor, keywords)
+            if "error" in results:
+                return {}, f"❌ Error en clasificación: {results['error']}"
+            # Create status message
+            num_keywords = len([k for k in keywords.split(",") if k.strip()])
+            status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave"
+            return results, status_msg
+        except Exception as e:
+            error_msg = f"❌ Error inesperado: {str(e)}"
+            print(error_msg)
+            return {}, error_msg
+    def format_results_for_display(self, results: Dict[str, float]) -> str:
+        """
+        Format classification results for display.
+        Args:
+            results: Classification results dictionary
+        Returns:
+            Formatted string for display
+        """
+        if not results:
+            return "No hay resultados para mostrar."
+        if "error" in results:
+            return f"Error: {results['error']}"
+        # Sort results by probability (descending)
+        sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
+        output_lines = ["📊 **Resultados de Clasificación:**\n"]
+        for keyword, probability in sorted_results:
+            # Create visual probability bar
+            bar_length = 20
+            filled_length = int(bar_length * probability)
+            bar = "█" * filled_length + "░" * (bar_length - filled_length)
+            # Color coding based on probability
+            if probability >= 0.7:
+                emoji = "🟢"  # High confidence
+            elif probability >= 0.4:
+                emoji = "🟡"  # Medium confidence
+            else:
+                emoji = "🔴"  # Low confidence
+            percentage = probability * 100
+            output_lines.append(
+                f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]"
+            )
+        return "\n".join(output_lines)
+def create_gradio_interface():
+    """Create and configure the Gradio interface for Hugging Face."""
+    # Initialize the app with default model
+    app = KeywordSpottingApp(model_size="base")
+    def classify_audio(audio_input, audio_file, keywords, model_size, access_token):
+        """Wrapper function for Gradio interface."""
+        # Check authentication first
+        if not authenticate_user(access_token):
+            return "❌ **Access Denied**: Invalid token. Please enter the correct access token.", "❌ Authentication failed", "❌ Access denied"
+        # Change model if needed
+        model_change_msg = app.change_model(model_size)
+        results, status = app.process_audio_and_classify(audio_input, audio_file, keywords)
+        formatted_results = app.format_results_for_display(results)
+        # Add model info to status
+        status_with_model = f"{status} | Model: {model_size}"
+        return formatted_results, status_with_model, model_change_msg
+    # Create the interface
+    with gr.Blocks(
+        title="🎯 Zero-Shot Audio Keyword Spotting",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {
+            max-width: 900px !important;
+            margin: auto !important;
+        }
+        .status-box {
+            padding: 10px;
+            border-radius: 5px;
+            margin: 10px 0;
+        }
+        """
+    ) as interface:
+        gr.Markdown("""
+        # 🎯 Zero-Shot Audio Keyword Spotting
+        Detect keywords in Spanish audio using **Whisper AI** without prior training.
+        Transcribes audio and matches keywords with high accuracy.
+        ## 📋 Instructions:
+        1. **Enter access token** to authenticate
+        2. **Select Whisper model** (tiny=fastest, medium=most accurate)
+        3. **Enter keywords** you want to detect (comma-separated)
+        4. **Record audio** using microphone OR **upload audio file**
+        5. **Click "Analyze Audio"** to get results
+        ### 💡 Example Keywords:
+        `hola, gracias, adiós, sí, no, por favor`
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 🔐 Authentication")
+                access_token_input = gr.Textbox(
+                    label="Access Token",
+                    placeholder="Enter access token",
+                    type="password",
+                    info="Required to use the application"
+                )
+                gr.Markdown("### 🤖 Model Selection")
+                model_selector = gr.Dropdown(
+                    choices=["tiny", "base", "small", "medium"],
+                    value="base",
+                    label="Whisper Model",
+                    info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy"
+                )
+                gr.Markdown("### 🔤 Keywords")
+                gr.Markdown("*Example: hola, gracias, adiós*")
+                keywords_input = gr.Textbox(
+                    label="Keywords (comma-separated)",
+                    placeholder="hola, gracias, adiós, sí, no",
+                    lines=2
+                )
+                gr.Markdown("### 🎵 Audio Input")
+                with gr.Tab("🎤 Record Audio"):
+                    gr.Markdown("*Click to record (max 30 seconds)*")
+                    audio_input = gr.Audio(
+                        sources=["microphone"],
+                        type="numpy",
+                        label="Record your audio here"
+                    )
+                with gr.Tab("📁 Upload File"):
+                    gr.Markdown("*Supported: WAV, MP3, M4A, etc.*")
+                    audio_file = gr.Audio(
+                        sources=["upload"],
+                        type="filepath",
+                        label="Upload audio file"
+                    )
+                analyze_btn = gr.Button(
+                    "🔍 Analyze Audio",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column(scale=1):
+                gr.Markdown("### 📊 Results")
+                results_output = gr.Markdown(
+                    value="Results will appear here after analysis...",
+                    label="Classification Results"
+                )
+                status_output = gr.Textbox(
+                    label="Status",
+                    value="Ready to analyze",
+                    interactive=False,
+                    elem_classes=["status-box"]
+                )
+                model_status_output = gr.Textbox(
+                    label="Model Status",
+                    value="Current model: base",
+                    interactive=False,
+                    elem_classes=["status-box"]
+                )
+        # Event handlers
+        analyze_btn.click(
+            fn=classify_audio,
+            inputs=[audio_input, audio_file, keywords_input, model_selector, access_token_input],
+            outputs=[results_output, status_output, model_status_output]
+        )
+        # Examples section
+        gr.Markdown("""
+        ## 💡 Usage Examples:
+        **Suggested Spanish keywords:**
+        - Greetings: `hola, buenos días, buenas tardes, adiós`
+        - Courtesy: `gracias, por favor, disculpe, perdón`
+        - Responses: `sí, no, tal vez, claro`
+        - Numbers: `uno, dos, tres, cuatro, cinco`
+        - Colors: `rojo, azul, verde, amarillo`
+        **Tips:**
+        - Use clear audio without background noise
+        - Speak at normal speed
+        - Keywords can appear anywhere in the audio
+        - Works best with common Spanish words
+        ## 🔧 Technical Details:
+        - **Model**: OpenAI Whisper (speech transcription)
+        - **Languages**: Optimized for Spanish, works with others
+        - **Processing**: Up to 30 seconds, 48kHz sampling rate
+        - **Approach**: Transcription + text matching
+        ## 🤖 Model Comparison:
+        - **tiny**: Fastest, basic accuracy (72MB)
+        - **base**: Balanced speed/accuracy (139MB)
+        - **small**: Better accuracy, slower (461MB)
+        - **medium**: Best accuracy, slowest (1.46GB)
+        """)
+    return interface
+# Main execution for Hugging Face Spaces
+if __name__ == "__main__":
+    print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...")
+    # Show authentication info
+    current_token = get_auth_token()
+    print(f"🔐 Access token required: {current_token}")
+    print("💡 Set ACCESS_TOKEN environment variable to change the token")
+    # Create and launch the interface
+    interface = create_gradio_interface()
+    # Launch with token-based authentication
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )