Spaces:

Layer7
/

Simple-KWS

Sleeping

File size: 13,539 Bytes

"""

Hugging Face Spaces version of the Keyword Spotting App.

Simplified for deployment without local authentication.

"""

import gradio as gr
import numpy as np
import torch
import os
from typing import Dict, Any, Tuple, Optional
import warnings

# Import our custom modules
from audio_processor import AudioProcessor
from whisper_classifier import WhisperKeywordSpotter

warnings.filterwarnings("ignore")


def get_auth_token():
    """Get authentication token from environment variables."""
    # Default token if not set in environment
    default_token = "layer7"
    
    # Try to get from environment variable
    token = os.getenv("ACCESS_TOKEN", default_token)
    
    return token


def authenticate_user(token: str) -> bool:
    """

    Simple token-based authentication.

    

    Args:

        token: User provided token

        

    Returns:

        True if token is valid, False otherwise

    """
    valid_token = get_auth_token()
    return token == valid_token


class KeywordSpottingApp:
    """Main application class for the keyword spotting interface."""
    
    def __init__(self, model_size: str = "base"):
        """Initialize the application components."""
        print("Initializing Keyword Spotting App for Hugging Face...")
        
        # Initialize components
        self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0)
        self.classifier = WhisperKeywordSpotter(model_size=model_size)
        
        print("App initialized successfully!")
    
    def change_model(self, new_model_size: str) -> str:
        """Change the Whisper model size."""
        try:
            success = self.classifier.change_model(new_model_size)
            if success:
                return f"✅ Successfully changed to {new_model_size} model"
            else:
                return f"❌ Failed to change to {new_model_size} model"
        except Exception as e:
            return f"❌ Error changing model: {str(e)}"
    
    def process_audio_and_classify(

        self, 

        audio_input: Optional[Tuple[int, np.ndarray]], 

        audio_file: Optional[str], 

        keywords: str

    ) -> Tuple[Dict[str, float], str]:
        """

        Process audio input and perform keyword classification.

        

        Args:

            audio_input: Tuple of (sample_rate, audio_array) from microphone

            audio_file: Path to uploaded audio file

            keywords: Comma-separated keywords string

            

        Returns:

            Tuple of (classification_results, status_message)

        """
        try:
            # Validate keywords
            if not keywords or not keywords.strip():
                return {}, "❌ Por favor, ingrese al menos una palabra clave."
            
            # Determine audio source and process
            audio_tensor = None
            source_info = ""
            
            if audio_file is not None:
                # Process uploaded file
                try:
                    audio_tensor = self.audio_processor.process_audio_file(audio_file)
                    source_info = f"📁 Archivo: {os.path.basename(audio_file)}"
                except Exception as e:
                    return {}, f"❌ Error procesando archivo: {str(e)}"
                    
            elif audio_input is not None:
                # Process microphone input
                try:
                    sample_rate, audio_array = audio_input
                    # Convert to float32 if needed
                    if audio_array.dtype == np.int16:
                        audio_array = audio_array.astype(np.float32) / 32768.0
                    elif audio_array.dtype == np.int32:
                        audio_array = audio_array.astype(np.float32) / 2147483648.0
                    
                    audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate)
                    source_info = "🎤 Micrófono"
                except Exception as e:
                    return {}, f"❌ Error procesando audio del micrófono: {str(e)}"
            else:
                return {}, "❌ Por favor, grabe audio o suba un archivo de audio."
            
            # Perform classification
            results = self.classifier.classify_keywords(audio_tensor, keywords)
            
            if "error" in results:
                return {}, f"❌ Error en clasificación: {results['error']}"
            
            # Create status message
            num_keywords = len([k for k in keywords.split(",") if k.strip()])
            status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave"
            
            return results, status_msg
            
        except Exception as e:
            error_msg = f"❌ Error inesperado: {str(e)}"
            print(error_msg)
            return {}, error_msg
    
    def format_results_for_display(self, results: Dict[str, float]) -> str:
        """

        Format classification results for display.

        

        Args:

            results: Classification results dictionary

            

        Returns:

            Formatted string for display

        """
        if not results:
            return "No hay resultados para mostrar."
        
        if "error" in results:
            return f"Error: {results['error']}"
        
        # Sort results by probability (descending)
        sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
        
        output_lines = ["📊 **Resultados de Clasificación:**\n"]
        
        for keyword, probability in sorted_results:
            # Create visual probability bar
            bar_length = 20
            filled_length = int(bar_length * probability)
            bar = "█" * filled_length + "░" * (bar_length - filled_length)
            
            # Color coding based on probability
            if probability >= 0.7:
                emoji = "🟢"  # High confidence
            elif probability >= 0.4:
                emoji = "🟡"  # Medium confidence
            else:
                emoji = "🔴"  # Low confidence
            
            percentage = probability * 100
            output_lines.append(
                f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]"
            )
        
        return "\n".join(output_lines)


def create_gradio_interface():
    """Create and configure the Gradio interface for Hugging Face."""
    
    # Initialize the app with default model
    app = KeywordSpottingApp(model_size="tiny")
    
    def classify_audio(audio_input, audio_file, keywords, model_size, access_token):
        """Wrapper function for Gradio interface."""
        # Check authentication first
        if not authenticate_user(access_token):
            return "❌ **Access Denied**: Invalid token. Please enter the correct access token.", "❌ Authentication failed", "❌ Access denied"
        
        # Change model if needed
        model_change_msg = app.change_model(model_size)
        
        results, status = app.process_audio_and_classify(audio_input, audio_file, keywords)
        formatted_results = app.format_results_for_display(results)
        
        # Add model info to status
        status_with_model = f"{status} | Model: {model_size}"
        
        return formatted_results, status_with_model, model_change_msg
    
    # Create the interface
    with gr.Blocks(
        title="🎯 Zero-Shot Audio Keyword Spotting",
        theme=gr.themes.Soft(),
        css="""

        .gradio-container {

            max-width: 900px !important;

            margin: auto !important;

        }

        .status-box {

            padding: 10px;

            border-radius: 5px;

            margin: 10px 0;

        }

        """
    ) as interface:
        
        gr.Markdown("""

        # 🎯 Zero-Shot Audio Keyword Spotting

        

        Detect keywords in Spanish audio using **Whisper AI** without prior training. 

        Transcribes audio and matches keywords with high accuracy.

        

        ## 📋 Instructions:

        1. **Enter access token** to authenticate

        2. **Select Whisper model** (tiny=fastest, medium=most accurate)

        3. **Enter keywords** you want to detect (comma-separated)

        4. **Record audio** using microphone OR **upload audio file**

        5. **Click "Analyze Audio"** to get results

        

        ### 💡 Example Keywords:

        `Sí, Claro, No, Nunca, Quizás, Tal vez, Por supuesto, En absoluto`

        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 🔐 Authentication")
                access_token_input = gr.Textbox(
                    label="Access Token",
                    placeholder="Enter access token",
                    type="password",
                    info="Required to use the application"
                )
                
                gr.Markdown("### 🤖 Model Selection")
                model_selector = gr.Dropdown(
                    choices=["tiny", "base", "small", "medium"],
                    value="tiny",
                    label="Whisper Model",
                    info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy"
                )
                
                gr.Markdown("### 🔤 Keywords")
                gr.Markdown("*Example: Sí, No, Quizás, Claro, Nunca*")
                keywords_input = gr.Textbox(
                    label="Keywords (comma-separated)",
                    placeholder="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro",
                    value="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro",
                    lines=3
                )
                
                gr.Markdown("### 🎵 Audio Input")
                
                with gr.Tab("🎤 Record Audio"):
                    gr.Markdown("*Click to record (max 30 seconds)*")
                    audio_input = gr.Audio(
                        sources=["microphone"],
                        type="numpy",
                        label="Record your audio here"
                    )
                
                with gr.Tab("📁 Upload File"):
                    gr.Markdown("*Supported: WAV, MP3, M4A, etc.*")
                    audio_file = gr.Audio(
                        sources=["upload"],
                        type="filepath",
                        label="Upload audio file"
                    )
                
                analyze_btn = gr.Button(
                    "🔍 Analyze Audio",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                gr.Markdown("### 📊 Results")
                
                results_output = gr.Markdown(
                    value="Results will appear here after analysis...",
                    label="Classification Results"
                )
                
                status_output = gr.Textbox(
                    label="Status",
                    value="Ready to analyze",
                    interactive=False,
                    elem_classes=["status-box"]
                )
                
                model_status_output = gr.Textbox(
                    label="Model Status",
                    value="Current model: tiny",
                    interactive=False,
                    elem_classes=["status-box"]
                )
        
        # Event handlers
        analyze_btn.click(
            fn=classify_audio,
            inputs=[audio_input, audio_file, keywords_input, model_selector, access_token_input],
            outputs=[results_output, status_output, model_status_output]
        )
        
        # Examples section
        gr.Markdown("""

        ## 💡 Usage Examples:

        

        **Tips:**

        - Use clear audio without background noise

        - Speak at normal speed

        - Keywords can appear anywhere in the audio

        - Works best with common Spanish words



        """)
    
    return interface


# Main execution for Hugging Face Spaces
if __name__ == "__main__":
    print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...")
    
    # Show authentication info
    current_token = get_auth_token()
    print(f"🔐 Access token required: {current_token}")
    print("💡 Set ACCESS_TOKEN environment variable to change the token")
    
    # Create and launch the interface
    interface = create_gradio_interface()
    
    # Launch with token-based authentication
    interface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )