Spaces:

Layer7
/

Simple-KWS

Running

App Files Files Community

IvanLayer7 commited on Oct 18, 2025

Commit

e685c03

verified ·

1 Parent(s): 4414822

Upload 5 files

Browse files

Files changed (5) hide show

app_hf.py +284 -0
audio_processor.py +137 -0
config.py +98 -0
requirements_hf.txt +9 -0
whisper_classifier.py +230 -0

app_hf.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""
+Hugging Face Spaces version of the Keyword Spotting App.
+Simplified for deployment without local authentication.
+"""
+import gradio as gr
+import numpy as np
+import torch
+import os
+from typing import Dict, Any, Tuple, Optional
+import warnings
+# Import our custom modules
+from audio_processor import AudioProcessor
+from whisper_classifier import HybridKeywordSpotter
+warnings.filterwarnings("ignore")
+class KeywordSpottingApp:
+    """Main application class for the keyword spotting interface."""
+    def __init__(self):
+        """Initialize the application components."""
+        print("Initializing Keyword Spotting App for Hugging Face...")
+        # Initialize components
+        self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0)
+        self.classifier = HybridKeywordSpotter()
+        print("App initialized successfully!")
+    def process_audio_and_classify(
+        self,
+        audio_input: Optional[Tuple[int, np.ndarray]],
+        audio_file: Optional[str],
+        keywords: str
+    ) -> Tuple[Dict[str, float], str]:
+        """
+        Process audio input and perform keyword classification.
+        Args:
+            audio_input: Tuple of (sample_rate, audio_array) from microphone
+            audio_file: Path to uploaded audio file
+            keywords: Comma-separated keywords string
+        Returns:
+            Tuple of (classification_results, status_message)
+        """
+        try:
+            # Validate keywords
+            if not keywords or not keywords.strip():
+                return {}, "❌ Por favor, ingrese al menos una palabra clave."
+            # Determine audio source and process
+            audio_tensor = None
+            source_info = ""
+            if audio_file is not None:
+                # Process uploaded file
+                try:
+                    audio_tensor = self.audio_processor.process_audio_file(audio_file)
+                    source_info = f"📁 Archivo: {os.path.basename(audio_file)}"
+                except Exception as e:
+                    return {}, f"❌ Error procesando archivo: {str(e)}"
+            elif audio_input is not None:
+                # Process microphone input
+                try:
+                    sample_rate, audio_array = audio_input
+                    # Convert to float32 if needed
+                    if audio_array.dtype == np.int16:
+                        audio_array = audio_array.astype(np.float32) / 32768.0
+                    elif audio_array.dtype == np.int32:
+                        audio_array = audio_array.astype(np.float32) / 2147483648.0
+                    audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate)
+                    source_info = "🎤 Micrófono"
+                except Exception as e:
+                    return {}, f"❌ Error procesando audio del micrófono: {str(e)}"
+            else:
+                return {}, "❌ Por favor, grabe audio o suba un archivo de audio."
+            # Perform classification
+            results = self.classifier.classify_keywords(audio_tensor, keywords)
+            if "error" in results:
+                return {}, f"❌ Error en clasificación: {results['error']}"
+            # Create status message
+            num_keywords = len([k for k in keywords.split(",") if k.strip()])
+            status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave"
+            return results, status_msg
+        except Exception as e:
+            error_msg = f"❌ Error inesperado: {str(e)}"
+            print(error_msg)
+            return {}, error_msg
+    def format_results_for_display(self, results: Dict[str, float]) -> str:
+        """
+        Format classification results for display.
+        Args:
+            results: Classification results dictionary
+        Returns:
+            Formatted string for display
+        """
+        if not results:
+            return "No hay resultados para mostrar."
+        if "error" in results:
+            return f"Error: {results['error']}"
+        # Sort results by probability (descending)
+        sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
+        output_lines = ["📊 **Resultados de Clasificación:**\n"]
+        for keyword, probability in sorted_results:
+            # Create visual probability bar
+            bar_length = 20
+            filled_length = int(bar_length * probability)
+            bar = "█" * filled_length + "░" * (bar_length - filled_length)
+            # Color coding based on probability
+            if probability >= 0.7:
+                emoji = "🟢"  # High confidence
+            elif probability >= 0.4:
+                emoji = "🟡"  # Medium confidence
+            else:
+                emoji = "🔴"  # Low confidence
+            percentage = probability * 100
+            output_lines.append(
+                f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]"
+            )
+        return "\n".join(output_lines)
+def create_gradio_interface():
+    """Create and configure the Gradio interface for Hugging Face."""
+    # Initialize the app
+    app = KeywordSpottingApp()
+    def classify_audio(audio_input, audio_file, keywords):
+        """Wrapper function for Gradio interface."""
+        results, status = app.process_audio_and_classify(audio_input, audio_file, keywords)
+        formatted_results = app.format_results_for_display(results)
+        return formatted_results, status
+    # Create the interface
+    with gr.Blocks(
+        title="🎯 Zero-Shot Audio Keyword Spotting",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {
+            max-width: 900px !important;
+            margin: auto !important;
+        }
+        .status-box {
+            padding: 10px;
+            border-radius: 5px;
+            margin: 10px 0;
+        }
+        """
+    ) as interface:
+        gr.Markdown("""
+        # 🎯 Zero-Shot Audio Keyword Spotting
+        Detect keywords in Spanish audio using AI **without prior training**.
+        Uses Whisper + CLAP models for accurate keyword detection.
+        ## 📋 Instructions:
+        1. **Enter keywords** you want to detect (comma-separated)
+        2. **Record audio** using microphone OR **upload audio file**
+        3. **Click "Analyze Audio"** to get probability results
+        ### 💡 Example Keywords:
+        `hola, gracias, adiós, sí, no, por favor`
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 🔤 Keywords")
+                gr.Markdown("*Example: hola, gracias, adiós*")
+                keywords_input = gr.Textbox(
+                    label="Keywords (comma-separated)",
+                    placeholder="hola, gracias, adiós, sí, no",
+                    lines=2
+                )
+                gr.Markdown("### 🎵 Audio Input")
+                with gr.Tab("🎤 Record Audio"):
+                    gr.Markdown("*Click to record (max 30 seconds)*")
+                    audio_input = gr.Audio(
+                        sources=["microphone"],
+                        type="numpy",
+                        label="Record your audio here"
+                    )
+                with gr.Tab("📁 Upload File"):
+                    gr.Markdown("*Supported: WAV, MP3, M4A, etc.*")
+                    audio_file = gr.Audio(
+                        sources=["upload"],
+                        type="filepath",
+                        label="Upload audio file"
+                    )
+                analyze_btn = gr.Button(
+                    "🔍 Analyze Audio",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column(scale=1):
+                gr.Markdown("### 📊 Results")
+                results_output = gr.Markdown(
+                    value="Results will appear here after analysis...",
+                    label="Classification Results"
+                )
+                status_output = gr.Textbox(
+                    label="Status",
+                    value="Ready to analyze",
+                    interactive=False,
+                    elem_classes=["status-box"]
+                )
+        # Event handlers
+        analyze_btn.click(
+            fn=classify_audio,
+            inputs=[audio_input, audio_file, keywords_input],
+            outputs=[results_output, status_output]
+        )
+        # Examples section
+        gr.Markdown("""
+        ## 💡 Usage Examples:
+        **Suggested Spanish keywords:**
+        - Greetings: `hola, buenos días, buenas tardes, adiós`
+        - Courtesy: `gracias, por favor, disculpe, perdón`
+        - Responses: `sí, no, tal vez, claro`
+        - Numbers: `uno, dos, tres, cuatro, cinco`
+        - Colors: `rojo, azul, verde, amarillo`
+        **Tips:**
+        - Use clear audio without background noise
+        - Speak at normal speed
+        - Keywords can appear anywhere in the audio
+        - Works best with common Spanish words
+        ## 🔧 Technical Details:
+        - **Models**: Whisper (transcription) + CLAP (audio-text similarity)
+        - **Languages**: Optimized for Spanish, works with others
+        - **Processing**: Up to 30 seconds, 48kHz sampling rate
+        - **Approach**: Hybrid zero-shot classification
+        """)
+    return interface
+# Main execution for Hugging Face Spaces
+if __name__ == "__main__":
+    print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...")
+    # Create and launch the interface
+    interface = create_gradio_interface()
+    # Launch without authentication (HF Spaces handles this)
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

audio_processor.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Audio processing module for zero-shot keyword spotting.
+Handles audio loading, preprocessing, and feature extraction.
+"""
+import librosa
+import numpy as np
+import torch
+import torchaudio
+from typing import Union, Tuple
+import warnings
+warnings.filterwarnings("ignore")
+class AudioProcessor:
+    """Handles audio preprocessing for the keyword spotting model."""
+    def __init__(self, target_sample_rate: int = 48000, max_duration: float = 30.0):
+        """
+        Initialize the audio processor.
+        Args:
+            target_sample_rate: Target sampling rate for audio processing
+            max_duration: Maximum audio duration in seconds
+        """
+        self.target_sample_rate = target_sample_rate
+        self.max_duration = max_duration
+        self.max_samples = int(target_sample_rate * max_duration)
+    def load_audio(self, audio_path: str) -> Tuple[np.ndarray, int]:
+        """
+        Load audio file and return waveform and sample rate.
+        Args:
+            audio_path: Path to the audio file
+        Returns:
+            Tuple of (waveform, sample_rate)
+        """
+        try:
+            # Use librosa for robust audio loading
+            waveform, sr = librosa.load(audio_path, sr=None)
+            return waveform, sr
+        except Exception as e:
+            raise ValueError(f"Error loading audio file: {str(e)}")
+    def preprocess_audio(self, waveform: np.ndarray, sample_rate: int) -> torch.Tensor:
+        """
+        Preprocess audio waveform for model input.
+        Args:
+            waveform: Audio waveform as numpy array
+            sample_rate: Original sample rate
+        Returns:
+            Preprocessed audio tensor
+        """
+        # Convert to float32 if needed
+        if waveform.dtype != np.float32:
+            waveform = waveform.astype(np.float32)
+        # Resample if necessary
+        if sample_rate != self.target_sample_rate:
+            waveform = librosa.resample(
+                waveform,
+                orig_sr=sample_rate,
+                target_sr=self.target_sample_rate
+            )
+        # Ensure mono audio
+        if len(waveform.shape) > 1:
+            waveform = librosa.to_mono(waveform)
+        # Trim or pad to max duration
+        if len(waveform) > self.max_samples:
+            # Trim to max duration
+            waveform = waveform[:self.max_samples]
+        elif len(waveform) < self.max_samples:
+            # Pad with zeros
+            padding = self.max_samples - len(waveform)
+            waveform = np.pad(waveform, (0, padding), mode='constant', constant_values=0)
+        # Normalize audio
+        waveform = self._normalize_audio(waveform)
+        # Convert to tensor
+        audio_tensor = torch.from_numpy(waveform).float()
+        return audio_tensor
+    def _normalize_audio(self, waveform: np.ndarray) -> np.ndarray:
+        """
+        Normalize audio waveform.
+        Args:
+            waveform: Input waveform
+        Returns:
+            Normalized waveform
+        """
+        # RMS normalization
+        rms = np.sqrt(np.mean(waveform**2))
+        if rms > 0:
+            waveform = waveform / (rms * 10)  # Scale down to prevent clipping
+        # Clip to [-1, 1] range
+        waveform = np.clip(waveform, -1.0, 1.0)
+        return waveform
+    def process_audio_file(self, audio_path: str) -> torch.Tensor:
+        """
+        Complete audio processing pipeline from file to tensor.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            Preprocessed audio tensor ready for model input
+        """
+        waveform, sample_rate = self.load_audio(audio_path)
+        processed_audio = self.preprocess_audio(waveform, sample_rate)
+        return processed_audio
+    def process_audio_array(self, audio_array: np.ndarray, sample_rate: int) -> torch.Tensor:
+        """
+        Process audio from numpy array (e.g., from Gradio microphone input).
+        Args:
+            audio_array: Audio data as numpy array
+            sample_rate: Sample rate of the audio
+        Returns:
+            Preprocessed audio tensor
+        """
+        return self.preprocess_audio(audio_array, sample_rate)

config.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Configuration file for the Keyword Spotting App.
+Contains authentication and app settings.
+"""
+import os
+from typing import Tuple, Optional
+class AppConfig:
+    """Configuration class for the app."""
+    # Default authentication settings
+    DEFAULT_USERNAME = "admin"
+    DEFAULT_PASSWORD = "kws2024"
+    # App settings
+    DEFAULT_PORT = 7860
+    DEFAULT_HOST = "0.0.0.0"
+    @staticmethod
+    def get_auth_credentials() -> Optional[Tuple[str, str]]:
+        """
+        Get authentication credentials from environment variables or defaults.
+        Returns:
+            Tuple of (username, password) or None to disable auth
+        """
+        # Check environment variables first
+        username = os.getenv("KWS_USERNAME")
+        password = os.getenv("KWS_PASSWORD")
+        # Check if authentication should be disabled
+        if os.getenv("KWS_NO_AUTH", "").lower() in ["true", "1", "yes"]:
+            return None
+        # Use environment variables if available, otherwise use defaults
+        if username and password:
+            return (username, password)
+        else:
+            return (AppConfig.DEFAULT_USERNAME, AppConfig.DEFAULT_PASSWORD)
+    @staticmethod
+    def get_server_config() -> dict:
+        """
+        Get server configuration.
+        Returns:
+            Dictionary with server configuration
+        """
+        return {
+            "server_name": os.getenv("KWS_HOST", AppConfig.DEFAULT_HOST),
+            "server_port": int(os.getenv("KWS_PORT", AppConfig.DEFAULT_PORT)),
+            "share": os.getenv("KWS_SHARE", "false").lower() in ["true", "1", "yes"],
+            "debug": os.getenv("KWS_DEBUG", "false").lower() in ["true", "1", "yes"],
+        }
+    @staticmethod
+    def print_config_info():
+        """Print configuration information."""
+        auth = AppConfig.get_auth_credentials()
+        config = AppConfig.get_server_config()
+        print("🔧 Configuración de la aplicación:")
+        print(f"   Host: {config['server_name']}")
+        print(f"   Puerto: {config['server_port']}")
+        print(f"   Compartir públicamente: {config['share']}")
+        print(f"   Modo debug: {config['debug']}")
+        if auth:
+            print(f"🔐 Autenticación habilitada:")
+            print(f"   Usuario: {auth[0]}")
+            print(f"   Contraseña: {auth[1]}")
+        else:
+            print("🔓 Autenticación deshabilitada")
+        print("\n💡 Para cambiar la configuración, use variables de entorno:")
+        print("   KWS_USERNAME=tu_usuario")
+        print("   KWS_PASSWORD=tu_contraseña")
+        print("   KWS_NO_AUTH=true (para deshabilitar autenticación)")
+        print("   KWS_HOST=127.0.0.1 (para acceso local únicamente)")
+        print("   KWS_PORT=8080 (para cambiar puerto)")
+        print("   KWS_SHARE=true (para crear enlace público)")
+        print("   KWS_DEBUG=true (para modo debug)")
+# Quick access functions
+def get_auth() -> Optional[Tuple[str, str]]:
+    """Quick function to get auth credentials."""
+    return AppConfig.get_auth_credentials()
+def get_server_config() -> dict:
+    """Quick function to get server config."""
+    return AppConfig.get_server_config()
+def print_config() -> None:
+    """Quick function to print config."""
+    AppConfig.print_config_info()

requirements_hf.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+# Optimized requirements for Hugging Face Spaces
+gradio==4.44.0
+torch>=2.0.0
+transformers>=4.30.0
+librosa>=0.10.0
+numpy>=1.21.0
+soundfile>=0.12.0
+openai-whisper>=20231117
+scipy>=1.7.0

whisper_classifier.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+Alternative keyword spotter using Whisper for transcription + text matching.
+This approach transcribes the audio first, then matches keywords in the text.
+"""
+import torch
+import numpy as np
+from typing import List, Dict
+import warnings
+import re
+from difflib import SequenceMatcher
+warnings.filterwarnings("ignore")
+try:
+    import whisper
+    WHISPER_AVAILABLE = True
+except ImportError:
+    WHISPER_AVAILABLE = False
+    print("⚠️ Whisper not available. Install with: pip install openai-whisper")
+class WhisperKeywordSpotter:
+    """Keyword spotter using Whisper transcription + text matching."""
+    def __init__(self, model_size: str = "base"):
+        """
+        Initialize the Whisper-based keyword spotter.
+        Args:
+            model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large')
+        """
+        if not WHISPER_AVAILABLE:
+            raise ImportError("Whisper is not available. Install with: pip install openai-whisper")
+        self.model_size = model_size
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Loading Whisper model: {model_size}")
+        print(f"Using device: {self.device}")
+        try:
+            self.model = whisper.load_model(model_size, device=self.device)
+            print("Whisper model loaded successfully!")
+        except Exception as e:
+            print(f"Error loading Whisper model: {e}")
+            raise
+    def prepare_keywords(self, keywords: str) -> List[str]:
+        """Prepare and clean keyword list."""
+        if not keywords.strip():
+            return []
+        keyword_list = [kw.strip().lower() for kw in keywords.split(",")]
+        keyword_list = [kw for kw in keyword_list if kw]
+        return keyword_list
+    def transcribe_audio(self, audio_tensor: torch.Tensor) -> str:
+        """
+        Transcribe audio using Whisper.
+        Args:
+            audio_tensor: Audio tensor (should be 16kHz for Whisper)
+        Returns:
+            Transcribed text
+        """
+        try:
+            # Convert to numpy and ensure it's float32
+            audio_np = audio_tensor.numpy().astype(np.float32)
+            # Whisper expects 16kHz, but our audio is 48kHz, so we need to resample
+            # Simple downsampling (not ideal but works for testing)
+            if len(audio_np) > 16000 * 30:  # If longer than 30 seconds at 16kHz
+                # Downsample from 48kHz to 16kHz
+                audio_np = audio_np[::3]  # Simple decimation
+            # Ensure audio is in the right range [-1, 1]
+            if audio_np.max() > 1.0 or audio_np.min() < -1.0:
+                audio_np = np.clip(audio_np, -1.0, 1.0)
+            # Transcribe
+            result = self.model.transcribe(
+                audio_np,
+                language="es",  # Spanish
+                task="transcribe",
+                fp16=False,
+                verbose=False
+            )
+            transcription = result["text"].strip().lower()
+            print(f"📝 Transcription: '{transcription}'")
+            return transcription
+        except Exception as e:
+            print(f"Error transcribing audio: {e}")
+            return ""
+    def calculate_keyword_similarity(self, transcription: str, keyword: str) -> float:
+        """
+        Calculate similarity between transcription and keyword.
+        Args:
+            transcription: Transcribed text
+            keyword: Target keyword
+        Returns:
+            Similarity score (0-1)
+        """
+        if not transcription or not keyword:
+            return 0.0
+        # Method 1: Exact match
+        if keyword in transcription:
+            return 1.0
+        # Method 2: Word boundary match
+        word_pattern = r'\b' + re.escape(keyword) + r'\b'
+        if re.search(word_pattern, transcription):
+            return 1.0
+        # Method 3: Fuzzy matching for each word in transcription
+        words = transcription.split()
+        max_similarity = 0.0
+        for word in words:
+            # Clean word (remove punctuation)
+            clean_word = re.sub(r'[^\w]', '', word)
+            if clean_word:
+                similarity = SequenceMatcher(None, clean_word, keyword).ratio()
+                max_similarity = max(max_similarity, similarity)
+        # Method 4: Overall sequence similarity as fallback
+        overall_similarity = SequenceMatcher(None, transcription, keyword).ratio()
+        return max(max_similarity, overall_similarity * 0.7)  # Weight overall similarity less
+    def classify_keywords(self, audio_tensor: torch.Tensor, keywords: str) -> Dict[str, float]:
+        """
+        Perform keyword classification using transcription.
+        Args:
+            audio_tensor: Preprocessed audio tensor
+            keywords: Comma-separated keywords string
+        Returns:
+            Dictionary mapping keywords to probability scores
+        """
+        try:
+            # Prepare keywords
+            keyword_list = self.prepare_keywords(keywords)
+            if not keyword_list:
+                return {"error": "No valid keywords provided"}
+            # Transcribe audio
+            transcription = self.transcribe_audio(audio_tensor)
+            if not transcription:
+                # If no transcription, return low scores
+                return {keyword: 0.1 for keyword in keyword_list}
+            # Calculate similarities
+            results = {}
+            for keyword in keyword_list:
+                similarity = self.calculate_keyword_similarity(transcription, keyword)
+                results[keyword] = round(similarity, 4)
+            return results
+        except Exception as e:
+            error_msg = f"Classification error: {str(e)}"
+            print(error_msg)
+            return {"error": error_msg}
+class HybridKeywordSpotter:
+    """Hybrid approach combining multiple methods."""
+    def __init__(self):
+        """Initialize hybrid classifier."""
+        self.whisper_spotter = None
+        self.clap_spotter = None
+        # Try to initialize Whisper
+        try:
+            if WHISPER_AVAILABLE:
+                self.whisper_spotter = WhisperKeywordSpotter("base")
+        except Exception as e:
+            print(f"⚠️ Could not initialize Whisper: {e}")
+        # Try to initialize CLAP as fallback
+        try:
+            from improved_classifier import ImprovedZeroShotKeywordSpotter
+            self.clap_spotter = ImprovedZeroShotKeywordSpotter()
+        except Exception as e:
+            print(f"⚠️ Could not initialize CLAP: {e}")
+    def classify_keywords(self, audio_tensor: torch.Tensor, keywords: str) -> Dict[str, float]:
+        """
+        Classify using the best available method.
+        Args:
+            audio_tensor: Preprocessed audio tensor
+            keywords: Comma-separated keywords string
+        Returns:
+            Dictionary mapping keywords to probability scores
+        """
+        # Try Whisper first (usually more accurate for speech)
+        if self.whisper_spotter:
+            try:
+                results = self.whisper_spotter.classify_keywords(audio_tensor, keywords)
+                if "error" not in results:
+                    return results
+            except Exception as e:
+                print(f"Whisper failed: {e}")
+        # Fallback to CLAP
+        if self.clap_spotter:
+            try:
+                return self.clap_spotter.classify_keywords_simple(audio_tensor, keywords)
+            except Exception as e:
+                print(f"CLAP failed: {e}")
+        # If all else fails
+        keyword_list = keywords.split(",")
+        return {kw.strip(): 0.0 for kw in keyword_list if kw.strip()}