""" Whisper-only keyword spotter for zero-shot audio keyword detection. Uses Whisper transcription + text matching without CLAP dependencies. """ import torch import numpy as np from typing import List, Dict import warnings import re from difflib import SequenceMatcher warnings.filterwarnings("ignore") try: import whisper WHISPER_AVAILABLE = True except ImportError: WHISPER_AVAILABLE = False print("⚠️ Whisper not available. Install with: pip install openai-whisper") class WhisperKeywordSpotter: """Keyword spotter using Whisper transcription + text matching.""" def __init__(self, model_size: str = "base"): """ Initialize the Whisper-based keyword spotter. Args: model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large') """ if not WHISPER_AVAILABLE: raise ImportError("Whisper is not available. Install with: pip install openai-whisper") self.model_size = model_size self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Loading Whisper model: {model_size}") print(f"Using device: {self.device}") try: self.model = whisper.load_model(model_size, device=self.device) print("Whisper model loaded successfully!") except Exception as e: print(f"Error loading Whisper model: {e}") raise def prepare_keywords(self, keywords: str) -> List[str]: """Prepare and clean keyword list.""" if not keywords.strip(): return [] keyword_list = [kw.strip().lower() for kw in keywords.split(",")] keyword_list = [kw for kw in keyword_list if kw] return keyword_list def transcribe_audio(self, audio_tensor: torch.Tensor) -> str: """ Transcribe audio using Whisper. Args: audio_tensor: Audio tensor (will be resampled for Whisper) Returns: Transcribed text """ try: # Convert to numpy and ensure it's float32 audio_np = audio_tensor.numpy().astype(np.float32) # Whisper expects 16kHz, but our audio is 48kHz, so we need to resample # Simple downsampling (not ideal but works for testing) if len(audio_np) > 16000 * 30: # If longer than 30 seconds at 16kHz # Downsample from 48kHz to 16kHz audio_np = audio_np[::3] # Simple decimation # Ensure audio is in the right range [-1, 1] if audio_np.max() > 1.0 or audio_np.min() < -1.0: audio_np = np.clip(audio_np, -1.0, 1.0) # Transcribe result = self.model.transcribe( audio_np, language="es", # Spanish task="transcribe", fp16=False, verbose=False ) transcription = result["text"].strip().lower() print(f"📝 Transcription: '{transcription}'") return transcription except Exception as e: print(f"Error transcribing audio: {e}") return "" def calculate_keyword_similarity(self, transcription: str, keyword: str) -> float: """ Calculate similarity between transcription and keyword. Args: transcription: Transcribed text keyword: Target keyword Returns: Similarity score (0-1) """ if not transcription or not keyword: return 0.0 # Method 1: Exact match if keyword in transcription: return 1.0 # Method 2: Word boundary match word_pattern = r'\b' + re.escape(keyword) + r'\b' if re.search(word_pattern, transcription): return 1.0 # Method 3: Fuzzy matching for each word in transcription words = transcription.split() max_similarity = 0.0 for word in words: # Clean word (remove punctuation) clean_word = re.sub(r'[^\w]', '', word) if clean_word: similarity = SequenceMatcher(None, clean_word, keyword).ratio() max_similarity = max(max_similarity, similarity) # Method 4: Overall sequence similarity as fallback overall_similarity = SequenceMatcher(None, transcription, keyword).ratio() return max(max_similarity, overall_similarity * 0.7) # Weight overall similarity less def classify_keywords(self, audio_tensor: torch.Tensor, keywords: str) -> Dict[str, float]: """ Perform keyword classification using transcription. Args: audio_tensor: Preprocessed audio tensor keywords: Comma-separated keywords string Returns: Dictionary mapping keywords to probability scores """ try: # Prepare keywords keyword_list = self.prepare_keywords(keywords) if not keyword_list: return {"error": "No valid keywords provided"} # Transcribe audio transcription = self.transcribe_audio(audio_tensor) if not transcription: # If no transcription, return low scores return {keyword: 0.1 for keyword in keyword_list} # Calculate similarities results = {} for keyword in keyword_list: similarity = self.calculate_keyword_similarity(transcription, keyword) results[keyword] = round(similarity, 4) return results except Exception as e: error_msg = f"Classification error: {str(e)}" print(error_msg) return {"error": error_msg} def change_model(self, new_model_size: str): """ Change the Whisper model size. Args: new_model_size: New model size to load """ if new_model_size != self.model_size: print(f"Changing model from {self.model_size} to {new_model_size}") self.model_size = new_model_size try: self.model = whisper.load_model(new_model_size, device=self.device) print(f"Successfully loaded {new_model_size} model!") return True except Exception as e: print(f"Error loading {new_model_size} model: {e}") return False return True