Spaces:

Paranoiid
/

streaming-digit-classifier

Runtime error

Pranav Mishra commited on Aug 23, 2025

Commit

df60ee3

1 Parent(s): 1772a46

Streamline backend: Remove Whisper/Wav2Vec2 models and dependencies

- Removed whisper_digit_processor.py, wav2vec2_processor.py, faster_whisper_processor.py
- Removed local_whisper.py and related VAD utilities
- Simplified requirements_hf.txt: removed transformers, webrtcvad, CPU-specific PyTorch
- Only keeping 3 core ML models: MFCC, Mel CNN, Raw CNN + External API
- Reduced build size and complexity for reliable HF Spaces deployment

Files changed (11) hide show

app.py +2 -10
audio_processors/faster_whisper_processor.py +0 -219
audio_processors/local_whisper.py +0 -158
audio_processors/wav2vec2_processor.py +0 -170
audio_processors/whisper_digit_processor.py +0 -429
requirements_hf.txt +3 -7
utils/enhanced_vad.py +0 -571
utils/session_manager.py +0 -340
utils/vad.py +0 -149
utils/vad_feature_integration.py +0 -483
utils/webrtc_vad.py +0 -442

app.py CHANGED Viewed

@@ -12,9 +12,8 @@ from typing import Dict, Any, Optional
 from dotenv import load_dotenv
 import numpy as np
-# Import audio processors (only essential ones for deployment)
 from audio_processors.external_api import ExternalAPIProcessor
-from audio_processors.whisper_digit_processor import WhisperDigitProcessor
 from audio_processors.ml_mfcc_processor import MLMFCCProcessor
 from audio_processors.ml_mel_cnn_processor import MLMelCNNProcessor
 from audio_processors.ml_raw_cnn_processor import MLRawCNNProcessor
@@ -80,14 +79,7 @@ def initialize_processors():
     except Exception as e:
         app.logger.error(f"[FAIL] Failed to initialize External API: {str(e)}")
-    # Whisper digit processor as another fallback
-    try:
-        whisper_processor = WhisperDigitProcessor()
-        if whisper_processor.is_configured():
-            procs['whisper_digit'] = whisper_processor
-            app.logger.info("[OK] Whisper digit processor initialized")
-    except Exception as e:
-        app.logger.error(f"[FAIL] Failed to initialize Whisper: {str(e)}")
     app.logger.info(f"Processor initialization complete:")
     app.logger.info(f"  ML Models loaded: {ml_working_count}/3")

 from dotenv import load_dotenv
 import numpy as np
+# Import audio processors (only the 3 ML models + external API)
 from audio_processors.external_api import ExternalAPIProcessor
 from audio_processors.ml_mfcc_processor import MLMFCCProcessor
 from audio_processors.ml_mel_cnn_processor import MLMelCNNProcessor
 from audio_processors.ml_raw_cnn_processor import MLRawCNNProcessor
     except Exception as e:
         app.logger.error(f"[FAIL] Failed to initialize External API: {str(e)}")
+    # Removed whisper processors to reduce dependencies and build size
     app.logger.info(f"Processor initialization complete:")
     app.logger.info(f"  ML Models loaded: {ml_working_count}/3")

audio_processors/faster_whisper_processor.py DELETED Viewed

@@ -1,219 +0,0 @@
-"""
-Faster-Whisper processor with built-in VAD (2025 approach)
-More reliable than manual WebRTC VAD + Whisper coordination
-"""
-import numpy as np
-import io
-import time
-import logging
-from typing import Dict, Any, Optional
-try:
-    from faster_whisper import WhisperModel
-    FASTER_WHISPER_AVAILABLE = True
-except ImportError:
-    FASTER_WHISPER_AVAILABLE = False
-    WhisperModel = None
-from .base_processor import AudioProcessor
-logger = logging.getLogger(__name__)
-class FasterWhisperDigitProcessor(AudioProcessor):
-    """
-    Modern 2025 approach using faster-whisper with built-in VAD.
-    Much more reliable than manual WebRTC VAD coordination.
-    """
-    def __init__(self):
-        """Initialize faster-whisper processor with built-in VAD."""
-        super().__init__("Faster-Whisper with VAD")
-        if not FASTER_WHISPER_AVAILABLE:
-            logger.error("faster-whisper not available. Install with: pip install faster-whisper")
-            self.model = None
-            return
-        self.model = None
-        self.device = "cuda" if self._cuda_available() else "cpu"
-        # Digit mapping
-        self.digit_map = {
-            "zero": "0", "one": "1", "two": "2", "three": "3",
-            "four": "4", "five": "5", "six": "6", "seven": "7",
-            "eight": "8", "nine": "9",
-            "oh": "0", "o": "0", "for": "4", "fore": "4",
-            "to": "2", "too": "2", "tu": "2", "tree": "3",
-            "free": "3", "ate": "8", "ait": "8"
-        }
-        # Statistics
-        self.total_predictions = 0
-        self.successful_predictions = 0
-        self.failed_predictions = 0
-        self._initialize_model()
-    def _cuda_available(self) -> bool:
-        """Check if CUDA is available."""
-        try:
-            import torch
-            return torch.cuda.is_available()
-        except ImportError:
-            return False
-    def _initialize_model(self):
-        """Initialize faster-whisper model with VAD."""
-        if not FASTER_WHISPER_AVAILABLE:
-            return
-        try:
-            logger.info("Initializing faster-whisper model with built-in VAD...")
-            # Initialize faster-whisper model
-            self.model = WhisperModel(
-                "tiny",  # Use tiny model for speed
-                device=self.device,
-                compute_type="float16" if self.device == "cuda" else "int8"
-            )
-            logger.info(f"Faster-Whisper model initialized on {self.device}")
-        except Exception as e:
-            logger.error(f"Failed to initialize faster-whisper: {e}")
-            self.model = None
-    def is_configured(self) -> bool:
-        """Check if processor is configured."""
-        return self.model is not None and FASTER_WHISPER_AVAILABLE
-    def process_audio(self, audio_data: bytes) -> str:
-        """
-        Process audio with built-in VAD and return predicted digit.
-        Args:
-            audio_data: Raw audio bytes
-        Returns:
-            str: Predicted digit (0-9) or error message
-        """
-        if not self.is_configured():
-            return "error: Model not configured"
-        try:
-            # Convert audio to numpy array
-            audio_array = self._convert_audio_bytes(audio_data)
-            if audio_array is None:
-                return "error: Audio conversion failed"
-            # Use faster-whisper with built-in VAD
-            segments, info = self.model.transcribe(
-                audio_array,
-                language="en",
-                # Built-in VAD parameters - much better than manual VAD
-                vad_filter=True,
-                vad_parameters=dict(
-                    min_silence_duration_ms=100,  # 100ms minimum silence
-                    speech_pad_ms=30  # 30ms padding around speech
-                )
-            )
-            # Process transcription results
-            transcriptions = []
-            for segment in segments:
-                text = segment.text.strip().lower()
-                if text:
-                    transcriptions.append(text)
-            if not transcriptions:
-                return "error: No speech detected"
-            # Combine all segments and extract digit
-            full_text = " ".join(transcriptions)
-            digit = self._text_to_digit(full_text)
-            logger.debug(f"Faster-Whisper: '{full_text}' -> '{digit}'")
-            if digit in "0123456789":
-                self.successful_predictions += 1
-                return digit
-            else:
-                self.failed_predictions += 1
-                return f"unclear: {full_text}"
-        except Exception as e:
-            logger.error(f"Faster-Whisper processing failed: {e}")
-            self.failed_predictions += 1
-            return f"error: {str(e)}"
-        finally:
-            self.total_predictions += 1
-    def _convert_audio_bytes(self, audio_data: bytes) -> Optional[np.ndarray]:
-        """Convert audio bytes to numpy array for faster-whisper."""
-        try:
-            # Check if it's a WAV file
-            if audio_data.startswith(b'RIFF'):
-                import soundfile as sf
-                audio_buffer = io.BytesIO(audio_data)
-                audio_array, sample_rate = sf.read(audio_buffer, dtype='float32')
-                # Convert stereo to mono if needed
-                if len(audio_array.shape) > 1:
-                    audio_array = np.mean(audio_array, axis=1)
-                return audio_array
-            else:
-                # Raw PCM data
-                audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
-                return audio_array / 32768.0
-        except Exception as e:
-            logger.error(f"Audio conversion failed: {e}")
-            return None
-    def _text_to_digit(self, text: str) -> str:
-        """Convert transcribed text to digit."""
-        text = text.strip().lower()
-        # Remove common words
-        text = text.replace("the", "").replace("number", "").replace("digit", "")
-        text = text.strip()
-        # Direct mapping
-        if text in self.digit_map:
-            return self.digit_map[text]
-        # Word-by-word check
-        for word in text.split():
-            if word in self.digit_map:
-                return self.digit_map[word]
-        # Check for digits in text
-        digits = [char for char in text if char.isdigit()]
-        if digits:
-            return digits[0]
-        return text
-    def get_model_info(self) -> Dict[str, Any]:
-        """Get model information."""
-        return {
-            'model_name': 'faster-whisper-tiny',
-            'model_type': 'Speech-to-Text with VAD',
-            'has_builtin_vad': True,
-            'device': self.device,
-            'available': FASTER_WHISPER_AVAILABLE
-        }
-    def get_stats(self) -> Dict[str, Any]:
-        """Get processing statistics."""
-        success_rate = self.successful_predictions / max(1, self.total_predictions)
-        return {
-            'total_predictions': self.total_predictions,
-            'successful_predictions': self.successful_predictions,
-            'failed_predictions': self.failed_predictions,
-            'success_rate': round(success_rate, 3),
-            'model_available': self.is_configured()
-        }

audio_processors/local_whisper.py DELETED Viewed

@@ -1,158 +0,0 @@
-import logging
-import numpy as np
-from typing import Optional
-from .base_processor import AudioProcessor
-logger = logging.getLogger(__name__)
-class LocalWhisperProcessor(AudioProcessor):
-    """
-    Local Whisper model using transformers pipeline.
-    Fallback when API is unavailable.
-    """
-    def __init__(self):
-        super().__init__("Local Whisper (Tiny)")
-        self.pipeline = None
-        self.model_name = "openai/whisper-tiny"
-        self.is_initialized = False
-    def _initialize_model(self):
-        """Lazy initialization of the model"""
-        if self.is_initialized:
-            return
-        try:
-            logger.info(f"Loading local Whisper model: {self.model_name}")
-            from transformers import pipeline
-            import torch
-            # Use CPU for compatibility, GPU if available
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            self.pipeline = pipeline(
-                "automatic-speech-recognition",
-                model=self.model_name,
-                device=device,
-                torch_dtype=torch.float32,  # Use float32 to avoid dtype issues
-                return_timestamps=False  # We only need text
-            )
-            logger.info(f"Local Whisper model loaded on {device}")
-            self.is_initialized = True
-        except ImportError as e:
-            logger.error("transformers library not installed. Run: pip install transformers torch")
-            raise Exception("transformers library required for local processing")
-        except Exception as e:
-            logger.error(f"Failed to load local Whisper model: {str(e)}")
-            raise Exception(f"Local model initialization failed: {str(e)}")
-    def process_audio(self, audio_data: bytes) -> str:
-        """
-        Process audio using local Whisper model.
-        Args:
-            audio_data: Raw audio bytes (WAV format preferred)
-        Returns:
-            Predicted digit as string ('0'-'9')
-        Raises:
-            Exception: If processing fails
-        """
-        try:
-            # Initialize model on first use
-            self._initialize_model()
-            # Convert audio bytes to numpy array
-            from utils.audio_utils import audio_to_numpy
-            audio_array, sample_rate = audio_to_numpy(audio_data)
-            # Resample to 16kHz if needed (Whisper expects 16kHz)
-            if sample_rate != 16000:
-                logger.debug(f"Resampling from {sample_rate}Hz to 16kHz")
-                import librosa
-                audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
-            # Process with pipeline
-            logger.debug(f"Processing audio: {len(audio_array)} samples at 16kHz")
-            result = self.pipeline(audio_array)
-            if not result or 'text' not in result:
-                logger.error(f"Unexpected pipeline result: {result}")
-                raise Exception("Invalid pipeline output")
-            transcribed_text = result['text'].strip().lower()
-            logger.debug(f"Local Whisper transcription: '{transcribed_text}'")
-            # Extract digit from transcription
-            predicted_digit = self._extract_digit(transcribed_text)
-            if predicted_digit is None:
-                logger.warning(f"No digit found in transcription: '{transcribed_text}'")
-                return "?"
-            return predicted_digit
-        except Exception as e:
-            logger.error(f"Local Whisper processing failed: {str(e)}")
-            raise Exception(f"Local processing error: {str(e)}")
-    def _extract_digit(self, text: str) -> Optional[str]:
-        """
-        Extract digit from transcribed text.
-        Handles both numerical ('1', '2') and word forms ('one', 'two').
-        """
-        import re
-        # Word to digit mapping
-        word_to_digit = {
-            'zero': '0', 'oh': '0',
-            'one': '1', 'won': '1',
-            'two': '2', 'to': '2', 'too': '2',
-            'three': '3', 'tree': '3',
-            'four': '4', 'for': '4', 'fore': '4',
-            'five': '5',
-            'six': '6', 'sick': '6',
-            'seven': '7',
-            'eight': '8', 'ate': '8',
-            'nine': '9', 'niner': '9'
-        }
-        # First, try to find a direct digit
-        digit_match = re.search(r'\b([0-9])\b', text)
-        if digit_match:
-            return digit_match.group(1)
-        # Then try word forms
-        words = text.split()
-        for word in words:
-            clean_word = re.sub(r'[^\w]', '', word.lower())
-            if clean_word in word_to_digit:
-                return word_to_digit[clean_word]
-        # Try partial matches for robustness
-        for word, digit in word_to_digit.items():
-            if word in text:
-                return digit
-        return None
-    def is_configured(self) -> bool:
-        """Check if local model can be initialized."""
-        try:
-            import transformers
-            import torch
-            return True
-        except ImportError:
-            return False
-    def test_connection(self) -> bool:
-        """Test local model functionality."""
-        try:
-            self._initialize_model()
-            return True
-        except:
-            return False

audio_processors/wav2vec2_processor.py DELETED Viewed

@@ -1,170 +0,0 @@
-import logging
-import numpy as np
-from typing import Optional
-from .base_processor import AudioProcessor
-logger = logging.getLogger(__name__)
-class Wav2Vec2Processor(AudioProcessor):
-    """
-    Wav2Vec2 model processor for speech recognition.
-    Lightweight alternative to Whisper.
-    """
-    def __init__(self):
-        super().__init__("Wav2Vec2 (Facebook)")
-        self.processor = None
-        self.model = None
-        self.model_name = "facebook/wav2vec2-base-960h"
-        self.is_initialized = False
-    def _initialize_model(self):
-        """Lazy initialization of the model"""
-        if self.is_initialized:
-            return
-        try:
-            logger.info(f"Loading Wav2Vec2 model: {self.model_name}")
-            from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-            import torch
-            # Load processor and model
-            self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
-            self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)
-            # Move to GPU if available
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            self.model = self.model.to(device)
-            self.device = device
-            logger.info(f"Wav2Vec2 model loaded on {device}")
-            self.is_initialized = True
-        except ImportError as e:
-            logger.error("transformers library not installed. Run: pip install transformers torch")
-            raise Exception("transformers library required for Wav2Vec2 processing")
-        except Exception as e:
-            logger.error(f"Failed to load Wav2Vec2 model: {str(e)}")
-            raise Exception(f"Wav2Vec2 model initialization failed: {str(e)}")
-    def process_audio(self, audio_data: bytes) -> str:
-        """
-        Process audio using Wav2Vec2 model.
-        Args:
-            audio_data: Raw audio bytes (WAV format preferred)
-        Returns:
-            Predicted digit as string ('0'-'9')
-        Raises:
-            Exception: If processing fails
-        """
-        try:
-            # Initialize model on first use
-            self._initialize_model()
-            # Convert audio bytes to numpy array
-            from utils.audio_utils import audio_to_numpy
-            audio_array, sample_rate = audio_to_numpy(audio_data)
-            # Resample to 16kHz if needed (Wav2Vec2 expects 16kHz)
-            if sample_rate != 16000:
-                logger.debug(f"Resampling from {sample_rate}Hz to 16kHz")
-                import librosa
-                audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
-            logger.debug(f"Processing audio: {len(audio_array)} samples at 16kHz")
-            # Process with Wav2Vec2
-            import torch
-            # Tokenize audio
-            input_values = self.processor(
-                audio_array,
-                return_tensors="pt",
-                padding="longest",
-                sampling_rate=16000
-            ).input_values.to(self.device)
-            # Get logits
-            with torch.no_grad():
-                logits = self.model(input_values).logits
-            # Get predicted tokens
-            predicted_ids = torch.argmax(logits, dim=-1)
-            # Decode transcription
-            transcription = self.processor.batch_decode(predicted_ids)[0].lower().strip()
-            logger.debug(f"Wav2Vec2 transcription: '{transcription}'")
-            # Extract digit from transcription
-            predicted_digit = self._extract_digit(transcription)
-            if predicted_digit is None:
-                logger.warning(f"No digit found in transcription: '{transcription}'")
-                return "?"
-            return predicted_digit
-        except Exception as e:
-            logger.error(f"Wav2Vec2 processing failed: {str(e)}")
-            raise Exception(f"Wav2Vec2 processing error: {str(e)}")
-    def _extract_digit(self, text: str) -> Optional[str]:
-        """
-        Extract digit from transcribed text.
-        Handles both numerical ('1', '2') and word forms ('one', 'two').
-        """
-        import re
-        # Word to digit mapping
-        word_to_digit = {
-            'zero': '0', 'oh': '0',
-            'one': '1', 'won': '1',
-            'two': '2', 'to': '2', 'too': '2',
-            'three': '3', 'tree': '3',
-            'four': '4', 'for': '4', 'fore': '4', 'full': '4',  # "full" often misheard as "four"
-            'five': '5',
-            'six': '6', 'sick': '6',
-            'seven': '7',
-            'eight': '8', 'ate': '8',
-            'nine': '9', 'niner': '9'
-        }
-        # First, try to find a direct digit
-        digit_match = re.search(r'\b([0-9])\b', text)
-        if digit_match:
-            return digit_match.group(1)
-        # Then try word forms
-        words = text.split()
-        for word in words:
-            clean_word = re.sub(r'[^\w]', '', word.lower())
-            if clean_word in word_to_digit:
-                return word_to_digit[clean_word]
-        # Try partial matches for robustness
-        for word, digit in word_to_digit.items():
-            if word in text:
-                return digit
-        return None
-    def is_configured(self) -> bool:
-        """Check if Wav2Vec2 model can be initialized."""
-        try:
-            import transformers
-            import torch
-            return True
-        except ImportError:
-            return False
-    def test_connection(self) -> bool:
-        """Test Wav2Vec2 model functionality."""
-        try:
-            self._initialize_model()
-            return True
-        except:
-            return False

audio_processors/whisper_digit_processor.py DELETED Viewed

@@ -1,429 +0,0 @@
-"""
-Whisper-based digit recognition processor
-Specialized implementation for spoken digit recognition (0-9)
-"""
-import numpy as np
-import io
-import time
-import logging
-from typing import Dict, Any, Optional
-import torch
-from transformers import pipeline
-import soundfile as sf
-from .base_processor import AudioProcessor
-logger = logging.getLogger(__name__)
-class WhisperDigitProcessor(AudioProcessor):
-    """
-    Whisper-based digit recognition processor using Hugging Face transformers.
-    Optimized for single digit recognition with mapping from text to numbers.
-    """
-    def __init__(self):
-        """Initialize Whisper digit processor with optimized settings."""
-        super().__init__("Whisper Digit Recognition")
-        self.model = None
-        self.device = 0 if torch.cuda.is_available() else -1
-        # Digit mapping for text-to-number conversion
-        self.digit_map = {
-            "zero": "0", "one": "1", "two": "2", "three": "3",
-            "four": "4", "five": "5", "six": "6", "seven": "7",
-            "eight": "8", "nine": "9",
-            # Common variations and alternatives
-            "oh": "0", "o": "0",
-            "for": "4", "fore": "4", "to": "2", "too": "2", "tu": "2",
-            "tree": "3", "free": "3", "ate": "8", "ait": "8"
-        }
-        # Reverse mapping for validation
-        self.number_words = set(self.digit_map.keys())
-        # Statistics tracking
-        self.total_predictions = 0
-        self.successful_predictions = 0
-        self.failed_predictions = 0
-        self.average_inference_time = 0.0
-        self._initialize_model()
-    def _initialize_model(self):
-        """Initialize the Whisper model with optimal settings for digit recognition."""
-        try:
-            logger.info("Initializing Whisper model for digit recognition...")
-            # Use Whisper tiny model for fast inference
-            self.model = pipeline(
-                "automatic-speech-recognition",
-                model="openai/whisper-tiny",
-                device=self.device,
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                return_timestamps=False  # We don't need timestamps for single digits
-            )
-            logger.info(f"Whisper model initialized successfully on device: {self.device}")
-            # Test model with dummy input
-            test_audio = np.random.randn(16000).astype(np.float32)  # 1 second of noise
-            try:
-                test_result = self.model(test_audio)
-                logger.info("Model test successful")
-            except Exception as e:
-                logger.warning(f"Model test failed but model loaded: {e}")
-            return True
-        except Exception as e:
-            logger.error(f"Failed to initialize Whisper model: {e}")
-            return False
-    def is_configured(self) -> bool:
-        """Check if the processor is properly configured."""
-        return self.model is not None
-    def process_audio(self, audio_data: bytes) -> str:
-        """
-        Predict digit from audio data.
-        Args:
-            audio_data: Raw audio bytes (WAV format preferred)
-        Returns:
-            str: Predicted digit (0-9) or error message
-        """
-        if not self.is_configured():
-            return "error: Model not configured"
-        try:
-            # Convert audio bytes to numpy array
-            audio_array = self._convert_audio_to_array(audio_data)
-            if audio_array is None:
-                return "error: Invalid audio format"
-            # Ensure proper sample rate and format
-            audio_array = self._preprocess_audio(audio_array)
-            # Run Whisper inference
-            result = self.model(audio_array)
-            text = result["text"].strip().lower()
-            # Convert text to digit
-            digit = self._text_to_digit(text)
-            # Enhanced logging to debug transcription issues
-            logger.info(f"🎤 Whisper transcription: '{text}' -> digit: '{digit}'")
-            logger.info(f"📊 Audio stats: duration={len(audio_array)/16000:.2f}s, samples={len(audio_array)}, max_val={np.max(np.abs(audio_array)):.3f}")
-            if digit in "0123456789":
-                self.successful_predictions += 1
-                return digit
-            else:
-                self.failed_predictions += 1
-                return f"unclear: {text}"
-        except Exception as e:
-            logger.error(f"Whisper prediction failed: {e}")
-            self.failed_predictions += 1
-            return f"error: {str(e)}"
-        finally:
-            self.total_predictions += 1
-    def _convert_audio_to_array(self, audio_data: bytes) -> Optional[np.ndarray]:
-        """
-        Convert audio bytes to numpy array.
-        Args:
-            audio_data: Raw audio bytes (could be WAV file or raw PCM from VAD)
-        Returns:
-            np.ndarray: Audio samples or None if conversion failed
-        """
-        # First check if this looks like raw PCM data from VAD (no file headers)
-        if len(audio_data) < 100 or not audio_data.startswith(b'RIFF'):
-            # This is likely raw PCM data from WebRTC VAD
-            try:
-                logger.debug("Processing raw PCM data from VAD segment")
-                audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
-                audio_array = audio_array / 32768.0  # Normalize to [-1, 1]
-                self._original_sample_rate = 16000  # WebRTC VAD uses 16kHz
-                return audio_array
-            except Exception as e:
-                logger.error(f"Failed to process raw PCM data: {e}")
-                return None
-        # This looks like a complete audio file (WAV, etc.)
-        try:
-            # Try to read as audio file using soundfile
-            audio_buffer = io.BytesIO(audio_data)
-            audio_array, sample_rate = sf.read(audio_buffer, dtype='float32')
-            # Handle stereo to mono conversion
-            if len(audio_array.shape) > 1:
-                audio_array = np.mean(audio_array, axis=1)
-            # Store original sample rate for resampling
-            self._original_sample_rate = sample_rate
-            logger.debug(f"Successfully loaded audio file: {len(audio_array)} samples at {sample_rate}Hz")
-            return audio_array
-        except Exception as e:
-            logger.warning(f"Audio file conversion failed with soundfile: {e}")
-            # Final fallback: treat as raw PCM
-            try:
-                logger.debug("Fallback: treating as raw PCM data")
-                audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
-                audio_array = audio_array / 32768.0  # Normalize to [-1, 1]
-                self._original_sample_rate = 16000  # Assume 16kHz
-                return audio_array
-            except Exception as e2:
-                logger.error(f"All audio conversion methods failed: {e2}")
-                return None
-    def _preprocess_audio(self, audio_array: np.ndarray) -> np.ndarray:
-        """
-        Preprocess audio for optimal Whisper performance.
-        Args:
-            audio_array: Raw audio samples
-        Returns:
-            np.ndarray: Preprocessed audio
-        """
-        # Resample to 16kHz if needed (Whisper's expected input)
-        target_sample_rate = 16000
-        if hasattr(self, '_original_sample_rate') and self._original_sample_rate != target_sample_rate:
-            try:
-                import librosa
-                audio_array = librosa.resample(
-                    audio_array,
-                    orig_sr=self._original_sample_rate,
-                    target_sr=target_sample_rate
-                )
-                logger.debug(f"Resampled audio from {self._original_sample_rate}Hz to {target_sample_rate}Hz")
-            except ImportError:
-                logger.warning("librosa not available for resampling, using original audio")
-            except Exception as e:
-                logger.warning(f"Resampling failed: {e}, using original audio")
-        # Trim silence from edges
-        audio_array = self._trim_silence(audio_array)
-        # Ensure minimum length (Whisper works better with at least 0.1s)
-        min_samples = int(0.1 * target_sample_rate)
-        if len(audio_array) < min_samples:
-            # Pad with silence
-            padding = min_samples - len(audio_array)
-            audio_array = np.pad(audio_array, (0, padding), mode='constant', constant_values=0)
-        # Normalize audio
-        max_val = np.max(np.abs(audio_array))
-        if max_val > 0:
-            audio_array = audio_array / max_val * 0.9  # Prevent clipping
-        return audio_array
-    def _trim_silence(self, audio_array: np.ndarray, silence_threshold: float = 0.01) -> np.ndarray:
-        """
-        Trim silence from beginning and end of audio.
-        Args:
-            audio_array: Audio samples
-            silence_threshold: Threshold for silence detection
-        Returns:
-            np.ndarray: Trimmed audio
-        """
-        if len(audio_array) == 0:
-            return audio_array
-        # Find non-silent regions
-        energy = audio_array ** 2
-        non_silent = energy > silence_threshold
-        if not np.any(non_silent):
-            return audio_array  # All silence, return as is
-        # Find first and last non-silent samples
-        first_sound = np.argmax(non_silent)
-        last_sound = len(non_silent) - np.argmax(non_silent[::-1]) - 1
-        # Add small padding
-        padding_samples = int(0.05 * 16000)  # 50ms padding
-        first_sound = max(0, first_sound - padding_samples)
-        last_sound = min(len(audio_array) - 1, last_sound + padding_samples)
-        return audio_array[first_sound:last_sound + 1]
-    def _text_to_digit(self, text: str) -> str:
-        """
-        Convert transcribed text to digit.
-        Args:
-            text: Transcribed text from Whisper
-        Returns:
-            str: Digit (0-9) or original text if no match
-        """
-        # Clean the text
-        text = text.strip().lower()
-        # Remove common punctuation and extra words
-        text = text.replace(",", "").replace(".", "").replace("!", "").replace("?", "")
-        text = text.replace("the", "").replace("number", "").replace("digit", "")
-        text = text.strip()
-        # Try direct mapping
-        if text in self.digit_map:
-            return self.digit_map[text]
-        # Try word-by-word mapping for multi-word responses
-        words = text.split()
-        for word in words:
-            if word in self.digit_map:
-                return self.digit_map[word]
-        # Check if it's already a digit
-        if len(text) == 1 and text.isdigit():
-            return text
-        # Look for digits in the text
-        digits_found = [char for char in text if char.isdigit()]
-        if digits_found:
-            return digits_found[0]  # Return first digit found
-        # No clear digit found
-        return text
-    def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]:
-        """
-        Predict digit with detailed timing and confidence metrics.
-        Args:
-            audio_data: Raw audio bytes
-        Returns:
-            dict: Prediction results with timing and metadata
-        """
-        start_time = time.time()
-        predicted_digit = self.process_audio(audio_data)
-        inference_time = time.time() - start_time
-        # Update average inference time
-        if self.total_predictions > 0:
-            self.average_inference_time = (
-                (self.average_inference_time * (self.total_predictions - 1) + inference_time)
-                / self.total_predictions
-            )
-        # Determine success status
-        is_successful = predicted_digit in "0123456789"
-        confidence_score = 1.0 if is_successful else 0.0
-        # Extract any error information
-        error_info = None
-        if predicted_digit.startswith("error:"):
-            error_info = predicted_digit[6:].strip()
-            predicted_digit = "unknown"
-        elif predicted_digit.startswith("unclear:"):
-            error_info = f"Transcription unclear: {predicted_digit[8:].strip()}"
-            predicted_digit = "unknown"
-        result = {
-            'predicted_digit': predicted_digit,
-            'confidence_score': confidence_score,
-            'inference_time': round(inference_time, 4),
-            'success': is_successful,
-            'timestamp': time.time(),
-            'model': 'openai/whisper-tiny',
-            'method': 'whisper_digit'
-        }
-        if error_info:
-            result['error'] = error_info
-        return result
-    def get_model_info(self) -> Dict[str, Any]:
-        """
-        Get information about the loaded model.
-        Returns:
-            dict: Model information
-        """
-        return {
-            'model_name': 'openai/whisper-tiny',
-            'model_type': 'Speech-to-Text (ASR)',
-            'specialized_for': 'Digit Recognition (0-9)',
-            'device': 'GPU' if self.device >= 0 else 'CPU',
-            'torch_device': self.device,
-            'supports_streaming': False,
-            'supported_languages': ['en'],
-            'digit_mappings': len(self.digit_map)
-        }
-    def get_stats(self) -> Dict[str, Any]:
-        """
-        Get processor statistics.
-        Returns:
-            dict: Performance statistics
-        """
-        success_rate = (
-            self.successful_predictions / max(1, self.total_predictions)
-        )
-        return {
-            'total_predictions': self.total_predictions,
-            'successful_predictions': self.successful_predictions,
-            'failed_predictions': self.failed_predictions,
-            'success_rate': round(success_rate, 3),
-            'average_inference_time': round(self.average_inference_time, 4),
-            'model_loaded': self.is_configured()
-        }
-    def test_with_sample_audio(self) -> Dict[str, Any]:
-        """
-        Test the processor with generated sample audio.
-        Returns:
-            dict: Test results
-        """
-        if not self.is_configured():
-            return {'error': 'Model not configured'}
-        try:
-            # Generate simple test audio (1 second of tone)
-            sample_rate = 16000
-            duration = 1.0
-            frequency = 440  # A note
-            t = np.linspace(0, duration, int(sample_rate * duration))
-            test_audio = 0.3 * np.sin(2 * np.pi * frequency * t).astype(np.float32)
-            # Run prediction
-            start_time = time.time()
-            result = self.model(test_audio)
-            test_time = time.time() - start_time
-            return {
-                'test_successful': True,
-                'test_time': round(test_time, 4),
-                'transcription': result.get('text', 'No text'),
-                'model_responsive': True
-            }
-        except Exception as e:
-            return {
-                'test_successful': False,
-                'error': str(e),
-                'model_responsive': False
-            }

requirements_hf.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-# HF Spaces Requirements - Essential packages only
 # Core Flask API
 Flask==2.3.3
 Flask-CORS==4.0.0
@@ -12,15 +12,11 @@ scipy==1.11.4
 soundfile==0.12.1
 # ML Models - PyTorch (CPU optimized for HF Spaces)
-torch==2.0.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu
-torchaudio==2.0.2+cpu --extra-index-url https://download.pytorch.org/whl/cpu
 # Essential ML utilities
 scikit-learn==1.3.2
-transformers==4.35.2
-# Audio format handling
-webrtcvad==2.0.10
 # Logging and utilities
 tqdm==4.66.1

+# HF Spaces Requirements - Essential packages only (3 ML models only)
 # Core Flask API
 Flask==2.3.3
 Flask-CORS==4.0.0
 soundfile==0.12.1
 # ML Models - PyTorch (CPU optimized for HF Spaces)
+torch==2.0.1
+torchaudio==2.0.2
 # Essential ML utilities
 scikit-learn==1.3.2
 # Logging and utilities
 tqdm==4.66.1

utils/enhanced_vad.py DELETED Viewed

@@ -1,571 +0,0 @@
-"""
-Enhanced VAD Implementation with ffmpeg support and comprehensive debugging
-"""
-import numpy as np
-import logging
-import subprocess
-import tempfile
-import os
-import time
-import wave
-import io
-from pathlib import Path
-from typing import Dict, List, Tuple, Optional, Any
-from threading import Thread, Lock
-import asyncio
-import concurrent.futures
-# Try to import WebRTC VAD
-try:
-    import webrtcvad
-    WEBRTC_AVAILABLE = True
-except ImportError:
-    WEBRTC_AVAILABLE = False
-    logging.warning("webrtcvad not available - using fallback VAD implementation")
-logger = logging.getLogger(__name__)
-class EnhancedVAD:
-    """
-    Enhanced Voice Activity Detection with ffmpeg integration and comprehensive debugging.
-    Features:
-    - ffmpeg-based audio preprocessing
-    - Multiple VAD implementations (WebRTC, simple energy-based)
-    - Comprehensive audio validation and debugging
-    - Async audio chunk saving
-    - Real-time performance monitoring
-    """
-    def __init__(self,
-                 sample_rate: int = 16000,
-                 frame_duration_ms: int = 30,
-                 aggressiveness: int = 1,
-                 min_speech_duration: float = 0.4,
-                 max_speech_duration: float = 3.0,
-                 silence_threshold: float = 0.01):
-        """
-        Initialize Enhanced VAD.
-        Args:
-            sample_rate: Target sample rate (Hz)
-            frame_duration_ms: Frame duration in milliseconds
-            aggressiveness: VAD aggressiveness (0-3)
-            min_speech_duration: Minimum speech segment duration (seconds)
-            max_speech_duration: Maximum speech segment duration (seconds)
-            silence_threshold: Energy threshold for silence detection
-        """
-        self.sample_rate = sample_rate
-        self.frame_duration_ms = frame_duration_ms
-        self.frame_size = int(sample_rate * frame_duration_ms / 1000)
-        self.aggressiveness = aggressiveness
-        self.min_speech_duration = min_speech_duration
-        self.max_speech_duration = max_speech_duration
-        self.silence_threshold = silence_threshold
-        # Initialize WebRTC VAD if available
-        self.webrtc_vad = None
-        if WEBRTC_AVAILABLE:
-            try:
-                self.webrtc_vad = webrtcvad.Vad(aggressiveness)
-                logger.info(f"WebRTC VAD initialized (aggressiveness: {aggressiveness})")
-            except Exception as e:
-                logger.error(f"Failed to initialize WebRTC VAD: {e}")
-                self.webrtc_vad = None
-        # Check ffmpeg availability
-        self.ffmpeg_available = self._check_ffmpeg_available()
-        # Performance tracking
-        self.stats = {
-            'total_chunks_processed': 0,
-            'speech_segments_detected': 0,
-            'processing_time_total': 0.0,
-            'last_processing_time': 0.0,
-            'ffmpeg_conversions': 0,
-            'audio_validation_failures': 0,
-            'webrtc_available': WEBRTC_AVAILABLE and self.webrtc_vad is not None,
-            'ffmpeg_available': self.ffmpeg_available
-        }
-        # Async processing
-        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=2)
-        self.save_lock = Lock()
-        logger.info(f"Enhanced VAD initialized:")
-        logger.info(f"  Sample rate: {sample_rate} Hz")
-        logger.info(f"  Frame duration: {frame_duration_ms} ms")
-        logger.info(f"  WebRTC VAD: {'Available' if self.webrtc_vad else 'Not available'}")
-        logger.info(f"  ffmpeg: {'Available' if self.ffmpeg_available else 'Not available'}")
-    def _check_ffmpeg_available(self) -> bool:
-        """Check if ffmpeg is available."""
-        try:
-            result = subprocess.run(['ffmpeg', '-version'],
-                                  capture_output=True, text=True, timeout=5)
-            return result.returncode == 0
-        except Exception:
-            return False
-    def preprocess_audio_with_ffmpeg(self, audio_data: bytes) -> Optional[bytes]:
-        """
-        Preprocess audio using ffmpeg for optimal VAD performance.
-        Args:
-            audio_data: Raw audio bytes
-        Returns:
-            Preprocessed audio bytes or None if processing fails
-        """
-        if not self.ffmpeg_available:
-            logger.debug("ffmpeg not available for audio preprocessing")
-            return None
-        temp_input = None
-        temp_output = None
-        try:
-            # Create temporary files
-            with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as temp_input:
-                temp_input.write(audio_data)
-                temp_input.flush()
-            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_output:
-                pass
-            # ffmpeg command for VAD-optimized preprocessing
-            ffmpeg_cmd = [
-                'ffmpeg',
-                '-i', temp_input.name,
-                '-ar', str(self.sample_rate),    # Resample to target rate
-                '-ac', '1',                      # Convert to mono
-                '-acodec', 'pcm_s16le',          # 16-bit PCM
-                '-af', 'highpass=f=80,lowpass=f=8000,dynaudnorm=f=10:g=3',  # Audio filters for speech
-                '-f', 'wav',
-                '-loglevel', 'error',
-                '-y',
-                temp_output.name
-            ]
-            result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, timeout=10)
-            if result.returncode == 0:
-                with open(temp_output.name, 'rb') as f:
-                    preprocessed_audio = f.read()
-                self.stats['ffmpeg_conversions'] += 1
-                logger.debug(f"ffmpeg preprocessing: {len(audio_data)} -> {len(preprocessed_audio)} bytes")
-                return preprocessed_audio
-            else:
-                logger.error(f"ffmpeg preprocessing failed: {result.stderr}")
-                return None
-        except Exception as e:
-            logger.error(f"ffmpeg preprocessing error: {e}")
-            return None
-        finally:
-            # Cleanup
-            try:
-                if temp_input and os.path.exists(temp_input.name):
-                    os.unlink(temp_input.name)
-                if temp_output and os.path.exists(temp_output.name):
-                    os.unlink(temp_output.name)
-            except Exception:
-                pass
-    def validate_and_debug_audio(self, audio_data: bytes) -> Dict[str, Any]:
-        """
-        Comprehensive audio validation and debugging.
-        Args:
-            audio_data: Audio data to validate
-        Returns:
-            Validation results and debugging information
-        """
-        debug_info = {
-            'size_bytes': len(audio_data),
-            'valid_wav': False,
-            'sample_rate': None,
-            'channels': None,
-            'duration': 0.0,
-            'energy_level': 0.0,
-            'is_silent': True,
-            'format_detected': 'unknown',
-            'issues': []
-        }
-        try:
-            # Check minimum size
-            if len(audio_data) < 44:
-                debug_info['issues'].append(f"Too small: {len(audio_data)} bytes (need ≥44 for WAV)")
-                return debug_info
-            # Detect format by header
-            if audio_data.startswith(b'RIFF') and b'WAVE' in audio_data[:20]:
-                debug_info['format_detected'] = 'wav'
-            elif audio_data.startswith(b'OggS'):
-                debug_info['format_detected'] = 'ogg'
-            elif audio_data.startswith(b'\x1a\x45\xdf\xa3'):
-                debug_info['format_detected'] = 'webm'
-            # Try to parse as WAV
-            try:
-                with wave.open(io.BytesIO(audio_data), 'rb') as wav:
-                    debug_info['valid_wav'] = True
-                    debug_info['sample_rate'] = wav.getframerate()
-                    debug_info['channels'] = wav.getnchannels()
-                    debug_info['duration'] = wav.getnframes() / wav.getframerate()
-                    # Read audio samples for analysis
-                    wav.rewind()
-                    frames = wav.readframes(wav.getnframes())
-                    if len(frames) > 0:
-                        # Convert to numpy for analysis
-                        audio_array = np.frombuffer(frames, dtype=np.int16)
-                        # Calculate energy level
-                        energy = np.sqrt(np.mean(audio_array.astype(np.float32) ** 2))
-                        debug_info['energy_level'] = float(energy)
-                        debug_info['is_silent'] = energy < (self.silence_threshold * 32768)
-                        # Check for constant beep (common issue)
-                        if len(audio_array) > 100:
-                            # Check if audio is a constant tone (beep)
-                            diff = np.diff(audio_array)
-                            if np.std(diff) < 100:  # Very low variation
-                                debug_info['issues'].append("Constant tone/beep detected")
-                        # Check dynamic range
-                        if np.max(audio_array) - np.min(audio_array) < 1000:
-                            debug_info['issues'].append("Very low dynamic range")
-            except Exception as wav_error:
-                debug_info['issues'].append(f"WAV parsing failed: {wav_error}")
-            # Additional format-specific checks
-            if debug_info['format_detected'] in ['ogg', 'webm'] and not debug_info['valid_wav']:
-                debug_info['issues'].append("Non-WAV format detected - requires conversion")
-            logger.debug(f"Audio validation: {debug_info}")
-            if debug_info['issues']:
-                self.stats['audio_validation_failures'] += 1
-                logger.warning(f"Audio validation issues: {debug_info['issues']}")
-            return debug_info
-        except Exception as e:
-            debug_info['issues'].append(f"Validation error: {str(e)}")
-            logger.error(f"Audio validation failed: {e}")
-            return debug_info
-    def detect_speech_segments(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]:
-        """
-        Detect speech segments using multiple methods.
-        Args:
-            audio_data: Input audio data
-        Returns:
-            List of (segment_audio, segment_info) tuples
-        """
-        start_time = time.time()
-        # Validate and debug audio
-        debug_info = self.validate_and_debug_audio(audio_data)
-        segments = []
-        try:
-            # Preprocess with ffmpeg if available
-            processed_audio = self.preprocess_audio_with_ffmpeg(audio_data)
-            if processed_audio:
-                working_audio = processed_audio
-                logger.debug("Using ffmpeg-preprocessed audio for VAD")
-            else:
-                working_audio = audio_data
-                logger.debug("Using original audio for VAD")
-            # Re-validate processed audio
-            if processed_audio:
-                processed_debug = self.validate_and_debug_audio(processed_audio)
-                logger.debug(f"Processed audio validation: {processed_debug}")
-            # Method 1: WebRTC VAD (if available)
-            if self.webrtc_vad and debug_info['valid_wav']:
-                webrtc_segments = self._webrtc_vad_detection(working_audio)
-                segments.extend(webrtc_segments)
-                logger.debug(f"WebRTC VAD found {len(webrtc_segments)} segments")
-            # Method 2: Energy-based VAD (fallback)
-            if not segments or debug_info['issues']:
-                energy_segments = self._energy_based_vad(working_audio)
-                segments.extend(energy_segments)
-                logger.debug(f"Energy VAD found {len(energy_segments)} segments")
-            # Method 3: Simple duration-based segmentation (last resort)
-            if not segments and len(audio_data) > 8000:  # > 8KB
-                fallback_segment = self._create_fallback_segment(working_audio)
-                if fallback_segment:
-                    segments.append(fallback_segment)
-                    logger.debug("Used fallback segmentation")
-            processing_time = time.time() - start_time
-            self.stats['total_chunks_processed'] += 1
-            self.stats['speech_segments_detected'] += len(segments)
-            self.stats['processing_time_total'] += processing_time
-            self.stats['last_processing_time'] = processing_time
-            logger.debug(f"VAD processing complete: {len(segments)} segments in {processing_time:.3f}s")
-            return segments
-        except Exception as e:
-            logger.error(f"Speech segment detection failed: {e}")
-            return []
-    def _webrtc_vad_detection(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]:
-        """WebRTC-based speech detection."""
-        segments = []
-        try:
-            frame_size_bytes = self.frame_size * 2  # 16-bit = 2 bytes per sample
-            frames = []
-            # Extract frames
-            for i in range(0, len(audio_data) - frame_size_bytes + 1, frame_size_bytes):
-                frame = audio_data[i:i + frame_size_bytes]
-                if len(frame) == frame_size_bytes:
-                    frames.append(frame)
-            if len(frames) < 5:  # Need minimum frames
-                return segments
-            # VAD processing
-            speech_frames = []
-            for frame in frames:
-                try:
-                    is_speech = self.webrtc_vad.is_speech(frame, self.sample_rate)
-                    speech_frames.append((frame, is_speech))
-                except Exception as e:
-                    logger.debug(f"WebRTC VAD frame processing failed: {e}")
-                    speech_frames.append((frame, False))
-            # Group consecutive speech frames
-            current_segment = []
-            for frame, is_speech in speech_frames:
-                if is_speech:
-                    current_segment.append(frame)
-                else:
-                    if len(current_segment) > 0:
-                        # End of speech segment
-                        segment_audio = b''.join(current_segment)
-                        segment_duration = len(current_segment) * self.frame_duration_ms / 1000
-                        if segment_duration >= self.min_speech_duration:
-                            segments.append((segment_audio, {
-                                'duration': segment_duration,
-                                'method': 'webrtc_vad',
-                                'frames': len(current_segment)
-                            }))
-                        current_segment = []
-            # Handle final segment
-            if current_segment:
-                segment_audio = b''.join(current_segment)
-                segment_duration = len(current_segment) * self.frame_duration_ms / 1000
-                if segment_duration >= self.min_speech_duration:
-                    segments.append((segment_audio, {
-                        'duration': segment_duration,
-                        'method': 'webrtc_vad',
-                        'frames': len(current_segment)
-                    }))
-            return segments
-        except Exception as e:
-            logger.error(f"WebRTC VAD detection failed: {e}")
-            return []
-    def _energy_based_vad(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]:
-        """Energy-based speech detection."""
-        segments = []
-        try:
-            # Try to parse as WAV or raw PCM
-            try:
-                with wave.open(io.BytesIO(audio_data), 'rb') as wav:
-                    frames = wav.readframes(wav.getnframes())
-                    sample_rate = wav.getframerate()
-            except:
-                # Assume raw 16-bit PCM
-                frames = audio_data
-                sample_rate = self.sample_rate
-            if len(frames) < 1000:  # Too short
-                return segments
-            # Convert to numpy array
-            audio_samples = np.frombuffer(frames, dtype=np.int16)
-            audio_float = audio_samples.astype(np.float32) / 32768.0
-            # Calculate energy in overlapping windows
-            window_size = int(sample_rate * 0.1)  # 100ms windows
-            hop_size = window_size // 2
-            energies = []
-            for i in range(0, len(audio_float) - window_size, hop_size):
-                window = audio_float[i:i + window_size]
-                energy = np.sqrt(np.mean(window ** 2))
-                energies.append(energy)
-            if len(energies) < 3:
-                return segments
-            # Adaptive threshold
-            mean_energy = np.mean(energies)
-            threshold = max(self.silence_threshold, mean_energy * 0.3)
-            # Find speech segments
-            if isinstance(energies, (list, np.ndarray)):
-                energies = np.array(energies)  # Ensure it's a numpy array
-            speech_windows = energies > threshold
-            # Group consecutive speech windows
-            speech_start = None
-            for i, is_speech in enumerate(speech_windows):
-                if is_speech and speech_start is None:
-                    speech_start = i
-                elif not is_speech and speech_start is not None:
-                    # End of speech
-                    start_sample = speech_start * hop_size
-                    end_sample = min(i * hop_size + window_size, len(audio_samples))
-                    segment_samples = audio_samples[start_sample:end_sample]
-                    segment_duration = len(segment_samples) / sample_rate
-                    if segment_duration >= self.min_speech_duration:
-                        # Convert back to bytes
-                        segment_audio = segment_samples.tobytes()
-                        segments.append((segment_audio, {
-                            'duration': segment_duration,
-                            'method': 'energy_based',
-                            'start_time': start_sample / sample_rate,
-                            'energy_threshold': threshold,
-                            'mean_energy': mean_energy
-                        }))
-                    speech_start = None
-            return segments
-        except Exception as e:
-            logger.error(f"Energy-based VAD failed: {e}")
-            return []
-    def _create_fallback_segment(self, audio_data: bytes) -> Optional[Tuple[bytes, Dict[str, Any]]]:
-        """Create a fallback segment when VAD methods fail."""
-        try:
-            # Use the entire audio as a segment if it's reasonable length
-            debug_info = self.validate_and_debug_audio(audio_data)
-            if debug_info['duration'] > 0:
-                duration = debug_info['duration']
-            else:
-                # Estimate duration based on size (assume 16-bit, mono, 16kHz)
-                estimated_samples = len(audio_data) // 2
-                duration = estimated_samples / self.sample_rate
-            if self.min_speech_duration <= duration <= self.max_speech_duration:
-                return (audio_data, {
-                    'duration': duration,
-                    'method': 'fallback',
-                    'estimated': True,
-                    'issues': debug_info['issues']
-                })
-            return None
-        except Exception as e:
-            logger.error(f"Fallback segment creation failed: {e}")
-            return None
-    async def save_audio_chunk_async(self, audio_data: bytes, session_id: str,
-                                   chunk_type: str = "vad_chunk") -> Optional[str]:
-        """
-        Asynchronously save audio chunk to file.
-        Args:
-            audio_data: Audio data to save
-            session_id: Session identifier
-            chunk_type: Type of chunk (for filename)
-        Returns:
-            Path to saved file or None if failed
-        """
-        def _save_chunk():
-            try:
-                with self.save_lock:
-                    timestamp = int(time.time() * 1000)
-                    filename = f"{chunk_type}_{session_id}_{timestamp}.wav"
-                    filepath = Path("output") / filename
-                    # Ensure output directory exists
-                    filepath.parent.mkdir(exist_ok=True)
-                    # Save as WAV file
-                    with open(filepath, 'wb') as f:
-                        f.write(audio_data)
-                    logger.debug(f"Saved audio chunk: {filepath}")
-                    return str(filepath)
-            except Exception as e:
-                logger.error(f"Failed to save audio chunk: {e}")
-                return None
-        # Run in executor to avoid blocking
-        loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(self.executor, _save_chunk)
-        return result
-    def get_stats(self) -> Dict[str, Any]:
-        """Get comprehensive VAD statistics."""
-        stats = self.stats.copy()
-        if stats['total_chunks_processed'] > 0:
-            stats['average_processing_time'] = stats['processing_time_total'] / stats['total_chunks_processed']
-            stats['segments_per_chunk'] = stats['speech_segments_detected'] / stats['total_chunks_processed']
-        else:
-            stats['average_processing_time'] = 0.0
-            stats['segments_per_chunk'] = 0.0
-        return stats
-    def cleanup(self):
-        """Clean up resources."""
-        if hasattr(self, 'executor'):
-            self.executor.shutdown(wait=True)
-        logger.info("Enhanced VAD cleaned up")
-# Convenience function for creating enhanced VAD
-def create_enhanced_vad(config: Optional[Dict[str, Any]] = None) -> EnhancedVAD:
-    """Create enhanced VAD with optional configuration."""
-    if config is None:
-        config = {}
-    return EnhancedVAD(
-        sample_rate=config.get('sample_rate', 16000),
-        frame_duration_ms=config.get('frame_duration_ms', 30),
-        aggressiveness=config.get('aggressiveness', 1),
-        min_speech_duration=config.get('min_speech_duration', 0.4),
-        max_speech_duration=config.get('max_speech_duration', 3.0),
-        silence_threshold=config.get('silence_threshold', 0.01)
-    )

utils/session_manager.py DELETED Viewed

@@ -1,340 +0,0 @@
-"""
-Session Management for Audio Chunk Storage
-Handles session creation, audio chunk saving, and folder organization
-"""
-import os
-import time
-import uuid
-import logging
-import wave
-import numpy as np
-from typing import Dict, Optional, List
-from pathlib import Path
-import json
-import threading
-logger = logging.getLogger(__name__)
-class SessionManager:
-    """
-    Manages audio recording sessions with systematic file storage.
-    Each session gets a unique ID and folder for organized chunk storage.
-    """
-    def __init__(self, base_output_dir: str = "output"):
-        """
-        Initialize session manager.
-        Args:
-            base_output_dir: Base directory for all session outputs
-        """
-        self.base_output_dir = Path(base_output_dir)
-        self.base_output_dir.mkdir(exist_ok=True)
-        # Active sessions tracking
-        self.active_sessions: Dict[str, 'AudioSession'] = {}
-        self.lock = threading.Lock()
-        logger.info(f"Session manager initialized with output directory: {self.base_output_dir}")
-    def create_session(self, session_id: Optional[str] = None) -> str:
-        """
-        Create a new audio recording session.
-        Args:
-            session_id: Optional custom session ID, otherwise auto-generated
-        Returns:
-            str: Session ID
-        """
-        if not session_id:
-            # Generate session ID with timestamp and short UUID
-            timestamp = int(time.time())
-            short_uuid = str(uuid.uuid4())[:8]
-            session_id = f"session{timestamp}_{short_uuid}"
-        with self.lock:
-            if session_id in self.active_sessions:
-                logger.warning(f"Session {session_id} already exists, returning existing session")
-                return session_id
-            # Create session object
-            session = AudioSession(session_id, self.base_output_dir)
-            self.active_sessions[session_id] = session
-            logger.info(f"Created new session: {session_id}")
-            return session_id
-    def get_session(self, session_id: str) -> Optional['AudioSession']:
-        """Get an existing session by ID."""
-        with self.lock:
-            return self.active_sessions.get(session_id)
-    def close_session(self, session_id: str) -> bool:
-        """
-        Close and finalize a session.
-        Args:
-            session_id: Session to close
-        Returns:
-            bool: True if session was closed successfully
-        """
-        with self.lock:
-            if session_id not in self.active_sessions:
-                logger.warning(f"Session {session_id} not found")
-                return False
-            session = self.active_sessions[session_id]
-            session.finalize()
-            del self.active_sessions[session_id]
-            logger.info(f"Closed session: {session_id} ({session.chunk_count} chunks saved)")
-            return True
-    def cleanup_old_sessions(self, max_age_hours: int = 24) -> int:
-        """
-        Clean up sessions older than specified hours.
-        Args:
-            max_age_hours: Maximum age in hours before cleanup
-        Returns:
-            int: Number of sessions cleaned up
-        """
-        cutoff_time = time.time() - (max_age_hours * 3600)
-        cleaned_count = 0
-        # Find old session folders
-        for session_dir in self.base_output_dir.iterdir():
-            if not session_dir.is_dir() or not session_dir.name.startswith('session'):
-                continue
-            try:
-                # Check if session has a metadata file with creation time
-                metadata_file = session_dir / "session_info.json"
-                if metadata_file.exists():
-                    with open(metadata_file, 'r') as f:
-                        metadata = json.load(f)
-                        if metadata.get('created_at', 0) < cutoff_time:
-                            import shutil
-                            shutil.rmtree(session_dir)
-                            cleaned_count += 1
-                            logger.info(f"Cleaned up old session: {session_dir.name}")
-                else:
-                    # Fallback to directory modification time
-                    if session_dir.stat().st_mtime < cutoff_time:
-                        import shutil
-                        shutil.rmtree(session_dir)
-                        cleaned_count += 1
-                        logger.info(f"Cleaned up old session: {session_dir.name}")
-            except Exception as e:
-                logger.error(f"Error cleaning up session {session_dir.name}: {e}")
-        if cleaned_count > 0:
-            logger.info(f"Cleaned up {cleaned_count} old sessions")
-        return cleaned_count
-    def get_session_stats(self) -> Dict:
-        """Get statistics about all sessions."""
-        with self.lock:
-            stats = {
-                'active_sessions': len(self.active_sessions),
-                'total_chunks_active': sum(s.chunk_count for s in self.active_sessions.values()),
-                'session_details': {
-                    sid: {
-                        'chunk_count': session.chunk_count,
-                        'created_at': session.created_at,
-                        'folder_path': str(session.session_dir)
-                    }
-                    for sid, session in self.active_sessions.items()
-                }
-            }
-        # Count total session folders
-        total_session_dirs = len([
-            d for d in self.base_output_dir.iterdir()
-            if d.is_dir() and d.name.startswith('session')
-        ])
-        stats['total_session_folders'] = total_session_dirs
-        return stats
-class AudioSession:
-    """
-    Represents a single audio recording session with systematic chunk storage.
-    """
-    def __init__(self, session_id: str, base_output_dir: Path):
-        """
-        Initialize audio session.
-        Args:
-            session_id: Unique session identifier
-            base_output_dir: Base directory for output
-        """
-        self.session_id = session_id
-        self.created_at = time.time()
-        self.chunk_count = 0
-        # Create session directory
-        self.session_dir = base_output_dir / session_id
-        self.session_dir.mkdir(exist_ok=True)
-        # Create subdirectories
-        self.chunks_dir = self.session_dir / "chunks"
-        self.chunks_dir.mkdir(exist_ok=True)
-        # Session metadata
-        self.metadata = {
-            'session_id': session_id,
-            'created_at': self.created_at,
-            'created_at_human': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.created_at)),
-            'chunk_count': 0,
-            'chunks': []
-        }
-        self._save_metadata()
-        logger.info(f"Session folder created: {self.session_dir}")
-    def save_audio_chunk(self, audio_data: bytes, prediction_result: Optional[Dict] = None,
-                        chunk_type: str = "speech") -> str:
-        """
-        Save an audio chunk to the session folder.
-        Args:
-            audio_data: Raw audio bytes (WAV format preferred)
-            prediction_result: Optional prediction results to save alongside
-            chunk_type: Type of chunk ("speech", "vad_segment", "raw", etc.)
-        Returns:
-            str: Path to saved chunk file
-        """
-        self.chunk_count += 1
-        # Generate chunk filename
-        chunk_filename = f"{self.chunk_count:03d}.wav"
-        chunk_path = self.chunks_dir / chunk_filename
-        try:
-            # Save audio data
-            if self._is_wav_format(audio_data):
-                # Already WAV format, save directly
-                with open(chunk_path, 'wb') as f:
-                    f.write(audio_data)
-                logger.debug(f"Saved WAV chunk: {chunk_path}")
-            else:
-                # Convert raw PCM to WAV
-                self._save_pcm_as_wav(audio_data, chunk_path)
-                logger.debug(f"Converted and saved PCM chunk: {chunk_path}")
-            # Update metadata
-            chunk_info = {
-                'chunk_id': self.chunk_count,
-                'filename': chunk_filename,
-                'chunk_type': chunk_type,
-                'size_bytes': len(audio_data),
-                'saved_at': time.time(),
-                'saved_at_human': time.strftime('%Y-%m-%d %H:%M:%S'),
-                'audio_format': 'wav' if self._is_wav_format(audio_data) else 'pcm_converted'
-            }
-            # Add prediction results if provided
-            if prediction_result:
-                chunk_info['prediction'] = prediction_result
-            self.metadata['chunks'].append(chunk_info)
-            self.metadata['chunk_count'] = self.chunk_count
-            self._save_metadata()
-            logger.info(f"Saved audio chunk {self.chunk_count}: {chunk_path}")
-            return str(chunk_path)
-        except Exception as e:
-            logger.error(f"Failed to save audio chunk {self.chunk_count}: {e}")
-            # Rollback chunk count on failure
-            self.chunk_count -= 1
-            raise
-    def _is_wav_format(self, audio_data: bytes) -> bool:
-        """Check if audio data is in WAV format."""
-        return audio_data.startswith(b'RIFF') and b'WAVE' in audio_data[:12]
-    def _save_pcm_as_wav(self, pcm_data: bytes, output_path: Path,
-                        sample_rate: int = 16000, channels: int = 1, sample_width: int = 2):
-        """
-        Convert raw PCM data to WAV format and save.
-        Args:
-            pcm_data: Raw PCM bytes
-            output_path: Output WAV file path
-            sample_rate: Sample rate (default 16kHz for speech)
-            channels: Number of channels (default mono)
-            sample_width: Sample width in bytes (default 16-bit)
-        """
-        try:
-            with wave.open(str(output_path), 'wb') as wav_file:
-                wav_file.setnchannels(channels)
-                wav_file.setsampwidth(sample_width)
-                wav_file.setframerate(sample_rate)
-                wav_file.writeframes(pcm_data)
-        except Exception as e:
-            logger.error(f"PCM to WAV conversion failed: {e}")
-            # Fallback: save as raw PCM with .pcm extension
-            raw_path = output_path.with_suffix('.pcm')
-            with open(raw_path, 'wb') as f:
-                f.write(pcm_data)
-            logger.warning(f"Saved as raw PCM instead: {raw_path}")
-    def _save_metadata(self):
-        """Save session metadata to JSON file."""
-        try:
-            metadata_path = self.session_dir / "session_info.json"
-            with open(metadata_path, 'w') as f:
-                json.dump(self.metadata, f, indent=2, default=str)
-        except Exception as e:
-            logger.error(f"Failed to save session metadata: {e}")
-    def finalize(self):
-        """Finalize the session and save final metadata."""
-        self.metadata['finalized_at'] = time.time()
-        self.metadata['finalized_at_human'] = time.strftime('%Y-%m-%d %H:%M:%S')
-        self.metadata['final_chunk_count'] = self.chunk_count
-        self._save_metadata()
-        logger.info(f"📋 Finalized session {self.session_id}: {self.chunk_count} chunks saved")
-    def get_chunk_list(self) -> List[str]:
-        """Get list of all chunk files in order."""
-        chunk_files = []
-        for i in range(1, self.chunk_count + 1):
-            chunk_file = self.chunks_dir / f"{i:03d}.wav"
-            if chunk_file.exists():
-                chunk_files.append(str(chunk_file))
-            else:
-                # Check for .pcm fallback
-                pcm_file = self.chunks_dir / f"{i:03d}.pcm"
-                if pcm_file.exists():
-                    chunk_files.append(str(pcm_file))
-        return chunk_files
-    def get_session_summary(self) -> Dict:
-        """Get comprehensive session summary."""
-        return {
-            'session_id': self.session_id,
-            'created_at': self.created_at,
-            'chunk_count': self.chunk_count,
-            'session_dir': str(self.session_dir),
-            'chunks_dir': str(self.chunks_dir),
-            'chunk_files': self.get_chunk_list(),
-            'metadata': self.metadata
-        }
-# Global session manager instance
-session_manager = SessionManager()

utils/vad.py DELETED Viewed

@@ -1,149 +0,0 @@
-"""
-Voice Activity Detection (VAD) for streaming audio processing
-Detects speech segments and trims silence
-"""
-import numpy as np
-import logging
-logger = logging.getLogger(__name__)
-class VoiceActivityDetector:
-    """Simple voice activity detector based on energy and zero-crossing rate."""
-    def __init__(self):
-        self.sample_rate = 16000
-        self.frame_size = 512  # ~32ms frames at 16kHz
-        self.hop_size = 256    # 50% overlap
-        # VAD thresholds
-        self.energy_threshold = 0.01  # Minimum energy for speech
-        self.zcr_threshold = 0.3      # Zero crossing rate threshold
-        self.min_speech_frames = 5    # Minimum frames for speech detection
-        self.min_silence_frames = 8   # Minimum silence frames to end speech
-        # State tracking
-        self.is_speech_active = False
-        self.speech_frames = 0
-        self.silence_frames = 0
-        self.speech_buffer = []
-        logger.info("Voice Activity Detector initialized")
-    def reset(self):
-        """Reset VAD state."""
-        self.is_speech_active = False
-        self.speech_frames = 0
-        self.silence_frames = 0
-        self.speech_buffer = []
-    def compute_energy(self, frame):
-        """Compute energy of audio frame."""
-        return np.mean(frame ** 2)
-    def compute_zcr(self, frame):
-        """Compute zero crossing rate of audio frame."""
-        zcr = np.sum(np.abs(np.diff(np.sign(frame)))) / (2 * len(frame))
-        return zcr
-    def is_speech_frame(self, frame):
-        """Determine if frame contains speech."""
-        energy = self.compute_energy(frame)
-        zcr = self.compute_zcr(frame)
-        # Simple rule: speech has moderate energy and ZCR
-        has_energy = energy > self.energy_threshold
-        has_reasonable_zcr = zcr < self.zcr_threshold
-        return has_energy and has_reasonable_zcr
-    def process_chunk(self, audio_data):
-        """
-        Process audio chunk and return speech segments.
-        Args:
-            audio_data: numpy array of audio samples
-        Returns:
-            List of (start_sample, end_sample) tuples for speech segments
-        """
-        if len(audio_data) == 0:
-            return []
-        speech_segments = []
-        num_frames = (len(audio_data) - self.frame_size) // self.hop_size + 1
-        for i in range(num_frames):
-            start_idx = i * self.hop_size
-            end_idx = start_idx + self.frame_size
-            if end_idx > len(audio_data):
-                break
-            frame = audio_data[start_idx:end_idx]
-            is_speech = self.is_speech_frame(frame)
-            if is_speech:
-                self.speech_frames += 1
-                self.silence_frames = 0
-                if not self.is_speech_active and self.speech_frames >= self.min_speech_frames:
-                    # Speech started
-                    self.is_speech_active = True
-                    self.speech_start_idx = max(0, start_idx - self.min_speech_frames * self.hop_size)
-                    logger.debug(f"Speech started at sample {self.speech_start_idx}")
-            else:
-                self.silence_frames += 1
-                if self.is_speech_active and self.silence_frames >= self.min_silence_frames:
-                    # Speech ended
-                    speech_end_idx = start_idx
-                    speech_segments.append((self.speech_start_idx, speech_end_idx))
-                    logger.debug(f"Speech ended at sample {speech_end_idx}")
-                    # Reset for next speech segment
-                    self.is_speech_active = False
-                    self.speech_frames = 0
-                    self.silence_frames = 0
-        return speech_segments
-    def extract_speech_segments(self, audio_data, segments):
-        """Extract speech segments from audio data."""
-        speech_chunks = []
-        for start_idx, end_idx in segments:
-            if end_idx > start_idx:
-                segment = audio_data[start_idx:end_idx]
-                # Trim silence from edges
-                segment = self.trim_silence(segment)
-                if len(segment) > self.sample_rate * 0.3:  # At least 300ms
-                    speech_chunks.append(segment)
-        return speech_chunks
-    def trim_silence(self, audio_data, silence_threshold=0.01):
-        """Trim silence from beginning and end of audio."""
-        if len(audio_data) == 0:
-            return audio_data
-        # Find first and last non-silent samples
-        energy = audio_data ** 2
-        non_silent = energy > silence_threshold
-        if not np.any(non_silent):
-            return audio_data  # All silence, return as is
-        first_sound = np.argmax(non_silent)
-        last_sound = len(non_silent) - np.argmax(non_silent[::-1]) - 1
-        return audio_data[first_sound:last_sound + 1]
-    def get_current_speech_segment(self, audio_data):
-        """Get current ongoing speech segment if any."""
-        if self.is_speech_active and len(audio_data) > 0:
-            current_segment = audio_data[self.speech_start_idx:]
-            if len(current_segment) > self.sample_rate * 0.5:  # At least 500ms
-                return self.trim_silence(current_segment)
-        return None

utils/vad_feature_integration.py DELETED Viewed

@@ -1,483 +0,0 @@
-"""
-Integration module for WebRTC VAD with MFCC and Spectrogram processors
-Combines voice activity detection with real-time feature extraction
-"""
-import numpy as np
-import librosa
-import logging
-from typing import Dict, List, Optional, Tuple
-import time
-from collections import deque
-import threading
-import queue
-from utils.webrtc_vad import WebRTCVADProcessor
-from audio_processors.mfcc_processor import MFCCProcessor
-from audio_processors.mel_spectrogram import MelSpectrogramProcessor
-from audio_processors.raw_spectrogram import RawSpectrogramProcessor
-logger = logging.getLogger(__name__)
-class StreamingFeatureExtractor:
-    """
-    Real-time feature extraction with VAD integration.
-    Combines WebRTC VAD with MFCC, Mel Spectrogram, and Raw Spectrogram processing.
-    """
-    def __init__(self, sample_rate=16000, n_mfcc=13, n_fft=2048, hop_length=512):
-        """
-        Initialize streaming feature extractor.
-        Args:
-            sample_rate: Audio sample rate
-            n_mfcc: Number of MFCC coefficients
-            n_fft: FFT window size
-            hop_length: Hop length for STFT
-        """
-        self.sample_rate = sample_rate
-        self.n_mfcc = n_mfcc
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        # Initialize VAD processor
-        self.vad_processor = WebRTCVADProcessor(
-            aggressiveness=2,
-            sample_rate=sample_rate,
-            frame_duration=30
-        )
-        # Initialize feature processors
-        self.mfcc_processor = MFCCProcessor()
-        self.mel_processor = MelSpectrogramProcessor()
-        self.raw_spec_processor = RawSpectrogramProcessor()
-        # Buffers for overlapped processing
-        self.audio_buffer = deque(maxlen=sample_rate * 2)  # 2 second buffer
-        self.feature_buffer = deque(maxlen=100)  # Store recent feature vectors
-        # Threading for real-time processing
-        self.processing_queue = queue.Queue()
-        self.feature_queue = queue.Queue()
-        self.is_processing = False
-        self.processing_thread = None
-        # Statistics
-        self.total_chunks_processed = 0
-        self.features_extracted = 0
-        self.speech_segments_processed = 0
-        logger.info("Streaming Feature Extractor initialized")
-    def extract_features_realtime(self, audio_chunk: bytes) -> Dict[str, np.ndarray]:
-        """
-        Extract features from streaming audio chunk with VAD.
-        Args:
-            audio_chunk: Raw audio bytes
-        Returns:
-            dict: Extracted features for detected speech segments
-        """
-        # Process with VAD first
-        speech_segments = self.vad_processor.process_audio_chunk(audio_chunk)
-        features_list = []
-        for segment in speech_segments:
-            # Convert bytes to numpy array
-            audio_array = np.frombuffer(segment, dtype=np.int16).astype(np.float32) / 32768.0
-            # Extract comprehensive features
-            features = self._compute_streaming_features(audio_array)
-            if features:
-                features_list.append(features)
-                self.features_extracted += 1
-        self.total_chunks_processed += 1
-        if speech_segments:
-            self.speech_segments_processed += len(speech_segments)
-            logger.debug(f"Extracted features from {len(speech_segments)} speech segments")
-        return features_list
-    def _compute_streaming_features(self, audio_data: np.ndarray) -> Optional[Dict[str, np.ndarray]]:
-        """
-        Compute comprehensive feature set optimized for streaming.
-        Args:
-            audio_data: Audio samples as numpy array
-        Returns:
-            dict: Feature dictionary or None if extraction fails
-        """
-        try:
-            if len(audio_data) < self.n_fft:
-                logger.debug("Audio segment too short for feature extraction")
-                return None
-            features = {}
-            # Core MFCC features
-            mfccs = librosa.feature.mfcc(
-                y=audio_data,
-                sr=self.sample_rate,
-                n_mfcc=self.n_mfcc,
-                n_fft=self.n_fft,
-                hop_length=self.hop_length
-            )
-            # Statistical summaries for streaming
-            features['mfcc_mean'] = np.mean(mfccs, axis=1)
-            features['mfcc_std'] = np.std(mfccs, axis=1)
-            features['mfcc_delta'] = np.mean(librosa.feature.delta(mfccs), axis=1)
-            features['mfcc_delta2'] = np.mean(librosa.feature.delta(mfccs, order=2), axis=1)
-            # Spectral features
-            features['spectral_centroid'] = np.mean(
-                librosa.feature.spectral_centroid(y=audio_data, sr=self.sample_rate)
-            )
-            features['spectral_bandwidth'] = np.mean(
-                librosa.feature.spectral_bandwidth(y=audio_data, sr=self.sample_rate)
-            )
-            features['spectral_rolloff'] = np.mean(
-                librosa.feature.spectral_rolloff(y=audio_data, sr=self.sample_rate)
-            )
-            features['zero_crossing_rate'] = np.mean(
-                librosa.feature.zero_crossing_rate(audio_data)
-            )
-            # Energy features
-            features['rms_energy'] = np.mean(librosa.feature.rms(y=audio_data))
-            # Mel spectrogram features
-            mel_spec = librosa.feature.melspectrogram(
-                y=audio_data,
-                sr=self.sample_rate,
-                n_mels=40,  # Reduced for streaming
-                n_fft=self.n_fft,
-                hop_length=self.hop_length
-            )
-            features['mel_spec_mean'] = np.mean(mel_spec, axis=1)
-            features['mel_spec_std'] = np.std(mel_spec, axis=1)
-            # Raw spectrogram features
-            stft = librosa.stft(audio_data, n_fft=self.n_fft, hop_length=self.hop_length)
-            magnitude_spec = np.abs(stft)
-            features['raw_spec_mean'] = np.mean(magnitude_spec, axis=1)
-            features['raw_spec_std'] = np.std(magnitude_spec, axis=1)
-            # Harmonic and percussive components
-            harmonic, percussive = librosa.effects.hpss(audio_data)
-            features['harmonic_ratio'] = np.mean(harmonic ** 2) / (np.mean(audio_data ** 2) + 1e-8)
-            features['percussive_ratio'] = np.mean(percussive ** 2) / (np.mean(audio_data ** 2) + 1e-8)
-            # Tempo and rhythm features (simplified for streaming)
-            tempo, _ = librosa.beat.beat_track(y=audio_data, sr=self.sample_rate)
-            features['tempo'] = tempo
-            # Add metadata
-            features['_metadata'] = {
-                'duration': len(audio_data) / self.sample_rate,
-                'sample_rate': self.sample_rate,
-                'n_samples': len(audio_data),
-                'extraction_timestamp': time.time()
-            }
-            return features
-        except Exception as e:
-            logger.error(f"Feature extraction error: {e}")
-            return None
-    def extract_mfcc_features(self, audio_data: np.ndarray) -> Optional[np.ndarray]:
-        """
-        Extract only MFCC features for lightweight processing.
-        Args:
-            audio_data: Audio samples
-        Returns:
-            np.ndarray: MFCC feature vector
-        """
-        try:
-            mfccs = librosa.feature.mfcc(
-                y=audio_data,
-                sr=self.sample_rate,
-                n_mfcc=self.n_mfcc,
-                n_fft=self.n_fft,
-                hop_length=self.hop_length
-            )
-            return np.mean(mfccs, axis=1)
-        except Exception as e:
-            logger.error(f"MFCC extraction error: {e}")
-            return None
-    def extract_spectrogram_features(self, audio_data: np.ndarray) -> Optional[Dict[str, np.ndarray]]:
-        """
-        Extract spectrogram-based features.
-        Args:
-            audio_data: Audio samples
-        Returns:
-            dict: Spectrogram features
-        """
-        try:
-            # Mel spectrogram
-            mel_spec = librosa.feature.melspectrogram(
-                y=audio_data,
-                sr=self.sample_rate,
-                n_mels=80,
-                n_fft=self.n_fft,
-                hop_length=self.hop_length
-            )
-            # Raw spectrogram
-            stft = librosa.stft(audio_data, n_fft=self.n_fft, hop_length=self.hop_length)
-            magnitude_spec = np.abs(stft)
-            return {
-                'mel_spectrogram': mel_spec,
-                'mel_spec_db': librosa.power_to_db(mel_spec),
-                'raw_spectrogram': magnitude_spec,
-                'raw_spec_db': librosa.amplitude_to_db(magnitude_spec)
-            }
-        except Exception as e:
-            logger.error(f"Spectrogram extraction error: {e}")
-            return None
-    def process_with_vad_and_features(self, audio_chunk: bytes, feature_type: str = 'all') -> List[Dict]:
-        """
-        Process audio chunk with VAD and extract specified features.
-        Args:
-            audio_chunk: Raw audio bytes
-            feature_type: Type of features to extract ('mfcc', 'spectrogram', 'all')
-        Returns:
-            List[dict]: Feature results for each speech segment
-        """
-        # Get speech segments from VAD
-        speech_segments = self.vad_processor.process_audio_chunk(audio_chunk)
-        results = []
-        for i, segment in enumerate(speech_segments):
-            # Convert to numpy array
-            audio_array = np.frombuffer(segment, dtype=np.int16).astype(np.float32) / 32768.0
-            segment_result = {
-                'segment_index': i,
-                'segment_duration': len(audio_array) / self.sample_rate,
-                'segment_samples': len(audio_array)
-            }
-            # Extract requested features
-            if feature_type == 'mfcc':
-                mfcc_features = self.extract_mfcc_features(audio_array)
-                if mfcc_features is not None:
-                    segment_result['mfcc'] = mfcc_features
-            elif feature_type == 'spectrogram':
-                spec_features = self.extract_spectrogram_features(audio_array)
-                if spec_features is not None:
-                    segment_result.update(spec_features)
-            elif feature_type == 'all':
-                comprehensive_features = self._compute_streaming_features(audio_array)
-                if comprehensive_features is not None:
-                    segment_result.update(comprehensive_features)
-            results.append(segment_result)
-        return results
-    def start_streaming_processing(self):
-        """Start background thread for streaming processing."""
-        if self.is_processing:
-            return
-        self.is_processing = True
-        self.processing_thread = threading.Thread(target=self._streaming_worker, daemon=True)
-        self.processing_thread.start()
-        logger.info("Started streaming feature processing")
-    def stop_streaming_processing(self):
-        """Stop background streaming processing."""
-        self.is_processing = False
-        if self.processing_thread:
-            self.processing_thread.join(timeout=1.0)
-        logger.info("Stopped streaming feature processing")
-    def add_audio_chunk(self, audio_chunk: bytes, feature_type: str = 'all'):
-        """
-        Add audio chunk to processing queue.
-        Args:
-            audio_chunk: Raw audio bytes
-            feature_type: Type of features to extract
-        """
-        if self.is_processing:
-            try:
-                self.processing_queue.put_nowait((audio_chunk, feature_type))
-            except queue.Full:
-                logger.warning("Processing queue full, dropping chunk")
-    def get_feature_results(self) -> List[Dict]:
-        """
-        Get all available feature extraction results.
-        Returns:
-            List[dict]: Available feature results
-        """
-        results = []
-        try:
-            while True:
-                result = self.feature_queue.get_nowait()
-                results.append(result)
-        except queue.Empty:
-            pass
-        return results
-    def _streaming_worker(self):
-        """Background worker for streaming feature processing."""
-        while self.is_processing:
-            try:
-                # Get audio chunk with timeout
-                audio_chunk, feature_type = self.processing_queue.get(timeout=0.1)
-                # Process chunk
-                start_time = time.time()
-                results = self.process_with_vad_and_features(audio_chunk, feature_type)
-                processing_time = time.time() - start_time
-                # Add processing metadata
-                for result in results:
-                    result['processing_time'] = processing_time
-                    result['timestamp'] = time.time()
-                # Add results to output queue
-                for result in results:
-                    try:
-                        self.feature_queue.put_nowait(result)
-                    except queue.Full:
-                        logger.warning("Feature queue full, dropping result")
-            except queue.Empty:
-                continue
-            except Exception as e:
-                logger.error(f"Streaming feature processing error: {e}")
-    def get_stats(self) -> Dict:
-        """
-        Get feature extraction statistics.
-        Returns:
-            dict: Processing statistics
-        """
-        vad_stats = self.vad_processor.get_stats()
-        return {
-            'total_chunks_processed': self.total_chunks_processed,
-            'features_extracted': self.features_extracted,
-            'speech_segments_processed': self.speech_segments_processed,
-            'vad_stats': vad_stats,
-            'is_processing': self.is_processing,
-            'queue_sizes': {
-                'processing_queue': self.processing_queue.qsize(),
-                'feature_queue': self.feature_queue.qsize()
-            }
-        }
-    def reset_state(self):
-        """Reset all processing state."""
-        self.vad_processor.reset_state()
-        self.audio_buffer.clear()
-        self.feature_buffer.clear()
-        # Clear queues
-        while not self.processing_queue.empty():
-            try:
-                self.processing_queue.get_nowait()
-            except queue.Empty:
-                break
-        while not self.feature_queue.empty():
-            try:
-                self.feature_queue.get_nowait()
-            except queue.Empty:
-                break
-        logger.info("Feature extractor state reset")
-class VADMFCCProcessor:
-    """
-    Simplified VAD + MFCC processor for digit recognition.
-    Optimized for low-latency real-time processing.
-    """
-    def __init__(self, sample_rate=16000, n_mfcc=13):
-        """Initialize VAD + MFCC processor."""
-        self.sample_rate = sample_rate
-        self.n_mfcc = n_mfcc
-        self.vad_processor = WebRTCVADProcessor(
-            aggressiveness=1,  # Less aggressive for better digit detection
-            sample_rate=sample_rate,
-            frame_duration=30
-        )
-        self.features_extracted = 0
-        logger.info("VAD-MFCC processor initialized")
-    def process_audio_for_digit_recognition(self, audio_chunk: bytes) -> List[np.ndarray]:
-        """
-        Process audio chunk and extract MFCC features from speech segments.
-        Args:
-            audio_chunk: Raw audio bytes
-        Returns:
-            List[np.ndarray]: MFCC feature vectors for each speech segment
-        """
-        # Get speech segments
-        speech_segments = self.vad_processor.process_audio_chunk(audio_chunk)
-        mfcc_features = []
-        for segment in speech_segments:
-            # Convert to numpy array
-            audio_array = np.frombuffer(segment, dtype=np.int16).astype(np.float32) / 32768.0
-            # Extract MFCC features
-            try:
-                mfccs = librosa.feature.mfcc(
-                    y=audio_array,
-                    sr=self.sample_rate,
-                    n_mfcc=self.n_mfcc,
-                    n_fft=1024,  # Smaller FFT for faster processing
-                    hop_length=256
-                )
-                # Use mean across time for simplicity
-                mfcc_mean = np.mean(mfccs, axis=1)
-                mfcc_features.append(mfcc_mean)
-                self.features_extracted += 1
-            except Exception as e:
-                logger.error(f"MFCC extraction failed: {e}")
-        return mfcc_features
-    def get_stats(self) -> Dict:
-        """Get processing statistics."""
-        vad_stats = self.vad_processor.get_stats()
-        return {
-            'features_extracted': self.features_extracted,
-            'vad_stats': vad_stats
-        }

utils/webrtc_vad.py DELETED Viewed

@@ -1,442 +0,0 @@
-"""
-WebRTC VAD implementation for streaming audio processing
-Provides high-performance voice activity detection with proper audio chunking
-"""
-import webrtcvad
-import collections
-import numpy as np
-import logging
-from typing import List, Tuple, Optional, Generator
-import struct
-import threading
-import queue
-import time
-logger = logging.getLogger(__name__)
-class WebRTCVADProcessor:
-    """
-    WebRTC-based Voice Activity Detection processor for streaming audio.
-    Features:
-    - Real-time VAD processing with WebRTC library
-    - Proper audio chunking and buffering
-    - Speech segment detection and extraction
-    - Thread-safe operation for streaming applications
-    """
-    def __init__(self, aggressiveness=2, sample_rate=16000, frame_duration=30):
-        """
-        Initialize WebRTC VAD processor.
-        Args:
-            aggressiveness: VAD aggressiveness mode (0-3, higher = more aggressive)
-            sample_rate: Audio sample rate (8000, 16000, 32000, or 48000 Hz)
-            frame_duration: Frame duration in milliseconds (10, 20, or 30 ms)
-        """
-        self.vad = webrtcvad.Vad(aggressiveness)
-        self.sample_rate = sample_rate
-        self.frame_duration = frame_duration
-        self.frame_size = int(sample_rate * frame_duration / 1000)
-        # Circular buffer for frame management
-        self.ring_buffer_size = max(10, int(500 / frame_duration))  # ~500ms buffer
-        self.ring_buffer = collections.deque(maxlen=self.ring_buffer_size)
-        # State tracking
-        self.triggered = False
-        self.speech_buffer = collections.deque()
-        self.is_recording = False
-        self.current_utterance_start = None
-        # Configuration parameters
-        self.silence_threshold = 0.8  # Ratio of silence frames to trigger end
-        self.speech_threshold = 0.5   # Ratio of speech frames to trigger start
-        self.min_speech_duration = 0.5  # Minimum speech duration in seconds
-        self.max_speech_duration = 10.0  # Maximum speech duration in seconds
-        self.max_silence_duration = 2.0  # Maximum silence before reset
-        # Performance tracking
-        self.total_frames_processed = 0
-        self.speech_frames_detected = 0
-        self.segments_extracted = 0
-        # Thread-safe queue for streaming chunks
-        self.audio_queue = queue.Queue()
-        self.output_queue = queue.Queue()
-        self.processing = False
-        logger.info(f"WebRTC VAD initialized: aggressiveness={aggressiveness}, "
-                   f"sample_rate={sample_rate}Hz, frame_duration={frame_duration}ms")
-    def reset_state(self):
-        """Reset VAD state for new processing session."""
-        self.triggered = False
-        self.is_recording = False
-        self.ring_buffer.clear()
-        self.speech_buffer.clear()
-        self.current_utterance_start = None
-        logger.debug("VAD state reset")
-    def convert_audio_to_frames(self, audio_data: bytes) -> Generator[bytes, None, None]:
-        """
-        Convert audio data to properly sized frames for WebRTC VAD.
-        Args:
-            audio_data: Raw audio bytes (16-bit PCM)
-        Yields:
-            bytes: Frame data suitable for VAD processing
-        """
-        frame_size_bytes = self.frame_size * 2  # 16-bit = 2 bytes per sample
-        for i in range(0, len(audio_data) - frame_size_bytes + 1, frame_size_bytes):
-            frame = audio_data[i:i + frame_size_bytes]
-            if len(frame) == frame_size_bytes:
-                yield frame
-    def is_speech_frame(self, frame: bytes) -> bool:
-        """
-        Determine if a frame contains speech using WebRTC VAD.
-        Args:
-            frame: Audio frame bytes
-        Returns:
-            bool: True if frame contains speech
-        """
-        try:
-            if len(frame) != self.frame_size * 2:
-                return False
-            return self.vad.is_speech(frame, self.sample_rate)
-        except Exception as e:
-            logger.warning(f"VAD frame analysis failed: {e}")
-            return False
-    def process_audio_chunk(self, audio_data: bytes) -> List[bytes]:
-        """
-        Process audio chunk and return complete speech segments.
-        Args:
-            audio_data: Raw audio bytes (16-bit PCM)
-        Returns:
-            List[bytes]: List of detected speech segments
-        """
-        speech_segments = []
-        for frame in self.convert_audio_to_frames(audio_data):
-            self.total_frames_processed += 1
-            is_speech = self.is_speech_frame(frame)
-            if is_speech:
-                self.speech_frames_detected += 1
-            # Process frame through VAD collector
-            collected_audio = self._vad_collector_step(frame, is_speech)
-            if collected_audio is not None:
-                # Complete speech segment detected
-                speech_segments.append(collected_audio)
-                self.segments_extracted += 1
-                logger.debug(f"Speech segment extracted: {len(collected_audio)} bytes")
-        return speech_segments
-    def _vad_collector_step(self, frame: bytes, is_speech: bool) -> Optional[bytes]:
-        """
-        Single step of VAD collection algorithm.
-        Args:
-            frame: Audio frame
-            is_speech: Whether frame contains speech
-        Returns:
-            bytes: Complete speech segment if detected, None otherwise
-        """
-        if not self.triggered:
-            # Not currently in speech mode
-            self.ring_buffer.append((frame, is_speech))
-            num_voiced = sum(1 for f, speech in self.ring_buffer if speech)
-            # Check if we should trigger speech detection
-            if len(self.ring_buffer) == self.ring_buffer.maxlen:
-                if num_voiced >= self.speech_threshold * self.ring_buffer.maxlen:
-                    self.triggered = True
-                    self.is_recording = True
-                    self.current_utterance_start = time.time()
-                    # Output buffered frames to start speech segment
-                    self.speech_buffer.clear()
-                    for f, s in self.ring_buffer:
-                        self.speech_buffer.append(f)
-                    self.ring_buffer.clear()
-                    logger.debug("Speech triggered - starting collection")
-        else:
-            # Currently in speech mode
-            self.speech_buffer.append(frame)
-            self.ring_buffer.append((frame, is_speech))
-            # Check duration limits
-            if self.current_utterance_start:
-                utterance_duration = time.time() - self.current_utterance_start
-                if utterance_duration > self.max_speech_duration:
-                    # Force end due to maximum duration
-                    logger.debug("Speech segment ended due to max duration")
-                    return self._finalize_speech_segment()
-            # Check for end of speech
-            if len(self.ring_buffer) == self.ring_buffer.maxlen:
-                num_unvoiced = sum(1 for f, speech in self.ring_buffer if not speech)
-                if num_unvoiced >= self.silence_threshold * self.ring_buffer.maxlen:
-                    # End of speech detected
-                    logger.debug("Speech segment ended due to silence")
-                    return self._finalize_speech_segment()
-        return None
-    def _finalize_speech_segment(self) -> Optional[bytes]:
-        """
-        Finalize and return current speech segment.
-        Returns:
-            bytes: Complete speech segment or None if too short
-        """
-        if not self.speech_buffer:
-            self.triggered = False
-            self.is_recording = False
-            return None
-        # Calculate duration
-        total_frames = len(self.speech_buffer)
-        duration = total_frames * self.frame_duration / 1000.0
-        # Apply stricter minimum duration filter (0.1s minimum)
-        min_duration = max(self.min_speech_duration, 0.1)  # At least 100ms
-        # Check minimum duration
-        if duration < min_duration:
-            logger.debug(f"Speech segment too short: {duration:.2f}s < {min_duration}s")
-            self.triggered = False
-            self.is_recording = False
-            self.speech_buffer.clear()
-            self.ring_buffer.clear()
-            return None
-        # Create complete audio segment
-        speech_data = b''.join(self.speech_buffer)
-        # Reset state
-        self.triggered = False
-        self.is_recording = False
-        self.speech_buffer.clear()
-        self.ring_buffer.clear()
-        self.current_utterance_start = None
-        logger.info(f"Speech segment finalized: {duration:.2f}s, {len(speech_data)} bytes")
-        return speech_data
-    def process_numpy_audio(self, audio_array: np.ndarray) -> List[bytes]:
-        """
-        Process numpy audio array.
-        Args:
-            audio_array: Audio data as numpy array (float32, -1 to 1 range)
-        Returns:
-            List[bytes]: List of detected speech segments
-        """
-        # Convert to 16-bit PCM bytes
-        if audio_array.dtype != np.int16:
-            # Normalize and convert to int16
-            audio_normalized = np.clip(audio_array, -1.0, 1.0)
-            audio_int16 = (audio_normalized * 32767).astype(np.int16)
-        else:
-            audio_int16 = audio_array
-        # Convert to bytes
-        audio_bytes = audio_int16.tobytes()
-        return self.process_audio_chunk(audio_bytes)
-    def get_current_segment(self) -> Optional[bytes]:
-        """
-        Get current ongoing speech segment if any.
-        Returns:
-            bytes: Current speech segment or None
-        """
-        if self.is_recording and self.speech_buffer:
-            current_duration = len(self.speech_buffer) * self.frame_duration / 1000.0
-            if current_duration >= self.min_speech_duration:
-                return b''.join(self.speech_buffer)
-        return None
-    def start_streaming_processing(self):
-        """Start background thread for streaming audio processing."""
-        if self.processing:
-            return
-        self.processing = True
-        self.processing_thread = threading.Thread(target=self._streaming_worker, daemon=True)
-        self.processing_thread.start()
-        logger.info("Started streaming VAD processing")
-    def stop_streaming_processing(self):
-        """Stop background streaming processing."""
-        self.processing = False
-        if hasattr(self, 'processing_thread'):
-            self.processing_thread.join(timeout=1.0)
-        logger.info("Stopped streaming VAD processing")
-    def add_audio_chunk(self, audio_data: bytes):
-        """
-        Add audio chunk to processing queue (thread-safe).
-        Args:
-            audio_data: Raw audio bytes
-        """
-        if self.processing:
-            try:
-                self.audio_queue.put_nowait(audio_data)
-            except queue.Full:
-                logger.warning("Audio queue full, dropping chunk")
-    def get_speech_segments(self) -> List[bytes]:
-        """
-        Get all available speech segments from processing queue.
-        Returns:
-            List[bytes]: Available speech segments
-        """
-        segments = []
-        try:
-            while True:
-                segment = self.output_queue.get_nowait()
-                segments.append(segment)
-        except queue.Empty:
-            pass
-        return segments
-    def _streaming_worker(self):
-        """Background worker for streaming audio processing."""
-        while self.processing:
-            try:
-                # Get audio chunk with timeout
-                audio_chunk = self.audio_queue.get(timeout=0.1)
-                # Process chunk
-                segments = self.process_audio_chunk(audio_chunk)
-                # Add segments to output queue
-                for segment in segments:
-                    try:
-                        self.output_queue.put_nowait(segment)
-                    except queue.Full:
-                        logger.warning("Output queue full, dropping segment")
-            except queue.Empty:
-                continue
-            except Exception as e:
-                logger.error(f"Streaming processing error: {e}")
-    def get_stats(self) -> dict:
-        """
-        Get VAD processing statistics.
-        Returns:
-            dict: Processing statistics
-        """
-        return {
-            'total_frames_processed': self.total_frames_processed,
-            'speech_frames_detected': self.speech_frames_detected,
-            'segments_extracted': self.segments_extracted,
-            'speech_ratio': (
-                self.speech_frames_detected / max(1, self.total_frames_processed)
-            ),
-            'is_recording': self.is_recording,
-            'triggered': self.triggered,
-            'buffer_size': len(self.speech_buffer),
-            'ring_buffer_size': len(self.ring_buffer),
-            'configuration': {
-                'sample_rate': self.sample_rate,
-                'frame_duration': self.frame_duration,
-                'min_speech_duration': self.min_speech_duration,
-                'max_speech_duration': self.max_speech_duration
-            }
-        }
-class StreamingAudioBuffer:
-    """
-    Optimized audio buffer for streaming VAD processing.
-    Thread-safe with memory pool for high performance.
-    """
-    def __init__(self, sample_rate=16000, max_duration=30):
-        self.sample_rate = sample_rate
-        self.max_samples = sample_rate * max_duration
-        # Thread-safe circular buffer
-        self.buffer = collections.deque(maxlen=self.max_samples)
-        self.buffer_lock = threading.RLock()
-        # Performance tracking
-        self.total_samples_added = 0
-        self.buffer_overruns = 0
-    def add_audio(self, audio_data: np.ndarray):
-        """
-        Add audio data to buffer (thread-safe).
-        Args:
-            audio_data: Audio samples as numpy array
-        """
-        with self.buffer_lock:
-            if len(self.buffer) + len(audio_data) > self.max_samples:
-                self.buffer_overruns += 1
-                # Remove old samples to make room
-                samples_to_remove = len(audio_data)
-                for _ in range(min(samples_to_remove, len(self.buffer))):
-                    self.buffer.popleft()
-            self.buffer.extend(audio_data)
-            self.total_samples_added += len(audio_data)
-    def get_recent_audio(self, duration_ms: int = 1000) -> np.ndarray:
-        """
-        Get recent audio with specified duration.
-        Args:
-            duration_ms: Duration in milliseconds
-        Returns:
-            np.ndarray: Recent audio samples
-        """
-        samples_needed = int(self.sample_rate * duration_ms / 1000)
-        with self.buffer_lock:
-            if len(self.buffer) >= samples_needed:
-                return np.array(list(self.buffer)[-samples_needed:], dtype=np.float32)
-            else:
-                return np.array(list(self.buffer), dtype=np.float32)
-    def clear(self):
-        """Clear buffer contents."""
-        with self.buffer_lock:
-            self.buffer.clear()
-    def get_stats(self) -> dict:
-        """Get buffer statistics."""
-        with self.buffer_lock:
-            return {
-                'buffer_size': len(self.buffer),
-                'max_size': self.max_samples,
-                'utilization': len(self.buffer) / self.max_samples,
-                'total_added': self.total_samples_added,
-                'overruns': self.buffer_overruns
-            }