Spaces:

Paranoiid
/

streaming-digit-classifier

Runtime error

App Files Files Community

Pranav Mishra commited on Aug 23, 2025

Commit

1772a46

1 Parent(s): 494577d

Initial backend deployment - Flask API with ML models

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +21 -0
Dockerfile +43 -0
README.md +25 -7
app.py +361 -0
audio_processors/__init__.py +0 -0
audio_processors/__pycache__/__init__.cpython-312.pyc +0 -0
audio_processors/__pycache__/base_processor.cpython-312.pyc +0 -0
audio_processors/__pycache__/external_api.cpython-312.pyc +0 -0
audio_processors/__pycache__/faster_whisper_processor.cpython-312.pyc +0 -0
audio_processors/__pycache__/local_whisper.cpython-312.pyc +0 -0
audio_processors/__pycache__/mel_spectrogram.cpython-312.pyc +0 -0
audio_processors/__pycache__/mfcc_processor.cpython-312.pyc +0 -0
audio_processors/__pycache__/ml_mel_cnn_processor.cpython-312.pyc +0 -0
audio_processors/__pycache__/ml_mfcc_processor.cpython-312.pyc +0 -0
audio_processors/__pycache__/ml_raw_cnn_processor.cpython-312.pyc +0 -0
audio_processors/__pycache__/raw_spectrogram.cpython-312.pyc +0 -0
audio_processors/__pycache__/wav2vec2_processor.cpython-312.pyc +0 -0
audio_processors/__pycache__/whisper_digit_processor.cpython-312.pyc +0 -0
audio_processors/base_processor.py +85 -0
audio_processors/external_api.py +153 -0
audio_processors/faster_whisper_processor.py +219 -0
audio_processors/local_whisper.py +158 -0
audio_processors/mel_spectrogram.py +74 -0
audio_processors/mfcc_processor.py +79 -0
audio_processors/ml_mel_cnn_processor.py +307 -0
audio_processors/ml_mfcc_processor.py +370 -0
audio_processors/ml_raw_cnn_processor.py +307 -0
audio_processors/raw_spectrogram.py +69 -0
audio_processors/wav2vec2_processor.py +170 -0
audio_processors/whisper_digit_processor.py +429 -0
models/mel_cnn_classifier/best_model.pt +3 -0
models/mfcc_classifier/best_model.pt +3 -0
models/mfcc_classifier/scaler.pkl +3 -0
models/raw_cnn_classifier/best_model.pt +3 -0
requirements_hf.txt +26 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/audio_utils.cpython-312.pyc +0 -0
utils/__pycache__/enhanced_vad.cpython-312.pyc +0 -0
utils/__pycache__/logging_utils.cpython-312.pyc +0 -0
utils/__pycache__/noise_utils.cpython-312.pyc +0 -0
utils/__pycache__/session_manager.cpython-312.pyc +0 -0
utils/__pycache__/vad_feature_integration.cpython-312.pyc +0 -0
utils/__pycache__/webm_converter.cpython-312.pyc +0 -0
utils/__pycache__/webrtc_vad.cpython-312.pyc +0 -0
utils/audio_utils.py +427 -0
utils/enhanced_vad.py +571 -0
utils/logging_utils.py +201 -0
utils/noise_utils.py +292 -0
utils/session_manager.py +340 -0

.env.example ADDED Viewed

	@@ -0,0 +1,21 @@

+# Environment variables for HF Spaces deployment
+# Copy this to .env and set your values
+# Flask Configuration
+SECRET_KEY=your_secret_key_here
+FLASK_ENV=production
+# External API Keys (optional - for external processors)
+HF_TOKEN=your_huggingface_token_here
+OPENAI_API_KEY=your_openai_key_here
+# Model Configuration
+DEFAULT_ML_MODEL=ml_mfcc
+ENABLE_EXTERNAL_API=false
+# Performance Settings
+MAX_AUDIO_DURATION=10
+MAX_FILE_SIZE=10485760
+# Logging
+LOG_LEVEL=INFO

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+# Use Python 3.9 as recommended by HF Spaces
+FROM python:3.9-slim
+# Create user for HF Spaces (required)
+RUN useradd -m -u 1000 user
+USER user
+# Set environment variables
+ENV PATH="/home/user/.local/bin:$PATH"
+ENV PYTHONPATH="/app:$PYTHONPATH"
+ENV PYTHONUNBUFFERED=1
+# Set work directory
+WORKDIR /app
+# Install system dependencies (as user, limited packages)
+# Note: HF Spaces has restrictions on system packages
+COPY --chown=user ./requirements_hf.txt requirements.txt
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy application files (essential files only)
+COPY --chown=user ./app.py ./app.py
+COPY --chown=user ./audio_processors ./audio_processors
+COPY --chown=user ./utils ./utils
+COPY --chown=user ./models ./models
+# Copy environment template (users can set their own HF_TOKEN)
+COPY --chown=user ./.env.example ./.env
+# Create log directory
+RUN mkdir -p /app/logs
+# Expose port (HF Spaces requires 7860)
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:7860/api/health').raise_for_status()" || exit 1
+# Run the application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,30 @@
 ---
-title: Streaming Digit Classifier
-emoji: 🦀
-colorFrom: gray
-colorTo: red
 sdk: docker
 pinned: false
-license: mit
-short_description: Real-time spoken digit recognition API with 4 ML approaches
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Streaming Digit Classifier API
+emoji: 🎤
+colorFrom: green
+colorTo: blue
 sdk: docker
 pinned: false
+app_port: 7860
 ---
+# Streaming Digit Classifier API
+Backend API for real-time spoken digit recognition (0-9).
+## Features
+- ML Models: MFCC + Dense NN, Mel CNN, Raw CNN
+- External API integration (Whisper)
+- Real-time audio processing
+- RESTful API endpoints
+## API Endpoints
+- \`GET /\` - API status
+- \`POST /api/process_audio\` - Process audio file
+- \`POST /api/process_audio_chunk\` - Process streaming chunk
+- \`GET /api/health\` - Health check
+- \`GET /api/processors\` - Available processors
+Frontend: [Deployed on Vercel](https://your-frontend-url.vercel.app)

app.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""
+Audio Digit Classification API for Hugging Face Spaces
+Backend API for spoken digit recognition (0-9) - HF Spaces deployment
+"""
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+import os
+import time
+import logging
+from typing import Dict, Any, Optional
+from dotenv import load_dotenv
+import numpy as np
+# Import audio processors (only essential ones for deployment)
+from audio_processors.external_api import ExternalAPIProcessor
+from audio_processors.whisper_digit_processor import WhisperDigitProcessor
+from audio_processors.ml_mfcc_processor import MLMFCCProcessor
+from audio_processors.ml_mel_cnn_processor import MLMelCNNProcessor
+from audio_processors.ml_raw_cnn_processor import MLRawCNNProcessor
+# Import utilities
+from utils.audio_utils import validate_audio_format, convert_audio_format, get_audio_duration, convert_for_ml_models
+from utils.logging_utils import performance_logger, setup_flask_logging
+# Load environment variables
+load_dotenv()
+# Initialize Flask app
+app = Flask(__name__)
+app.secret_key = os.getenv('SECRET_KEY', 'hf_spaces_deployment_key')
+# Enable CORS for frontend requests from Vercel
+CORS(app, origins=['*'])  # In production, specify your Vercel domain
+# Setup logging
+setup_flask_logging(app)
+# Configuration for HF Spaces
+MAX_AUDIO_DURATION = 10  # seconds
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+ALLOWED_EXTENSIONS = {'wav', 'mp3', 'ogg', 'm4a', 'webm'}
+def allowed_file(filename: str) -> bool:
+    """Check if file extension is allowed."""
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def initialize_processors():
+    """Initialize audio processors optimized for HF Spaces deployment."""
+    procs = {}
+    # ML-trained processors (high priority - use best models only)
+    ml_processors = [
+        ('ml_mfcc', MLMFCCProcessor, 'ML MFCC + Dense NN (Best - 98.52%)'),
+        ('ml_mel_cnn', MLMelCNNProcessor, 'ML Mel CNN (Good - 97.22%)'),
+        ('ml_raw_cnn', MLRawCNNProcessor, 'ML Raw CNN (Fair - 91.30%)')
+    ]
+    ml_working_count = 0
+    for proc_key, proc_class, proc_name in ml_processors:
+        try:
+            processor = proc_class()
+            if processor.is_configured():
+                procs[proc_key] = processor
+                ml_working_count += 1
+                app.logger.info(f"[OK] {proc_name} loaded successfully")
+            else:
+                app.logger.warning(f"[WARN] {proc_name} not configured (model files missing)")
+        except Exception as e:
+            app.logger.error(f"[FAIL] Failed to initialize {proc_name}: {str(e)}")
+    # External API processor as fallback
+    try:
+        external_processor = ExternalAPIProcessor()
+        if external_processor.is_configured():
+            procs['external_api'] = external_processor
+            app.logger.info("[OK] External API processor initialized")
+        else:
+            app.logger.warning("[WARN] External API not configured")
+    except Exception as e:
+        app.logger.error(f"[FAIL] Failed to initialize External API: {str(e)}")
+    # Whisper digit processor as another fallback
+    try:
+        whisper_processor = WhisperDigitProcessor()
+        if whisper_processor.is_configured():
+            procs['whisper_digit'] = whisper_processor
+            app.logger.info("[OK] Whisper digit processor initialized")
+    except Exception as e:
+        app.logger.error(f"[FAIL] Failed to initialize Whisper: {str(e)}")
+    app.logger.info(f"Processor initialization complete:")
+    app.logger.info(f"  ML Models loaded: {ml_working_count}/3")
+    app.logger.info(f"  Total processors: {len(procs)}")
+    return procs
+processors = initialize_processors()
+@app.route('/')
+def index():
+    """API status endpoint."""
+    return jsonify({
+        'message': 'Streaming Digit Classifier API',
+        'status': 'running',
+        'version': '1.0.0',
+        'available_processors': list(processors.keys()),
+        'documentation': 'Frontend at Vercel, Backend API at HF Spaces'
+    })
+@app.route('/api/process_audio', methods=['POST'])
+def process_audio():
+    """
+    Process audio file with selected method and return digit prediction.
+    Expects multipart form data with 'audio' file and 'method' selection.
+    """
+    try:
+        # Validate request
+        if 'audio' not in request.files:
+            return jsonify({'error': 'No audio file provided'}), 400
+        if 'method' not in request.form:
+            return jsonify({'error': 'No processing method specified'}), 400
+        audio_file = request.files['audio']
+        method = request.form['method']
+        # Validate audio file
+        if audio_file.filename == '':
+            return jsonify({'error': 'No file selected'}), 400
+        if not allowed_file(audio_file.filename):
+            return jsonify({'error': 'Unsupported file format'}), 400
+        # Validate method
+        if method not in processors:
+            return jsonify({'error': f'Unknown processing method: {method}'}), 400
+        # Read audio data
+        audio_data = audio_file.read()
+        # Check file size
+        if len(audio_data) > MAX_FILE_SIZE:
+            return jsonify({'error': 'Audio file too large'}), 400
+        # Convert to standard format
+        try:
+            app.logger.debug(f"Converting audio format. Original size: {len(audio_data)} bytes")
+            standardized_audio = convert_audio_format(audio_data)
+            app.logger.debug(f"Converted audio size: {len(standardized_audio)} bytes")
+        except Exception as e:
+            app.logger.error(f"Audio conversion failed: {str(e)}")
+            return jsonify({'error': 'Failed to process audio format - unsupported format or corrupted file'}), 400
+        # Check audio duration
+        duration = get_audio_duration(standardized_audio)
+        if duration > MAX_AUDIO_DURATION:
+            return jsonify({
+                'error': f'Audio too long: {duration:.1f}s (max: {MAX_AUDIO_DURATION}s)'
+            }), 400
+        if duration < 0.1:
+            return jsonify({'error': 'Audio too short (minimum: 0.1s)'}), 400
+        # Log audio input info
+        performance_logger.log_audio_info(duration, {
+            'filename': audio_file.filename,
+            'size_bytes': len(audio_data),
+            'converted_size': len(standardized_audio),
+            'method': method
+        })
+        # Process with selected method
+        processor = processors[method]
+        result = processor.predict_with_timing(standardized_audio)
+        # Log performance
+        performance_logger.log_prediction(method, result)
+        # Add additional metadata
+        result.update({
+            'audio_duration': round(duration, 3),
+            'file_size': len(audio_data),
+            'api_version': '1.0.0'
+        })
+        app.logger.info(f"Processed audio with {method}: '{result['predicted_digit']}' in {result['inference_time']}s")
+        return jsonify(result)
+    except Exception as e:
+        app.logger.error(f"Audio processing error: {str(e)}")
+        return jsonify({
+            'error': 'Internal processing error',
+            'success': False,
+            'timestamp': time.time()
+        }), 500
+@app.route('/api/process_audio_chunk', methods=['POST'])
+def process_audio_chunk():
+    """
+    Process streaming audio chunk for real-time digit recognition.
+    """
+    try:
+        # Validate request
+        if 'audio' not in request.files:
+            return jsonify({'error': 'No audio chunk provided'}), 400
+        audio_file = request.files['audio']
+        method = request.form.get('method', 'ml_mfcc')  # Default to best ML model
+        # Validate method
+        if method not in processors:
+            return jsonify({'error': f'Unknown processing method: {method}'}), 400
+        # Read audio data
+        audio_data = audio_file.read()
+        # Check chunk size
+        if len(audio_data) > MAX_FILE_SIZE:
+            return jsonify({'error': 'Audio chunk too large'}), 400
+        if len(audio_data) < 100:
+            return jsonify({'error': 'Audio chunk too small'}), 400
+        # Convert to standardized format
+        try:
+            standardized_audio = convert_for_ml_models(audio_data, 'streaming')
+        except Exception as e:
+            app.logger.error(f"Audio conversion failed for chunk: {str(e)}")
+            return jsonify({'error': 'Failed to process audio chunk format'}), 400
+        # Process audio chunk
+        processor = processors[method]
+        result = processor.predict_with_timing(standardized_audio)
+        # Add streaming metadata
+        result.update({
+            'segment_index': 0,
+            'segment_size': len(standardized_audio),
+            'is_streaming': True,
+            'api_version': '1.0.0'
+        })
+        app.logger.info(f"Streaming prediction: '{result['predicted_digit']}' "
+                      f"(Inference: {result['inference_time']}s)")
+        return jsonify({
+            'success': True,
+            'segments_detected': 1,
+            'total_results': 1,
+            'results': [result],
+            'timestamp': time.time(),
+            'has_fallback': False
+        })
+    except Exception as e:
+        app.logger.error(f"Streaming audio processing error: {str(e)}")
+        return jsonify({
+            'error': 'Internal streaming processing error',
+            'success': False,
+            'timestamp': time.time()
+        }), 500
+@app.route('/api/processors')
+def get_processors():
+    """Get information about available processors."""
+    try:
+        processor_info = {}
+        for name, processor in processors.items():
+            info = {
+                'name': processor.name,
+                'method': name,
+                'configured': getattr(processor, 'is_configured', lambda: True)()
+            }
+            # Add model-specific info if available
+            if hasattr(processor, 'get_model_info'):
+                info.update(processor.get_model_info())
+            processor_info[name] = info
+        return jsonify(processor_info)
+    except Exception as e:
+        app.logger.error(f"Error getting processors: {str(e)}")
+        return jsonify({'error': 'Failed to retrieve processor information'}), 500
+@app.route('/api/health')
+def health_check():
+    """Health check endpoint."""
+    try:
+        # Check processor availability
+        processor_health = {}
+        for name, processor in processors.items():
+            processor_health[name] = {
+                'available': True,
+                'configured': getattr(processor, 'is_configured', lambda: True)()
+            }
+        return jsonify({
+            'status': 'healthy',
+            'timestamp': time.time(),
+            'processors': processor_health,
+            'version': '1.0.0',
+            'deployment': 'huggingface-spaces'
+        })
+    except Exception as e:
+        app.logger.error(f"Health check failed: {str(e)}")
+        return jsonify({
+            'status': 'unhealthy',
+            'error': str(e),
+            'timestamp': time.time()
+        }), 500
+@app.errorhandler(404)
+def not_found_error(error):
+    """Handle 404 errors."""
+    return jsonify({'error': 'Endpoint not found', 'status': 404}), 404
+@app.errorhandler(500)
+def internal_error(error):
+    """Handle 500 errors."""
+    app.logger.error(f"Internal error: {str(error)}")
+    return jsonify({'error': 'Internal server error', 'status': 500}), 500
+@app.errorhandler(413)
+def too_large_error(error):
+    """Handle file too large errors."""
+    return jsonify({'error': 'File too large', 'status': 413}), 413
+if __name__ == '__main__':
+    # Log startup information
+    try:
+        import importlib.metadata
+        flask_version = importlib.metadata.version('flask')
+    except:
+        flask_version = 'unknown'
+    performance_logger.log_system_info({
+        'python_version': os.sys.version,
+        'flask_version': flask_version,
+        'processors_loaded': list(processors.keys()),
+        'max_audio_duration': MAX_AUDIO_DURATION,
+        'max_file_size': MAX_FILE_SIZE,
+        'deployment': 'huggingface-spaces'
+    })
+    # Run server (HF Spaces requires port 7860)
+    port = int(os.getenv('PORT', 7860))
+    app.logger.info(f"Starting Audio Digit Classifier API on port {port}")
+    app.logger.info("Deployment: Hugging Face Spaces")
+    app.run(
+        host='0.0.0.0',
+        port=port,
+        debug=False,  # Disable debug in production
+        threaded=True
+    )

audio_processors/__init__.py ADDED Viewed

File without changes

audio_processors/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (153 Bytes). View file

audio_processors/__pycache__/base_processor.cpython-312.pyc ADDED Viewed

Binary file (3.79 kB). View file

audio_processors/__pycache__/external_api.cpython-312.pyc ADDED Viewed

Binary file (7.1 kB). View file

audio_processors/__pycache__/faster_whisper_processor.cpython-312.pyc ADDED Viewed

Binary file (9.58 kB). View file

audio_processors/__pycache__/local_whisper.cpython-312.pyc ADDED Viewed

Binary file (6.67 kB). View file

audio_processors/__pycache__/mel_spectrogram.cpython-312.pyc ADDED Viewed

Binary file (2.8 kB). View file

audio_processors/__pycache__/mfcc_processor.cpython-312.pyc ADDED Viewed

Binary file (2.85 kB). View file

audio_processors/__pycache__/ml_mel_cnn_processor.cpython-312.pyc ADDED Viewed

Binary file (13.6 kB). View file

audio_processors/__pycache__/ml_mfcc_processor.cpython-312.pyc ADDED Viewed

Binary file (16.7 kB). View file

audio_processors/__pycache__/ml_raw_cnn_processor.cpython-312.pyc ADDED Viewed

Binary file (13.4 kB). View file

audio_processors/__pycache__/raw_spectrogram.cpython-312.pyc ADDED Viewed

Binary file (2.81 kB). View file

audio_processors/__pycache__/wav2vec2_processor.cpython-312.pyc ADDED Viewed

Binary file (7.25 kB). View file

audio_processors/__pycache__/whisper_digit_processor.cpython-312.pyc ADDED Viewed

Binary file (18.1 kB). View file

audio_processors/base_processor.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from abc import ABC, abstractmethod
+from typing import Union, Dict, Any
+import time
+import logging
+logger = logging.getLogger(__name__)
+class AudioProcessor(ABC):
+    """
+    Abstract base class for all audio digit classification processors.
+    Provides common interface and logging functionality.
+    """
+    def __init__(self, name: str):
+        self.name = name
+        self.total_predictions = 0
+        self.total_inference_time = 0.0
+    @abstractmethod
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio data and return predicted digit as string.
+        Args:
+            audio_data: Raw audio bytes
+        Returns:
+            Predicted digit as string ('0'-'9')
+        """
+        pass
+    def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]:
+        """
+        Process audio and return prediction with timing information.
+        Args:
+            audio_data: Raw audio bytes
+        Returns:
+            Dictionary with prediction, timing, and method info
+        """
+        start_time = time.time()
+        try:
+            predicted_digit = self.process_audio(audio_data)
+            inference_time = time.time() - start_time
+            self.total_predictions += 1
+            self.total_inference_time += inference_time
+            result = {
+                'predicted_digit': predicted_digit,
+                'inference_time': round(inference_time, 3),
+                'method': self.name,
+                'timestamp': time.time(),
+                'average_time': round(self.total_inference_time / self.total_predictions, 3),
+                'success': True
+            }
+            logger.info(f"{self.name}: Predicted '{predicted_digit}' in {inference_time:.3f}s")
+            return result
+        except Exception as e:
+            inference_time = time.time() - start_time
+            logger.error(f"{self.name}: Error processing audio: {str(e)}")
+            return {
+                'predicted_digit': 'ERROR',
+                'inference_time': round(inference_time, 3),
+                'method': self.name,
+                'timestamp': time.time(),
+                'success': False,
+                'error': str(e)
+            }
+    def get_stats(self) -> Dict[str, float]:
+        """Get processor statistics."""
+        if self.total_predictions == 0:
+            return {'total_predictions': 0, 'average_time': 0.0}
+        return {
+            'total_predictions': self.total_predictions,
+            'total_time': round(self.total_inference_time, 3),
+            'average_time': round(self.total_inference_time / self.total_predictions, 3)
+        }

audio_processors/external_api.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import requests
+import os
+import re
+import logging
+from typing import Optional
+from .base_processor import AudioProcessor
+logger = logging.getLogger(__name__)
+class ExternalAPIProcessor(AudioProcessor):
+    """
+    Hugging Face Whisper API integration for digit classification.
+    Uses openai/whisper-base model for speech-to-text conversion.
+    """
+    def __init__(self):
+        super().__init__("External API (Whisper)")
+        # Try alternative Whisper model that should be available
+        self.api_url = "https://api-inference.huggingface.co/models/openai/whisper-small"
+        self.token = os.getenv('HUGGING_FACE_TOKEN')
+        self.headers = {"Authorization": f"Bearer {self.token}"} if self.token else {}
+        if not self.token:
+            logger.warning("HUGGING_FACE_TOKEN not found in environment variables")
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio using Hugging Face Whisper API.
+        Args:
+            audio_data: Raw audio bytes (WAV format preferred)
+        Returns:
+            Predicted digit as string ('0'-'9')
+        Raises:
+            Exception: If API call fails or no digit found in response
+        """
+        if not self.token:
+            raise Exception("Hugging Face API token not configured")
+        try:
+            # Make API request
+            response = requests.post(
+                self.api_url,
+                headers=self.headers,
+                data=audio_data,
+                timeout=15  # Increased timeout
+            )
+            if response.status_code == 401:
+                logger.error("Hugging Face API token is invalid or expired")
+                raise Exception("Invalid or expired API token - please update HUGGING_FACE_TOKEN")
+            elif response.status_code == 404:
+                logger.error(f"Model not found or unavailable: {self.api_url}")
+                raise Exception("API model unavailable - may be loading or deprecated")
+            elif response.status_code == 503:
+                logger.warning("Model is loading, this may take a few moments")
+                raise Exception("API model is loading - please try again in a moment")
+            elif response.status_code != 200:
+                logger.error(f"API request failed: {response.status_code} - {response.text}")
+                raise Exception(f"API error {response.status_code}: {response.text[:100]}")
+            # Parse response
+            result = response.json()
+            if 'text' not in result:
+                logger.error(f"Unexpected API response format: {result}")
+                raise Exception("Invalid API response format")
+            transcribed_text = result['text'].strip().lower()
+            logger.debug(f"Whisper transcription: '{transcribed_text}'")
+            # Extract digit from transcription
+            predicted_digit = self._extract_digit(transcribed_text)
+            if predicted_digit is None:
+                logger.warning(f"No digit found in transcription: '{transcribed_text}'")
+                return "?"
+            return predicted_digit
+        except requests.exceptions.Timeout:
+            raise Exception("API request timeout (15s) - service may be slow")
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"API request failed: {str(e)}")
+        except Exception as e:
+            logger.error(f"Unexpected error in external API processing: {str(e)}")
+            raise
+    def _extract_digit(self, text: str) -> Optional[str]:
+        """
+        Extract digit from transcribed text.
+        Handles both numerical ('1', '2') and word forms ('one', 'two').
+        Args:
+            text: Transcribed text from Whisper
+        Returns:
+            Digit as string ('0'-'9') or None if not found
+        """
+        # Word to digit mapping
+        word_to_digit = {
+            'zero': '0', 'oh': '0',
+            'one': '1', 'won': '1',
+            'two': '2', 'to': '2', 'too': '2',
+            'three': '3', 'tree': '3',
+            'four': '4', 'for': '4', 'fore': '4',
+            'five': '5',
+            'six': '6', 'sick': '6',
+            'seven': '7',
+            'eight': '8', 'ate': '8',
+            'nine': '9', 'niner': '9'
+        }
+        # First, try to find a direct digit
+        digit_match = re.search(r'\b([0-9])\b', text)
+        if digit_match:
+            return digit_match.group(1)
+        # Then try word forms
+        words = text.split()
+        for word in words:
+            clean_word = re.sub(r'[^\w]', '', word.lower())
+            if clean_word in word_to_digit:
+                return word_to_digit[clean_word]
+        # Try partial matches for robustness
+        for word, digit in word_to_digit.items():
+            if word in text:
+                return digit
+        return None
+    def is_configured(self) -> bool:
+        """Check if API is properly configured."""
+        return bool(self.token)
+    def test_connection(self) -> bool:
+        """Test API connection with a simple request."""
+        if not self.is_configured():
+            return False
+        try:
+            # Test with minimal audio data
+            test_response = requests.get(
+                self.api_url,
+                headers=self.headers,
+                timeout=5
+            )
+            return test_response.status_code == 200
+        except:
+            return False

audio_processors/faster_whisper_processor.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Faster-Whisper processor with built-in VAD (2025 approach)
+More reliable than manual WebRTC VAD + Whisper coordination
+"""
+import numpy as np
+import io
+import time
+import logging
+from typing import Dict, Any, Optional
+try:
+    from faster_whisper import WhisperModel
+    FASTER_WHISPER_AVAILABLE = True
+except ImportError:
+    FASTER_WHISPER_AVAILABLE = False
+    WhisperModel = None
+from .base_processor import AudioProcessor
+logger = logging.getLogger(__name__)
+class FasterWhisperDigitProcessor(AudioProcessor):
+    """
+    Modern 2025 approach using faster-whisper with built-in VAD.
+    Much more reliable than manual WebRTC VAD coordination.
+    """
+    def __init__(self):
+        """Initialize faster-whisper processor with built-in VAD."""
+        super().__init__("Faster-Whisper with VAD")
+        if not FASTER_WHISPER_AVAILABLE:
+            logger.error("faster-whisper not available. Install with: pip install faster-whisper")
+            self.model = None
+            return
+        self.model = None
+        self.device = "cuda" if self._cuda_available() else "cpu"
+        # Digit mapping
+        self.digit_map = {
+            "zero": "0", "one": "1", "two": "2", "three": "3",
+            "four": "4", "five": "5", "six": "6", "seven": "7",
+            "eight": "8", "nine": "9",
+            "oh": "0", "o": "0", "for": "4", "fore": "4",
+            "to": "2", "too": "2", "tu": "2", "tree": "3",
+            "free": "3", "ate": "8", "ait": "8"
+        }
+        # Statistics
+        self.total_predictions = 0
+        self.successful_predictions = 0
+        self.failed_predictions = 0
+        self._initialize_model()
+    def _cuda_available(self) -> bool:
+        """Check if CUDA is available."""
+        try:
+            import torch
+            return torch.cuda.is_available()
+        except ImportError:
+            return False
+    def _initialize_model(self):
+        """Initialize faster-whisper model with VAD."""
+        if not FASTER_WHISPER_AVAILABLE:
+            return
+        try:
+            logger.info("Initializing faster-whisper model with built-in VAD...")
+            # Initialize faster-whisper model
+            self.model = WhisperModel(
+                "tiny",  # Use tiny model for speed
+                device=self.device,
+                compute_type="float16" if self.device == "cuda" else "int8"
+            )
+            logger.info(f"Faster-Whisper model initialized on {self.device}")
+        except Exception as e:
+            logger.error(f"Failed to initialize faster-whisper: {e}")
+            self.model = None
+    def is_configured(self) -> bool:
+        """Check if processor is configured."""
+        return self.model is not None and FASTER_WHISPER_AVAILABLE
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio with built-in VAD and return predicted digit.
+        Args:
+            audio_data: Raw audio bytes
+        Returns:
+            str: Predicted digit (0-9) or error message
+        """
+        if not self.is_configured():
+            return "error: Model not configured"
+        try:
+            # Convert audio to numpy array
+            audio_array = self._convert_audio_bytes(audio_data)
+            if audio_array is None:
+                return "error: Audio conversion failed"
+            # Use faster-whisper with built-in VAD
+            segments, info = self.model.transcribe(
+                audio_array,
+                language="en",
+                # Built-in VAD parameters - much better than manual VAD
+                vad_filter=True,
+                vad_parameters=dict(
+                    min_silence_duration_ms=100,  # 100ms minimum silence
+                    speech_pad_ms=30  # 30ms padding around speech
+                )
+            )
+            # Process transcription results
+            transcriptions = []
+            for segment in segments:
+                text = segment.text.strip().lower()
+                if text:
+                    transcriptions.append(text)
+            if not transcriptions:
+                return "error: No speech detected"
+            # Combine all segments and extract digit
+            full_text = " ".join(transcriptions)
+            digit = self._text_to_digit(full_text)
+            logger.debug(f"Faster-Whisper: '{full_text}' -> '{digit}'")
+            if digit in "0123456789":
+                self.successful_predictions += 1
+                return digit
+            else:
+                self.failed_predictions += 1
+                return f"unclear: {full_text}"
+        except Exception as e:
+            logger.error(f"Faster-Whisper processing failed: {e}")
+            self.failed_predictions += 1
+            return f"error: {str(e)}"
+        finally:
+            self.total_predictions += 1
+    def _convert_audio_bytes(self, audio_data: bytes) -> Optional[np.ndarray]:
+        """Convert audio bytes to numpy array for faster-whisper."""
+        try:
+            # Check if it's a WAV file
+            if audio_data.startswith(b'RIFF'):
+                import soundfile as sf
+                audio_buffer = io.BytesIO(audio_data)
+                audio_array, sample_rate = sf.read(audio_buffer, dtype='float32')
+                # Convert stereo to mono if needed
+                if len(audio_array.shape) > 1:
+                    audio_array = np.mean(audio_array, axis=1)
+                return audio_array
+            else:
+                # Raw PCM data
+                audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
+                return audio_array / 32768.0
+        except Exception as e:
+            logger.error(f"Audio conversion failed: {e}")
+            return None
+    def _text_to_digit(self, text: str) -> str:
+        """Convert transcribed text to digit."""
+        text = text.strip().lower()
+        # Remove common words
+        text = text.replace("the", "").replace("number", "").replace("digit", "")
+        text = text.strip()
+        # Direct mapping
+        if text in self.digit_map:
+            return self.digit_map[text]
+        # Word-by-word check
+        for word in text.split():
+            if word in self.digit_map:
+                return self.digit_map[word]
+        # Check for digits in text
+        digits = [char for char in text if char.isdigit()]
+        if digits:
+            return digits[0]
+        return text
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get model information."""
+        return {
+            'model_name': 'faster-whisper-tiny',
+            'model_type': 'Speech-to-Text with VAD',
+            'has_builtin_vad': True,
+            'device': self.device,
+            'available': FASTER_WHISPER_AVAILABLE
+        }
+    def get_stats(self) -> Dict[str, Any]:
+        """Get processing statistics."""
+        success_rate = self.successful_predictions / max(1, self.total_predictions)
+        return {
+            'total_predictions': self.total_predictions,
+            'successful_predictions': self.successful_predictions,
+            'failed_predictions': self.failed_predictions,
+            'success_rate': round(success_rate, 3),
+            'model_available': self.is_configured()
+        }

audio_processors/local_whisper.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import logging
+import numpy as np
+from typing import Optional
+from .base_processor import AudioProcessor
+logger = logging.getLogger(__name__)
+class LocalWhisperProcessor(AudioProcessor):
+    """
+    Local Whisper model using transformers pipeline.
+    Fallback when API is unavailable.
+    """
+    def __init__(self):
+        super().__init__("Local Whisper (Tiny)")
+        self.pipeline = None
+        self.model_name = "openai/whisper-tiny"
+        self.is_initialized = False
+    def _initialize_model(self):
+        """Lazy initialization of the model"""
+        if self.is_initialized:
+            return
+        try:
+            logger.info(f"Loading local Whisper model: {self.model_name}")
+            from transformers import pipeline
+            import torch
+            # Use CPU for compatibility, GPU if available
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.pipeline = pipeline(
+                "automatic-speech-recognition",
+                model=self.model_name,
+                device=device,
+                torch_dtype=torch.float32,  # Use float32 to avoid dtype issues
+                return_timestamps=False  # We only need text
+            )
+            logger.info(f"Local Whisper model loaded on {device}")
+            self.is_initialized = True
+        except ImportError as e:
+            logger.error("transformers library not installed. Run: pip install transformers torch")
+            raise Exception("transformers library required for local processing")
+        except Exception as e:
+            logger.error(f"Failed to load local Whisper model: {str(e)}")
+            raise Exception(f"Local model initialization failed: {str(e)}")
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio using local Whisper model.
+        Args:
+            audio_data: Raw audio bytes (WAV format preferred)
+        Returns:
+            Predicted digit as string ('0'-'9')
+        Raises:
+            Exception: If processing fails
+        """
+        try:
+            # Initialize model on first use
+            self._initialize_model()
+            # Convert audio bytes to numpy array
+            from utils.audio_utils import audio_to_numpy
+            audio_array, sample_rate = audio_to_numpy(audio_data)
+            # Resample to 16kHz if needed (Whisper expects 16kHz)
+            if sample_rate != 16000:
+                logger.debug(f"Resampling from {sample_rate}Hz to 16kHz")
+                import librosa
+                audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
+            # Process with pipeline
+            logger.debug(f"Processing audio: {len(audio_array)} samples at 16kHz")
+            result = self.pipeline(audio_array)
+            if not result or 'text' not in result:
+                logger.error(f"Unexpected pipeline result: {result}")
+                raise Exception("Invalid pipeline output")
+            transcribed_text = result['text'].strip().lower()
+            logger.debug(f"Local Whisper transcription: '{transcribed_text}'")
+            # Extract digit from transcription
+            predicted_digit = self._extract_digit(transcribed_text)
+            if predicted_digit is None:
+                logger.warning(f"No digit found in transcription: '{transcribed_text}'")
+                return "?"
+            return predicted_digit
+        except Exception as e:
+            logger.error(f"Local Whisper processing failed: {str(e)}")
+            raise Exception(f"Local processing error: {str(e)}")
+    def _extract_digit(self, text: str) -> Optional[str]:
+        """
+        Extract digit from transcribed text.
+        Handles both numerical ('1', '2') and word forms ('one', 'two').
+        """
+        import re
+        # Word to digit mapping
+        word_to_digit = {
+            'zero': '0', 'oh': '0',
+            'one': '1', 'won': '1',
+            'two': '2', 'to': '2', 'too': '2',
+            'three': '3', 'tree': '3',
+            'four': '4', 'for': '4', 'fore': '4',
+            'five': '5',
+            'six': '6', 'sick': '6',
+            'seven': '7',
+            'eight': '8', 'ate': '8',
+            'nine': '9', 'niner': '9'
+        }
+        # First, try to find a direct digit
+        digit_match = re.search(r'\b([0-9])\b', text)
+        if digit_match:
+            return digit_match.group(1)
+        # Then try word forms
+        words = text.split()
+        for word in words:
+            clean_word = re.sub(r'[^\w]', '', word.lower())
+            if clean_word in word_to_digit:
+                return word_to_digit[clean_word]
+        # Try partial matches for robustness
+        for word, digit in word_to_digit.items():
+            if word in text:
+                return digit
+        return None
+    def is_configured(self) -> bool:
+        """Check if local model can be initialized."""
+        try:
+            import transformers
+            import torch
+            return True
+        except ImportError:
+            return False
+    def test_connection(self) -> bool:
+        """Test local model functionality."""
+        try:
+            self._initialize_model()
+            return True
+        except:
+            return False

audio_processors/mel_spectrogram.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import numpy as np
+import logging
+from .base_processor import AudioProcessor
+logger = logging.getLogger(__name__)
+class MelSpectrogramProcessor(AudioProcessor):
+    """
+    Mel Spectrogram processor using mel-scale frequency analysis.
+    Future implementation will:
+    - Apply mel filterbank to frequency domain representation
+    - Use perceptually-motivated frequency scaling
+    - Feed mel spectrogram features to deep learning model
+    Currently returns placeholder '00' for testing UI functionality.
+    """
+    def __init__(self):
+        super().__init__("Mel Spectrogram")
+        logger.info("Mel Spectrogram processor initialized (PLACEHOLDER MODE)")
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio using mel-scale spectrogram analysis.
+        PLACEHOLDER IMPLEMENTATION:
+        Currently returns '00' for UI testing purposes.
+        Future implementation will:
+        1. Convert audio bytes to numpy array
+        2. Compute STFT of the audio signal
+        3. Apply mel filterbank to convert to mel scale
+        4. Take logarithm for perceptual scaling
+        5. Feed to trained neural network (CNN/RNN)
+        6. Return predicted digit
+        Args:
+            audio_data: Raw audio bytes
+        Returns:
+            Predicted digit as string (currently '00')
+        """
+        logger.debug("Processing audio with Mel Spectrogram (placeholder)")
+        # Simulate processing time
+        import time
+        time.sleep(0.15)
+        # TODO: Implement actual mel spectrogram processing:
+        # 1. audio_array = np.frombuffer(audio_data, dtype=np.float32)
+        # 2. mel_spec = librosa.feature.melspectrogram(
+        #        y=audio_array,
+        #        sr=sample_rate,
+        #        n_mels=128,
+        #        fmax=8000
+        #    )
+        # 3. mel_db = librosa.power_to_db(mel_spec, ref=np.max)
+        # 4. prediction = self.neural_model.predict(mel_db)
+        # 5. return str(np.argmax(prediction))
+        return '00'
+    def get_model_info(self) -> dict:
+        """Get information about the mel spectrogram model."""
+        return {
+            'method': 'Mel Spectrogram',
+            'status': 'PLACEHOLDER',
+            'features': 'Mel-scale frequency representation',
+            'classifier': 'CNN/RNN (not implemented)',
+            'n_mels': 128,
+            'fmax': 8000,
+            'expected_inference_time': '<500ms'
+        }

audio_processors/mfcc_processor.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import numpy as np
+import logging
+from .base_processor import AudioProcessor
+logger = logging.getLogger(__name__)
+class MFCCProcessor(AudioProcessor):
+    """
+    MFCC (Mel-Frequency Cepstral Coefficients) processor.
+    Future implementation will:
+    - Extract MFCC features (typically 12-13 coefficients)
+    - Apply DCT (Discrete Cosine Transform) to mel spectrogram
+    - Use traditional ML classifier (SVM, Random Forest, etc.)
+    Currently returns placeholder '00' for testing UI functionality.
+    """
+    def __init__(self):
+        super().__init__("MFCC")
+        logger.info("MFCC processor initialized (PLACEHOLDER MODE)")
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio using MFCC feature extraction.
+        PLACEHOLDER IMPLEMENTATION:
+        Currently returns '00' for UI testing purposes.
+        Future implementation will:
+        1. Convert audio bytes to numpy array
+        2. Compute mel spectrogram of the audio
+        3. Apply DCT to get cepstral coefficients
+        4. Extract first 12-13 MFCC coefficients
+        5. Optionally add delta and delta-delta features
+        6. Feed to trained classifier (SVM/Random Forest)
+        7. Return predicted digit
+        Args:
+            audio_data: Raw audio bytes
+        Returns:
+            Predicted digit as string (currently '00')
+        """
+        logger.debug("Processing audio with MFCC (placeholder)")
+        # Simulate processing time (MFCC should be fastest)
+        import time
+        time.sleep(0.05)
+        # TODO: Implement actual MFCC processing:
+        # 1. audio_array = np.frombuffer(audio_data, dtype=np.float32)
+        # 2. mfccs = librosa.feature.mfcc(
+        #        y=audio_array,
+        #        sr=sample_rate,
+        #        n_mfcc=13,
+        #        n_fft=2048,
+        #        hop_length=512
+        #    )
+        # 3. # Optionally add delta features
+        # 4. delta_mfccs = librosa.feature.delta(mfccs)
+        # 5. features = np.concatenate([mfccs, delta_mfccs], axis=0)
+        # 6. prediction = self.svm_model.predict(features.T.flatten().reshape(1, -1))
+        # 7. return str(prediction[0])
+        return '00'
+    def get_model_info(self) -> dict:
+        """Get information about the MFCC model."""
+        return {
+            'method': 'MFCC (Mel-Frequency Cepstral Coefficients)',
+            'status': 'PLACEHOLDER',
+            'features': 'Cepstral coefficients with delta features',
+            'classifier': 'SVM/Random Forest (not implemented)',
+            'n_mfcc': 13,
+            'n_fft': 2048,
+            'hop_length': 512,
+            'expected_inference_time': '<100ms'
+        }

audio_processors/ml_mel_cnn_processor.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""
+ML Mel CNN Digit Processor
+Uses the trained Mel Spectrogram + 2D CNN model for digit classification
+"""
+import os
+import sys
+import time
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional, Union
+import numpy as np
+from .base_processor import AudioProcessor
+# Add project root to path for ML imports
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.append(str(PROJECT_ROOT))
+# Import ML inference
+from ml_training.inference import load_classifier
+logger = logging.getLogger(__name__)
+class MLMelCNNProcessor(AudioProcessor):
+    """
+    ML-based Mel CNN digit processor using trained 2D CNN model.
+    Performance characteristics (based on training results):
+    - Test accuracy: 97.22%
+    - Inference time: ~3-5ms
+    - Model size: ~2.6MB
+    """
+    name = "ML Mel CNN (2D Conv)"
+    def __init__(self, model_dir: str = "models", device: str = "auto"):
+        """
+        Initialize ML Mel CNN processor.
+        Args:
+            model_dir: Directory containing trained models
+            device: Device to run inference on ('cpu', 'cuda', or 'auto')
+        """
+        super().__init__(self.name)
+        self.model_dir = Path(model_dir)
+        self.device = device if device != "auto" else None
+        self.classifier = None
+        self._configured = False
+        # Performance tracking
+        self.prediction_count = 0
+        self.total_inference_time = 0.0
+        self.last_prediction_time = None
+        # Try to load the model
+        self._initialize_classifier()
+        logger.info(f"ML Mel CNN Processor initialized (configured: {self._configured})")
+    def _initialize_classifier(self):
+        """Initialize the ML classifier."""
+        try:
+            # Check if model directory exists
+            if not self.model_dir.exists():
+                logger.warning(f"Model directory not found: {self.model_dir}")
+                return
+            # Load the Mel CNN classifier
+            self.classifier = load_classifier(
+                model_dir=str(self.model_dir),
+                pipeline_type='mel_cnn',
+                device=self.device
+            )
+            self._configured = True
+            logger.info("ML Mel CNN classifier loaded successfully")
+            logger.info(f"  Model device: {self.classifier.device}")
+            logger.info(f"  Parameters: {sum(p.numel() for p in self.classifier.model.parameters()):,}")
+        except Exception as e:
+            logger.error(f"Failed to load ML Mel CNN classifier: {str(e)}")
+            self.classifier = None
+            self._configured = False
+    def is_configured(self) -> bool:
+        """Check if the processor is properly configured."""
+        return self._configured and self.classifier is not None
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio and return predicted digit (required by base class).
+        Args:
+            audio_data: Raw audio data in bytes
+        Returns:
+            predicted_digit: Predicted digit as string
+        """
+        return self.predict(audio_data)
+    def predict(self, audio_data: bytes) -> str:
+        """
+        Predict digit from audio data.
+        Args:
+            audio_data: Raw audio data in bytes
+        Returns:
+            predicted_digit: Predicted digit as string
+        """
+        if not self.is_configured():
+            raise RuntimeError("ML Mel CNN processor not properly configured")
+        try:
+            # Convert audio with optimized format for ML models
+            from utils.audio_utils import convert_for_ml_models
+            optimized_audio = convert_for_ml_models(audio_data, 'mel_cnn')
+            # Convert audio bytes to numpy array
+            audio_array = self._bytes_to_audio_array(optimized_audio)
+            # Make prediction using ML classifier
+            start_time = time.time()
+            result = self.classifier.predict(
+                audio_array,
+                return_probabilities=True,
+                return_features=False
+            )
+            inference_time = time.time() - start_time
+            # Update performance tracking
+            self.prediction_count += 1
+            self.total_inference_time += inference_time
+            self.last_prediction_time = inference_time
+            predicted_digit = str(result['predicted_digit'])
+            confidence = result['confidence']
+            logger.debug(f"ML Mel CNN prediction: '{predicted_digit}' "
+                        f"(confidence: {confidence:.3f}, time: {inference_time*1000:.1f}ms)")
+            return predicted_digit
+        except Exception as e:
+            logger.error(f"ML Mel CNN prediction failed: {str(e)}")
+            raise
+    def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]:
+        """
+        Predict digit with detailed timing and confidence information.
+        Args:
+            audio_data: Raw audio data in bytes
+        Returns:
+            result: Detailed prediction results
+        """
+        if not self.is_configured():
+            return {
+                'success': False,
+                'error': 'ML Mel CNN processor not properly configured',
+                'predicted_digit': None,
+                'inference_time': 0.0
+            }
+        try:
+            # Convert audio with optimized format for ML models
+            from utils.audio_utils import convert_for_ml_models
+            optimized_audio = convert_for_ml_models(audio_data, 'mel_cnn')
+            # Convert audio bytes to numpy array
+            audio_array = self._bytes_to_audio_array(optimized_audio)
+            # Make prediction using ML classifier
+            start_time = time.time()
+            ml_result = self.classifier.predict(
+                audio_array,
+                return_probabilities=True,
+                return_features=False
+            )
+            inference_time = time.time() - start_time
+            # Update performance tracking
+            self.prediction_count += 1
+            self.total_inference_time += inference_time
+            self.last_prediction_time = inference_time
+            # Format result
+            result = {
+                'success': True,
+                'predicted_digit': str(ml_result['predicted_digit']),
+                'confidence': ml_result['confidence'],
+                'inference_time': inference_time,
+                'class_probabilities': {
+                    str(k): float(v) for k, v in ml_result['class_probabilities'].items()
+                },
+                'top_3_predictions': [
+                    {
+                        'digit': str(pred['digit']),
+                        'probability': pred['probability']
+                    }
+                    for pred in ml_result['top_3_predictions']
+                ],
+                'method': self.name,
+                'model_type': 'ml_mel_cnn',
+                'timestamp': time.time()
+            }
+            logger.debug(f"ML Mel CNN detailed prediction: '{result['predicted_digit']}' "
+                        f"(confidence: {result['confidence']:.3f}, "
+                        f"time: {inference_time*1000:.1f}ms)")
+            return result
+        except Exception as e:
+            logger.error(f"ML Mel CNN prediction with timing failed: {str(e)}")
+            return {
+                'success': False,
+                'error': str(e),
+                'predicted_digit': None,
+                'inference_time': 0.0,
+                'method': self.name,
+                'model_type': 'ml_mel_cnn',
+                'timestamp': time.time()
+            }
+    def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray:
+        """Convert audio bytes to numpy array."""
+        try:
+            # Try to interpret as int16 PCM first (most common)
+            audio_array = np.frombuffer(audio_data, dtype=np.int16)
+            # Convert to float32 and normalize
+            audio_array = audio_array.astype(np.float32) / 32768.0
+            # If the array is too short, pad it
+            if len(audio_array) < 1000:  # Less than ~60ms at 16kHz
+                # Pad with zeros to minimum length
+                audio_array = np.pad(audio_array, (0, 1000 - len(audio_array)))
+            return audio_array
+        except Exception as e:
+            logger.error(f"Failed to convert audio bytes to array: {str(e)}")
+            # Return a small zero array as fallback
+            return np.zeros(1000, dtype=np.float32)
+    def get_stats(self) -> Dict[str, Any]:
+        """Get processor performance statistics."""
+        stats = super().get_stats()
+        if self.prediction_count > 0:
+            stats.update({
+                'ml_predictions': self.prediction_count,
+                'average_inference_time': self.total_inference_time / self.prediction_count,
+                'last_inference_time': self.last_prediction_time,
+                'throughput_per_second': self.prediction_count / self.total_inference_time if self.total_inference_time > 0 else 0,
+                'model_configured': self.is_configured()
+            })
+        if self.classifier:
+            # Get ML classifier performance stats
+            ml_stats = self.classifier.get_performance_stats()
+            stats['ml_classifier_stats'] = ml_stats
+        return stats
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model."""
+        if not self.is_configured():
+            return {'error': 'Model not loaded'}
+        try:
+            info = {
+                'pipeline_type': 'mel_cnn',
+                'model_class': self.classifier.model.__class__.__name__,
+                'device': str(self.classifier.device),
+                'parameters': sum(p.numel() for p in self.classifier.model.parameters()),
+                'feature_extractor': self.classifier.feature_extractor.__class__.__name__,
+                'has_scaler': self.classifier.scaler is not None,
+                'expected_sample_rate': 8000,
+                'expected_audio_length': 8000,  # 1 second at 8kHz
+                'input_shape': '(1, 64, 51)',  # Mel spectrogram shape
+                'model_architecture': '2D CNN'
+            }
+            if hasattr(self.classifier, 'model_path'):
+                info['model_path'] = str(self.classifier.model_path)
+            return info
+        except Exception as e:
+            logger.error(f"Failed to get model info: {str(e)}")
+            return {'error': str(e)}
+    def benchmark_speed(self, num_samples: int = 100) -> Dict[str, Any]:
+        """Benchmark inference speed."""
+        if not self.is_configured():
+            return {'error': 'Model not configured'}
+        try:
+            return self.classifier.benchmark_speed(num_samples)
+        except Exception as e:
+            logger.error(f"Benchmark failed: {str(e)}")
+            return {'error': str(e)}

audio_processors/ml_mfcc_processor.py ADDED Viewed

	@@ -0,0 +1,370 @@

+"""
+ML MFCC Digit Processor
+Uses the trained MFCC + Dense NN model for digit classification
+"""
+import os
+import sys
+import time
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional, Union
+import numpy as np
+from .base_processor import AudioProcessor
+# Add project root to path for ML imports
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.append(str(PROJECT_ROOT))
+# Import ML inference
+from ml_training.inference import load_classifier
+logger = logging.getLogger(__name__)
+class MLMFCCProcessor(AudioProcessor):
+    """
+    ML-based MFCC digit processor using trained Dense NN model.
+    Performance characteristics (based on training results):
+    - Test accuracy: 98.52%
+    - Inference time: ~1-2ms
+    - Model size: ~0.3MB
+    """
+    name = "ML MFCC + Dense NN (Best)"
+    def __init__(self, model_dir: str = "models", device: str = "auto"):
+        """
+        Initialize ML MFCC processor.
+        Args:
+            model_dir: Directory containing trained models
+            device: Device to run inference on ('cpu', 'cuda', or 'auto')
+        """
+        super().__init__(self.name)
+        self.model_dir = Path(model_dir)
+        self.device = device if device != "auto" else None
+        self.classifier = None
+        self._configured = False
+        # Performance tracking
+        self.prediction_count = 0
+        self.total_inference_time = 0.0
+        self.last_prediction_time = None
+        # Try to load the model
+        self._initialize_classifier()
+        logger.info(f"ML MFCC Processor initialized (configured: {self._configured})")
+    def _initialize_classifier(self):
+        """Initialize the ML classifier."""
+        try:
+            # Check if model directory exists
+            if not self.model_dir.exists():
+                logger.warning(f"Model directory not found: {self.model_dir}")
+                return
+            # Load the MFCC classifier
+            self.classifier = load_classifier(
+                model_dir=str(self.model_dir),
+                pipeline_type='mfcc',
+                device=self.device
+            )
+            self._configured = True
+            logger.info("ML MFCC classifier loaded successfully")
+            logger.info(f"  Model device: {self.classifier.device}")
+            logger.info(f"  Parameters: {sum(p.numel() for p in self.classifier.model.parameters()):,}")
+        except Exception as e:
+            logger.error(f"Failed to load ML MFCC classifier: {str(e)}")
+            self.classifier = None
+            self._configured = False
+    def is_configured(self) -> bool:
+        """Check if the processor is properly configured."""
+        return self._configured and self.classifier is not None
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio and return predicted digit (required by base class).
+        Args:
+            audio_data: Raw audio data in bytes
+        Returns:
+            predicted_digit: Predicted digit as string
+        """
+        return self.predict(audio_data)
+    def predict(self, audio_data: bytes) -> str:
+        """
+        Predict digit from audio data.
+        Args:
+            audio_data: Raw audio data in bytes
+        Returns:
+            predicted_digit: Predicted digit as string
+        """
+        if not self.is_configured():
+            raise RuntimeError("ML MFCC processor not properly configured")
+        try:
+            # Convert audio with optimized format for ML models
+            from utils.audio_utils import convert_for_ml_models
+            optimized_audio = convert_for_ml_models(audio_data, 'mfcc')
+            # Convert audio bytes to numpy array
+            audio_array = self._bytes_to_audio_array(optimized_audio)
+            # No audio preprocessing needed - normalization happens at feature level in ML pipeline
+            # Make prediction using ML classifier
+            start_time = time.time()
+            result = self.classifier.predict(
+                audio_array,
+                return_probabilities=True,
+                return_features=False
+            )
+            inference_time = time.time() - start_time
+            # Update performance tracking
+            self.prediction_count += 1
+            self.total_inference_time += inference_time
+            self.last_prediction_time = inference_time
+            predicted_digit = str(result['predicted_digit'])
+            confidence = result['confidence']
+            # Debug logging for predictions (temporary)
+            if hasattr(result, 'probabilities') or 'probabilities' in result:
+                probs = result.get('probabilities', [])
+                if len(probs) >= 10:
+                    top_predictions = [(i, p) for i, p in enumerate(probs)]
+                    top_predictions.sort(key=lambda x: x[1], reverse=True)
+                    logger.debug(f"MFCC Top 3 predictions: {[(str(d), f'{p:.3f}') for d, p in top_predictions[:3]]}")
+            logger.debug(f"MFCC predicted '{predicted_digit}' with confidence {confidence:.3f} in {inference_time:.3f}s")
+            logger.debug(f"ML MFCC prediction: '{predicted_digit}' "
+                        f"(confidence: {confidence:.3f}, time: {inference_time*1000:.1f}ms)")
+            return predicted_digit
+        except Exception as e:
+            logger.error(f"ML MFCC prediction failed: {str(e)}")
+            raise
+    def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]:
+        """
+        Predict digit with detailed timing and confidence information.
+        Args:
+            audio_data: Raw audio data in bytes
+        Returns:
+            result: Detailed prediction results
+        """
+        if not self.is_configured():
+            return {
+                'success': False,
+                'error': 'ML MFCC processor not properly configured',
+                'predicted_digit': None,
+                'inference_time': 0.0
+            }
+        try:
+            # Convert audio with optimized format for ML models
+            from utils.audio_utils import convert_for_ml_models
+            optimized_audio = convert_for_ml_models(audio_data, 'mfcc')
+            # Convert audio bytes to numpy array
+            audio_array = self._bytes_to_audio_array(optimized_audio)
+            # No audio preprocessing needed - normalization happens at feature level in ML pipeline
+            # Make prediction using ML classifier
+            start_time = time.time()
+            ml_result = self.classifier.predict(
+                audio_array,
+                return_probabilities=True,
+                return_features=False
+            )
+            inference_time = time.time() - start_time
+            # Update performance tracking
+            self.prediction_count += 1
+            self.total_inference_time += inference_time
+            self.last_prediction_time = inference_time
+            # Format result
+            result = {
+                'success': True,
+                'predicted_digit': str(ml_result['predicted_digit']),
+                'confidence': ml_result['confidence'],
+                'inference_time': inference_time,
+                'class_probabilities': {
+                    str(k): float(v) for k, v in ml_result['class_probabilities'].items()
+                },
+                'top_3_predictions': [
+                    {
+                        'digit': str(pred['digit']),
+                        'probability': pred['probability']
+                    }
+                    for pred in ml_result['top_3_predictions']
+                ],
+                'method': self.name,
+                'model_type': 'ml_mfcc',
+                'timestamp': time.time()
+            }
+            logger.debug(f"ML MFCC detailed prediction: '{result['predicted_digit']}' "
+                        f"(confidence: {result['confidence']:.3f}, "
+                        f"time: {inference_time*1000:.1f}ms)")
+            return result
+        except Exception as e:
+            logger.error(f"ML MFCC prediction with timing failed: {str(e)}")
+            return {
+                'success': False,
+                'error': str(e),
+                'predicted_digit': None,
+                'inference_time': 0.0,
+                'method': self.name,
+                'model_type': 'ml_mfcc',
+                'timestamp': time.time()
+            }
+    def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray:
+        """Convert audio bytes to numpy array."""
+        try:
+            # Try to interpret as int16 PCM first (most common)
+            audio_array = np.frombuffer(audio_data, dtype=np.int16)
+            # Convert to float32 and normalize
+            audio_array = audio_array.astype(np.float32) / 32768.0
+            # If the array is too short, pad it
+            if len(audio_array) < 1000:  # Less than ~60ms at 16kHz
+                # Pad with zeros to minimum length
+                audio_array = np.pad(audio_array, (0, 1000 - len(audio_array)))
+            return audio_array
+        except Exception as e:
+            logger.error(f"Failed to convert audio bytes to array: {str(e)}")
+            # Return a small zero array as fallback
+            return np.zeros(1000, dtype=np.float32)
+    def _preprocess_audio_for_mfcc(self, audio_array: np.ndarray) -> np.ndarray:
+        """
+        Apply MFCC-specific audio preprocessing to improve model performance.
+        This compensates for missing scaler normalization.
+        Args:
+            audio_array: Raw audio array
+        Returns:
+            preprocessed_audio: Audio array optimized for MFCC feature extraction
+        """
+        try:
+            # Remove DC component
+            audio_array = audio_array - np.mean(audio_array)
+            # Apply gentle normalization to handle volume variations
+            # This helps compensate for the missing feature scaler
+            max_val = np.max(np.abs(audio_array))
+            if max_val > 0:
+                audio_array = audio_array / max_val * 0.7  # Scale to 70% of max to avoid clipping
+            # Apply a gentle high-pass filter to remove low-frequency noise
+            # This improves MFCC feature quality
+            from scipy import signal
+            if len(audio_array) > 100:  # Only apply if we have enough samples
+                # Simple high-pass filter at ~300Hz for 8kHz sample rate
+                sos = signal.butter(2, 300, btype='high', fs=8000, output='sos')
+                audio_array = signal.sosfilt(sos, audio_array)
+            # Ensure we don't have any NaN or inf values
+            audio_array = np.nan_to_num(audio_array, nan=0.0, posinf=0.0, neginf=0.0)
+            logger.debug(f"MFCC preprocessing applied: range=[{np.min(audio_array):.3f}, {np.max(audio_array):.3f}], "
+                        f"mean={np.mean(audio_array):.3f}, std={np.std(audio_array):.3f}")
+            return audio_array
+        except ImportError:
+            # Fallback if scipy is not available - just normalize
+            logger.warning("Scipy not available, using basic normalization")
+            audio_array = audio_array - np.mean(audio_array)
+            max_val = np.max(np.abs(audio_array))
+            if max_val > 0:
+                audio_array = audio_array / max_val * 0.7
+            return audio_array
+        except Exception as e:
+            logger.error(f"MFCC preprocessing failed: {str(e)}")
+            # Return original array if preprocessing fails
+            return audio_array
+    def get_stats(self) -> Dict[str, Any]:
+        """Get processor performance statistics."""
+        stats = super().get_stats()
+        if self.prediction_count > 0:
+            stats.update({
+                'ml_predictions': self.prediction_count,
+                'average_inference_time': self.total_inference_time / self.prediction_count,
+                'last_inference_time': self.last_prediction_time,
+                'throughput_per_second': self.prediction_count / self.total_inference_time if self.total_inference_time > 0 else 0,
+                'model_configured': self.is_configured()
+            })
+        if self.classifier:
+            # Get ML classifier performance stats
+            ml_stats = self.classifier.get_performance_stats()
+            stats['ml_classifier_stats'] = ml_stats
+        return stats
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model."""
+        if not self.is_configured():
+            return {'error': 'Model not loaded'}
+        try:
+            info = {
+                'pipeline_type': 'mfcc',
+                'model_class': self.classifier.model.__class__.__name__,
+                'device': str(self.classifier.device),
+                'parameters': sum(p.numel() for p in self.classifier.model.parameters()),
+                'feature_extractor': self.classifier.feature_extractor.__class__.__name__,
+                'has_scaler': self.classifier.scaler is not None,
+                'expected_sample_rate': 8000,
+                'expected_audio_length': 8000  # 1 second at 8kHz
+            }
+            if hasattr(self.classifier, 'model_path'):
+                info['model_path'] = str(self.classifier.model_path)
+            return info
+        except Exception as e:
+            logger.error(f"Failed to get model info: {str(e)}")
+            return {'error': str(e)}
+    def benchmark_speed(self, num_samples: int = 100) -> Dict[str, Any]:
+        """Benchmark inference speed."""
+        if not self.is_configured():
+            return {'error': 'Model not configured'}
+        try:
+            return self.classifier.benchmark_speed(num_samples)
+        except Exception as e:
+            logger.error(f"Benchmark failed: {str(e)}")
+            return {'error': str(e)}

audio_processors/ml_raw_cnn_processor.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""
+ML Raw CNN Digit Processor
+Uses the trained Raw Waveform + 1D CNN model for digit classification
+"""
+import os
+import sys
+import time
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional, Union
+import numpy as np
+from .base_processor import AudioProcessor
+# Add project root to path for ML imports
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.append(str(PROJECT_ROOT))
+# Import ML inference
+from ml_training.inference import load_classifier
+logger = logging.getLogger(__name__)
+class MLRawCNNProcessor(AudioProcessor):
+    """
+    ML-based Raw CNN digit processor using trained 1D CNN model.
+    Performance characteristics (based on training results):
+    - Test accuracy: 91.30%
+    - Inference time: ~5-8ms
+    - Model size: ~2.6MB
+    """
+    name = "ML Raw CNN (1D Conv)"
+    def __init__(self, model_dir: str = "models", device: str = "auto"):
+        """
+        Initialize ML Raw CNN processor.
+        Args:
+            model_dir: Directory containing trained models
+            device: Device to run inference on ('cpu', 'cuda', or 'auto')
+        """
+        super().__init__(self.name)
+        self.model_dir = Path(model_dir)
+        self.device = device if device != "auto" else None
+        self.classifier = None
+        self._configured = False
+        # Performance tracking
+        self.prediction_count = 0
+        self.total_inference_time = 0.0
+        self.last_prediction_time = None
+        # Try to load the model
+        self._initialize_classifier()
+        logger.info(f"ML Raw CNN Processor initialized (configured: {self._configured})")
+    def _initialize_classifier(self):
+        """Initialize the ML classifier."""
+        try:
+            # Check if model directory exists
+            if not self.model_dir.exists():
+                logger.warning(f"Model directory not found: {self.model_dir}")
+                return
+            # Load the Raw CNN classifier
+            self.classifier = load_classifier(
+                model_dir=str(self.model_dir),
+                pipeline_type='raw_cnn',
+                device=self.device
+            )
+            self._configured = True
+            logger.info("ML Raw CNN classifier loaded successfully")
+            logger.info(f"  Model device: {self.classifier.device}")
+            logger.info(f"  Parameters: {sum(p.numel() for p in self.classifier.model.parameters()):,}")
+        except Exception as e:
+            logger.error(f"Failed to load ML Raw CNN classifier: {str(e)}")
+            self.classifier = None
+            self._configured = False
+    def is_configured(self) -> bool:
+        """Check if the processor is properly configured."""
+        return self._configured and self.classifier is not None
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio and return predicted digit (required by base class).
+        Args:
+            audio_data: Raw audio data in bytes
+        Returns:
+            predicted_digit: Predicted digit as string
+        """
+        return self.predict(audio_data)
+    def predict(self, audio_data: bytes) -> str:
+        """
+        Predict digit from audio data.
+        Args:
+            audio_data: Raw audio data in bytes
+        Returns:
+            predicted_digit: Predicted digit as string
+        """
+        if not self.is_configured():
+            raise RuntimeError("ML Raw CNN processor not properly configured")
+        try:
+            # Convert audio with optimized format for ML models
+            from utils.audio_utils import convert_for_ml_models
+            optimized_audio = convert_for_ml_models(audio_data, 'raw_cnn')
+            # Convert audio bytes to numpy array
+            audio_array = self._bytes_to_audio_array(optimized_audio)
+            # Make prediction using ML classifier
+            start_time = time.time()
+            result = self.classifier.predict(
+                audio_array,
+                return_probabilities=True,
+                return_features=False
+            )
+            inference_time = time.time() - start_time
+            # Update performance tracking
+            self.prediction_count += 1
+            self.total_inference_time += inference_time
+            self.last_prediction_time = inference_time
+            predicted_digit = str(result['predicted_digit'])
+            confidence = result['confidence']
+            logger.debug(f"ML Raw CNN prediction: '{predicted_digit}' "
+                        f"(confidence: {confidence:.3f}, time: {inference_time*1000:.1f}ms)")
+            return predicted_digit
+        except Exception as e:
+            logger.error(f"ML Raw CNN prediction failed: {str(e)}")
+            raise
+    def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]:
+        """
+        Predict digit with detailed timing and confidence information.
+        Args:
+            audio_data: Raw audio data in bytes
+        Returns:
+            result: Detailed prediction results
+        """
+        if not self.is_configured():
+            return {
+                'success': False,
+                'error': 'ML Raw CNN processor not properly configured',
+                'predicted_digit': None,
+                'inference_time': 0.0
+            }
+        try:
+            # Convert audio with optimized format for ML models
+            from utils.audio_utils import convert_for_ml_models
+            optimized_audio = convert_for_ml_models(audio_data, 'raw_cnn')
+            # Convert audio bytes to numpy array
+            audio_array = self._bytes_to_audio_array(optimized_audio)
+            # Make prediction using ML classifier
+            start_time = time.time()
+            ml_result = self.classifier.predict(
+                audio_array,
+                return_probabilities=True,
+                return_features=False
+            )
+            inference_time = time.time() - start_time
+            # Update performance tracking
+            self.prediction_count += 1
+            self.total_inference_time += inference_time
+            self.last_prediction_time = inference_time
+            # Format result
+            result = {
+                'success': True,
+                'predicted_digit': str(ml_result['predicted_digit']),
+                'confidence': ml_result['confidence'],
+                'inference_time': inference_time,
+                'class_probabilities': {
+                    str(k): float(v) for k, v in ml_result['class_probabilities'].items()
+                },
+                'top_3_predictions': [
+                    {
+                        'digit': str(pred['digit']),
+                        'probability': pred['probability']
+                    }
+                    for pred in ml_result['top_3_predictions']
+                ],
+                'method': self.name,
+                'model_type': 'ml_raw_cnn',
+                'timestamp': time.time()
+            }
+            logger.debug(f"ML Raw CNN detailed prediction: '{result['predicted_digit']}' "
+                        f"(confidence: {result['confidence']:.3f}, "
+                        f"time: {inference_time*1000:.1f}ms)")
+            return result
+        except Exception as e:
+            logger.error(f"ML Raw CNN prediction with timing failed: {str(e)}")
+            return {
+                'success': False,
+                'error': str(e),
+                'predicted_digit': None,
+                'inference_time': 0.0,
+                'method': self.name,
+                'model_type': 'ml_raw_cnn',
+                'timestamp': time.time()
+            }
+    def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray:
+        """Convert audio bytes to numpy array."""
+        try:
+            # Try to interpret as int16 PCM first (most common)
+            audio_array = np.frombuffer(audio_data, dtype=np.int16)
+            # Convert to float32 and normalize
+            audio_array = audio_array.astype(np.float32) / 32768.0
+            # If the array is too short, pad it
+            if len(audio_array) < 1000:  # Less than ~60ms at 16kHz
+                # Pad with zeros to minimum length
+                audio_array = np.pad(audio_array, (0, 1000 - len(audio_array)))
+            return audio_array
+        except Exception as e:
+            logger.error(f"Failed to convert audio bytes to array: {str(e)}")
+            # Return a small zero array as fallback
+            return np.zeros(1000, dtype=np.float32)
+    def get_stats(self) -> Dict[str, Any]:
+        """Get processor performance statistics."""
+        stats = super().get_stats()
+        if self.prediction_count > 0:
+            stats.update({
+                'ml_predictions': self.prediction_count,
+                'average_inference_time': self.total_inference_time / self.prediction_count,
+                'last_inference_time': self.last_prediction_time,
+                'throughput_per_second': self.prediction_count / self.total_inference_time if self.total_inference_time > 0 else 0,
+                'model_configured': self.is_configured()
+            })
+        if self.classifier:
+            # Get ML classifier performance stats
+            ml_stats = self.classifier.get_performance_stats()
+            stats['ml_classifier_stats'] = ml_stats
+        return stats
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model."""
+        if not self.is_configured():
+            return {'error': 'Model not loaded'}
+        try:
+            info = {
+                'pipeline_type': 'raw_cnn',
+                'model_class': self.classifier.model.__class__.__name__,
+                'device': str(self.classifier.device),
+                'parameters': sum(p.numel() for p in self.classifier.model.parameters()),
+                'feature_extractor': None,  # Raw waveforms don't need feature extraction
+                'has_scaler': False,
+                'expected_sample_rate': 8000,
+                'expected_audio_length': 8000,  # 1 second at 8kHz
+                'input_shape': '(1, 1, 8000)',  # Raw waveform shape
+                'model_architecture': '1D CNN'
+            }
+            if hasattr(self.classifier, 'model_path'):
+                info['model_path'] = str(self.classifier.model_path)
+            return info
+        except Exception as e:
+            logger.error(f"Failed to get model info: {str(e)}")
+            return {'error': str(e)}
+    def benchmark_speed(self, num_samples: int = 100) -> Dict[str, Any]:
+        """Benchmark inference speed."""
+        if not self.is_configured():
+            return {'error': 'Model not configured'}
+        try:
+            return self.classifier.benchmark_speed(num_samples)
+        except Exception as e:
+            logger.error(f"Benchmark failed: {str(e)}")
+            return {'error': str(e)}

audio_processors/raw_spectrogram.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import numpy as np
+import logging
+from .base_processor import AudioProcessor
+logger = logging.getLogger(__name__)
+class RawSpectrogramProcessor(AudioProcessor):
+    """
+    Raw Spectrogram processor using STFT (Short-Time Fourier Transform).
+    Future implementation will:
+    - Apply STFT to audio data for time-frequency representation
+    - Use CNN classifier trained on spectrogram images
+    - Process raw frequency domain features without mel scaling
+    Currently returns placeholder '00' for testing UI functionality.
+    """
+    def __init__(self):
+        super().__init__("Raw Spectrogram")
+        logger.info("Raw Spectrogram processor initialized (PLACEHOLDER MODE)")
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio using raw spectrogram analysis.
+        PLACEHOLDER IMPLEMENTATION:
+        Currently returns '00' for UI testing purposes.
+        Future implementation will:
+        1. Convert audio bytes to numpy array
+        2. Apply STFT with appropriate window size and overlap
+        3. Create time-frequency representation
+        4. Normalize spectrogram values
+        5. Feed to trained CNN model
+        6. Return predicted digit
+        Args:
+            audio_data: Raw audio bytes
+        Returns:
+            Predicted digit as string (currently '00')
+        """
+        logger.debug("Processing audio with Raw Spectrogram (placeholder)")
+        # Simulate processing time
+        import time
+        time.sleep(0.1)
+        # TODO: Implement actual STFT-based processing:
+        # 1. audio_array = np.frombuffer(audio_data, dtype=np.float32)
+        # 2. stft_result = np.abs(librosa.stft(audio_array, n_fft=2048, hop_length=512))
+        # 3. spectrogram = librosa.amplitude_to_db(stft_result, ref=np.max)
+        # 4. prediction = self.cnn_model.predict(spectrogram)
+        # 5. return str(np.argmax(prediction))
+        return '00'
+    def get_model_info(self) -> dict:
+        """Get information about the raw spectrogram model."""
+        return {
+            'method': 'Raw Spectrogram (STFT)',
+            'status': 'PLACEHOLDER',
+            'features': 'Time-frequency representation',
+            'classifier': 'CNN (not implemented)',
+            'window_size': 2048,
+            'hop_length': 512,
+            'expected_inference_time': '<1s'
+        }

audio_processors/wav2vec2_processor.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import logging
+import numpy as np
+from typing import Optional
+from .base_processor import AudioProcessor
+logger = logging.getLogger(__name__)
+class Wav2Vec2Processor(AudioProcessor):
+    """
+    Wav2Vec2 model processor for speech recognition.
+    Lightweight alternative to Whisper.
+    """
+    def __init__(self):
+        super().__init__("Wav2Vec2 (Facebook)")
+        self.processor = None
+        self.model = None
+        self.model_name = "facebook/wav2vec2-base-960h"
+        self.is_initialized = False
+    def _initialize_model(self):
+        """Lazy initialization of the model"""
+        if self.is_initialized:
+            return
+        try:
+            logger.info(f"Loading Wav2Vec2 model: {self.model_name}")
+            from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+            import torch
+            # Load processor and model
+            self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
+            self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)
+            # Move to GPU if available
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.model = self.model.to(device)
+            self.device = device
+            logger.info(f"Wav2Vec2 model loaded on {device}")
+            self.is_initialized = True
+        except ImportError as e:
+            logger.error("transformers library not installed. Run: pip install transformers torch")
+            raise Exception("transformers library required for Wav2Vec2 processing")
+        except Exception as e:
+            logger.error(f"Failed to load Wav2Vec2 model: {str(e)}")
+            raise Exception(f"Wav2Vec2 model initialization failed: {str(e)}")
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Process audio using Wav2Vec2 model.
+        Args:
+            audio_data: Raw audio bytes (WAV format preferred)
+        Returns:
+            Predicted digit as string ('0'-'9')
+        Raises:
+            Exception: If processing fails
+        """
+        try:
+            # Initialize model on first use
+            self._initialize_model()
+            # Convert audio bytes to numpy array
+            from utils.audio_utils import audio_to_numpy
+            audio_array, sample_rate = audio_to_numpy(audio_data)
+            # Resample to 16kHz if needed (Wav2Vec2 expects 16kHz)
+            if sample_rate != 16000:
+                logger.debug(f"Resampling from {sample_rate}Hz to 16kHz")
+                import librosa
+                audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
+            logger.debug(f"Processing audio: {len(audio_array)} samples at 16kHz")
+            # Process with Wav2Vec2
+            import torch
+            # Tokenize audio
+            input_values = self.processor(
+                audio_array,
+                return_tensors="pt",
+                padding="longest",
+                sampling_rate=16000
+            ).input_values.to(self.device)
+            # Get logits
+            with torch.no_grad():
+                logits = self.model(input_values).logits
+            # Get predicted tokens
+            predicted_ids = torch.argmax(logits, dim=-1)
+            # Decode transcription
+            transcription = self.processor.batch_decode(predicted_ids)[0].lower().strip()
+            logger.debug(f"Wav2Vec2 transcription: '{transcription}'")
+            # Extract digit from transcription
+            predicted_digit = self._extract_digit(transcription)
+            if predicted_digit is None:
+                logger.warning(f"No digit found in transcription: '{transcription}'")
+                return "?"
+            return predicted_digit
+        except Exception as e:
+            logger.error(f"Wav2Vec2 processing failed: {str(e)}")
+            raise Exception(f"Wav2Vec2 processing error: {str(e)}")
+    def _extract_digit(self, text: str) -> Optional[str]:
+        """
+        Extract digit from transcribed text.
+        Handles both numerical ('1', '2') and word forms ('one', 'two').
+        """
+        import re
+        # Word to digit mapping
+        word_to_digit = {
+            'zero': '0', 'oh': '0',
+            'one': '1', 'won': '1',
+            'two': '2', 'to': '2', 'too': '2',
+            'three': '3', 'tree': '3',
+            'four': '4', 'for': '4', 'fore': '4', 'full': '4',  # "full" often misheard as "four"
+            'five': '5',
+            'six': '6', 'sick': '6',
+            'seven': '7',
+            'eight': '8', 'ate': '8',
+            'nine': '9', 'niner': '9'
+        }
+        # First, try to find a direct digit
+        digit_match = re.search(r'\b([0-9])\b', text)
+        if digit_match:
+            return digit_match.group(1)
+        # Then try word forms
+        words = text.split()
+        for word in words:
+            clean_word = re.sub(r'[^\w]', '', word.lower())
+            if clean_word in word_to_digit:
+                return word_to_digit[clean_word]
+        # Try partial matches for robustness
+        for word, digit in word_to_digit.items():
+            if word in text:
+                return digit
+        return None
+    def is_configured(self) -> bool:
+        """Check if Wav2Vec2 model can be initialized."""
+        try:
+            import transformers
+            import torch
+            return True
+        except ImportError:
+            return False
+    def test_connection(self) -> bool:
+        """Test Wav2Vec2 model functionality."""
+        try:
+            self._initialize_model()
+            return True
+        except:
+            return False

audio_processors/whisper_digit_processor.py ADDED Viewed

	@@ -0,0 +1,429 @@

+"""
+Whisper-based digit recognition processor
+Specialized implementation for spoken digit recognition (0-9)
+"""
+import numpy as np
+import io
+import time
+import logging
+from typing import Dict, Any, Optional
+import torch
+from transformers import pipeline
+import soundfile as sf
+from .base_processor import AudioProcessor
+logger = logging.getLogger(__name__)
+class WhisperDigitProcessor(AudioProcessor):
+    """
+    Whisper-based digit recognition processor using Hugging Face transformers.
+    Optimized for single digit recognition with mapping from text to numbers.
+    """
+    def __init__(self):
+        """Initialize Whisper digit processor with optimized settings."""
+        super().__init__("Whisper Digit Recognition")
+        self.model = None
+        self.device = 0 if torch.cuda.is_available() else -1
+        # Digit mapping for text-to-number conversion
+        self.digit_map = {
+            "zero": "0", "one": "1", "two": "2", "three": "3",
+            "four": "4", "five": "5", "six": "6", "seven": "7",
+            "eight": "8", "nine": "9",
+            # Common variations and alternatives
+            "oh": "0", "o": "0",
+            "for": "4", "fore": "4", "to": "2", "too": "2", "tu": "2",
+            "tree": "3", "free": "3", "ate": "8", "ait": "8"
+        }
+        # Reverse mapping for validation
+        self.number_words = set(self.digit_map.keys())
+        # Statistics tracking
+        self.total_predictions = 0
+        self.successful_predictions = 0
+        self.failed_predictions = 0
+        self.average_inference_time = 0.0
+        self._initialize_model()
+    def _initialize_model(self):
+        """Initialize the Whisper model with optimal settings for digit recognition."""
+        try:
+            logger.info("Initializing Whisper model for digit recognition...")
+            # Use Whisper tiny model for fast inference
+            self.model = pipeline(
+                "automatic-speech-recognition",
+                model="openai/whisper-tiny",
+                device=self.device,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                return_timestamps=False  # We don't need timestamps for single digits
+            )
+            logger.info(f"Whisper model initialized successfully on device: {self.device}")
+            # Test model with dummy input
+            test_audio = np.random.randn(16000).astype(np.float32)  # 1 second of noise
+            try:
+                test_result = self.model(test_audio)
+                logger.info("Model test successful")
+            except Exception as e:
+                logger.warning(f"Model test failed but model loaded: {e}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to initialize Whisper model: {e}")
+            return False
+    def is_configured(self) -> bool:
+        """Check if the processor is properly configured."""
+        return self.model is not None
+    def process_audio(self, audio_data: bytes) -> str:
+        """
+        Predict digit from audio data.
+        Args:
+            audio_data: Raw audio bytes (WAV format preferred)
+        Returns:
+            str: Predicted digit (0-9) or error message
+        """
+        if not self.is_configured():
+            return "error: Model not configured"
+        try:
+            # Convert audio bytes to numpy array
+            audio_array = self._convert_audio_to_array(audio_data)
+            if audio_array is None:
+                return "error: Invalid audio format"
+            # Ensure proper sample rate and format
+            audio_array = self._preprocess_audio(audio_array)
+            # Run Whisper inference
+            result = self.model(audio_array)
+            text = result["text"].strip().lower()
+            # Convert text to digit
+            digit = self._text_to_digit(text)
+            # Enhanced logging to debug transcription issues
+            logger.info(f"🎤 Whisper transcription: '{text}' -> digit: '{digit}'")
+            logger.info(f"📊 Audio stats: duration={len(audio_array)/16000:.2f}s, samples={len(audio_array)}, max_val={np.max(np.abs(audio_array)):.3f}")
+            if digit in "0123456789":
+                self.successful_predictions += 1
+                return digit
+            else:
+                self.failed_predictions += 1
+                return f"unclear: {text}"
+        except Exception as e:
+            logger.error(f"Whisper prediction failed: {e}")
+            self.failed_predictions += 1
+            return f"error: {str(e)}"
+        finally:
+            self.total_predictions += 1
+    def _convert_audio_to_array(self, audio_data: bytes) -> Optional[np.ndarray]:
+        """
+        Convert audio bytes to numpy array.
+        Args:
+            audio_data: Raw audio bytes (could be WAV file or raw PCM from VAD)
+        Returns:
+            np.ndarray: Audio samples or None if conversion failed
+        """
+        # First check if this looks like raw PCM data from VAD (no file headers)
+        if len(audio_data) < 100 or not audio_data.startswith(b'RIFF'):
+            # This is likely raw PCM data from WebRTC VAD
+            try:
+                logger.debug("Processing raw PCM data from VAD segment")
+                audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
+                audio_array = audio_array / 32768.0  # Normalize to [-1, 1]
+                self._original_sample_rate = 16000  # WebRTC VAD uses 16kHz
+                return audio_array
+            except Exception as e:
+                logger.error(f"Failed to process raw PCM data: {e}")
+                return None
+        # This looks like a complete audio file (WAV, etc.)
+        try:
+            # Try to read as audio file using soundfile
+            audio_buffer = io.BytesIO(audio_data)
+            audio_array, sample_rate = sf.read(audio_buffer, dtype='float32')
+            # Handle stereo to mono conversion
+            if len(audio_array.shape) > 1:
+                audio_array = np.mean(audio_array, axis=1)
+            # Store original sample rate for resampling
+            self._original_sample_rate = sample_rate
+            logger.debug(f"Successfully loaded audio file: {len(audio_array)} samples at {sample_rate}Hz")
+            return audio_array
+        except Exception as e:
+            logger.warning(f"Audio file conversion failed with soundfile: {e}")
+            # Final fallback: treat as raw PCM
+            try:
+                logger.debug("Fallback: treating as raw PCM data")
+                audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
+                audio_array = audio_array / 32768.0  # Normalize to [-1, 1]
+                self._original_sample_rate = 16000  # Assume 16kHz
+                return audio_array
+            except Exception as e2:
+                logger.error(f"All audio conversion methods failed: {e2}")
+                return None
+    def _preprocess_audio(self, audio_array: np.ndarray) -> np.ndarray:
+        """
+        Preprocess audio for optimal Whisper performance.
+        Args:
+            audio_array: Raw audio samples
+        Returns:
+            np.ndarray: Preprocessed audio
+        """
+        # Resample to 16kHz if needed (Whisper's expected input)
+        target_sample_rate = 16000
+        if hasattr(self, '_original_sample_rate') and self._original_sample_rate != target_sample_rate:
+            try:
+                import librosa
+                audio_array = librosa.resample(
+                    audio_array,
+                    orig_sr=self._original_sample_rate,
+                    target_sr=target_sample_rate
+                )
+                logger.debug(f"Resampled audio from {self._original_sample_rate}Hz to {target_sample_rate}Hz")
+            except ImportError:
+                logger.warning("librosa not available for resampling, using original audio")
+            except Exception as e:
+                logger.warning(f"Resampling failed: {e}, using original audio")
+        # Trim silence from edges
+        audio_array = self._trim_silence(audio_array)
+        # Ensure minimum length (Whisper works better with at least 0.1s)
+        min_samples = int(0.1 * target_sample_rate)
+        if len(audio_array) < min_samples:
+            # Pad with silence
+            padding = min_samples - len(audio_array)
+            audio_array = np.pad(audio_array, (0, padding), mode='constant', constant_values=0)
+        # Normalize audio
+        max_val = np.max(np.abs(audio_array))
+        if max_val > 0:
+            audio_array = audio_array / max_val * 0.9  # Prevent clipping
+        return audio_array
+    def _trim_silence(self, audio_array: np.ndarray, silence_threshold: float = 0.01) -> np.ndarray:
+        """
+        Trim silence from beginning and end of audio.
+        Args:
+            audio_array: Audio samples
+            silence_threshold: Threshold for silence detection
+        Returns:
+            np.ndarray: Trimmed audio
+        """
+        if len(audio_array) == 0:
+            return audio_array
+        # Find non-silent regions
+        energy = audio_array ** 2
+        non_silent = energy > silence_threshold
+        if not np.any(non_silent):
+            return audio_array  # All silence, return as is
+        # Find first and last non-silent samples
+        first_sound = np.argmax(non_silent)
+        last_sound = len(non_silent) - np.argmax(non_silent[::-1]) - 1
+        # Add small padding
+        padding_samples = int(0.05 * 16000)  # 50ms padding
+        first_sound = max(0, first_sound - padding_samples)
+        last_sound = min(len(audio_array) - 1, last_sound + padding_samples)
+        return audio_array[first_sound:last_sound + 1]
+    def _text_to_digit(self, text: str) -> str:
+        """
+        Convert transcribed text to digit.
+        Args:
+            text: Transcribed text from Whisper
+        Returns:
+            str: Digit (0-9) or original text if no match
+        """
+        # Clean the text
+        text = text.strip().lower()
+        # Remove common punctuation and extra words
+        text = text.replace(",", "").replace(".", "").replace("!", "").replace("?", "")
+        text = text.replace("the", "").replace("number", "").replace("digit", "")
+        text = text.strip()
+        # Try direct mapping
+        if text in self.digit_map:
+            return self.digit_map[text]
+        # Try word-by-word mapping for multi-word responses
+        words = text.split()
+        for word in words:
+            if word in self.digit_map:
+                return self.digit_map[word]
+        # Check if it's already a digit
+        if len(text) == 1 and text.isdigit():
+            return text
+        # Look for digits in the text
+        digits_found = [char for char in text if char.isdigit()]
+        if digits_found:
+            return digits_found[0]  # Return first digit found
+        # No clear digit found
+        return text
+    def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]:
+        """
+        Predict digit with detailed timing and confidence metrics.
+        Args:
+            audio_data: Raw audio bytes
+        Returns:
+            dict: Prediction results with timing and metadata
+        """
+        start_time = time.time()
+        predicted_digit = self.process_audio(audio_data)
+        inference_time = time.time() - start_time
+        # Update average inference time
+        if self.total_predictions > 0:
+            self.average_inference_time = (
+                (self.average_inference_time * (self.total_predictions - 1) + inference_time)
+                / self.total_predictions
+            )
+        # Determine success status
+        is_successful = predicted_digit in "0123456789"
+        confidence_score = 1.0 if is_successful else 0.0
+        # Extract any error information
+        error_info = None
+        if predicted_digit.startswith("error:"):
+            error_info = predicted_digit[6:].strip()
+            predicted_digit = "unknown"
+        elif predicted_digit.startswith("unclear:"):
+            error_info = f"Transcription unclear: {predicted_digit[8:].strip()}"
+            predicted_digit = "unknown"
+        result = {
+            'predicted_digit': predicted_digit,
+            'confidence_score': confidence_score,
+            'inference_time': round(inference_time, 4),
+            'success': is_successful,
+            'timestamp': time.time(),
+            'model': 'openai/whisper-tiny',
+            'method': 'whisper_digit'
+        }
+        if error_info:
+            result['error'] = error_info
+        return result
+    def get_model_info(self) -> Dict[str, Any]:
+        """
+        Get information about the loaded model.
+        Returns:
+            dict: Model information
+        """
+        return {
+            'model_name': 'openai/whisper-tiny',
+            'model_type': 'Speech-to-Text (ASR)',
+            'specialized_for': 'Digit Recognition (0-9)',
+            'device': 'GPU' if self.device >= 0 else 'CPU',
+            'torch_device': self.device,
+            'supports_streaming': False,
+            'supported_languages': ['en'],
+            'digit_mappings': len(self.digit_map)
+        }
+    def get_stats(self) -> Dict[str, Any]:
+        """
+        Get processor statistics.
+        Returns:
+            dict: Performance statistics
+        """
+        success_rate = (
+            self.successful_predictions / max(1, self.total_predictions)
+        )
+        return {
+            'total_predictions': self.total_predictions,
+            'successful_predictions': self.successful_predictions,
+            'failed_predictions': self.failed_predictions,
+            'success_rate': round(success_rate, 3),
+            'average_inference_time': round(self.average_inference_time, 4),
+            'model_loaded': self.is_configured()
+        }
+    def test_with_sample_audio(self) -> Dict[str, Any]:
+        """
+        Test the processor with generated sample audio.
+        Returns:
+            dict: Test results
+        """
+        if not self.is_configured():
+            return {'error': 'Model not configured'}
+        try:
+            # Generate simple test audio (1 second of tone)
+            sample_rate = 16000
+            duration = 1.0
+            frequency = 440  # A note
+            t = np.linspace(0, duration, int(sample_rate * duration))
+            test_audio = 0.3 * np.sin(2 * np.pi * frequency * t).astype(np.float32)
+            # Run prediction
+            start_time = time.time()
+            result = self.model(test_audio)
+            test_time = time.time() - start_time
+            return {
+                'test_successful': True,
+                'test_time': round(test_time, 4),
+                'transcription': result.get('text', 'No text'),
+                'model_responsive': True
+            }
+        except Exception as e:
+            return {
+                'test_successful': False,
+                'error': str(e),
+                'model_responsive': False
+            }

models/mel_cnn_classifier/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:784cd9615368040ec7f4fa393f4bbfa8effa8b66b5a526cb2d82f3c526537ae7
+size 7876706

models/mfcc_classifier/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35346777b57dd72acf2599359e153336859ae5af05e991e0419a3c0f8fff0248
+size 1019362

models/mfcc_classifier/scaler.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5af2561a5be3934fb43d590605bbb9a6293e93935a975d904c7eac5bfe876c1
+size 4202

models/raw_cnn_classifier/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdcae9f8fed4d05a27149a6258ba44e9350924fc571f6fe87deaf9cd4f4a3a0e
+size 7728930

requirements_hf.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# HF Spaces Requirements - Essential packages only
+# Core Flask API
+Flask==2.3.3
+Flask-CORS==4.0.0
+requests==2.31.0
+python-dotenv==1.0.0
+# Audio Processing Core
+numpy==1.24.3
+librosa==0.10.1
+scipy==1.11.4
+soundfile==0.12.1
+# ML Models - PyTorch (CPU optimized for HF Spaces)
+torch==2.0.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu
+torchaudio==2.0.2+cpu --extra-index-url https://download.pytorch.org/whl/cpu
+# Essential ML utilities
+scikit-learn==1.3.2
+transformers==4.35.2
+# Audio format handling
+webrtcvad==2.0.10
+# Logging and utilities
+tqdm==4.66.1

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (142 Bytes). View file

utils/__pycache__/audio_utils.cpython-312.pyc ADDED Viewed

Binary file (20.5 kB). View file

utils/__pycache__/enhanced_vad.cpython-312.pyc ADDED Viewed

Binary file (25.7 kB). View file

utils/__pycache__/logging_utils.cpython-312.pyc ADDED Viewed

Binary file (9.6 kB). View file

utils/__pycache__/noise_utils.cpython-312.pyc ADDED Viewed

Binary file (12.9 kB). View file

utils/__pycache__/session_manager.cpython-312.pyc ADDED Viewed

Binary file (16.8 kB). View file

utils/__pycache__/vad_feature_integration.cpython-312.pyc ADDED Viewed

Binary file (21.2 kB). View file

utils/__pycache__/webm_converter.cpython-312.pyc ADDED Viewed

Binary file (5.77 kB). View file

utils/__pycache__/webrtc_vad.cpython-312.pyc ADDED Viewed

Binary file (20.6 kB). View file

utils/audio_utils.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import numpy as np
+import wave
+import io
+import logging
+import subprocess
+import tempfile
+import os
+from pathlib import Path
+from typing import Tuple, Optional
+logger = logging.getLogger(__name__)
+def check_ffmpeg_available() -> bool:
+    """Check if ffmpeg is available on the system."""
+    try:
+        result = subprocess.run(['ffmpeg', '-version'],
+                              capture_output=True,
+                              text=True,
+                              timeout=5)
+        return result.returncode == 0
+    except (subprocess.SubprocessError, FileNotFoundError, subprocess.TimeoutExpired):
+        return False
+def convert_with_ffmpeg(audio_data: bytes, target_sr: int = 8000, target_format: str = 'wav') -> Optional[bytes]:
+    """
+    Convert audio using ffmpeg for high-quality format conversion.
+    Args:
+        audio_data: Input audio bytes in any format
+        target_sr: Target sampling rate (default: 8000 Hz for ML models)
+        target_format: Target audio format (default: wav)
+    Returns:
+        Converted audio bytes or None if conversion fails
+    """
+    if not check_ffmpeg_available():
+        logger.warning("ffmpeg not available for audio conversion")
+        return None
+    temp_input = None
+    temp_output = None
+    try:
+        # Create temporary files
+        with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as temp_input:
+            temp_input.write(audio_data)
+            temp_input.flush()
+        with tempfile.NamedTemporaryFile(suffix=f'.{target_format}', delete=False) as temp_output:
+            pass  # Just need the filename
+        # Build ffmpeg command for high-quality conversion
+        ffmpeg_cmd = [
+            'ffmpeg',
+            '-i', temp_input.name,
+            '-ar', str(target_sr),          # Resample to target sample rate
+            '-ac', '1',                     # Convert to mono
+            '-acodec', 'pcm_s16le',         # 16-bit PCM (standard for ML)
+            '-f', target_format,            # Output format
+            '-loglevel', 'error',           # Reduce ffmpeg output
+            '-y',                           # Overwrite output
+            temp_output.name
+        ]
+        logger.debug(f"Running ffmpeg conversion: {' '.join(ffmpeg_cmd)}")
+        # Run ffmpeg conversion
+        result = subprocess.run(ffmpeg_cmd,
+                              capture_output=True,
+                              text=True,
+                              timeout=30)
+        if result.returncode == 0:
+            # Read converted audio
+            with open(temp_output.name, 'rb') as f:
+                converted_audio = f.read()
+            logger.debug(f"ffmpeg conversion successful: "
+                        f"{len(audio_data)} -> {len(converted_audio)} bytes "
+                        f"({target_sr}Hz, mono, {target_format})")
+            return converted_audio
+        else:
+            logger.error(f"ffmpeg conversion failed: {result.stderr}")
+            return None
+    except Exception as e:
+        logger.error(f"ffmpeg conversion error: {str(e)}")
+        return None
+    finally:
+        # Clean up temporary files
+        try:
+            if temp_input and os.path.exists(temp_input.name):
+                os.unlink(temp_input.name)
+            if temp_output and os.path.exists(temp_output.name):
+                os.unlink(temp_output.name)
+        except Exception as cleanup_error:
+            logger.warning(f"Failed to cleanup temp files: {cleanup_error}")
+def convert_for_ml_models(audio_data: bytes, pipeline_type: str = 'mfcc') -> bytes:
+    """
+    Convert audio specifically for ML model requirements.
+    Args:
+        audio_data: Input audio bytes
+        pipeline_type: ML pipeline type ('mfcc', 'mel_cnn', 'raw_cnn')
+    Returns:
+        Audio bytes optimized for the specific ML model
+    """
+    # All our ML models expect 8kHz, mono, 16-bit PCM
+    target_sr = 8000
+    # Try ffmpeg first for best quality
+    converted = convert_with_ffmpeg(audio_data, target_sr=target_sr)
+    if converted:
+        logger.debug(f"Used ffmpeg for {pipeline_type} model audio conversion")
+        return converted
+    # Fallback to existing conversion methods
+    logger.debug(f"Using fallback audio conversion for {pipeline_type} model")
+    return convert_audio_format(audio_data)
+def validate_audio_format(audio_data: bytes) -> bool:
+    """
+    Validate that audio data is in a supported format.
+    Args:
+        audio_data: Raw audio bytes
+    Returns:
+        True if format is supported, False otherwise
+    """
+    # Check minimum size
+    if len(audio_data) < 44:  # WAV header is 44 bytes
+        logger.debug(f"Audio data too small: {len(audio_data)} bytes (minimum 44 for WAV header)")
+        return False
+    # Check for null/empty data
+    if audio_data[:20] == b'\x00' * 20:
+        logger.error("Audio data appears to be empty/null bytes")
+        return False
+    # Check if it starts with RIFF header
+    if not audio_data.startswith(b'RIFF'):
+        logger.error(f"Audio data does not start with RIFF header. First 8 bytes: {audio_data[:8]}")
+        # Try to provide more diagnostic info
+        if len(audio_data) > 20:
+            logger.error(f"First 20 bytes as hex: {audio_data[:20].hex()}")
+        return False
+    try:
+        with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
+            # Check basic WAV properties
+            channels = wav_file.getnchannels()
+            sample_width = wav_file.getsampwidth()
+            frame_rate = wav_file.getframerate()
+            frames = wav_file.getnframes()
+            logger.debug(f"Audio format: {channels} channels, {sample_width} bytes/sample, {frame_rate} Hz, {frames} frames")
+            # Be more lenient with streaming chunks
+            if channels not in [1, 2]:
+                logger.warning(f"Unusual channel count: {channels}")
+                return False
+            if sample_width not in [1, 2, 4]:  # 8-bit, 16-bit, 32-bit
+                logger.warning(f"Unusual sample width: {sample_width}")
+                return False
+            if frame_rate < 8000 or frame_rate > 48000:  # Wider range
+                logger.warning(f"Unusual frame rate: {frame_rate}")
+                return False
+            if frames == 0:
+                logger.warning("No audio frames found")
+                return False
+            return True
+    except wave.Error as e:
+        logger.error(f"WAV format error: {str(e)}")
+        logger.error(f"Audio data size: {len(audio_data)} bytes")
+        if len(audio_data) > 44:
+            logger.error(f"WAV header bytes: {audio_data[:44].hex()}")
+        return False
+    except Exception as e:
+        logger.error(f"Audio validation failed: {str(e)}")
+        logger.error(f"Audio data size: {len(audio_data)} bytes")
+        return False
+def convert_audio_format(audio_data: bytes) -> bytes:
+    """
+    Convert various audio formats (WebM, OGG, MP3, etc.) to WAV format.
+    Args:
+        audio_data: Input audio bytes in any supported format
+    Returns:
+        Converted audio bytes in WAV format
+    Raises:
+        Exception: If conversion fails
+    """
+    try:
+        # First detect the audio format
+        from .webm_converter import detect_audio_format, convert_webm_to_wav
+        audio_format = detect_audio_format(audio_data)
+        logger.debug(f"Detected audio format: {audio_format}")
+        # Handle WebM specifically (common from MediaRecorder)
+        if audio_format == 'webm':
+            logger.info("Converting WebM audio to WAV (fallback method)")
+            converted = convert_webm_to_wav(audio_data)
+            if converted:
+                return converted
+            else:
+                raise Exception("WebM conversion failed")
+        # Try using pydub for format conversion (handles WebM, OGG, MP3, etc.)
+        try:
+            from pydub import AudioSegment
+            import io
+            # Load audio from bytes
+            audio = AudioSegment.from_file(io.BytesIO(audio_data))
+            # Convert to mono and 16kHz
+            audio = audio.set_channels(1)  # Mono
+            audio = audio.set_frame_rate(16000)  # 16kHz
+            audio = audio.set_sample_width(2)  # 16-bit
+            # Export as WAV
+            output_buffer = io.BytesIO()
+            audio.export(output_buffer, format="wav")
+            return output_buffer.getvalue()
+        except ImportError:
+            logger.warning("pydub not installed, falling back to basic WAV conversion")
+            # Fall back to basic WAV processing
+            return convert_to_mono_16khz(audio_data)
+        except Exception as e:
+            logger.warning(f"pydub conversion failed: {str(e)}, trying fallback methods")
+            # Try WebM converter as fallback
+            if audio_format in ['webm', 'unknown']:
+                logger.info("Trying WebM fallback converter")
+                converted = convert_webm_to_wav(audio_data)
+                if converted:
+                    return converted
+            # Last resort: basic WAV processing
+            return convert_to_mono_16khz(audio_data)
+    except Exception as e:
+        logger.error(f"All audio conversion methods failed: {str(e)}")
+        raise Exception(f"Failed to convert audio format: {str(e)}")
+def convert_to_mono_16khz(audio_data: bytes) -> bytes:
+    """
+    Convert audio to mono, 16kHz format suitable for speech recognition.
+    Args:
+        audio_data: Input audio bytes (WAV format)
+    Returns:
+        Converted audio bytes in mono 16kHz WAV format
+    Raises:
+        Exception: If conversion fails
+    """
+    try:
+        with wave.open(io.BytesIO(audio_data), 'rb') as input_wav:
+            frames = input_wav.readframes(input_wav.getnframes())
+            channels = input_wav.getnchannels()
+            sample_width = input_wav.getsampwidth()
+            frame_rate = input_wav.getframerate()
+            # Convert to numpy array
+            if sample_width == 2:
+                audio_array = np.frombuffer(frames, dtype=np.int16)
+            else:
+                raise Exception(f"Unsupported sample width: {sample_width}")
+            # Convert stereo to mono if needed
+            if channels == 2:
+                audio_array = audio_array.reshape(-1, 2)
+                audio_array = np.mean(audio_array, axis=1).astype(np.int16)
+            # Resample to 16kHz if needed
+            if frame_rate != 16000:
+                # Simple downsampling (for production, use proper resampling)
+                ratio = frame_rate / 16000
+                if ratio > 1:
+                    # Downsample by taking every nth sample
+                    indices = np.arange(0, len(audio_array), ratio).astype(int)
+                    audio_array = audio_array[indices]
+                else:
+                    # Upsample by repeating samples (basic interpolation)
+                    audio_array = np.repeat(audio_array, int(1/ratio))
+            # Create output WAV
+            output = io.BytesIO()
+            with wave.open(output, 'wb') as output_wav:
+                output_wav.setnchannels(1)  # Mono
+                output_wav.setsampwidth(2)  # 16-bit
+                output_wav.setframerate(16000)  # 16kHz
+                output_wav.writeframes(audio_array.tobytes())
+            return output.getvalue()
+    except Exception as e:
+        logger.error(f"Audio conversion failed: {str(e)}")
+        raise Exception(f"Failed to convert audio: {str(e)}")
+def get_audio_duration(audio_data: bytes) -> float:
+    """
+    Get duration of audio in seconds.
+    Args:
+        audio_data: WAV audio bytes
+    Returns:
+        Duration in seconds
+    """
+    try:
+        with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
+            frames = wav_file.getnframes()
+            frame_rate = wav_file.getframerate()
+            duration = frames / frame_rate
+            return duration
+    except Exception as e:
+        logger.error(f"Failed to get audio duration: {str(e)}")
+        return 0.0
+def audio_to_numpy(audio_data: bytes) -> Tuple[np.ndarray, int]:
+    """
+    Convert WAV audio bytes to numpy array.
+    Args:
+        audio_data: WAV audio bytes
+    Returns:
+        Tuple of (audio_array, sample_rate)
+    Raises:
+        Exception: If conversion fails
+    """
+    try:
+        with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
+            frames = wav_file.readframes(wav_file.getnframes())
+            sample_rate = wav_file.getframerate()
+            channels = wav_file.getnchannels()
+            sample_width = wav_file.getsampwidth()
+            if sample_width == 2:
+                audio_array = np.frombuffer(frames, dtype=np.int16)
+            else:
+                raise Exception(f"Unsupported sample width: {sample_width}")
+            # Convert to float32 and normalize
+            audio_array = audio_array.astype(np.float32) / 32767.0
+            # Handle stereo
+            if channels == 2:
+                audio_array = audio_array.reshape(-1, 2)
+                audio_array = np.mean(audio_array, axis=1)
+            return audio_array, sample_rate
+    except Exception as e:
+        logger.error(f"Failed to convert audio to numpy: {str(e)}")
+        raise Exception(f"Audio conversion failed: {str(e)}")
+def create_test_audio(digit: str, duration: float = 1.0, sample_rate: int = 16000) -> bytes:
+    """
+    Create test audio data for development purposes.
+    Args:
+        digit: Digit to simulate ('0'-'9')
+        duration: Audio duration in seconds
+        sample_rate: Sample rate in Hz
+    Returns:
+        WAV audio bytes
+    """
+    try:
+        # Create simple tone pattern based on digit
+        t = np.linspace(0, duration, int(sample_rate * duration), False)
+        # Different frequency patterns for each digit
+        freq_map = {
+            '0': [400, 600],     # Low frequencies
+            '1': [800, 1000],    # Higher frequencies
+            '2': [600, 800],
+            '3': [700, 900],
+            '4': [500, 700],
+            '5': [900, 1100],
+            '6': [450, 650],
+            '7': [750, 950],
+            '8': [550, 750],
+            '9': [850, 1050]
+        }
+        freqs = freq_map.get(digit, [440, 880])
+        # Generate tone
+        signal = np.sin(freqs[0] * 2.0 * np.pi * t) * 0.3 + np.sin(freqs[1] * 2.0 * np.pi * t) * 0.3
+        # Add some envelope
+        envelope = np.exp(-3 * t)
+        signal = signal * envelope
+        # Convert to int16
+        signal = (signal * 32767).astype(np.int16)
+        # Create WAV
+        output = io.BytesIO()
+        with wave.open(output, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(signal.tobytes())
+        return output.getvalue()
+    except Exception as e:
+        logger.error(f"Failed to create test audio: {str(e)}")
+        raise Exception(f"Test audio creation failed: {str(e)}")

utils/enhanced_vad.py ADDED Viewed

	@@ -0,0 +1,571 @@

+"""
+Enhanced VAD Implementation with ffmpeg support and comprehensive debugging
+"""
+import numpy as np
+import logging
+import subprocess
+import tempfile
+import os
+import time
+import wave
+import io
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Any
+from threading import Thread, Lock
+import asyncio
+import concurrent.futures
+# Try to import WebRTC VAD
+try:
+    import webrtcvad
+    WEBRTC_AVAILABLE = True
+except ImportError:
+    WEBRTC_AVAILABLE = False
+    logging.warning("webrtcvad not available - using fallback VAD implementation")
+logger = logging.getLogger(__name__)
+class EnhancedVAD:
+    """
+    Enhanced Voice Activity Detection with ffmpeg integration and comprehensive debugging.
+    Features:
+    - ffmpeg-based audio preprocessing
+    - Multiple VAD implementations (WebRTC, simple energy-based)
+    - Comprehensive audio validation and debugging
+    - Async audio chunk saving
+    - Real-time performance monitoring
+    """
+    def __init__(self,
+                 sample_rate: int = 16000,
+                 frame_duration_ms: int = 30,
+                 aggressiveness: int = 1,
+                 min_speech_duration: float = 0.4,
+                 max_speech_duration: float = 3.0,
+                 silence_threshold: float = 0.01):
+        """
+        Initialize Enhanced VAD.
+        Args:
+            sample_rate: Target sample rate (Hz)
+            frame_duration_ms: Frame duration in milliseconds
+            aggressiveness: VAD aggressiveness (0-3)
+            min_speech_duration: Minimum speech segment duration (seconds)
+            max_speech_duration: Maximum speech segment duration (seconds)
+            silence_threshold: Energy threshold for silence detection
+        """
+        self.sample_rate = sample_rate
+        self.frame_duration_ms = frame_duration_ms
+        self.frame_size = int(sample_rate * frame_duration_ms / 1000)
+        self.aggressiveness = aggressiveness
+        self.min_speech_duration = min_speech_duration
+        self.max_speech_duration = max_speech_duration
+        self.silence_threshold = silence_threshold
+        # Initialize WebRTC VAD if available
+        self.webrtc_vad = None
+        if WEBRTC_AVAILABLE:
+            try:
+                self.webrtc_vad = webrtcvad.Vad(aggressiveness)
+                logger.info(f"WebRTC VAD initialized (aggressiveness: {aggressiveness})")
+            except Exception as e:
+                logger.error(f"Failed to initialize WebRTC VAD: {e}")
+                self.webrtc_vad = None
+        # Check ffmpeg availability
+        self.ffmpeg_available = self._check_ffmpeg_available()
+        # Performance tracking
+        self.stats = {
+            'total_chunks_processed': 0,
+            'speech_segments_detected': 0,
+            'processing_time_total': 0.0,
+            'last_processing_time': 0.0,
+            'ffmpeg_conversions': 0,
+            'audio_validation_failures': 0,
+            'webrtc_available': WEBRTC_AVAILABLE and self.webrtc_vad is not None,
+            'ffmpeg_available': self.ffmpeg_available
+        }
+        # Async processing
+        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=2)
+        self.save_lock = Lock()
+        logger.info(f"Enhanced VAD initialized:")
+        logger.info(f"  Sample rate: {sample_rate} Hz")
+        logger.info(f"  Frame duration: {frame_duration_ms} ms")
+        logger.info(f"  WebRTC VAD: {'Available' if self.webrtc_vad else 'Not available'}")
+        logger.info(f"  ffmpeg: {'Available' if self.ffmpeg_available else 'Not available'}")
+    def _check_ffmpeg_available(self) -> bool:
+        """Check if ffmpeg is available."""
+        try:
+            result = subprocess.run(['ffmpeg', '-version'],
+                                  capture_output=True, text=True, timeout=5)
+            return result.returncode == 0
+        except Exception:
+            return False
+    def preprocess_audio_with_ffmpeg(self, audio_data: bytes) -> Optional[bytes]:
+        """
+        Preprocess audio using ffmpeg for optimal VAD performance.
+        Args:
+            audio_data: Raw audio bytes
+        Returns:
+            Preprocessed audio bytes or None if processing fails
+        """
+        if not self.ffmpeg_available:
+            logger.debug("ffmpeg not available for audio preprocessing")
+            return None
+        temp_input = None
+        temp_output = None
+        try:
+            # Create temporary files
+            with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as temp_input:
+                temp_input.write(audio_data)
+                temp_input.flush()
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_output:
+                pass
+            # ffmpeg command for VAD-optimized preprocessing
+            ffmpeg_cmd = [
+                'ffmpeg',
+                '-i', temp_input.name,
+                '-ar', str(self.sample_rate),    # Resample to target rate
+                '-ac', '1',                      # Convert to mono
+                '-acodec', 'pcm_s16le',          # 16-bit PCM
+                '-af', 'highpass=f=80,lowpass=f=8000,dynaudnorm=f=10:g=3',  # Audio filters for speech
+                '-f', 'wav',
+                '-loglevel', 'error',
+                '-y',
+                temp_output.name
+            ]
+            result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, timeout=10)
+            if result.returncode == 0:
+                with open(temp_output.name, 'rb') as f:
+                    preprocessed_audio = f.read()
+                self.stats['ffmpeg_conversions'] += 1
+                logger.debug(f"ffmpeg preprocessing: {len(audio_data)} -> {len(preprocessed_audio)} bytes")
+                return preprocessed_audio
+            else:
+                logger.error(f"ffmpeg preprocessing failed: {result.stderr}")
+                return None
+        except Exception as e:
+            logger.error(f"ffmpeg preprocessing error: {e}")
+            return None
+        finally:
+            # Cleanup
+            try:
+                if temp_input and os.path.exists(temp_input.name):
+                    os.unlink(temp_input.name)
+                if temp_output and os.path.exists(temp_output.name):
+                    os.unlink(temp_output.name)
+            except Exception:
+                pass
+    def validate_and_debug_audio(self, audio_data: bytes) -> Dict[str, Any]:
+        """
+        Comprehensive audio validation and debugging.
+        Args:
+            audio_data: Audio data to validate
+        Returns:
+            Validation results and debugging information
+        """
+        debug_info = {
+            'size_bytes': len(audio_data),
+            'valid_wav': False,
+            'sample_rate': None,
+            'channels': None,
+            'duration': 0.0,
+            'energy_level': 0.0,
+            'is_silent': True,
+            'format_detected': 'unknown',
+            'issues': []
+        }
+        try:
+            # Check minimum size
+            if len(audio_data) < 44:
+                debug_info['issues'].append(f"Too small: {len(audio_data)} bytes (need ≥44 for WAV)")
+                return debug_info
+            # Detect format by header
+            if audio_data.startswith(b'RIFF') and b'WAVE' in audio_data[:20]:
+                debug_info['format_detected'] = 'wav'
+            elif audio_data.startswith(b'OggS'):
+                debug_info['format_detected'] = 'ogg'
+            elif audio_data.startswith(b'\x1a\x45\xdf\xa3'):
+                debug_info['format_detected'] = 'webm'
+            # Try to parse as WAV
+            try:
+                with wave.open(io.BytesIO(audio_data), 'rb') as wav:
+                    debug_info['valid_wav'] = True
+                    debug_info['sample_rate'] = wav.getframerate()
+                    debug_info['channels'] = wav.getnchannels()
+                    debug_info['duration'] = wav.getnframes() / wav.getframerate()
+                    # Read audio samples for analysis
+                    wav.rewind()
+                    frames = wav.readframes(wav.getnframes())
+                    if len(frames) > 0:
+                        # Convert to numpy for analysis
+                        audio_array = np.frombuffer(frames, dtype=np.int16)
+                        # Calculate energy level
+                        energy = np.sqrt(np.mean(audio_array.astype(np.float32) ** 2))
+                        debug_info['energy_level'] = float(energy)
+                        debug_info['is_silent'] = energy < (self.silence_threshold * 32768)
+                        # Check for constant beep (common issue)
+                        if len(audio_array) > 100:
+                            # Check if audio is a constant tone (beep)
+                            diff = np.diff(audio_array)
+                            if np.std(diff) < 100:  # Very low variation
+                                debug_info['issues'].append("Constant tone/beep detected")
+                        # Check dynamic range
+                        if np.max(audio_array) - np.min(audio_array) < 1000:
+                            debug_info['issues'].append("Very low dynamic range")
+            except Exception as wav_error:
+                debug_info['issues'].append(f"WAV parsing failed: {wav_error}")
+            # Additional format-specific checks
+            if debug_info['format_detected'] in ['ogg', 'webm'] and not debug_info['valid_wav']:
+                debug_info['issues'].append("Non-WAV format detected - requires conversion")
+            logger.debug(f"Audio validation: {debug_info}")
+            if debug_info['issues']:
+                self.stats['audio_validation_failures'] += 1
+                logger.warning(f"Audio validation issues: {debug_info['issues']}")
+            return debug_info
+        except Exception as e:
+            debug_info['issues'].append(f"Validation error: {str(e)}")
+            logger.error(f"Audio validation failed: {e}")
+            return debug_info
+    def detect_speech_segments(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]:
+        """
+        Detect speech segments using multiple methods.
+        Args:
+            audio_data: Input audio data
+        Returns:
+            List of (segment_audio, segment_info) tuples
+        """
+        start_time = time.time()
+        # Validate and debug audio
+        debug_info = self.validate_and_debug_audio(audio_data)
+        segments = []
+        try:
+            # Preprocess with ffmpeg if available
+            processed_audio = self.preprocess_audio_with_ffmpeg(audio_data)
+            if processed_audio:
+                working_audio = processed_audio
+                logger.debug("Using ffmpeg-preprocessed audio for VAD")
+            else:
+                working_audio = audio_data
+                logger.debug("Using original audio for VAD")
+            # Re-validate processed audio
+            if processed_audio:
+                processed_debug = self.validate_and_debug_audio(processed_audio)
+                logger.debug(f"Processed audio validation: {processed_debug}")
+            # Method 1: WebRTC VAD (if available)
+            if self.webrtc_vad and debug_info['valid_wav']:
+                webrtc_segments = self._webrtc_vad_detection(working_audio)
+                segments.extend(webrtc_segments)
+                logger.debug(f"WebRTC VAD found {len(webrtc_segments)} segments")
+            # Method 2: Energy-based VAD (fallback)
+            if not segments or debug_info['issues']:
+                energy_segments = self._energy_based_vad(working_audio)
+                segments.extend(energy_segments)
+                logger.debug(f"Energy VAD found {len(energy_segments)} segments")
+            # Method 3: Simple duration-based segmentation (last resort)
+            if not segments and len(audio_data) > 8000:  # > 8KB
+                fallback_segment = self._create_fallback_segment(working_audio)
+                if fallback_segment:
+                    segments.append(fallback_segment)
+                    logger.debug("Used fallback segmentation")
+            processing_time = time.time() - start_time
+            self.stats['total_chunks_processed'] += 1
+            self.stats['speech_segments_detected'] += len(segments)
+            self.stats['processing_time_total'] += processing_time
+            self.stats['last_processing_time'] = processing_time
+            logger.debug(f"VAD processing complete: {len(segments)} segments in {processing_time:.3f}s")
+            return segments
+        except Exception as e:
+            logger.error(f"Speech segment detection failed: {e}")
+            return []
+    def _webrtc_vad_detection(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]:
+        """WebRTC-based speech detection."""
+        segments = []
+        try:
+            frame_size_bytes = self.frame_size * 2  # 16-bit = 2 bytes per sample
+            frames = []
+            # Extract frames
+            for i in range(0, len(audio_data) - frame_size_bytes + 1, frame_size_bytes):
+                frame = audio_data[i:i + frame_size_bytes]
+                if len(frame) == frame_size_bytes:
+                    frames.append(frame)
+            if len(frames) < 5:  # Need minimum frames
+                return segments
+            # VAD processing
+            speech_frames = []
+            for frame in frames:
+                try:
+                    is_speech = self.webrtc_vad.is_speech(frame, self.sample_rate)
+                    speech_frames.append((frame, is_speech))
+                except Exception as e:
+                    logger.debug(f"WebRTC VAD frame processing failed: {e}")
+                    speech_frames.append((frame, False))
+            # Group consecutive speech frames
+            current_segment = []
+            for frame, is_speech in speech_frames:
+                if is_speech:
+                    current_segment.append(frame)
+                else:
+                    if len(current_segment) > 0:
+                        # End of speech segment
+                        segment_audio = b''.join(current_segment)
+                        segment_duration = len(current_segment) * self.frame_duration_ms / 1000
+                        if segment_duration >= self.min_speech_duration:
+                            segments.append((segment_audio, {
+                                'duration': segment_duration,
+                                'method': 'webrtc_vad',
+                                'frames': len(current_segment)
+                            }))
+                        current_segment = []
+            # Handle final segment
+            if current_segment:
+                segment_audio = b''.join(current_segment)
+                segment_duration = len(current_segment) * self.frame_duration_ms / 1000
+                if segment_duration >= self.min_speech_duration:
+                    segments.append((segment_audio, {
+                        'duration': segment_duration,
+                        'method': 'webrtc_vad',
+                        'frames': len(current_segment)
+                    }))
+            return segments
+        except Exception as e:
+            logger.error(f"WebRTC VAD detection failed: {e}")
+            return []
+    def _energy_based_vad(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]:
+        """Energy-based speech detection."""
+        segments = []
+        try:
+            # Try to parse as WAV or raw PCM
+            try:
+                with wave.open(io.BytesIO(audio_data), 'rb') as wav:
+                    frames = wav.readframes(wav.getnframes())
+                    sample_rate = wav.getframerate()
+            except:
+                # Assume raw 16-bit PCM
+                frames = audio_data
+                sample_rate = self.sample_rate
+            if len(frames) < 1000:  # Too short
+                return segments
+            # Convert to numpy array
+            audio_samples = np.frombuffer(frames, dtype=np.int16)
+            audio_float = audio_samples.astype(np.float32) / 32768.0
+            # Calculate energy in overlapping windows
+            window_size = int(sample_rate * 0.1)  # 100ms windows
+            hop_size = window_size // 2
+            energies = []
+            for i in range(0, len(audio_float) - window_size, hop_size):
+                window = audio_float[i:i + window_size]
+                energy = np.sqrt(np.mean(window ** 2))
+                energies.append(energy)
+            if len(energies) < 3:
+                return segments
+            # Adaptive threshold
+            mean_energy = np.mean(energies)
+            threshold = max(self.silence_threshold, mean_energy * 0.3)
+            # Find speech segments
+            if isinstance(energies, (list, np.ndarray)):
+                energies = np.array(energies)  # Ensure it's a numpy array
+            speech_windows = energies > threshold
+            # Group consecutive speech windows
+            speech_start = None
+            for i, is_speech in enumerate(speech_windows):
+                if is_speech and speech_start is None:
+                    speech_start = i
+                elif not is_speech and speech_start is not None:
+                    # End of speech
+                    start_sample = speech_start * hop_size
+                    end_sample = min(i * hop_size + window_size, len(audio_samples))
+                    segment_samples = audio_samples[start_sample:end_sample]
+                    segment_duration = len(segment_samples) / sample_rate
+                    if segment_duration >= self.min_speech_duration:
+                        # Convert back to bytes
+                        segment_audio = segment_samples.tobytes()
+                        segments.append((segment_audio, {
+                            'duration': segment_duration,
+                            'method': 'energy_based',
+                            'start_time': start_sample / sample_rate,
+                            'energy_threshold': threshold,
+                            'mean_energy': mean_energy
+                        }))
+                    speech_start = None
+            return segments
+        except Exception as e:
+            logger.error(f"Energy-based VAD failed: {e}")
+            return []
+    def _create_fallback_segment(self, audio_data: bytes) -> Optional[Tuple[bytes, Dict[str, Any]]]:
+        """Create a fallback segment when VAD methods fail."""
+        try:
+            # Use the entire audio as a segment if it's reasonable length
+            debug_info = self.validate_and_debug_audio(audio_data)
+            if debug_info['duration'] > 0:
+                duration = debug_info['duration']
+            else:
+                # Estimate duration based on size (assume 16-bit, mono, 16kHz)
+                estimated_samples = len(audio_data) // 2
+                duration = estimated_samples / self.sample_rate
+            if self.min_speech_duration <= duration <= self.max_speech_duration:
+                return (audio_data, {
+                    'duration': duration,
+                    'method': 'fallback',
+                    'estimated': True,
+                    'issues': debug_info['issues']
+                })
+            return None
+        except Exception as e:
+            logger.error(f"Fallback segment creation failed: {e}")
+            return None
+    async def save_audio_chunk_async(self, audio_data: bytes, session_id: str,
+                                   chunk_type: str = "vad_chunk") -> Optional[str]:
+        """
+        Asynchronously save audio chunk to file.
+        Args:
+            audio_data: Audio data to save
+            session_id: Session identifier
+            chunk_type: Type of chunk (for filename)
+        Returns:
+            Path to saved file or None if failed
+        """
+        def _save_chunk():
+            try:
+                with self.save_lock:
+                    timestamp = int(time.time() * 1000)
+                    filename = f"{chunk_type}_{session_id}_{timestamp}.wav"
+                    filepath = Path("output") / filename
+                    # Ensure output directory exists
+                    filepath.parent.mkdir(exist_ok=True)
+                    # Save as WAV file
+                    with open(filepath, 'wb') as f:
+                        f.write(audio_data)
+                    logger.debug(f"Saved audio chunk: {filepath}")
+                    return str(filepath)
+            except Exception as e:
+                logger.error(f"Failed to save audio chunk: {e}")
+                return None
+        # Run in executor to avoid blocking
+        loop = asyncio.get_event_loop()
+        result = await loop.run_in_executor(self.executor, _save_chunk)
+        return result
+    def get_stats(self) -> Dict[str, Any]:
+        """Get comprehensive VAD statistics."""
+        stats = self.stats.copy()
+        if stats['total_chunks_processed'] > 0:
+            stats['average_processing_time'] = stats['processing_time_total'] / stats['total_chunks_processed']
+            stats['segments_per_chunk'] = stats['speech_segments_detected'] / stats['total_chunks_processed']
+        else:
+            stats['average_processing_time'] = 0.0
+            stats['segments_per_chunk'] = 0.0
+        return stats
+    def cleanup(self):
+        """Clean up resources."""
+        if hasattr(self, 'executor'):
+            self.executor.shutdown(wait=True)
+        logger.info("Enhanced VAD cleaned up")
+# Convenience function for creating enhanced VAD
+def create_enhanced_vad(config: Optional[Dict[str, Any]] = None) -> EnhancedVAD:
+    """Create enhanced VAD with optional configuration."""
+    if config is None:
+        config = {}
+    return EnhancedVAD(
+        sample_rate=config.get('sample_rate', 16000),
+        frame_duration_ms=config.get('frame_duration_ms', 30),
+        aggressiveness=config.get('aggressiveness', 1),
+        min_speech_duration=config.get('min_speech_duration', 0.4),
+        max_speech_duration=config.get('max_speech_duration', 3.0),
+        silence_threshold=config.get('silence_threshold', 0.01)
+    )

utils/logging_utils.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import logging
+import time
+from typing import Dict, List, Any
+from collections import defaultdict, deque
+import json
+class PerformanceLogger:
+    """
+    Performance logger for tracking audio processing metrics.
+    Provides detailed logging and statistics for each processing method.
+    """
+    def __init__(self, max_history: int = 100):
+        self.max_history = max_history
+        self.method_stats = defaultdict(lambda: {
+            'predictions': deque(maxlen=max_history),
+            'inference_times': deque(maxlen=max_history),
+            'errors': deque(maxlen=max_history),
+            'total_calls': 0,
+            'total_errors': 0
+        })
+        # Setup structured logging
+        self.setup_logging()
+    def setup_logging(self):
+        """Setup structured logging with proper formatting."""
+        # Create custom formatter
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        # Setup console handler
+        console_handler = logging.StreamHandler()
+        console_handler.setFormatter(formatter)
+        # Setup file handler
+        file_handler = logging.FileHandler('audio_digit_classifier.log')
+        file_handler.setFormatter(formatter)
+        # Configure root logger
+        logging.basicConfig(
+            level=logging.DEBUG,
+            handlers=[console_handler, file_handler]
+        )
+        self.logger = logging.getLogger(__name__)
+    def log_prediction(self, method: str, result: Dict[str, Any]):
+        """
+        Log a prediction result with performance metrics.
+        Args:
+            method: Processing method name
+            result: Prediction result dictionary
+        """
+        stats = self.method_stats[method]
+        stats['total_calls'] += 1
+        if result.get('success', True):
+            stats['predictions'].append({
+                'digit': result.get('predicted_digit'),
+                'timestamp': result.get('timestamp', time.time()),
+                'inference_time': result.get('inference_time', 0)
+            })
+            stats['inference_times'].append(result.get('inference_time', 0))
+            self.logger.info(json.dumps({
+                'event': 'prediction',
+                'method': method,
+                'digit': result.get('predicted_digit'),
+                'inference_time': result.get('inference_time'),
+                'timestamp': result.get('timestamp')
+            }))
+        else:
+            stats['total_errors'] += 1
+            stats['errors'].append({
+                'error': result.get('error'),
+                'timestamp': result.get('timestamp', time.time()),
+                'inference_time': result.get('inference_time', 0)
+            })
+            self.logger.error(json.dumps({
+                'event': 'error',
+                'method': method,
+                'error': result.get('error'),
+                'timestamp': result.get('timestamp')
+            }))
+    def get_method_stats(self, method: str) -> Dict[str, Any]:
+        """
+        Get performance statistics for a specific method.
+        Args:
+            method: Processing method name
+        Returns:
+            Dictionary with performance statistics
+        """
+        stats = self.method_stats[method]
+        inference_times = list(stats['inference_times'])
+        if not inference_times:
+            return {
+                'method': method,
+                'total_calls': stats['total_calls'],
+                'successful_predictions': 0,
+                'error_rate': 0.0,
+                'avg_inference_time': 0.0,
+                'min_inference_time': 0.0,
+                'max_inference_time': 0.0
+            }
+        successful_predictions = len(inference_times)
+        error_rate = stats['total_errors'] / stats['total_calls'] if stats['total_calls'] > 0 else 0
+        return {
+            'method': method,
+            'total_calls': stats['total_calls'],
+            'successful_predictions': successful_predictions,
+            'error_rate': round(error_rate * 100, 2),
+            'avg_inference_time': round(sum(inference_times) / len(inference_times), 3),
+            'min_inference_time': round(min(inference_times), 3),
+            'max_inference_time': round(max(inference_times), 3),
+            'recent_predictions': list(stats['predictions'])[-10:]  # Last 10 predictions
+        }
+    def get_all_stats(self) -> Dict[str, Any]:
+        """Get statistics for all processing methods."""
+        all_stats = {}
+        for method in self.method_stats.keys():
+            all_stats[method] = self.get_method_stats(method)
+        return all_stats
+    def get_comparison_report(self) -> str:
+        """
+        Generate a comparison report of all processing methods.
+        Returns:
+            Formatted string with method comparison
+        """
+        all_stats = self.get_all_stats()
+        if not all_stats:
+            return "No statistics available yet."
+        report = "\n=== Audio Processing Method Comparison ===\n\n"
+        for method, stats in all_stats.items():
+            report += f"Method: {method}\n"
+            report += f"  Total Calls: {stats['total_calls']}\n"
+            report += f"  Successful: {stats['successful_predictions']}\n"
+            report += f"  Error Rate: {stats['error_rate']}%\n"
+            report += f"  Avg Time: {stats['avg_inference_time']}s\n"
+            report += f"  Min/Max: {stats['min_inference_time']}s / {stats['max_inference_time']}s\n"
+            report += "\n"
+        # Find best performing method
+        if len(all_stats) > 1:
+            best_speed = min(all_stats.items(), key=lambda x: x[1]['avg_inference_time'])
+            best_accuracy = min(all_stats.items(), key=lambda x: x[1]['error_rate'])
+            report += f"Fastest Method: {best_speed[0]} ({best_speed[1]['avg_inference_time']}s avg)\n"
+            report += f"Most Accurate: {best_accuracy[0]} ({best_accuracy[1]['error_rate']}% error rate)\n"
+        return report
+    def log_system_info(self, info: Dict[str, Any]):
+        """Log system information for debugging."""
+        self.logger.info(json.dumps({
+            'event': 'system_info',
+            'timestamp': time.time(),
+            **info
+        }))
+    def log_audio_info(self, duration: float, format_info: Dict[str, Any]):
+        """Log audio input information."""
+        self.logger.debug(json.dumps({
+            'event': 'audio_input',
+            'duration': duration,
+            'format': format_info,
+            'timestamp': time.time()
+        }))
+# Global performance logger instance
+performance_logger = PerformanceLogger()
+def setup_flask_logging(app):
+    """Setup logging configuration for Flask application."""
+    if not app.debug:
+        # Production logging
+        file_handler = logging.FileHandler('flask_app.log')
+        file_handler.setFormatter(logging.Formatter(
+            '%(asctime)s %(levelname)s %(name)s %(message)s'
+        ))
+        file_handler.setLevel(logging.INFO)
+        app.logger.addHandler(file_handler)
+        app.logger.setLevel(logging.INFO)
+    app.logger.info('Audio Digit Classifier startup')

utils/noise_utils.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import numpy as np
+import wave
+import io
+import logging
+from typing import Literal, Optional
+logger = logging.getLogger(__name__)
+NoiseType = Literal['white', 'pink', 'brown', 'gaussian', 'background', 'speech']
+class NoiseGenerator:
+    """
+    Audio noise generator for robustness testing.
+    Supports various types of noise injection for testing digit recognition.
+    """
+    def __init__(self):
+        self.logger = logging.getLogger(__name__)
+    def generate_white_noise(self, duration: float, sample_rate: int = 16000,
+                           amplitude: float = 0.1) -> np.ndarray:
+        """
+        Generate white noise signal.
+        Args:
+            duration: Duration in seconds
+            sample_rate: Sample rate in Hz
+            amplitude: Noise amplitude (0.0 to 1.0)
+        Returns:
+            Numpy array of white noise
+        """
+        samples = int(duration * sample_rate)
+        noise = np.random.normal(0, amplitude, samples)
+        return noise.astype(np.float32)
+    def generate_pink_noise(self, duration: float, sample_rate: int = 16000,
+                          amplitude: float = 0.1) -> np.ndarray:
+        """
+        Generate pink noise (1/f noise).
+        Args:
+            duration: Duration in seconds
+            sample_rate: Sample rate in Hz
+            amplitude: Noise amplitude
+        Returns:
+            Numpy array of pink noise
+        """
+        samples = int(duration * sample_rate)
+        # Generate white noise
+        white = np.random.randn(samples)
+        # Apply 1/f filter in frequency domain
+        freqs = np.fft.fftfreq(samples, 1/sample_rate)
+        freqs[0] = 1  # Avoid division by zero
+        # 1/f filter
+        filter_response = 1.0 / np.sqrt(np.abs(freqs))
+        filter_response[0] = 0
+        # Apply filter
+        white_fft = np.fft.fft(white)
+        pink_fft = white_fft * filter_response
+        pink = np.real(np.fft.ifft(pink_fft))
+        # Normalize and scale
+        pink = pink / np.std(pink) * amplitude
+        return pink.astype(np.float32)
+    def generate_brown_noise(self, duration: float, sample_rate: int = 16000,
+                           amplitude: float = 0.1) -> np.ndarray:
+        """
+        Generate brown noise (1/f^2 noise).
+        Args:
+            duration: Duration in seconds
+            sample_rate: Sample rate in Hz
+            amplitude: Noise amplitude
+        Returns:
+            Numpy array of brown noise
+        """
+        samples = int(duration * sample_rate)
+        # Generate white noise and integrate (cumulative sum)
+        white = np.random.randn(samples)
+        brown = np.cumsum(white)
+        # Normalize and scale
+        brown = brown / np.std(brown) * amplitude
+        return brown.astype(np.float32)
+    def generate_gaussian_noise(self, duration: float, sample_rate: int = 16000,
+                               amplitude: float = 0.1) -> np.ndarray:
+        """
+        Generate Gaussian (normal distribution) noise.
+        Args:
+            duration: Duration in seconds
+            sample_rate: Sample rate in Hz
+            amplitude: Noise amplitude (standard deviation)
+        Returns:
+            Numpy array of Gaussian noise
+        """
+        samples = int(duration * sample_rate)
+        noise = np.random.normal(0, amplitude, samples)
+        return noise.astype(np.float32)
+    def generate_background_noise(self, duration: float, sample_rate: int = 16000,
+                                amplitude: float = 0.05) -> np.ndarray:
+        """
+        Generate realistic background noise (mixture of different noise types).
+        Args:
+            duration: Duration in seconds
+            sample_rate: Sample rate in Hz
+            amplitude: Noise amplitude
+        Returns:
+            Numpy array of background noise
+        """
+        # Mix different types of noise
+        white = self.generate_white_noise(duration, sample_rate, amplitude * 0.3)
+        pink = self.generate_pink_noise(duration, sample_rate, amplitude * 0.5)
+        # Add some low-frequency rumble
+        t = np.linspace(0, duration, int(sample_rate * duration), False)
+        rumble = amplitude * 0.2 * np.sin(2 * np.pi * 60 * t)  # 60 Hz hum
+        background = white + pink + rumble
+        return background.astype(np.float32)
+    def inject_noise(self, audio_data: bytes, noise_type: NoiseType,
+                    noise_level: float = 0.1) -> bytes:
+        """
+        Inject noise into existing audio data.
+        Args:
+            audio_data: Original audio bytes (WAV format)
+            noise_type: Type of noise to inject
+            noise_level: Noise level relative to signal (0.0 to 1.0)
+        Returns:
+            Audio bytes with noise injected
+        Raises:
+            Exception: If noise injection fails
+        """
+        try:
+            # Convert input audio to numpy
+            with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
+                frames = wav_file.readframes(wav_file.getnframes())
+                sample_rate = wav_file.getframerate()
+                channels = wav_file.getnchannels()
+                sample_width = wav_file.getsampwidth()
+                if sample_width != 2:
+                    raise Exception(f"Unsupported sample width: {sample_width}")
+                audio_array = np.frombuffer(frames, dtype=np.int16)
+                # Convert to float
+                audio_float = audio_array.astype(np.float32) / 32767.0
+                # Handle stereo
+                if channels == 2:
+                    audio_float = audio_float.reshape(-1, 2)
+                    # Process each channel separately
+                    for ch in range(2):
+                        channel_data = audio_float[:, ch]
+                        duration = len(channel_data) / sample_rate
+                        # Generate appropriate noise
+                        if noise_type == 'white':
+                            noise = self.generate_white_noise(duration, sample_rate, noise_level)
+                        elif noise_type == 'pink':
+                            noise = self.generate_pink_noise(duration, sample_rate, noise_level)
+                        elif noise_type == 'brown':
+                            noise = self.generate_brown_noise(duration, sample_rate, noise_level)
+                        elif noise_type == 'gaussian':
+                            noise = self.generate_gaussian_noise(duration, sample_rate, noise_level)
+                        elif noise_type == 'background':
+                            noise = self.generate_background_noise(duration, sample_rate, noise_level)
+                        else:
+                            raise Exception(f"Unsupported noise type: {noise_type}")
+                        # Ensure same length
+                        if len(noise) != len(channel_data):
+                            noise = noise[:len(channel_data)]
+                        # Add noise
+                        audio_float[:, ch] = channel_data + noise
+                    # Flatten back
+                    audio_float = audio_float.flatten()
+                else:
+                    # Mono processing
+                    duration = len(audio_float) / sample_rate
+                    # Generate noise
+                    if noise_type == 'white':
+                        noise = self.generate_white_noise(duration, sample_rate, noise_level)
+                    elif noise_type == 'pink':
+                        noise = self.generate_pink_noise(duration, sample_rate, noise_level)
+                    elif noise_type == 'brown':
+                        noise = self.generate_brown_noise(duration, sample_rate, noise_level)
+                    elif noise_type == 'gaussian':
+                        noise = self.generate_gaussian_noise(duration, sample_rate, noise_level)
+                    elif noise_type == 'background':
+                        noise = self.generate_background_noise(duration, sample_rate, noise_level)
+                    else:
+                        raise Exception(f"Unsupported noise type: {noise_type}")
+                    # Ensure same length
+                    if len(noise) != len(audio_float):
+                        noise = noise[:len(audio_float)]
+                    # Add noise
+                    audio_float = audio_float + noise
+                # Clip to prevent overflow
+                audio_float = np.clip(audio_float, -1.0, 1.0)
+                # Convert back to int16
+                audio_int16 = (audio_float * 32767).astype(np.int16)
+                # Create output WAV
+                output = io.BytesIO()
+                with wave.open(output, 'wb') as output_wav:
+                    output_wav.setnchannels(channels)
+                    output_wav.setsampwidth(sample_width)
+                    output_wav.setframerate(sample_rate)
+                    output_wav.writeframes(audio_int16.tobytes())
+                self.logger.debug(f"Injected {noise_type} noise at level {noise_level}")
+                return output.getvalue()
+        except Exception as e:
+            self.logger.error(f"Noise injection failed: {str(e)}")
+            raise Exception(f"Failed to inject noise: {str(e)}")
+    def create_pure_noise(self, noise_type: NoiseType, duration: float = 1.0,
+                         sample_rate: int = 16000, amplitude: float = 0.3) -> bytes:
+        """
+        Create pure noise audio file for testing.
+        Args:
+            noise_type: Type of noise to generate
+            duration: Duration in seconds
+            sample_rate: Sample rate in Hz
+            amplitude: Noise amplitude
+        Returns:
+            WAV audio bytes containing pure noise
+        """
+        try:
+            # Generate noise
+            if noise_type == 'white':
+                noise = self.generate_white_noise(duration, sample_rate, amplitude)
+            elif noise_type == 'pink':
+                noise = self.generate_pink_noise(duration, sample_rate, amplitude)
+            elif noise_type == 'brown':
+                noise = self.generate_brown_noise(duration, sample_rate, amplitude)
+            elif noise_type == 'gaussian':
+                noise = self.generate_gaussian_noise(duration, sample_rate, amplitude)
+            elif noise_type == 'background':
+                noise = self.generate_background_noise(duration, sample_rate, amplitude)
+            else:
+                raise Exception(f"Unsupported noise type: {noise_type}")
+            # Convert to int16
+            noise_int16 = (np.clip(noise, -1.0, 1.0) * 32767).astype(np.int16)
+            # Create WAV
+            output = io.BytesIO()
+            with wave.open(output, 'wb') as wav_file:
+                wav_file.setnchannels(1)  # Mono
+                wav_file.setsampwidth(2)  # 16-bit
+                wav_file.setframerate(sample_rate)
+                wav_file.writeframes(noise_int16.tobytes())
+            return output.getvalue()
+        except Exception as e:
+            self.logger.error(f"Pure noise generation failed: {str(e)}")
+            raise Exception(f"Failed to create pure noise: {str(e)}")
+# Global noise generator instance
+noise_generator = NoiseGenerator()

utils/session_manager.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Session Management for Audio Chunk Storage
+Handles session creation, audio chunk saving, and folder organization
+"""
+import os
+import time
+import uuid
+import logging
+import wave
+import numpy as np
+from typing import Dict, Optional, List
+from pathlib import Path
+import json
+import threading
+logger = logging.getLogger(__name__)
+class SessionManager:
+    """
+    Manages audio recording sessions with systematic file storage.
+    Each session gets a unique ID and folder for organized chunk storage.
+    """
+    def __init__(self, base_output_dir: str = "output"):
+        """
+        Initialize session manager.
+        Args:
+            base_output_dir: Base directory for all session outputs
+        """
+        self.base_output_dir = Path(base_output_dir)
+        self.base_output_dir.mkdir(exist_ok=True)
+        # Active sessions tracking
+        self.active_sessions: Dict[str, 'AudioSession'] = {}
+        self.lock = threading.Lock()
+        logger.info(f"Session manager initialized with output directory: {self.base_output_dir}")
+    def create_session(self, session_id: Optional[str] = None) -> str:
+        """
+        Create a new audio recording session.
+        Args:
+            session_id: Optional custom session ID, otherwise auto-generated
+        Returns:
+            str: Session ID
+        """
+        if not session_id:
+            # Generate session ID with timestamp and short UUID
+            timestamp = int(time.time())
+            short_uuid = str(uuid.uuid4())[:8]
+            session_id = f"session{timestamp}_{short_uuid}"
+        with self.lock:
+            if session_id in self.active_sessions:
+                logger.warning(f"Session {session_id} already exists, returning existing session")
+                return session_id
+            # Create session object
+            session = AudioSession(session_id, self.base_output_dir)
+            self.active_sessions[session_id] = session
+            logger.info(f"Created new session: {session_id}")
+            return session_id
+    def get_session(self, session_id: str) -> Optional['AudioSession']:
+        """Get an existing session by ID."""
+        with self.lock:
+            return self.active_sessions.get(session_id)
+    def close_session(self, session_id: str) -> bool:
+        """
+        Close and finalize a session.
+        Args:
+            session_id: Session to close
+        Returns:
+            bool: True if session was closed successfully
+        """
+        with self.lock:
+            if session_id not in self.active_sessions:
+                logger.warning(f"Session {session_id} not found")
+                return False
+            session = self.active_sessions[session_id]
+            session.finalize()
+            del self.active_sessions[session_id]
+            logger.info(f"Closed session: {session_id} ({session.chunk_count} chunks saved)")
+            return True
+    def cleanup_old_sessions(self, max_age_hours: int = 24) -> int:
+        """
+        Clean up sessions older than specified hours.
+        Args:
+            max_age_hours: Maximum age in hours before cleanup
+        Returns:
+            int: Number of sessions cleaned up
+        """
+        cutoff_time = time.time() - (max_age_hours * 3600)
+        cleaned_count = 0
+        # Find old session folders
+        for session_dir in self.base_output_dir.iterdir():
+            if not session_dir.is_dir() or not session_dir.name.startswith('session'):
+                continue
+            try:
+                # Check if session has a metadata file with creation time
+                metadata_file = session_dir / "session_info.json"
+                if metadata_file.exists():
+                    with open(metadata_file, 'r') as f:
+                        metadata = json.load(f)
+                        if metadata.get('created_at', 0) < cutoff_time:
+                            import shutil
+                            shutil.rmtree(session_dir)
+                            cleaned_count += 1
+                            logger.info(f"Cleaned up old session: {session_dir.name}")
+                else:
+                    # Fallback to directory modification time
+                    if session_dir.stat().st_mtime < cutoff_time:
+                        import shutil
+                        shutil.rmtree(session_dir)
+                        cleaned_count += 1
+                        logger.info(f"Cleaned up old session: {session_dir.name}")
+            except Exception as e:
+                logger.error(f"Error cleaning up session {session_dir.name}: {e}")
+        if cleaned_count > 0:
+            logger.info(f"Cleaned up {cleaned_count} old sessions")
+        return cleaned_count
+    def get_session_stats(self) -> Dict:
+        """Get statistics about all sessions."""
+        with self.lock:
+            stats = {
+                'active_sessions': len(self.active_sessions),
+                'total_chunks_active': sum(s.chunk_count for s in self.active_sessions.values()),
+                'session_details': {
+                    sid: {
+                        'chunk_count': session.chunk_count,
+                        'created_at': session.created_at,
+                        'folder_path': str(session.session_dir)
+                    }
+                    for sid, session in self.active_sessions.items()
+                }
+            }
+        # Count total session folders
+        total_session_dirs = len([
+            d for d in self.base_output_dir.iterdir()
+            if d.is_dir() and d.name.startswith('session')
+        ])
+        stats['total_session_folders'] = total_session_dirs
+        return stats
+class AudioSession:
+    """
+    Represents a single audio recording session with systematic chunk storage.
+    """
+    def __init__(self, session_id: str, base_output_dir: Path):
+        """
+        Initialize audio session.
+        Args:
+            session_id: Unique session identifier
+            base_output_dir: Base directory for output
+        """
+        self.session_id = session_id
+        self.created_at = time.time()
+        self.chunk_count = 0
+        # Create session directory
+        self.session_dir = base_output_dir / session_id
+        self.session_dir.mkdir(exist_ok=True)
+        # Create subdirectories
+        self.chunks_dir = self.session_dir / "chunks"
+        self.chunks_dir.mkdir(exist_ok=True)
+        # Session metadata
+        self.metadata = {
+            'session_id': session_id,
+            'created_at': self.created_at,
+            'created_at_human': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.created_at)),
+            'chunk_count': 0,
+            'chunks': []
+        }
+        self._save_metadata()
+        logger.info(f"Session folder created: {self.session_dir}")
+    def save_audio_chunk(self, audio_data: bytes, prediction_result: Optional[Dict] = None,
+                        chunk_type: str = "speech") -> str:
+        """
+        Save an audio chunk to the session folder.
+        Args:
+            audio_data: Raw audio bytes (WAV format preferred)
+            prediction_result: Optional prediction results to save alongside
+            chunk_type: Type of chunk ("speech", "vad_segment", "raw", etc.)
+        Returns:
+            str: Path to saved chunk file
+        """
+        self.chunk_count += 1
+        # Generate chunk filename
+        chunk_filename = f"{self.chunk_count:03d}.wav"
+        chunk_path = self.chunks_dir / chunk_filename
+        try:
+            # Save audio data
+            if self._is_wav_format(audio_data):
+                # Already WAV format, save directly
+                with open(chunk_path, 'wb') as f:
+                    f.write(audio_data)
+                logger.debug(f"Saved WAV chunk: {chunk_path}")
+            else:
+                # Convert raw PCM to WAV
+                self._save_pcm_as_wav(audio_data, chunk_path)
+                logger.debug(f"Converted and saved PCM chunk: {chunk_path}")
+            # Update metadata
+            chunk_info = {
+                'chunk_id': self.chunk_count,
+                'filename': chunk_filename,
+                'chunk_type': chunk_type,
+                'size_bytes': len(audio_data),
+                'saved_at': time.time(),
+                'saved_at_human': time.strftime('%Y-%m-%d %H:%M:%S'),
+                'audio_format': 'wav' if self._is_wav_format(audio_data) else 'pcm_converted'
+            }
+            # Add prediction results if provided
+            if prediction_result:
+                chunk_info['prediction'] = prediction_result
+            self.metadata['chunks'].append(chunk_info)
+            self.metadata['chunk_count'] = self.chunk_count
+            self._save_metadata()
+            logger.info(f"Saved audio chunk {self.chunk_count}: {chunk_path}")
+            return str(chunk_path)
+        except Exception as e:
+            logger.error(f"Failed to save audio chunk {self.chunk_count}: {e}")
+            # Rollback chunk count on failure
+            self.chunk_count -= 1
+            raise
+    def _is_wav_format(self, audio_data: bytes) -> bool:
+        """Check if audio data is in WAV format."""
+        return audio_data.startswith(b'RIFF') and b'WAVE' in audio_data[:12]
+    def _save_pcm_as_wav(self, pcm_data: bytes, output_path: Path,
+                        sample_rate: int = 16000, channels: int = 1, sample_width: int = 2):
+        """
+        Convert raw PCM data to WAV format and save.
+        Args:
+            pcm_data: Raw PCM bytes
+            output_path: Output WAV file path
+            sample_rate: Sample rate (default 16kHz for speech)
+            channels: Number of channels (default mono)
+            sample_width: Sample width in bytes (default 16-bit)
+        """
+        try:
+            with wave.open(str(output_path), 'wb') as wav_file:
+                wav_file.setnchannels(channels)
+                wav_file.setsampwidth(sample_width)
+                wav_file.setframerate(sample_rate)
+                wav_file.writeframes(pcm_data)
+        except Exception as e:
+            logger.error(f"PCM to WAV conversion failed: {e}")
+            # Fallback: save as raw PCM with .pcm extension
+            raw_path = output_path.with_suffix('.pcm')
+            with open(raw_path, 'wb') as f:
+                f.write(pcm_data)
+            logger.warning(f"Saved as raw PCM instead: {raw_path}")
+    def _save_metadata(self):
+        """Save session metadata to JSON file."""
+        try:
+            metadata_path = self.session_dir / "session_info.json"
+            with open(metadata_path, 'w') as f:
+                json.dump(self.metadata, f, indent=2, default=str)
+        except Exception as e:
+            logger.error(f"Failed to save session metadata: {e}")
+    def finalize(self):
+        """Finalize the session and save final metadata."""
+        self.metadata['finalized_at'] = time.time()
+        self.metadata['finalized_at_human'] = time.strftime('%Y-%m-%d %H:%M:%S')
+        self.metadata['final_chunk_count'] = self.chunk_count
+        self._save_metadata()
+        logger.info(f"📋 Finalized session {self.session_id}: {self.chunk_count} chunks saved")
+    def get_chunk_list(self) -> List[str]:
+        """Get list of all chunk files in order."""
+        chunk_files = []
+        for i in range(1, self.chunk_count + 1):
+            chunk_file = self.chunks_dir / f"{i:03d}.wav"
+            if chunk_file.exists():
+                chunk_files.append(str(chunk_file))
+            else:
+                # Check for .pcm fallback
+                pcm_file = self.chunks_dir / f"{i:03d}.pcm"
+                if pcm_file.exists():
+                    chunk_files.append(str(pcm_file))
+        return chunk_files
+    def get_session_summary(self) -> Dict:
+        """Get comprehensive session summary."""
+        return {
+            'session_id': self.session_id,
+            'created_at': self.created_at,
+            'chunk_count': self.chunk_count,
+            'session_dir': str(self.session_dir),
+            'chunks_dir': str(self.chunks_dir),
+            'chunk_files': self.get_chunk_list(),
+            'metadata': self.metadata
+        }
+# Global session manager instance
+session_manager = SessionManager()