diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..d2864ab19aaeb74650fac16088b25523ac460376 --- /dev/null +++ b/.env.example @@ -0,0 +1,21 @@ +# Environment variables for HF Spaces deployment +# Copy this to .env and set your values + +# Flask Configuration +SECRET_KEY=your_secret_key_here +FLASK_ENV=production + +# External API Keys (optional - for external processors) +HF_TOKEN=your_huggingface_token_here +OPENAI_API_KEY=your_openai_key_here + +# Model Configuration +DEFAULT_ML_MODEL=ml_mfcc +ENABLE_EXTERNAL_API=false + +# Performance Settings +MAX_AUDIO_DURATION=10 +MAX_FILE_SIZE=10485760 + +# Logging +LOG_LEVEL=INFO \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..65ce1884825f19b8e634a3d5a2453640800f4d5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +# Use Python 3.9 as recommended by HF Spaces +FROM python:3.9-slim + +# Create user for HF Spaces (required) +RUN useradd -m -u 1000 user +USER user + +# Set environment variables +ENV PATH="/home/user/.local/bin:$PATH" +ENV PYTHONPATH="/app:$PYTHONPATH" +ENV PYTHONUNBUFFERED=1 + +# Set work directory +WORKDIR /app + +# Install system dependencies (as user, limited packages) +# Note: HF Spaces has restrictions on system packages +COPY --chown=user ./requirements_hf.txt requirements.txt + +# Install Python dependencies +RUN pip install --no-cache-dir --upgrade -r requirements.txt + +# Copy application files (essential files only) +COPY --chown=user ./app.py ./app.py +COPY --chown=user ./audio_processors ./audio_processors +COPY --chown=user ./utils ./utils +COPY --chown=user ./models ./models + +# Copy environment template (users can set their own HF_TOKEN) +COPY --chown=user ./.env.example ./.env + +# Create log directory +RUN mkdir -p /app/logs + +# Expose port (HF Spaces requires 7860) +EXPOSE 7860 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD python -c "import requests; requests.get('http://localhost:7860/api/health').raise_for_status()" || exit 1 + +# Run the application +CMD ["python", "app.py"] \ No newline at end of file diff --git a/README.md b/README.md index 12cb2b255b7c4d309484c9740d04aa62caac792b..ef12b6888ec0526203a65e4ab4de9dfc392e785a 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,30 @@ --- -title: Streaming Digit Classifier -emoji: 🦀 -colorFrom: gray -colorTo: red +title: Streaming Digit Classifier API +emoji: 🎤 +colorFrom: green +colorTo: blue sdk: docker pinned: false -license: mit -short_description: Real-time spoken digit recognition API with 4 ML approaches +app_port: 7860 --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# Streaming Digit Classifier API + +Backend API for real-time spoken digit recognition (0-9). + +## Features + +- ML Models: MFCC + Dense NN, Mel CNN, Raw CNN +- External API integration (Whisper) +- Real-time audio processing +- RESTful API endpoints + +## API Endpoints + +- \`GET /\` - API status +- \`POST /api/process_audio\` - Process audio file +- \`POST /api/process_audio_chunk\` - Process streaming chunk +- \`GET /api/health\` - Health check +- \`GET /api/processors\` - Available processors + +Frontend: [Deployed on Vercel](https://your-frontend-url.vercel.app) diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..2174c793a65a0ec518251e683896a72bc67689a3 --- /dev/null +++ b/app.py @@ -0,0 +1,361 @@ +""" +Audio Digit Classification API for Hugging Face Spaces +Backend API for spoken digit recognition (0-9) - HF Spaces deployment +""" + +from flask import Flask, request, jsonify +from flask_cors import CORS +import os +import time +import logging +from typing import Dict, Any, Optional +from dotenv import load_dotenv +import numpy as np + +# Import audio processors (only essential ones for deployment) +from audio_processors.external_api import ExternalAPIProcessor +from audio_processors.whisper_digit_processor import WhisperDigitProcessor +from audio_processors.ml_mfcc_processor import MLMFCCProcessor +from audio_processors.ml_mel_cnn_processor import MLMelCNNProcessor +from audio_processors.ml_raw_cnn_processor import MLRawCNNProcessor + +# Import utilities +from utils.audio_utils import validate_audio_format, convert_audio_format, get_audio_duration, convert_for_ml_models +from utils.logging_utils import performance_logger, setup_flask_logging + +# Load environment variables +load_dotenv() + +# Initialize Flask app +app = Flask(__name__) +app.secret_key = os.getenv('SECRET_KEY', 'hf_spaces_deployment_key') + +# Enable CORS for frontend requests from Vercel +CORS(app, origins=['*']) # In production, specify your Vercel domain + +# Setup logging +setup_flask_logging(app) + +# Configuration for HF Spaces +MAX_AUDIO_DURATION = 10 # seconds +MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB +ALLOWED_EXTENSIONS = {'wav', 'mp3', 'ogg', 'm4a', 'webm'} + +def allowed_file(filename: str) -> bool: + """Check if file extension is allowed.""" + return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + +def initialize_processors(): + """Initialize audio processors optimized for HF Spaces deployment.""" + procs = {} + + # ML-trained processors (high priority - use best models only) + ml_processors = [ + ('ml_mfcc', MLMFCCProcessor, 'ML MFCC + Dense NN (Best - 98.52%)'), + ('ml_mel_cnn', MLMelCNNProcessor, 'ML Mel CNN (Good - 97.22%)'), + ('ml_raw_cnn', MLRawCNNProcessor, 'ML Raw CNN (Fair - 91.30%)') + ] + + ml_working_count = 0 + for proc_key, proc_class, proc_name in ml_processors: + try: + processor = proc_class() + if processor.is_configured(): + procs[proc_key] = processor + ml_working_count += 1 + app.logger.info(f"[OK] {proc_name} loaded successfully") + else: + app.logger.warning(f"[WARN] {proc_name} not configured (model files missing)") + except Exception as e: + app.logger.error(f"[FAIL] Failed to initialize {proc_name}: {str(e)}") + + # External API processor as fallback + try: + external_processor = ExternalAPIProcessor() + if external_processor.is_configured(): + procs['external_api'] = external_processor + app.logger.info("[OK] External API processor initialized") + else: + app.logger.warning("[WARN] External API not configured") + except Exception as e: + app.logger.error(f"[FAIL] Failed to initialize External API: {str(e)}") + + # Whisper digit processor as another fallback + try: + whisper_processor = WhisperDigitProcessor() + if whisper_processor.is_configured(): + procs['whisper_digit'] = whisper_processor + app.logger.info("[OK] Whisper digit processor initialized") + except Exception as e: + app.logger.error(f"[FAIL] Failed to initialize Whisper: {str(e)}") + + app.logger.info(f"Processor initialization complete:") + app.logger.info(f" ML Models loaded: {ml_working_count}/3") + app.logger.info(f" Total processors: {len(procs)}") + + return procs + +processors = initialize_processors() + +@app.route('/') +def index(): + """API status endpoint.""" + return jsonify({ + 'message': 'Streaming Digit Classifier API', + 'status': 'running', + 'version': '1.0.0', + 'available_processors': list(processors.keys()), + 'documentation': 'Frontend at Vercel, Backend API at HF Spaces' + }) + +@app.route('/api/process_audio', methods=['POST']) +def process_audio(): + """ + Process audio file with selected method and return digit prediction. + Expects multipart form data with 'audio' file and 'method' selection. + """ + try: + # Validate request + if 'audio' not in request.files: + return jsonify({'error': 'No audio file provided'}), 400 + + if 'method' not in request.form: + return jsonify({'error': 'No processing method specified'}), 400 + + audio_file = request.files['audio'] + method = request.form['method'] + + # Validate audio file + if audio_file.filename == '': + return jsonify({'error': 'No file selected'}), 400 + + if not allowed_file(audio_file.filename): + return jsonify({'error': 'Unsupported file format'}), 400 + + # Validate method + if method not in processors: + return jsonify({'error': f'Unknown processing method: {method}'}), 400 + + # Read audio data + audio_data = audio_file.read() + + # Check file size + if len(audio_data) > MAX_FILE_SIZE: + return jsonify({'error': 'Audio file too large'}), 400 + + # Convert to standard format + try: + app.logger.debug(f"Converting audio format. Original size: {len(audio_data)} bytes") + standardized_audio = convert_audio_format(audio_data) + app.logger.debug(f"Converted audio size: {len(standardized_audio)} bytes") + except Exception as e: + app.logger.error(f"Audio conversion failed: {str(e)}") + return jsonify({'error': 'Failed to process audio format - unsupported format or corrupted file'}), 400 + + # Check audio duration + duration = get_audio_duration(standardized_audio) + if duration > MAX_AUDIO_DURATION: + return jsonify({ + 'error': f'Audio too long: {duration:.1f}s (max: {MAX_AUDIO_DURATION}s)' + }), 400 + + if duration < 0.1: + return jsonify({'error': 'Audio too short (minimum: 0.1s)'}), 400 + + # Log audio input info + performance_logger.log_audio_info(duration, { + 'filename': audio_file.filename, + 'size_bytes': len(audio_data), + 'converted_size': len(standardized_audio), + 'method': method + }) + + # Process with selected method + processor = processors[method] + result = processor.predict_with_timing(standardized_audio) + + # Log performance + performance_logger.log_prediction(method, result) + + # Add additional metadata + result.update({ + 'audio_duration': round(duration, 3), + 'file_size': len(audio_data), + 'api_version': '1.0.0' + }) + + app.logger.info(f"Processed audio with {method}: '{result['predicted_digit']}' in {result['inference_time']}s") + + return jsonify(result) + + except Exception as e: + app.logger.error(f"Audio processing error: {str(e)}") + return jsonify({ + 'error': 'Internal processing error', + 'success': False, + 'timestamp': time.time() + }), 500 + +@app.route('/api/process_audio_chunk', methods=['POST']) +def process_audio_chunk(): + """ + Process streaming audio chunk for real-time digit recognition. + """ + try: + # Validate request + if 'audio' not in request.files: + return jsonify({'error': 'No audio chunk provided'}), 400 + + audio_file = request.files['audio'] + method = request.form.get('method', 'ml_mfcc') # Default to best ML model + + # Validate method + if method not in processors: + return jsonify({'error': f'Unknown processing method: {method}'}), 400 + + # Read audio data + audio_data = audio_file.read() + + # Check chunk size + if len(audio_data) > MAX_FILE_SIZE: + return jsonify({'error': 'Audio chunk too large'}), 400 + + if len(audio_data) < 100: + return jsonify({'error': 'Audio chunk too small'}), 400 + + # Convert to standardized format + try: + standardized_audio = convert_for_ml_models(audio_data, 'streaming') + except Exception as e: + app.logger.error(f"Audio conversion failed for chunk: {str(e)}") + return jsonify({'error': 'Failed to process audio chunk format'}), 400 + + # Process audio chunk + processor = processors[method] + result = processor.predict_with_timing(standardized_audio) + + # Add streaming metadata + result.update({ + 'segment_index': 0, + 'segment_size': len(standardized_audio), + 'is_streaming': True, + 'api_version': '1.0.0' + }) + + app.logger.info(f"Streaming prediction: '{result['predicted_digit']}' " + f"(Inference: {result['inference_time']}s)") + + return jsonify({ + 'success': True, + 'segments_detected': 1, + 'total_results': 1, + 'results': [result], + 'timestamp': time.time(), + 'has_fallback': False + }) + + except Exception as e: + app.logger.error(f"Streaming audio processing error: {str(e)}") + return jsonify({ + 'error': 'Internal streaming processing error', + 'success': False, + 'timestamp': time.time() + }), 500 + +@app.route('/api/processors') +def get_processors(): + """Get information about available processors.""" + try: + processor_info = {} + for name, processor in processors.items(): + info = { + 'name': processor.name, + 'method': name, + 'configured': getattr(processor, 'is_configured', lambda: True)() + } + + # Add model-specific info if available + if hasattr(processor, 'get_model_info'): + info.update(processor.get_model_info()) + + processor_info[name] = info + + return jsonify(processor_info) + + except Exception as e: + app.logger.error(f"Error getting processors: {str(e)}") + return jsonify({'error': 'Failed to retrieve processor information'}), 500 + +@app.route('/api/health') +def health_check(): + """Health check endpoint.""" + try: + # Check processor availability + processor_health = {} + for name, processor in processors.items(): + processor_health[name] = { + 'available': True, + 'configured': getattr(processor, 'is_configured', lambda: True)() + } + + return jsonify({ + 'status': 'healthy', + 'timestamp': time.time(), + 'processors': processor_health, + 'version': '1.0.0', + 'deployment': 'huggingface-spaces' + }) + + except Exception as e: + app.logger.error(f"Health check failed: {str(e)}") + return jsonify({ + 'status': 'unhealthy', + 'error': str(e), + 'timestamp': time.time() + }), 500 + +@app.errorhandler(404) +def not_found_error(error): + """Handle 404 errors.""" + return jsonify({'error': 'Endpoint not found', 'status': 404}), 404 + +@app.errorhandler(500) +def internal_error(error): + """Handle 500 errors.""" + app.logger.error(f"Internal error: {str(error)}") + return jsonify({'error': 'Internal server error', 'status': 500}), 500 + +@app.errorhandler(413) +def too_large_error(error): + """Handle file too large errors.""" + return jsonify({'error': 'File too large', 'status': 413}), 413 + +if __name__ == '__main__': + # Log startup information + try: + import importlib.metadata + flask_version = importlib.metadata.version('flask') + except: + flask_version = 'unknown' + + performance_logger.log_system_info({ + 'python_version': os.sys.version, + 'flask_version': flask_version, + 'processors_loaded': list(processors.keys()), + 'max_audio_duration': MAX_AUDIO_DURATION, + 'max_file_size': MAX_FILE_SIZE, + 'deployment': 'huggingface-spaces' + }) + + # Run server (HF Spaces requires port 7860) + port = int(os.getenv('PORT', 7860)) + + app.logger.info(f"Starting Audio Digit Classifier API on port {port}") + app.logger.info("Deployment: Hugging Face Spaces") + + app.run( + host='0.0.0.0', + port=port, + debug=False, # Disable debug in production + threaded=True + ) \ No newline at end of file diff --git a/audio_processors/__init__.py b/audio_processors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_processors/__pycache__/__init__.cpython-312.pyc b/audio_processors/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24b6ed93fd448c3a1c46ef9e08f316946dbe5b71 Binary files /dev/null and b/audio_processors/__pycache__/__init__.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/base_processor.cpython-312.pyc b/audio_processors/__pycache__/base_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..217f47741134a9965da1240d675e2848495f6f94 Binary files /dev/null and b/audio_processors/__pycache__/base_processor.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/external_api.cpython-312.pyc b/audio_processors/__pycache__/external_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d1f3157a6dda39e65e0a3ed609d120574571c16 Binary files /dev/null and b/audio_processors/__pycache__/external_api.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/faster_whisper_processor.cpython-312.pyc b/audio_processors/__pycache__/faster_whisper_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2c53727aca136ee9fba87bd63110648f667626c Binary files /dev/null and b/audio_processors/__pycache__/faster_whisper_processor.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/local_whisper.cpython-312.pyc b/audio_processors/__pycache__/local_whisper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..981414f8098a84a284e0108aa5035b299c457747 Binary files /dev/null and b/audio_processors/__pycache__/local_whisper.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/mel_spectrogram.cpython-312.pyc b/audio_processors/__pycache__/mel_spectrogram.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e50d4e8ae7c25eac3fc460d42f6970bff076395 Binary files /dev/null and b/audio_processors/__pycache__/mel_spectrogram.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/mfcc_processor.cpython-312.pyc b/audio_processors/__pycache__/mfcc_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca3427be1de4b342df50aaa1178107a2798a85e7 Binary files /dev/null and b/audio_processors/__pycache__/mfcc_processor.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/ml_mel_cnn_processor.cpython-312.pyc b/audio_processors/__pycache__/ml_mel_cnn_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8173a3e88ecf3d01ef296c6b49bb373d7c584f89 Binary files /dev/null and b/audio_processors/__pycache__/ml_mel_cnn_processor.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/ml_mfcc_processor.cpython-312.pyc b/audio_processors/__pycache__/ml_mfcc_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8bc745aa7c35b84b5245701baaaee231fd7c1ced Binary files /dev/null and b/audio_processors/__pycache__/ml_mfcc_processor.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/ml_raw_cnn_processor.cpython-312.pyc b/audio_processors/__pycache__/ml_raw_cnn_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9946a2168b54b7f46205f707e6e898c5b54bc42 Binary files /dev/null and b/audio_processors/__pycache__/ml_raw_cnn_processor.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/raw_spectrogram.cpython-312.pyc b/audio_processors/__pycache__/raw_spectrogram.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bc176484db5ec43ae97ac994f632504ea7449c9 Binary files /dev/null and b/audio_processors/__pycache__/raw_spectrogram.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/wav2vec2_processor.cpython-312.pyc b/audio_processors/__pycache__/wav2vec2_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c8772577f97d26a25bc951cbcbaa2ce490d20d5 Binary files /dev/null and b/audio_processors/__pycache__/wav2vec2_processor.cpython-312.pyc differ diff --git a/audio_processors/__pycache__/whisper_digit_processor.cpython-312.pyc b/audio_processors/__pycache__/whisper_digit_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29c6f1ada3829aa0bf68f71c1ce6b2b47c1aeb79 Binary files /dev/null and b/audio_processors/__pycache__/whisper_digit_processor.cpython-312.pyc differ diff --git a/audio_processors/base_processor.py b/audio_processors/base_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..1b0c13380820574af47c3d61124ca6b376990dab --- /dev/null +++ b/audio_processors/base_processor.py @@ -0,0 +1,85 @@ +from abc import ABC, abstractmethod +from typing import Union, Dict, Any +import time +import logging + +logger = logging.getLogger(__name__) + +class AudioProcessor(ABC): + """ + Abstract base class for all audio digit classification processors. + Provides common interface and logging functionality. + """ + + def __init__(self, name: str): + self.name = name + self.total_predictions = 0 + self.total_inference_time = 0.0 + + @abstractmethod + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio data and return predicted digit as string. + + Args: + audio_data: Raw audio bytes + + Returns: + Predicted digit as string ('0'-'9') + """ + pass + + def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]: + """ + Process audio and return prediction with timing information. + + Args: + audio_data: Raw audio bytes + + Returns: + Dictionary with prediction, timing, and method info + """ + start_time = time.time() + + try: + predicted_digit = self.process_audio(audio_data) + inference_time = time.time() - start_time + + self.total_predictions += 1 + self.total_inference_time += inference_time + + result = { + 'predicted_digit': predicted_digit, + 'inference_time': round(inference_time, 3), + 'method': self.name, + 'timestamp': time.time(), + 'average_time': round(self.total_inference_time / self.total_predictions, 3), + 'success': True + } + + logger.info(f"{self.name}: Predicted '{predicted_digit}' in {inference_time:.3f}s") + return result + + except Exception as e: + inference_time = time.time() - start_time + logger.error(f"{self.name}: Error processing audio: {str(e)}") + + return { + 'predicted_digit': 'ERROR', + 'inference_time': round(inference_time, 3), + 'method': self.name, + 'timestamp': time.time(), + 'success': False, + 'error': str(e) + } + + def get_stats(self) -> Dict[str, float]: + """Get processor statistics.""" + if self.total_predictions == 0: + return {'total_predictions': 0, 'average_time': 0.0} + + return { + 'total_predictions': self.total_predictions, + 'total_time': round(self.total_inference_time, 3), + 'average_time': round(self.total_inference_time / self.total_predictions, 3) + } \ No newline at end of file diff --git a/audio_processors/external_api.py b/audio_processors/external_api.py new file mode 100644 index 0000000000000000000000000000000000000000..86a300066ba8427302f44824e7dcb35de502681d --- /dev/null +++ b/audio_processors/external_api.py @@ -0,0 +1,153 @@ +import requests +import os +import re +import logging +from typing import Optional +from .base_processor import AudioProcessor + +logger = logging.getLogger(__name__) + +class ExternalAPIProcessor(AudioProcessor): + """ + Hugging Face Whisper API integration for digit classification. + Uses openai/whisper-base model for speech-to-text conversion. + """ + + def __init__(self): + super().__init__("External API (Whisper)") + # Try alternative Whisper model that should be available + self.api_url = "https://api-inference.huggingface.co/models/openai/whisper-small" + self.token = os.getenv('HUGGING_FACE_TOKEN') + self.headers = {"Authorization": f"Bearer {self.token}"} if self.token else {} + + if not self.token: + logger.warning("HUGGING_FACE_TOKEN not found in environment variables") + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio using Hugging Face Whisper API. + + Args: + audio_data: Raw audio bytes (WAV format preferred) + + Returns: + Predicted digit as string ('0'-'9') + + Raises: + Exception: If API call fails or no digit found in response + """ + if not self.token: + raise Exception("Hugging Face API token not configured") + + try: + # Make API request + response = requests.post( + self.api_url, + headers=self.headers, + data=audio_data, + timeout=15 # Increased timeout + ) + + if response.status_code == 401: + logger.error("Hugging Face API token is invalid or expired") + raise Exception("Invalid or expired API token - please update HUGGING_FACE_TOKEN") + elif response.status_code == 404: + logger.error(f"Model not found or unavailable: {self.api_url}") + raise Exception("API model unavailable - may be loading or deprecated") + elif response.status_code == 503: + logger.warning("Model is loading, this may take a few moments") + raise Exception("API model is loading - please try again in a moment") + elif response.status_code != 200: + logger.error(f"API request failed: {response.status_code} - {response.text}") + raise Exception(f"API error {response.status_code}: {response.text[:100]}") + + # Parse response + result = response.json() + + if 'text' not in result: + logger.error(f"Unexpected API response format: {result}") + raise Exception("Invalid API response format") + + transcribed_text = result['text'].strip().lower() + logger.debug(f"Whisper transcription: '{transcribed_text}'") + + # Extract digit from transcription + predicted_digit = self._extract_digit(transcribed_text) + + if predicted_digit is None: + logger.warning(f"No digit found in transcription: '{transcribed_text}'") + return "?" + + return predicted_digit + + except requests.exceptions.Timeout: + raise Exception("API request timeout (15s) - service may be slow") + except requests.exceptions.RequestException as e: + raise Exception(f"API request failed: {str(e)}") + except Exception as e: + logger.error(f"Unexpected error in external API processing: {str(e)}") + raise + + def _extract_digit(self, text: str) -> Optional[str]: + """ + Extract digit from transcribed text. + Handles both numerical ('1', '2') and word forms ('one', 'two'). + + Args: + text: Transcribed text from Whisper + + Returns: + Digit as string ('0'-'9') or None if not found + """ + # Word to digit mapping + word_to_digit = { + 'zero': '0', 'oh': '0', + 'one': '1', 'won': '1', + 'two': '2', 'to': '2', 'too': '2', + 'three': '3', 'tree': '3', + 'four': '4', 'for': '4', 'fore': '4', + 'five': '5', + 'six': '6', 'sick': '6', + 'seven': '7', + 'eight': '8', 'ate': '8', + 'nine': '9', 'niner': '9' + } + + # First, try to find a direct digit + digit_match = re.search(r'\b([0-9])\b', text) + if digit_match: + return digit_match.group(1) + + # Then try word forms + words = text.split() + for word in words: + clean_word = re.sub(r'[^\w]', '', word.lower()) + if clean_word in word_to_digit: + return word_to_digit[clean_word] + + # Try partial matches for robustness + for word, digit in word_to_digit.items(): + if word in text: + return digit + + return None + + def is_configured(self) -> bool: + """Check if API is properly configured.""" + return bool(self.token) + + def test_connection(self) -> bool: + """Test API connection with a simple request.""" + if not self.is_configured(): + return False + + try: + # Test with minimal audio data + test_response = requests.get( + self.api_url, + headers=self.headers, + timeout=5 + ) + return test_response.status_code == 200 + except: + return False \ No newline at end of file diff --git a/audio_processors/faster_whisper_processor.py b/audio_processors/faster_whisper_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..13b4e33c5270b99d51424a9cffb418b9f7336aa0 --- /dev/null +++ b/audio_processors/faster_whisper_processor.py @@ -0,0 +1,219 @@ +""" +Faster-Whisper processor with built-in VAD (2025 approach) +More reliable than manual WebRTC VAD + Whisper coordination +""" + +import numpy as np +import io +import time +import logging +from typing import Dict, Any, Optional + +try: + from faster_whisper import WhisperModel + FASTER_WHISPER_AVAILABLE = True +except ImportError: + FASTER_WHISPER_AVAILABLE = False + WhisperModel = None + +from .base_processor import AudioProcessor + +logger = logging.getLogger(__name__) + +class FasterWhisperDigitProcessor(AudioProcessor): + """ + Modern 2025 approach using faster-whisper with built-in VAD. + Much more reliable than manual WebRTC VAD coordination. + """ + + def __init__(self): + """Initialize faster-whisper processor with built-in VAD.""" + super().__init__("Faster-Whisper with VAD") + + if not FASTER_WHISPER_AVAILABLE: + logger.error("faster-whisper not available. Install with: pip install faster-whisper") + self.model = None + return + + self.model = None + self.device = "cuda" if self._cuda_available() else "cpu" + + # Digit mapping + self.digit_map = { + "zero": "0", "one": "1", "two": "2", "three": "3", + "four": "4", "five": "5", "six": "6", "seven": "7", + "eight": "8", "nine": "9", + "oh": "0", "o": "0", "for": "4", "fore": "4", + "to": "2", "too": "2", "tu": "2", "tree": "3", + "free": "3", "ate": "8", "ait": "8" + } + + # Statistics + self.total_predictions = 0 + self.successful_predictions = 0 + self.failed_predictions = 0 + + self._initialize_model() + + def _cuda_available(self) -> bool: + """Check if CUDA is available.""" + try: + import torch + return torch.cuda.is_available() + except ImportError: + return False + + def _initialize_model(self): + """Initialize faster-whisper model with VAD.""" + if not FASTER_WHISPER_AVAILABLE: + return + + try: + logger.info("Initializing faster-whisper model with built-in VAD...") + + # Initialize faster-whisper model + self.model = WhisperModel( + "tiny", # Use tiny model for speed + device=self.device, + compute_type="float16" if self.device == "cuda" else "int8" + ) + + logger.info(f"Faster-Whisper model initialized on {self.device}") + + except Exception as e: + logger.error(f"Failed to initialize faster-whisper: {e}") + self.model = None + + def is_configured(self) -> bool: + """Check if processor is configured.""" + return self.model is not None and FASTER_WHISPER_AVAILABLE + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio with built-in VAD and return predicted digit. + + Args: + audio_data: Raw audio bytes + + Returns: + str: Predicted digit (0-9) or error message + """ + if not self.is_configured(): + return "error: Model not configured" + + try: + # Convert audio to numpy array + audio_array = self._convert_audio_bytes(audio_data) + if audio_array is None: + return "error: Audio conversion failed" + + # Use faster-whisper with built-in VAD + segments, info = self.model.transcribe( + audio_array, + language="en", + # Built-in VAD parameters - much better than manual VAD + vad_filter=True, + vad_parameters=dict( + min_silence_duration_ms=100, # 100ms minimum silence + speech_pad_ms=30 # 30ms padding around speech + ) + ) + + # Process transcription results + transcriptions = [] + for segment in segments: + text = segment.text.strip().lower() + if text: + transcriptions.append(text) + + if not transcriptions: + return "error: No speech detected" + + # Combine all segments and extract digit + full_text = " ".join(transcriptions) + digit = self._text_to_digit(full_text) + + logger.debug(f"Faster-Whisper: '{full_text}' -> '{digit}'") + + if digit in "0123456789": + self.successful_predictions += 1 + return digit + else: + self.failed_predictions += 1 + return f"unclear: {full_text}" + + except Exception as e: + logger.error(f"Faster-Whisper processing failed: {e}") + self.failed_predictions += 1 + return f"error: {str(e)}" + finally: + self.total_predictions += 1 + + def _convert_audio_bytes(self, audio_data: bytes) -> Optional[np.ndarray]: + """Convert audio bytes to numpy array for faster-whisper.""" + try: + # Check if it's a WAV file + if audio_data.startswith(b'RIFF'): + import soundfile as sf + audio_buffer = io.BytesIO(audio_data) + audio_array, sample_rate = sf.read(audio_buffer, dtype='float32') + + # Convert stereo to mono if needed + if len(audio_array.shape) > 1: + audio_array = np.mean(audio_array, axis=1) + + return audio_array + else: + # Raw PCM data + audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) + return audio_array / 32768.0 + + except Exception as e: + logger.error(f"Audio conversion failed: {e}") + return None + + def _text_to_digit(self, text: str) -> str: + """Convert transcribed text to digit.""" + text = text.strip().lower() + + # Remove common words + text = text.replace("the", "").replace("number", "").replace("digit", "") + text = text.strip() + + # Direct mapping + if text in self.digit_map: + return self.digit_map[text] + + # Word-by-word check + for word in text.split(): + if word in self.digit_map: + return self.digit_map[word] + + # Check for digits in text + digits = [char for char in text if char.isdigit()] + if digits: + return digits[0] + + return text + + def get_model_info(self) -> Dict[str, Any]: + """Get model information.""" + return { + 'model_name': 'faster-whisper-tiny', + 'model_type': 'Speech-to-Text with VAD', + 'has_builtin_vad': True, + 'device': self.device, + 'available': FASTER_WHISPER_AVAILABLE + } + + def get_stats(self) -> Dict[str, Any]: + """Get processing statistics.""" + success_rate = self.successful_predictions / max(1, self.total_predictions) + + return { + 'total_predictions': self.total_predictions, + 'successful_predictions': self.successful_predictions, + 'failed_predictions': self.failed_predictions, + 'success_rate': round(success_rate, 3), + 'model_available': self.is_configured() + } \ No newline at end of file diff --git a/audio_processors/local_whisper.py b/audio_processors/local_whisper.py new file mode 100644 index 0000000000000000000000000000000000000000..01751962db3334f2f9b9b494393f427f71ee3c42 --- /dev/null +++ b/audio_processors/local_whisper.py @@ -0,0 +1,158 @@ +import logging +import numpy as np +from typing import Optional +from .base_processor import AudioProcessor + +logger = logging.getLogger(__name__) + +class LocalWhisperProcessor(AudioProcessor): + """ + Local Whisper model using transformers pipeline. + Fallback when API is unavailable. + """ + + def __init__(self): + super().__init__("Local Whisper (Tiny)") + self.pipeline = None + self.model_name = "openai/whisper-tiny" + self.is_initialized = False + + def _initialize_model(self): + """Lazy initialization of the model""" + if self.is_initialized: + return + + try: + logger.info(f"Loading local Whisper model: {self.model_name}") + + from transformers import pipeline + import torch + + # Use CPU for compatibility, GPU if available + device = "cuda" if torch.cuda.is_available() else "cpu" + + self.pipeline = pipeline( + "automatic-speech-recognition", + model=self.model_name, + device=device, + torch_dtype=torch.float32, # Use float32 to avoid dtype issues + return_timestamps=False # We only need text + ) + + logger.info(f"Local Whisper model loaded on {device}") + self.is_initialized = True + + except ImportError as e: + logger.error("transformers library not installed. Run: pip install transformers torch") + raise Exception("transformers library required for local processing") + except Exception as e: + logger.error(f"Failed to load local Whisper model: {str(e)}") + raise Exception(f"Local model initialization failed: {str(e)}") + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio using local Whisper model. + + Args: + audio_data: Raw audio bytes (WAV format preferred) + + Returns: + Predicted digit as string ('0'-'9') + + Raises: + Exception: If processing fails + """ + try: + # Initialize model on first use + self._initialize_model() + + # Convert audio bytes to numpy array + from utils.audio_utils import audio_to_numpy + audio_array, sample_rate = audio_to_numpy(audio_data) + + # Resample to 16kHz if needed (Whisper expects 16kHz) + if sample_rate != 16000: + logger.debug(f"Resampling from {sample_rate}Hz to 16kHz") + import librosa + audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000) + + # Process with pipeline + logger.debug(f"Processing audio: {len(audio_array)} samples at 16kHz") + result = self.pipeline(audio_array) + + if not result or 'text' not in result: + logger.error(f"Unexpected pipeline result: {result}") + raise Exception("Invalid pipeline output") + + transcribed_text = result['text'].strip().lower() + logger.debug(f"Local Whisper transcription: '{transcribed_text}'") + + # Extract digit from transcription + predicted_digit = self._extract_digit(transcribed_text) + + if predicted_digit is None: + logger.warning(f"No digit found in transcription: '{transcribed_text}'") + return "?" + + return predicted_digit + + except Exception as e: + logger.error(f"Local Whisper processing failed: {str(e)}") + raise Exception(f"Local processing error: {str(e)}") + + def _extract_digit(self, text: str) -> Optional[str]: + """ + Extract digit from transcribed text. + Handles both numerical ('1', '2') and word forms ('one', 'two'). + """ + import re + + # Word to digit mapping + word_to_digit = { + 'zero': '0', 'oh': '0', + 'one': '1', 'won': '1', + 'two': '2', 'to': '2', 'too': '2', + 'three': '3', 'tree': '3', + 'four': '4', 'for': '4', 'fore': '4', + 'five': '5', + 'six': '6', 'sick': '6', + 'seven': '7', + 'eight': '8', 'ate': '8', + 'nine': '9', 'niner': '9' + } + + # First, try to find a direct digit + digit_match = re.search(r'\b([0-9])\b', text) + if digit_match: + return digit_match.group(1) + + # Then try word forms + words = text.split() + for word in words: + clean_word = re.sub(r'[^\w]', '', word.lower()) + if clean_word in word_to_digit: + return word_to_digit[clean_word] + + # Try partial matches for robustness + for word, digit in word_to_digit.items(): + if word in text: + return digit + + return None + + def is_configured(self) -> bool: + """Check if local model can be initialized.""" + try: + import transformers + import torch + return True + except ImportError: + return False + + def test_connection(self) -> bool: + """Test local model functionality.""" + try: + self._initialize_model() + return True + except: + return False \ No newline at end of file diff --git a/audio_processors/mel_spectrogram.py b/audio_processors/mel_spectrogram.py new file mode 100644 index 0000000000000000000000000000000000000000..8f147aa3cdf8fef29eb1657ee4b6625712e3eca0 --- /dev/null +++ b/audio_processors/mel_spectrogram.py @@ -0,0 +1,74 @@ +import numpy as np +import logging +from .base_processor import AudioProcessor + +logger = logging.getLogger(__name__) + +class MelSpectrogramProcessor(AudioProcessor): + """ + Mel Spectrogram processor using mel-scale frequency analysis. + + Future implementation will: + - Apply mel filterbank to frequency domain representation + - Use perceptually-motivated frequency scaling + - Feed mel spectrogram features to deep learning model + + Currently returns placeholder '00' for testing UI functionality. + """ + + def __init__(self): + super().__init__("Mel Spectrogram") + logger.info("Mel Spectrogram processor initialized (PLACEHOLDER MODE)") + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio using mel-scale spectrogram analysis. + + PLACEHOLDER IMPLEMENTATION: + Currently returns '00' for UI testing purposes. + + Future implementation will: + 1. Convert audio bytes to numpy array + 2. Compute STFT of the audio signal + 3. Apply mel filterbank to convert to mel scale + 4. Take logarithm for perceptual scaling + 5. Feed to trained neural network (CNN/RNN) + 6. Return predicted digit + + Args: + audio_data: Raw audio bytes + + Returns: + Predicted digit as string (currently '00') + """ + logger.debug("Processing audio with Mel Spectrogram (placeholder)") + + # Simulate processing time + import time + time.sleep(0.15) + + # TODO: Implement actual mel spectrogram processing: + # 1. audio_array = np.frombuffer(audio_data, dtype=np.float32) + # 2. mel_spec = librosa.feature.melspectrogram( + # y=audio_array, + # sr=sample_rate, + # n_mels=128, + # fmax=8000 + # ) + # 3. mel_db = librosa.power_to_db(mel_spec, ref=np.max) + # 4. prediction = self.neural_model.predict(mel_db) + # 5. return str(np.argmax(prediction)) + + return '00' + + def get_model_info(self) -> dict: + """Get information about the mel spectrogram model.""" + return { + 'method': 'Mel Spectrogram', + 'status': 'PLACEHOLDER', + 'features': 'Mel-scale frequency representation', + 'classifier': 'CNN/RNN (not implemented)', + 'n_mels': 128, + 'fmax': 8000, + 'expected_inference_time': '<500ms' + } \ No newline at end of file diff --git a/audio_processors/mfcc_processor.py b/audio_processors/mfcc_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..4c7165121253b452bc65c107984c3c40973b9b1e --- /dev/null +++ b/audio_processors/mfcc_processor.py @@ -0,0 +1,79 @@ +import numpy as np +import logging +from .base_processor import AudioProcessor + +logger = logging.getLogger(__name__) + +class MFCCProcessor(AudioProcessor): + """ + MFCC (Mel-Frequency Cepstral Coefficients) processor. + + Future implementation will: + - Extract MFCC features (typically 12-13 coefficients) + - Apply DCT (Discrete Cosine Transform) to mel spectrogram + - Use traditional ML classifier (SVM, Random Forest, etc.) + + Currently returns placeholder '00' for testing UI functionality. + """ + + def __init__(self): + super().__init__("MFCC") + logger.info("MFCC processor initialized (PLACEHOLDER MODE)") + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio using MFCC feature extraction. + + PLACEHOLDER IMPLEMENTATION: + Currently returns '00' for UI testing purposes. + + Future implementation will: + 1. Convert audio bytes to numpy array + 2. Compute mel spectrogram of the audio + 3. Apply DCT to get cepstral coefficients + 4. Extract first 12-13 MFCC coefficients + 5. Optionally add delta and delta-delta features + 6. Feed to trained classifier (SVM/Random Forest) + 7. Return predicted digit + + Args: + audio_data: Raw audio bytes + + Returns: + Predicted digit as string (currently '00') + """ + logger.debug("Processing audio with MFCC (placeholder)") + + # Simulate processing time (MFCC should be fastest) + import time + time.sleep(0.05) + + # TODO: Implement actual MFCC processing: + # 1. audio_array = np.frombuffer(audio_data, dtype=np.float32) + # 2. mfccs = librosa.feature.mfcc( + # y=audio_array, + # sr=sample_rate, + # n_mfcc=13, + # n_fft=2048, + # hop_length=512 + # ) + # 3. # Optionally add delta features + # 4. delta_mfccs = librosa.feature.delta(mfccs) + # 5. features = np.concatenate([mfccs, delta_mfccs], axis=0) + # 6. prediction = self.svm_model.predict(features.T.flatten().reshape(1, -1)) + # 7. return str(prediction[0]) + + return '00' + + def get_model_info(self) -> dict: + """Get information about the MFCC model.""" + return { + 'method': 'MFCC (Mel-Frequency Cepstral Coefficients)', + 'status': 'PLACEHOLDER', + 'features': 'Cepstral coefficients with delta features', + 'classifier': 'SVM/Random Forest (not implemented)', + 'n_mfcc': 13, + 'n_fft': 2048, + 'hop_length': 512, + 'expected_inference_time': '<100ms' + } \ No newline at end of file diff --git a/audio_processors/ml_mel_cnn_processor.py b/audio_processors/ml_mel_cnn_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..629287261ed3f257dbfc911328f90c6bbdb4cdb8 --- /dev/null +++ b/audio_processors/ml_mel_cnn_processor.py @@ -0,0 +1,307 @@ +""" +ML Mel CNN Digit Processor +Uses the trained Mel Spectrogram + 2D CNN model for digit classification +""" + +import os +import sys +import time +import logging +from pathlib import Path +from typing import Dict, Any, Optional, Union + +import numpy as np +from .base_processor import AudioProcessor + +# Add project root to path for ML imports +PROJECT_ROOT = Path(__file__).parent.parent +sys.path.append(str(PROJECT_ROOT)) + +# Import ML inference +from ml_training.inference import load_classifier + +logger = logging.getLogger(__name__) + +class MLMelCNNProcessor(AudioProcessor): + """ + ML-based Mel CNN digit processor using trained 2D CNN model. + + Performance characteristics (based on training results): + - Test accuracy: 97.22% + - Inference time: ~3-5ms + - Model size: ~2.6MB + """ + + name = "ML Mel CNN (2D Conv)" + + def __init__(self, model_dir: str = "models", device: str = "auto"): + """ + Initialize ML Mel CNN processor. + + Args: + model_dir: Directory containing trained models + device: Device to run inference on ('cpu', 'cuda', or 'auto') + """ + super().__init__(self.name) + + self.model_dir = Path(model_dir) + self.device = device if device != "auto" else None + self.classifier = None + self._configured = False + + # Performance tracking + self.prediction_count = 0 + self.total_inference_time = 0.0 + self.last_prediction_time = None + + # Try to load the model + self._initialize_classifier() + + logger.info(f"ML Mel CNN Processor initialized (configured: {self._configured})") + + def _initialize_classifier(self): + """Initialize the ML classifier.""" + try: + # Check if model directory exists + if not self.model_dir.exists(): + logger.warning(f"Model directory not found: {self.model_dir}") + return + + # Load the Mel CNN classifier + self.classifier = load_classifier( + model_dir=str(self.model_dir), + pipeline_type='mel_cnn', + device=self.device + ) + + self._configured = True + logger.info("ML Mel CNN classifier loaded successfully") + logger.info(f" Model device: {self.classifier.device}") + logger.info(f" Parameters: {sum(p.numel() for p in self.classifier.model.parameters()):,}") + + except Exception as e: + logger.error(f"Failed to load ML Mel CNN classifier: {str(e)}") + self.classifier = None + self._configured = False + + def is_configured(self) -> bool: + """Check if the processor is properly configured.""" + return self._configured and self.classifier is not None + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio and return predicted digit (required by base class). + + Args: + audio_data: Raw audio data in bytes + + Returns: + predicted_digit: Predicted digit as string + """ + return self.predict(audio_data) + + def predict(self, audio_data: bytes) -> str: + """ + Predict digit from audio data. + + Args: + audio_data: Raw audio data in bytes + + Returns: + predicted_digit: Predicted digit as string + """ + if not self.is_configured(): + raise RuntimeError("ML Mel CNN processor not properly configured") + + try: + # Convert audio with optimized format for ML models + from utils.audio_utils import convert_for_ml_models + optimized_audio = convert_for_ml_models(audio_data, 'mel_cnn') + + # Convert audio bytes to numpy array + audio_array = self._bytes_to_audio_array(optimized_audio) + + # Make prediction using ML classifier + start_time = time.time() + result = self.classifier.predict( + audio_array, + return_probabilities=True, + return_features=False + ) + inference_time = time.time() - start_time + + # Update performance tracking + self.prediction_count += 1 + self.total_inference_time += inference_time + self.last_prediction_time = inference_time + + predicted_digit = str(result['predicted_digit']) + confidence = result['confidence'] + + logger.debug(f"ML Mel CNN prediction: '{predicted_digit}' " + f"(confidence: {confidence:.3f}, time: {inference_time*1000:.1f}ms)") + + return predicted_digit + + except Exception as e: + logger.error(f"ML Mel CNN prediction failed: {str(e)}") + raise + + def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]: + """ + Predict digit with detailed timing and confidence information. + + Args: + audio_data: Raw audio data in bytes + + Returns: + result: Detailed prediction results + """ + if not self.is_configured(): + return { + 'success': False, + 'error': 'ML Mel CNN processor not properly configured', + 'predicted_digit': None, + 'inference_time': 0.0 + } + + try: + # Convert audio with optimized format for ML models + from utils.audio_utils import convert_for_ml_models + optimized_audio = convert_for_ml_models(audio_data, 'mel_cnn') + + # Convert audio bytes to numpy array + audio_array = self._bytes_to_audio_array(optimized_audio) + + # Make prediction using ML classifier + start_time = time.time() + ml_result = self.classifier.predict( + audio_array, + return_probabilities=True, + return_features=False + ) + inference_time = time.time() - start_time + + # Update performance tracking + self.prediction_count += 1 + self.total_inference_time += inference_time + self.last_prediction_time = inference_time + + # Format result + result = { + 'success': True, + 'predicted_digit': str(ml_result['predicted_digit']), + 'confidence': ml_result['confidence'], + 'inference_time': inference_time, + 'class_probabilities': { + str(k): float(v) for k, v in ml_result['class_probabilities'].items() + }, + 'top_3_predictions': [ + { + 'digit': str(pred['digit']), + 'probability': pred['probability'] + } + for pred in ml_result['top_3_predictions'] + ], + 'method': self.name, + 'model_type': 'ml_mel_cnn', + 'timestamp': time.time() + } + + logger.debug(f"ML Mel CNN detailed prediction: '{result['predicted_digit']}' " + f"(confidence: {result['confidence']:.3f}, " + f"time: {inference_time*1000:.1f}ms)") + + return result + + except Exception as e: + logger.error(f"ML Mel CNN prediction with timing failed: {str(e)}") + return { + 'success': False, + 'error': str(e), + 'predicted_digit': None, + 'inference_time': 0.0, + 'method': self.name, + 'model_type': 'ml_mel_cnn', + 'timestamp': time.time() + } + + def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray: + """Convert audio bytes to numpy array.""" + try: + # Try to interpret as int16 PCM first (most common) + audio_array = np.frombuffer(audio_data, dtype=np.int16) + + # Convert to float32 and normalize + audio_array = audio_array.astype(np.float32) / 32768.0 + + # If the array is too short, pad it + if len(audio_array) < 1000: # Less than ~60ms at 16kHz + # Pad with zeros to minimum length + audio_array = np.pad(audio_array, (0, 1000 - len(audio_array))) + + return audio_array + + except Exception as e: + logger.error(f"Failed to convert audio bytes to array: {str(e)}") + # Return a small zero array as fallback + return np.zeros(1000, dtype=np.float32) + + def get_stats(self) -> Dict[str, Any]: + """Get processor performance statistics.""" + stats = super().get_stats() + + if self.prediction_count > 0: + stats.update({ + 'ml_predictions': self.prediction_count, + 'average_inference_time': self.total_inference_time / self.prediction_count, + 'last_inference_time': self.last_prediction_time, + 'throughput_per_second': self.prediction_count / self.total_inference_time if self.total_inference_time > 0 else 0, + 'model_configured': self.is_configured() + }) + + if self.classifier: + # Get ML classifier performance stats + ml_stats = self.classifier.get_performance_stats() + stats['ml_classifier_stats'] = ml_stats + + return stats + + def get_model_info(self) -> Dict[str, Any]: + """Get information about the loaded model.""" + if not self.is_configured(): + return {'error': 'Model not loaded'} + + try: + info = { + 'pipeline_type': 'mel_cnn', + 'model_class': self.classifier.model.__class__.__name__, + 'device': str(self.classifier.device), + 'parameters': sum(p.numel() for p in self.classifier.model.parameters()), + 'feature_extractor': self.classifier.feature_extractor.__class__.__name__, + 'has_scaler': self.classifier.scaler is not None, + 'expected_sample_rate': 8000, + 'expected_audio_length': 8000, # 1 second at 8kHz + 'input_shape': '(1, 64, 51)', # Mel spectrogram shape + 'model_architecture': '2D CNN' + } + + if hasattr(self.classifier, 'model_path'): + info['model_path'] = str(self.classifier.model_path) + + return info + + except Exception as e: + logger.error(f"Failed to get model info: {str(e)}") + return {'error': str(e)} + + def benchmark_speed(self, num_samples: int = 100) -> Dict[str, Any]: + """Benchmark inference speed.""" + if not self.is_configured(): + return {'error': 'Model not configured'} + + try: + return self.classifier.benchmark_speed(num_samples) + except Exception as e: + logger.error(f"Benchmark failed: {str(e)}") + return {'error': str(e)} \ No newline at end of file diff --git a/audio_processors/ml_mfcc_processor.py b/audio_processors/ml_mfcc_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..606c5d1bd47d5365b54ddf07fce16ab6e6e25518 --- /dev/null +++ b/audio_processors/ml_mfcc_processor.py @@ -0,0 +1,370 @@ +""" +ML MFCC Digit Processor +Uses the trained MFCC + Dense NN model for digit classification +""" + +import os +import sys +import time +import logging +from pathlib import Path +from typing import Dict, Any, Optional, Union + +import numpy as np +from .base_processor import AudioProcessor + +# Add project root to path for ML imports +PROJECT_ROOT = Path(__file__).parent.parent +sys.path.append(str(PROJECT_ROOT)) + +# Import ML inference +from ml_training.inference import load_classifier + +logger = logging.getLogger(__name__) + +class MLMFCCProcessor(AudioProcessor): + """ + ML-based MFCC digit processor using trained Dense NN model. + + Performance characteristics (based on training results): + - Test accuracy: 98.52% + - Inference time: ~1-2ms + - Model size: ~0.3MB + """ + + name = "ML MFCC + Dense NN (Best)" + + def __init__(self, model_dir: str = "models", device: str = "auto"): + """ + Initialize ML MFCC processor. + + Args: + model_dir: Directory containing trained models + device: Device to run inference on ('cpu', 'cuda', or 'auto') + """ + super().__init__(self.name) + + self.model_dir = Path(model_dir) + self.device = device if device != "auto" else None + self.classifier = None + self._configured = False + + # Performance tracking + self.prediction_count = 0 + self.total_inference_time = 0.0 + self.last_prediction_time = None + + # Try to load the model + self._initialize_classifier() + + logger.info(f"ML MFCC Processor initialized (configured: {self._configured})") + + def _initialize_classifier(self): + """Initialize the ML classifier.""" + try: + # Check if model directory exists + if not self.model_dir.exists(): + logger.warning(f"Model directory not found: {self.model_dir}") + return + + # Load the MFCC classifier + self.classifier = load_classifier( + model_dir=str(self.model_dir), + pipeline_type='mfcc', + device=self.device + ) + + self._configured = True + logger.info("ML MFCC classifier loaded successfully") + logger.info(f" Model device: {self.classifier.device}") + logger.info(f" Parameters: {sum(p.numel() for p in self.classifier.model.parameters()):,}") + + except Exception as e: + logger.error(f"Failed to load ML MFCC classifier: {str(e)}") + self.classifier = None + self._configured = False + + def is_configured(self) -> bool: + """Check if the processor is properly configured.""" + return self._configured and self.classifier is not None + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio and return predicted digit (required by base class). + + Args: + audio_data: Raw audio data in bytes + + Returns: + predicted_digit: Predicted digit as string + """ + return self.predict(audio_data) + + def predict(self, audio_data: bytes) -> str: + """ + Predict digit from audio data. + + Args: + audio_data: Raw audio data in bytes + + Returns: + predicted_digit: Predicted digit as string + """ + if not self.is_configured(): + raise RuntimeError("ML MFCC processor not properly configured") + + try: + # Convert audio with optimized format for ML models + from utils.audio_utils import convert_for_ml_models + optimized_audio = convert_for_ml_models(audio_data, 'mfcc') + + # Convert audio bytes to numpy array + audio_array = self._bytes_to_audio_array(optimized_audio) + + # No audio preprocessing needed - normalization happens at feature level in ML pipeline + + # Make prediction using ML classifier + start_time = time.time() + result = self.classifier.predict( + audio_array, + return_probabilities=True, + return_features=False + ) + inference_time = time.time() - start_time + + # Update performance tracking + self.prediction_count += 1 + self.total_inference_time += inference_time + self.last_prediction_time = inference_time + + predicted_digit = str(result['predicted_digit']) + confidence = result['confidence'] + + # Debug logging for predictions (temporary) + if hasattr(result, 'probabilities') or 'probabilities' in result: + probs = result.get('probabilities', []) + if len(probs) >= 10: + top_predictions = [(i, p) for i, p in enumerate(probs)] + top_predictions.sort(key=lambda x: x[1], reverse=True) + logger.debug(f"MFCC Top 3 predictions: {[(str(d), f'{p:.3f}') for d, p in top_predictions[:3]]}") + + logger.debug(f"MFCC predicted '{predicted_digit}' with confidence {confidence:.3f} in {inference_time:.3f}s") + + logger.debug(f"ML MFCC prediction: '{predicted_digit}' " + f"(confidence: {confidence:.3f}, time: {inference_time*1000:.1f}ms)") + + return predicted_digit + + except Exception as e: + logger.error(f"ML MFCC prediction failed: {str(e)}") + raise + + def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]: + """ + Predict digit with detailed timing and confidence information. + + Args: + audio_data: Raw audio data in bytes + + Returns: + result: Detailed prediction results + """ + if not self.is_configured(): + return { + 'success': False, + 'error': 'ML MFCC processor not properly configured', + 'predicted_digit': None, + 'inference_time': 0.0 + } + + try: + # Convert audio with optimized format for ML models + from utils.audio_utils import convert_for_ml_models + optimized_audio = convert_for_ml_models(audio_data, 'mfcc') + + # Convert audio bytes to numpy array + audio_array = self._bytes_to_audio_array(optimized_audio) + + # No audio preprocessing needed - normalization happens at feature level in ML pipeline + + # Make prediction using ML classifier + start_time = time.time() + ml_result = self.classifier.predict( + audio_array, + return_probabilities=True, + return_features=False + ) + inference_time = time.time() - start_time + + # Update performance tracking + self.prediction_count += 1 + self.total_inference_time += inference_time + self.last_prediction_time = inference_time + + # Format result + result = { + 'success': True, + 'predicted_digit': str(ml_result['predicted_digit']), + 'confidence': ml_result['confidence'], + 'inference_time': inference_time, + 'class_probabilities': { + str(k): float(v) for k, v in ml_result['class_probabilities'].items() + }, + 'top_3_predictions': [ + { + 'digit': str(pred['digit']), + 'probability': pred['probability'] + } + for pred in ml_result['top_3_predictions'] + ], + 'method': self.name, + 'model_type': 'ml_mfcc', + 'timestamp': time.time() + } + + logger.debug(f"ML MFCC detailed prediction: '{result['predicted_digit']}' " + f"(confidence: {result['confidence']:.3f}, " + f"time: {inference_time*1000:.1f}ms)") + + return result + + except Exception as e: + logger.error(f"ML MFCC prediction with timing failed: {str(e)}") + return { + 'success': False, + 'error': str(e), + 'predicted_digit': None, + 'inference_time': 0.0, + 'method': self.name, + 'model_type': 'ml_mfcc', + 'timestamp': time.time() + } + + def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray: + """Convert audio bytes to numpy array.""" + try: + # Try to interpret as int16 PCM first (most common) + audio_array = np.frombuffer(audio_data, dtype=np.int16) + + # Convert to float32 and normalize + audio_array = audio_array.astype(np.float32) / 32768.0 + + # If the array is too short, pad it + if len(audio_array) < 1000: # Less than ~60ms at 16kHz + # Pad with zeros to minimum length + audio_array = np.pad(audio_array, (0, 1000 - len(audio_array))) + + return audio_array + + except Exception as e: + logger.error(f"Failed to convert audio bytes to array: {str(e)}") + # Return a small zero array as fallback + return np.zeros(1000, dtype=np.float32) + + def _preprocess_audio_for_mfcc(self, audio_array: np.ndarray) -> np.ndarray: + """ + Apply MFCC-specific audio preprocessing to improve model performance. + This compensates for missing scaler normalization. + + Args: + audio_array: Raw audio array + + Returns: + preprocessed_audio: Audio array optimized for MFCC feature extraction + """ + try: + # Remove DC component + audio_array = audio_array - np.mean(audio_array) + + # Apply gentle normalization to handle volume variations + # This helps compensate for the missing feature scaler + max_val = np.max(np.abs(audio_array)) + if max_val > 0: + audio_array = audio_array / max_val * 0.7 # Scale to 70% of max to avoid clipping + + # Apply a gentle high-pass filter to remove low-frequency noise + # This improves MFCC feature quality + from scipy import signal + if len(audio_array) > 100: # Only apply if we have enough samples + # Simple high-pass filter at ~300Hz for 8kHz sample rate + sos = signal.butter(2, 300, btype='high', fs=8000, output='sos') + audio_array = signal.sosfilt(sos, audio_array) + + # Ensure we don't have any NaN or inf values + audio_array = np.nan_to_num(audio_array, nan=0.0, posinf=0.0, neginf=0.0) + + logger.debug(f"MFCC preprocessing applied: range=[{np.min(audio_array):.3f}, {np.max(audio_array):.3f}], " + f"mean={np.mean(audio_array):.3f}, std={np.std(audio_array):.3f}") + + return audio_array + + except ImportError: + # Fallback if scipy is not available - just normalize + logger.warning("Scipy not available, using basic normalization") + audio_array = audio_array - np.mean(audio_array) + max_val = np.max(np.abs(audio_array)) + if max_val > 0: + audio_array = audio_array / max_val * 0.7 + return audio_array + + except Exception as e: + logger.error(f"MFCC preprocessing failed: {str(e)}") + # Return original array if preprocessing fails + return audio_array + + def get_stats(self) -> Dict[str, Any]: + """Get processor performance statistics.""" + stats = super().get_stats() + + if self.prediction_count > 0: + stats.update({ + 'ml_predictions': self.prediction_count, + 'average_inference_time': self.total_inference_time / self.prediction_count, + 'last_inference_time': self.last_prediction_time, + 'throughput_per_second': self.prediction_count / self.total_inference_time if self.total_inference_time > 0 else 0, + 'model_configured': self.is_configured() + }) + + if self.classifier: + # Get ML classifier performance stats + ml_stats = self.classifier.get_performance_stats() + stats['ml_classifier_stats'] = ml_stats + + return stats + + def get_model_info(self) -> Dict[str, Any]: + """Get information about the loaded model.""" + if not self.is_configured(): + return {'error': 'Model not loaded'} + + try: + info = { + 'pipeline_type': 'mfcc', + 'model_class': self.classifier.model.__class__.__name__, + 'device': str(self.classifier.device), + 'parameters': sum(p.numel() for p in self.classifier.model.parameters()), + 'feature_extractor': self.classifier.feature_extractor.__class__.__name__, + 'has_scaler': self.classifier.scaler is not None, + 'expected_sample_rate': 8000, + 'expected_audio_length': 8000 # 1 second at 8kHz + } + + if hasattr(self.classifier, 'model_path'): + info['model_path'] = str(self.classifier.model_path) + + return info + + except Exception as e: + logger.error(f"Failed to get model info: {str(e)}") + return {'error': str(e)} + + def benchmark_speed(self, num_samples: int = 100) -> Dict[str, Any]: + """Benchmark inference speed.""" + if not self.is_configured(): + return {'error': 'Model not configured'} + + try: + return self.classifier.benchmark_speed(num_samples) + except Exception as e: + logger.error(f"Benchmark failed: {str(e)}") + return {'error': str(e)} \ No newline at end of file diff --git a/audio_processors/ml_raw_cnn_processor.py b/audio_processors/ml_raw_cnn_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..a88a412d6f8441a4b542fe3501513c515cdd6d5c --- /dev/null +++ b/audio_processors/ml_raw_cnn_processor.py @@ -0,0 +1,307 @@ +""" +ML Raw CNN Digit Processor +Uses the trained Raw Waveform + 1D CNN model for digit classification +""" + +import os +import sys +import time +import logging +from pathlib import Path +from typing import Dict, Any, Optional, Union + +import numpy as np +from .base_processor import AudioProcessor + +# Add project root to path for ML imports +PROJECT_ROOT = Path(__file__).parent.parent +sys.path.append(str(PROJECT_ROOT)) + +# Import ML inference +from ml_training.inference import load_classifier + +logger = logging.getLogger(__name__) + +class MLRawCNNProcessor(AudioProcessor): + """ + ML-based Raw CNN digit processor using trained 1D CNN model. + + Performance characteristics (based on training results): + - Test accuracy: 91.30% + - Inference time: ~5-8ms + - Model size: ~2.6MB + """ + + name = "ML Raw CNN (1D Conv)" + + def __init__(self, model_dir: str = "models", device: str = "auto"): + """ + Initialize ML Raw CNN processor. + + Args: + model_dir: Directory containing trained models + device: Device to run inference on ('cpu', 'cuda', or 'auto') + """ + super().__init__(self.name) + + self.model_dir = Path(model_dir) + self.device = device if device != "auto" else None + self.classifier = None + self._configured = False + + # Performance tracking + self.prediction_count = 0 + self.total_inference_time = 0.0 + self.last_prediction_time = None + + # Try to load the model + self._initialize_classifier() + + logger.info(f"ML Raw CNN Processor initialized (configured: {self._configured})") + + def _initialize_classifier(self): + """Initialize the ML classifier.""" + try: + # Check if model directory exists + if not self.model_dir.exists(): + logger.warning(f"Model directory not found: {self.model_dir}") + return + + # Load the Raw CNN classifier + self.classifier = load_classifier( + model_dir=str(self.model_dir), + pipeline_type='raw_cnn', + device=self.device + ) + + self._configured = True + logger.info("ML Raw CNN classifier loaded successfully") + logger.info(f" Model device: {self.classifier.device}") + logger.info(f" Parameters: {sum(p.numel() for p in self.classifier.model.parameters()):,}") + + except Exception as e: + logger.error(f"Failed to load ML Raw CNN classifier: {str(e)}") + self.classifier = None + self._configured = False + + def is_configured(self) -> bool: + """Check if the processor is properly configured.""" + return self._configured and self.classifier is not None + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio and return predicted digit (required by base class). + + Args: + audio_data: Raw audio data in bytes + + Returns: + predicted_digit: Predicted digit as string + """ + return self.predict(audio_data) + + def predict(self, audio_data: bytes) -> str: + """ + Predict digit from audio data. + + Args: + audio_data: Raw audio data in bytes + + Returns: + predicted_digit: Predicted digit as string + """ + if not self.is_configured(): + raise RuntimeError("ML Raw CNN processor not properly configured") + + try: + # Convert audio with optimized format for ML models + from utils.audio_utils import convert_for_ml_models + optimized_audio = convert_for_ml_models(audio_data, 'raw_cnn') + + # Convert audio bytes to numpy array + audio_array = self._bytes_to_audio_array(optimized_audio) + + # Make prediction using ML classifier + start_time = time.time() + result = self.classifier.predict( + audio_array, + return_probabilities=True, + return_features=False + ) + inference_time = time.time() - start_time + + # Update performance tracking + self.prediction_count += 1 + self.total_inference_time += inference_time + self.last_prediction_time = inference_time + + predicted_digit = str(result['predicted_digit']) + confidence = result['confidence'] + + logger.debug(f"ML Raw CNN prediction: '{predicted_digit}' " + f"(confidence: {confidence:.3f}, time: {inference_time*1000:.1f}ms)") + + return predicted_digit + + except Exception as e: + logger.error(f"ML Raw CNN prediction failed: {str(e)}") + raise + + def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]: + """ + Predict digit with detailed timing and confidence information. + + Args: + audio_data: Raw audio data in bytes + + Returns: + result: Detailed prediction results + """ + if not self.is_configured(): + return { + 'success': False, + 'error': 'ML Raw CNN processor not properly configured', + 'predicted_digit': None, + 'inference_time': 0.0 + } + + try: + # Convert audio with optimized format for ML models + from utils.audio_utils import convert_for_ml_models + optimized_audio = convert_for_ml_models(audio_data, 'raw_cnn') + + # Convert audio bytes to numpy array + audio_array = self._bytes_to_audio_array(optimized_audio) + + # Make prediction using ML classifier + start_time = time.time() + ml_result = self.classifier.predict( + audio_array, + return_probabilities=True, + return_features=False + ) + inference_time = time.time() - start_time + + # Update performance tracking + self.prediction_count += 1 + self.total_inference_time += inference_time + self.last_prediction_time = inference_time + + # Format result + result = { + 'success': True, + 'predicted_digit': str(ml_result['predicted_digit']), + 'confidence': ml_result['confidence'], + 'inference_time': inference_time, + 'class_probabilities': { + str(k): float(v) for k, v in ml_result['class_probabilities'].items() + }, + 'top_3_predictions': [ + { + 'digit': str(pred['digit']), + 'probability': pred['probability'] + } + for pred in ml_result['top_3_predictions'] + ], + 'method': self.name, + 'model_type': 'ml_raw_cnn', + 'timestamp': time.time() + } + + logger.debug(f"ML Raw CNN detailed prediction: '{result['predicted_digit']}' " + f"(confidence: {result['confidence']:.3f}, " + f"time: {inference_time*1000:.1f}ms)") + + return result + + except Exception as e: + logger.error(f"ML Raw CNN prediction with timing failed: {str(e)}") + return { + 'success': False, + 'error': str(e), + 'predicted_digit': None, + 'inference_time': 0.0, + 'method': self.name, + 'model_type': 'ml_raw_cnn', + 'timestamp': time.time() + } + + def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray: + """Convert audio bytes to numpy array.""" + try: + # Try to interpret as int16 PCM first (most common) + audio_array = np.frombuffer(audio_data, dtype=np.int16) + + # Convert to float32 and normalize + audio_array = audio_array.astype(np.float32) / 32768.0 + + # If the array is too short, pad it + if len(audio_array) < 1000: # Less than ~60ms at 16kHz + # Pad with zeros to minimum length + audio_array = np.pad(audio_array, (0, 1000 - len(audio_array))) + + return audio_array + + except Exception as e: + logger.error(f"Failed to convert audio bytes to array: {str(e)}") + # Return a small zero array as fallback + return np.zeros(1000, dtype=np.float32) + + def get_stats(self) -> Dict[str, Any]: + """Get processor performance statistics.""" + stats = super().get_stats() + + if self.prediction_count > 0: + stats.update({ + 'ml_predictions': self.prediction_count, + 'average_inference_time': self.total_inference_time / self.prediction_count, + 'last_inference_time': self.last_prediction_time, + 'throughput_per_second': self.prediction_count / self.total_inference_time if self.total_inference_time > 0 else 0, + 'model_configured': self.is_configured() + }) + + if self.classifier: + # Get ML classifier performance stats + ml_stats = self.classifier.get_performance_stats() + stats['ml_classifier_stats'] = ml_stats + + return stats + + def get_model_info(self) -> Dict[str, Any]: + """Get information about the loaded model.""" + if not self.is_configured(): + return {'error': 'Model not loaded'} + + try: + info = { + 'pipeline_type': 'raw_cnn', + 'model_class': self.classifier.model.__class__.__name__, + 'device': str(self.classifier.device), + 'parameters': sum(p.numel() for p in self.classifier.model.parameters()), + 'feature_extractor': None, # Raw waveforms don't need feature extraction + 'has_scaler': False, + 'expected_sample_rate': 8000, + 'expected_audio_length': 8000, # 1 second at 8kHz + 'input_shape': '(1, 1, 8000)', # Raw waveform shape + 'model_architecture': '1D CNN' + } + + if hasattr(self.classifier, 'model_path'): + info['model_path'] = str(self.classifier.model_path) + + return info + + except Exception as e: + logger.error(f"Failed to get model info: {str(e)}") + return {'error': str(e)} + + def benchmark_speed(self, num_samples: int = 100) -> Dict[str, Any]: + """Benchmark inference speed.""" + if not self.is_configured(): + return {'error': 'Model not configured'} + + try: + return self.classifier.benchmark_speed(num_samples) + except Exception as e: + logger.error(f"Benchmark failed: {str(e)}") + return {'error': str(e)} \ No newline at end of file diff --git a/audio_processors/raw_spectrogram.py b/audio_processors/raw_spectrogram.py new file mode 100644 index 0000000000000000000000000000000000000000..64ad9d4316aff96f92fd2d808251d5c0a9178cbb --- /dev/null +++ b/audio_processors/raw_spectrogram.py @@ -0,0 +1,69 @@ +import numpy as np +import logging +from .base_processor import AudioProcessor + +logger = logging.getLogger(__name__) + +class RawSpectrogramProcessor(AudioProcessor): + """ + Raw Spectrogram processor using STFT (Short-Time Fourier Transform). + + Future implementation will: + - Apply STFT to audio data for time-frequency representation + - Use CNN classifier trained on spectrogram images + - Process raw frequency domain features without mel scaling + + Currently returns placeholder '00' for testing UI functionality. + """ + + def __init__(self): + super().__init__("Raw Spectrogram") + logger.info("Raw Spectrogram processor initialized (PLACEHOLDER MODE)") + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio using raw spectrogram analysis. + + PLACEHOLDER IMPLEMENTATION: + Currently returns '00' for UI testing purposes. + + Future implementation will: + 1. Convert audio bytes to numpy array + 2. Apply STFT with appropriate window size and overlap + 3. Create time-frequency representation + 4. Normalize spectrogram values + 5. Feed to trained CNN model + 6. Return predicted digit + + Args: + audio_data: Raw audio bytes + + Returns: + Predicted digit as string (currently '00') + """ + logger.debug("Processing audio with Raw Spectrogram (placeholder)") + + # Simulate processing time + import time + time.sleep(0.1) + + # TODO: Implement actual STFT-based processing: + # 1. audio_array = np.frombuffer(audio_data, dtype=np.float32) + # 2. stft_result = np.abs(librosa.stft(audio_array, n_fft=2048, hop_length=512)) + # 3. spectrogram = librosa.amplitude_to_db(stft_result, ref=np.max) + # 4. prediction = self.cnn_model.predict(spectrogram) + # 5. return str(np.argmax(prediction)) + + return '00' + + def get_model_info(self) -> dict: + """Get information about the raw spectrogram model.""" + return { + 'method': 'Raw Spectrogram (STFT)', + 'status': 'PLACEHOLDER', + 'features': 'Time-frequency representation', + 'classifier': 'CNN (not implemented)', + 'window_size': 2048, + 'hop_length': 512, + 'expected_inference_time': '<1s' + } \ No newline at end of file diff --git a/audio_processors/wav2vec2_processor.py b/audio_processors/wav2vec2_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..486b990827f812e2f24c679b12d24174f902deff --- /dev/null +++ b/audio_processors/wav2vec2_processor.py @@ -0,0 +1,170 @@ +import logging +import numpy as np +from typing import Optional +from .base_processor import AudioProcessor + +logger = logging.getLogger(__name__) + +class Wav2Vec2Processor(AudioProcessor): + """ + Wav2Vec2 model processor for speech recognition. + Lightweight alternative to Whisper. + """ + + def __init__(self): + super().__init__("Wav2Vec2 (Facebook)") + self.processor = None + self.model = None + self.model_name = "facebook/wav2vec2-base-960h" + self.is_initialized = False + + def _initialize_model(self): + """Lazy initialization of the model""" + if self.is_initialized: + return + + try: + logger.info(f"Loading Wav2Vec2 model: {self.model_name}") + + from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC + import torch + + # Load processor and model + self.processor = Wav2Vec2Processor.from_pretrained(self.model_name) + self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name) + + # Move to GPU if available + device = "cuda" if torch.cuda.is_available() else "cpu" + self.model = self.model.to(device) + self.device = device + + logger.info(f"Wav2Vec2 model loaded on {device}") + self.is_initialized = True + + except ImportError as e: + logger.error("transformers library not installed. Run: pip install transformers torch") + raise Exception("transformers library required for Wav2Vec2 processing") + except Exception as e: + logger.error(f"Failed to load Wav2Vec2 model: {str(e)}") + raise Exception(f"Wav2Vec2 model initialization failed: {str(e)}") + + def process_audio(self, audio_data: bytes) -> str: + """ + Process audio using Wav2Vec2 model. + + Args: + audio_data: Raw audio bytes (WAV format preferred) + + Returns: + Predicted digit as string ('0'-'9') + + Raises: + Exception: If processing fails + """ + try: + # Initialize model on first use + self._initialize_model() + + # Convert audio bytes to numpy array + from utils.audio_utils import audio_to_numpy + audio_array, sample_rate = audio_to_numpy(audio_data) + + # Resample to 16kHz if needed (Wav2Vec2 expects 16kHz) + if sample_rate != 16000: + logger.debug(f"Resampling from {sample_rate}Hz to 16kHz") + import librosa + audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000) + + logger.debug(f"Processing audio: {len(audio_array)} samples at 16kHz") + + # Process with Wav2Vec2 + import torch + + # Tokenize audio + input_values = self.processor( + audio_array, + return_tensors="pt", + padding="longest", + sampling_rate=16000 + ).input_values.to(self.device) + + # Get logits + with torch.no_grad(): + logits = self.model(input_values).logits + + # Get predicted tokens + predicted_ids = torch.argmax(logits, dim=-1) + + # Decode transcription + transcription = self.processor.batch_decode(predicted_ids)[0].lower().strip() + logger.debug(f"Wav2Vec2 transcription: '{transcription}'") + + # Extract digit from transcription + predicted_digit = self._extract_digit(transcription) + + if predicted_digit is None: + logger.warning(f"No digit found in transcription: '{transcription}'") + return "?" + + return predicted_digit + + except Exception as e: + logger.error(f"Wav2Vec2 processing failed: {str(e)}") + raise Exception(f"Wav2Vec2 processing error: {str(e)}") + + def _extract_digit(self, text: str) -> Optional[str]: + """ + Extract digit from transcribed text. + Handles both numerical ('1', '2') and word forms ('one', 'two'). + """ + import re + + # Word to digit mapping + word_to_digit = { + 'zero': '0', 'oh': '0', + 'one': '1', 'won': '1', + 'two': '2', 'to': '2', 'too': '2', + 'three': '3', 'tree': '3', + 'four': '4', 'for': '4', 'fore': '4', 'full': '4', # "full" often misheard as "four" + 'five': '5', + 'six': '6', 'sick': '6', + 'seven': '7', + 'eight': '8', 'ate': '8', + 'nine': '9', 'niner': '9' + } + + # First, try to find a direct digit + digit_match = re.search(r'\b([0-9])\b', text) + if digit_match: + return digit_match.group(1) + + # Then try word forms + words = text.split() + for word in words: + clean_word = re.sub(r'[^\w]', '', word.lower()) + if clean_word in word_to_digit: + return word_to_digit[clean_word] + + # Try partial matches for robustness + for word, digit in word_to_digit.items(): + if word in text: + return digit + + return None + + def is_configured(self) -> bool: + """Check if Wav2Vec2 model can be initialized.""" + try: + import transformers + import torch + return True + except ImportError: + return False + + def test_connection(self) -> bool: + """Test Wav2Vec2 model functionality.""" + try: + self._initialize_model() + return True + except: + return False \ No newline at end of file diff --git a/audio_processors/whisper_digit_processor.py b/audio_processors/whisper_digit_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..43319cce4bf9ebd831b7e088be5e86304a69c7b0 --- /dev/null +++ b/audio_processors/whisper_digit_processor.py @@ -0,0 +1,429 @@ +""" +Whisper-based digit recognition processor +Specialized implementation for spoken digit recognition (0-9) +""" + +import numpy as np +import io +import time +import logging +from typing import Dict, Any, Optional +import torch +from transformers import pipeline +import soundfile as sf + +from .base_processor import AudioProcessor + +logger = logging.getLogger(__name__) + +class WhisperDigitProcessor(AudioProcessor): + """ + Whisper-based digit recognition processor using Hugging Face transformers. + Optimized for single digit recognition with mapping from text to numbers. + """ + + def __init__(self): + """Initialize Whisper digit processor with optimized settings.""" + super().__init__("Whisper Digit Recognition") + self.model = None + self.device = 0 if torch.cuda.is_available() else -1 + + # Digit mapping for text-to-number conversion + self.digit_map = { + "zero": "0", "one": "1", "two": "2", "three": "3", + "four": "4", "five": "5", "six": "6", "seven": "7", + "eight": "8", "nine": "9", + # Common variations and alternatives + "oh": "0", "o": "0", + "for": "4", "fore": "4", "to": "2", "too": "2", "tu": "2", + "tree": "3", "free": "3", "ate": "8", "ait": "8" + } + + # Reverse mapping for validation + self.number_words = set(self.digit_map.keys()) + + # Statistics tracking + self.total_predictions = 0 + self.successful_predictions = 0 + self.failed_predictions = 0 + self.average_inference_time = 0.0 + + self._initialize_model() + + def _initialize_model(self): + """Initialize the Whisper model with optimal settings for digit recognition.""" + try: + logger.info("Initializing Whisper model for digit recognition...") + + # Use Whisper tiny model for fast inference + self.model = pipeline( + "automatic-speech-recognition", + model="openai/whisper-tiny", + device=self.device, + torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, + return_timestamps=False # We don't need timestamps for single digits + ) + + logger.info(f"Whisper model initialized successfully on device: {self.device}") + + # Test model with dummy input + test_audio = np.random.randn(16000).astype(np.float32) # 1 second of noise + try: + test_result = self.model(test_audio) + logger.info("Model test successful") + except Exception as e: + logger.warning(f"Model test failed but model loaded: {e}") + + return True + + except Exception as e: + logger.error(f"Failed to initialize Whisper model: {e}") + return False + + def is_configured(self) -> bool: + """Check if the processor is properly configured.""" + return self.model is not None + + def process_audio(self, audio_data: bytes) -> str: + """ + Predict digit from audio data. + + Args: + audio_data: Raw audio bytes (WAV format preferred) + + Returns: + str: Predicted digit (0-9) or error message + """ + if not self.is_configured(): + return "error: Model not configured" + + try: + # Convert audio bytes to numpy array + audio_array = self._convert_audio_to_array(audio_data) + + if audio_array is None: + return "error: Invalid audio format" + + # Ensure proper sample rate and format + audio_array = self._preprocess_audio(audio_array) + + # Run Whisper inference + result = self.model(audio_array) + text = result["text"].strip().lower() + + # Convert text to digit + digit = self._text_to_digit(text) + + # Enhanced logging to debug transcription issues + logger.info(f"🎤 Whisper transcription: '{text}' -> digit: '{digit}'") + logger.info(f"📊 Audio stats: duration={len(audio_array)/16000:.2f}s, samples={len(audio_array)}, max_val={np.max(np.abs(audio_array)):.3f}") + + if digit in "0123456789": + self.successful_predictions += 1 + return digit + else: + self.failed_predictions += 1 + return f"unclear: {text}" + + except Exception as e: + logger.error(f"Whisper prediction failed: {e}") + self.failed_predictions += 1 + return f"error: {str(e)}" + finally: + self.total_predictions += 1 + + def _convert_audio_to_array(self, audio_data: bytes) -> Optional[np.ndarray]: + """ + Convert audio bytes to numpy array. + + Args: + audio_data: Raw audio bytes (could be WAV file or raw PCM from VAD) + + Returns: + np.ndarray: Audio samples or None if conversion failed + """ + # First check if this looks like raw PCM data from VAD (no file headers) + if len(audio_data) < 100 or not audio_data.startswith(b'RIFF'): + # This is likely raw PCM data from WebRTC VAD + try: + logger.debug("Processing raw PCM data from VAD segment") + audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) + audio_array = audio_array / 32768.0 # Normalize to [-1, 1] + self._original_sample_rate = 16000 # WebRTC VAD uses 16kHz + return audio_array + except Exception as e: + logger.error(f"Failed to process raw PCM data: {e}") + return None + + # This looks like a complete audio file (WAV, etc.) + try: + # Try to read as audio file using soundfile + audio_buffer = io.BytesIO(audio_data) + audio_array, sample_rate = sf.read(audio_buffer, dtype='float32') + + # Handle stereo to mono conversion + if len(audio_array.shape) > 1: + audio_array = np.mean(audio_array, axis=1) + + # Store original sample rate for resampling + self._original_sample_rate = sample_rate + + logger.debug(f"Successfully loaded audio file: {len(audio_array)} samples at {sample_rate}Hz") + return audio_array + + except Exception as e: + logger.warning(f"Audio file conversion failed with soundfile: {e}") + + # Final fallback: treat as raw PCM + try: + logger.debug("Fallback: treating as raw PCM data") + audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) + audio_array = audio_array / 32768.0 # Normalize to [-1, 1] + self._original_sample_rate = 16000 # Assume 16kHz + return audio_array + except Exception as e2: + logger.error(f"All audio conversion methods failed: {e2}") + return None + + def _preprocess_audio(self, audio_array: np.ndarray) -> np.ndarray: + """ + Preprocess audio for optimal Whisper performance. + + Args: + audio_array: Raw audio samples + + Returns: + np.ndarray: Preprocessed audio + """ + # Resample to 16kHz if needed (Whisper's expected input) + target_sample_rate = 16000 + + if hasattr(self, '_original_sample_rate') and self._original_sample_rate != target_sample_rate: + try: + import librosa + audio_array = librosa.resample( + audio_array, + orig_sr=self._original_sample_rate, + target_sr=target_sample_rate + ) + logger.debug(f"Resampled audio from {self._original_sample_rate}Hz to {target_sample_rate}Hz") + except ImportError: + logger.warning("librosa not available for resampling, using original audio") + except Exception as e: + logger.warning(f"Resampling failed: {e}, using original audio") + + # Trim silence from edges + audio_array = self._trim_silence(audio_array) + + # Ensure minimum length (Whisper works better with at least 0.1s) + min_samples = int(0.1 * target_sample_rate) + if len(audio_array) < min_samples: + # Pad with silence + padding = min_samples - len(audio_array) + audio_array = np.pad(audio_array, (0, padding), mode='constant', constant_values=0) + + # Normalize audio + max_val = np.max(np.abs(audio_array)) + if max_val > 0: + audio_array = audio_array / max_val * 0.9 # Prevent clipping + + return audio_array + + def _trim_silence(self, audio_array: np.ndarray, silence_threshold: float = 0.01) -> np.ndarray: + """ + Trim silence from beginning and end of audio. + + Args: + audio_array: Audio samples + silence_threshold: Threshold for silence detection + + Returns: + np.ndarray: Trimmed audio + """ + if len(audio_array) == 0: + return audio_array + + # Find non-silent regions + energy = audio_array ** 2 + non_silent = energy > silence_threshold + + if not np.any(non_silent): + return audio_array # All silence, return as is + + # Find first and last non-silent samples + first_sound = np.argmax(non_silent) + last_sound = len(non_silent) - np.argmax(non_silent[::-1]) - 1 + + # Add small padding + padding_samples = int(0.05 * 16000) # 50ms padding + first_sound = max(0, first_sound - padding_samples) + last_sound = min(len(audio_array) - 1, last_sound + padding_samples) + + return audio_array[first_sound:last_sound + 1] + + def _text_to_digit(self, text: str) -> str: + """ + Convert transcribed text to digit. + + Args: + text: Transcribed text from Whisper + + Returns: + str: Digit (0-9) or original text if no match + """ + # Clean the text + text = text.strip().lower() + + # Remove common punctuation and extra words + text = text.replace(",", "").replace(".", "").replace("!", "").replace("?", "") + text = text.replace("the", "").replace("number", "").replace("digit", "") + text = text.strip() + + # Try direct mapping + if text in self.digit_map: + return self.digit_map[text] + + # Try word-by-word mapping for multi-word responses + words = text.split() + for word in words: + if word in self.digit_map: + return self.digit_map[word] + + # Check if it's already a digit + if len(text) == 1 and text.isdigit(): + return text + + # Look for digits in the text + digits_found = [char for char in text if char.isdigit()] + if digits_found: + return digits_found[0] # Return first digit found + + # No clear digit found + return text + + def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]: + """ + Predict digit with detailed timing and confidence metrics. + + Args: + audio_data: Raw audio bytes + + Returns: + dict: Prediction results with timing and metadata + """ + start_time = time.time() + + predicted_digit = self.process_audio(audio_data) + + inference_time = time.time() - start_time + + # Update average inference time + if self.total_predictions > 0: + self.average_inference_time = ( + (self.average_inference_time * (self.total_predictions - 1) + inference_time) + / self.total_predictions + ) + + # Determine success status + is_successful = predicted_digit in "0123456789" + confidence_score = 1.0 if is_successful else 0.0 + + # Extract any error information + error_info = None + if predicted_digit.startswith("error:"): + error_info = predicted_digit[6:].strip() + predicted_digit = "unknown" + elif predicted_digit.startswith("unclear:"): + error_info = f"Transcription unclear: {predicted_digit[8:].strip()}" + predicted_digit = "unknown" + + result = { + 'predicted_digit': predicted_digit, + 'confidence_score': confidence_score, + 'inference_time': round(inference_time, 4), + 'success': is_successful, + 'timestamp': time.time(), + 'model': 'openai/whisper-tiny', + 'method': 'whisper_digit' + } + + if error_info: + result['error'] = error_info + + return result + + def get_model_info(self) -> Dict[str, Any]: + """ + Get information about the loaded model. + + Returns: + dict: Model information + """ + return { + 'model_name': 'openai/whisper-tiny', + 'model_type': 'Speech-to-Text (ASR)', + 'specialized_for': 'Digit Recognition (0-9)', + 'device': 'GPU' if self.device >= 0 else 'CPU', + 'torch_device': self.device, + 'supports_streaming': False, + 'supported_languages': ['en'], + 'digit_mappings': len(self.digit_map) + } + + def get_stats(self) -> Dict[str, Any]: + """ + Get processor statistics. + + Returns: + dict: Performance statistics + """ + success_rate = ( + self.successful_predictions / max(1, self.total_predictions) + ) + + return { + 'total_predictions': self.total_predictions, + 'successful_predictions': self.successful_predictions, + 'failed_predictions': self.failed_predictions, + 'success_rate': round(success_rate, 3), + 'average_inference_time': round(self.average_inference_time, 4), + 'model_loaded': self.is_configured() + } + + def test_with_sample_audio(self) -> Dict[str, Any]: + """ + Test the processor with generated sample audio. + + Returns: + dict: Test results + """ + if not self.is_configured(): + return {'error': 'Model not configured'} + + try: + # Generate simple test audio (1 second of tone) + sample_rate = 16000 + duration = 1.0 + frequency = 440 # A note + + t = np.linspace(0, duration, int(sample_rate * duration)) + test_audio = 0.3 * np.sin(2 * np.pi * frequency * t).astype(np.float32) + + # Run prediction + start_time = time.time() + result = self.model(test_audio) + test_time = time.time() - start_time + + return { + 'test_successful': True, + 'test_time': round(test_time, 4), + 'transcription': result.get('text', 'No text'), + 'model_responsive': True + } + + except Exception as e: + return { + 'test_successful': False, + 'error': str(e), + 'model_responsive': False + } \ No newline at end of file diff --git a/models/mel_cnn_classifier/best_model.pt b/models/mel_cnn_classifier/best_model.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc0372d0308efd9f20518976a3e23c8208ce1706 --- /dev/null +++ b/models/mel_cnn_classifier/best_model.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:784cd9615368040ec7f4fa393f4bbfa8effa8b66b5a526cb2d82f3c526537ae7 +size 7876706 diff --git a/models/mfcc_classifier/best_model.pt b/models/mfcc_classifier/best_model.pt new file mode 100644 index 0000000000000000000000000000000000000000..82fc351bbca66baedbe7db38efba3fc250230211 --- /dev/null +++ b/models/mfcc_classifier/best_model.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35346777b57dd72acf2599359e153336859ae5af05e991e0419a3c0f8fff0248 +size 1019362 diff --git a/models/mfcc_classifier/scaler.pkl b/models/mfcc_classifier/scaler.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e980b9599494ed17caf7a4265df1f8d31d40d6a2 --- /dev/null +++ b/models/mfcc_classifier/scaler.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5af2561a5be3934fb43d590605bbb9a6293e93935a975d904c7eac5bfe876c1 +size 4202 diff --git a/models/raw_cnn_classifier/best_model.pt b/models/raw_cnn_classifier/best_model.pt new file mode 100644 index 0000000000000000000000000000000000000000..4130980e2ed0b7400c12aa148e5604e755e54d4f --- /dev/null +++ b/models/raw_cnn_classifier/best_model.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdcae9f8fed4d05a27149a6258ba44e9350924fc571f6fe87deaf9cd4f4a3a0e +size 7728930 diff --git a/requirements_hf.txt b/requirements_hf.txt new file mode 100644 index 0000000000000000000000000000000000000000..3188d426cdb411da3b0440b2f4893f92d460bed3 --- /dev/null +++ b/requirements_hf.txt @@ -0,0 +1,26 @@ +# HF Spaces Requirements - Essential packages only +# Core Flask API +Flask==2.3.3 +Flask-CORS==4.0.0 +requests==2.31.0 +python-dotenv==1.0.0 + +# Audio Processing Core +numpy==1.24.3 +librosa==0.10.1 +scipy==1.11.4 +soundfile==0.12.1 + +# ML Models - PyTorch (CPU optimized for HF Spaces) +torch==2.0.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu +torchaudio==2.0.2+cpu --extra-index-url https://download.pytorch.org/whl/cpu + +# Essential ML utilities +scikit-learn==1.3.2 +transformers==4.35.2 + +# Audio format handling +webrtcvad==2.0.10 + +# Logging and utilities +tqdm==4.66.1 \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utils/__pycache__/__init__.cpython-312.pyc b/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f3ee8c9c7d22a19eff4326e623f7532f0ac2b4d Binary files /dev/null and b/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/utils/__pycache__/audio_utils.cpython-312.pyc b/utils/__pycache__/audio_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38c36b3927945b1e74a1a18bebdb632ce932ce52 Binary files /dev/null and b/utils/__pycache__/audio_utils.cpython-312.pyc differ diff --git a/utils/__pycache__/enhanced_vad.cpython-312.pyc b/utils/__pycache__/enhanced_vad.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9227d826cd04cd8396e36f586eb97c40749bca08 Binary files /dev/null and b/utils/__pycache__/enhanced_vad.cpython-312.pyc differ diff --git a/utils/__pycache__/logging_utils.cpython-312.pyc b/utils/__pycache__/logging_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d478903555fb56fd36b9545188c54ef4aee50069 Binary files /dev/null and b/utils/__pycache__/logging_utils.cpython-312.pyc differ diff --git a/utils/__pycache__/noise_utils.cpython-312.pyc b/utils/__pycache__/noise_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ca6563c31452a91c01ed5e2723ca74300157bae Binary files /dev/null and b/utils/__pycache__/noise_utils.cpython-312.pyc differ diff --git a/utils/__pycache__/session_manager.cpython-312.pyc b/utils/__pycache__/session_manager.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f7d12a4a79054783ac8e34c0d86d375b721e0f3 Binary files /dev/null and b/utils/__pycache__/session_manager.cpython-312.pyc differ diff --git a/utils/__pycache__/vad_feature_integration.cpython-312.pyc b/utils/__pycache__/vad_feature_integration.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11a5b169275570f8dbbd365346510f59978c013a Binary files /dev/null and b/utils/__pycache__/vad_feature_integration.cpython-312.pyc differ diff --git a/utils/__pycache__/webm_converter.cpython-312.pyc b/utils/__pycache__/webm_converter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b3ce80b5d18439b6285dad6236d45fcb63aaa1f Binary files /dev/null and b/utils/__pycache__/webm_converter.cpython-312.pyc differ diff --git a/utils/__pycache__/webrtc_vad.cpython-312.pyc b/utils/__pycache__/webrtc_vad.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4ac535b78621485983b708deef863d26bf28cd1 Binary files /dev/null and b/utils/__pycache__/webrtc_vad.cpython-312.pyc differ diff --git a/utils/audio_utils.py b/utils/audio_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a49672e38f419a36c8bfc4023d757979085d985e --- /dev/null +++ b/utils/audio_utils.py @@ -0,0 +1,427 @@ +import numpy as np +import wave +import io +import logging +import subprocess +import tempfile +import os +from pathlib import Path +from typing import Tuple, Optional + +logger = logging.getLogger(__name__) + +def check_ffmpeg_available() -> bool: + """Check if ffmpeg is available on the system.""" + try: + result = subprocess.run(['ffmpeg', '-version'], + capture_output=True, + text=True, + timeout=5) + return result.returncode == 0 + except (subprocess.SubprocessError, FileNotFoundError, subprocess.TimeoutExpired): + return False + +def convert_with_ffmpeg(audio_data: bytes, target_sr: int = 8000, target_format: str = 'wav') -> Optional[bytes]: + """ + Convert audio using ffmpeg for high-quality format conversion. + + Args: + audio_data: Input audio bytes in any format + target_sr: Target sampling rate (default: 8000 Hz for ML models) + target_format: Target audio format (default: wav) + + Returns: + Converted audio bytes or None if conversion fails + """ + if not check_ffmpeg_available(): + logger.warning("ffmpeg not available for audio conversion") + return None + + temp_input = None + temp_output = None + + try: + # Create temporary files + with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as temp_input: + temp_input.write(audio_data) + temp_input.flush() + + with tempfile.NamedTemporaryFile(suffix=f'.{target_format}', delete=False) as temp_output: + pass # Just need the filename + + # Build ffmpeg command for high-quality conversion + ffmpeg_cmd = [ + 'ffmpeg', + '-i', temp_input.name, + '-ar', str(target_sr), # Resample to target sample rate + '-ac', '1', # Convert to mono + '-acodec', 'pcm_s16le', # 16-bit PCM (standard for ML) + '-f', target_format, # Output format + '-loglevel', 'error', # Reduce ffmpeg output + '-y', # Overwrite output + temp_output.name + ] + + logger.debug(f"Running ffmpeg conversion: {' '.join(ffmpeg_cmd)}") + + # Run ffmpeg conversion + result = subprocess.run(ffmpeg_cmd, + capture_output=True, + text=True, + timeout=30) + + if result.returncode == 0: + # Read converted audio + with open(temp_output.name, 'rb') as f: + converted_audio = f.read() + + logger.debug(f"ffmpeg conversion successful: " + f"{len(audio_data)} -> {len(converted_audio)} bytes " + f"({target_sr}Hz, mono, {target_format})") + + return converted_audio + else: + logger.error(f"ffmpeg conversion failed: {result.stderr}") + return None + + except Exception as e: + logger.error(f"ffmpeg conversion error: {str(e)}") + return None + + finally: + # Clean up temporary files + try: + if temp_input and os.path.exists(temp_input.name): + os.unlink(temp_input.name) + if temp_output and os.path.exists(temp_output.name): + os.unlink(temp_output.name) + except Exception as cleanup_error: + logger.warning(f"Failed to cleanup temp files: {cleanup_error}") + +def convert_for_ml_models(audio_data: bytes, pipeline_type: str = 'mfcc') -> bytes: + """ + Convert audio specifically for ML model requirements. + + Args: + audio_data: Input audio bytes + pipeline_type: ML pipeline type ('mfcc', 'mel_cnn', 'raw_cnn') + + Returns: + Audio bytes optimized for the specific ML model + """ + # All our ML models expect 8kHz, mono, 16-bit PCM + target_sr = 8000 + + # Try ffmpeg first for best quality + converted = convert_with_ffmpeg(audio_data, target_sr=target_sr) + if converted: + logger.debug(f"Used ffmpeg for {pipeline_type} model audio conversion") + return converted + + # Fallback to existing conversion methods + logger.debug(f"Using fallback audio conversion for {pipeline_type} model") + return convert_audio_format(audio_data) + +def validate_audio_format(audio_data: bytes) -> bool: + """ + Validate that audio data is in a supported format. + + Args: + audio_data: Raw audio bytes + + Returns: + True if format is supported, False otherwise + """ + # Check minimum size + if len(audio_data) < 44: # WAV header is 44 bytes + logger.debug(f"Audio data too small: {len(audio_data)} bytes (minimum 44 for WAV header)") + return False + + # Check for null/empty data + if audio_data[:20] == b'\x00' * 20: + logger.error("Audio data appears to be empty/null bytes") + return False + + # Check if it starts with RIFF header + if not audio_data.startswith(b'RIFF'): + logger.error(f"Audio data does not start with RIFF header. First 8 bytes: {audio_data[:8]}") + # Try to provide more diagnostic info + if len(audio_data) > 20: + logger.error(f"First 20 bytes as hex: {audio_data[:20].hex()}") + return False + + try: + with wave.open(io.BytesIO(audio_data), 'rb') as wav_file: + # Check basic WAV properties + channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + frame_rate = wav_file.getframerate() + frames = wav_file.getnframes() + + logger.debug(f"Audio format: {channels} channels, {sample_width} bytes/sample, {frame_rate} Hz, {frames} frames") + + # Be more lenient with streaming chunks + if channels not in [1, 2]: + logger.warning(f"Unusual channel count: {channels}") + return False + if sample_width not in [1, 2, 4]: # 8-bit, 16-bit, 32-bit + logger.warning(f"Unusual sample width: {sample_width}") + return False + if frame_rate < 8000 or frame_rate > 48000: # Wider range + logger.warning(f"Unusual frame rate: {frame_rate}") + return False + if frames == 0: + logger.warning("No audio frames found") + return False + + return True + except wave.Error as e: + logger.error(f"WAV format error: {str(e)}") + logger.error(f"Audio data size: {len(audio_data)} bytes") + if len(audio_data) > 44: + logger.error(f"WAV header bytes: {audio_data[:44].hex()}") + return False + except Exception as e: + logger.error(f"Audio validation failed: {str(e)}") + logger.error(f"Audio data size: {len(audio_data)} bytes") + return False + +def convert_audio_format(audio_data: bytes) -> bytes: + """ + Convert various audio formats (WebM, OGG, MP3, etc.) to WAV format. + + Args: + audio_data: Input audio bytes in any supported format + + Returns: + Converted audio bytes in WAV format + + Raises: + Exception: If conversion fails + """ + try: + # First detect the audio format + from .webm_converter import detect_audio_format, convert_webm_to_wav + + audio_format = detect_audio_format(audio_data) + logger.debug(f"Detected audio format: {audio_format}") + + # Handle WebM specifically (common from MediaRecorder) + if audio_format == 'webm': + logger.info("Converting WebM audio to WAV (fallback method)") + converted = convert_webm_to_wav(audio_data) + if converted: + return converted + else: + raise Exception("WebM conversion failed") + + # Try using pydub for format conversion (handles WebM, OGG, MP3, etc.) + try: + from pydub import AudioSegment + import io + + # Load audio from bytes + audio = AudioSegment.from_file(io.BytesIO(audio_data)) + + # Convert to mono and 16kHz + audio = audio.set_channels(1) # Mono + audio = audio.set_frame_rate(16000) # 16kHz + audio = audio.set_sample_width(2) # 16-bit + + # Export as WAV + output_buffer = io.BytesIO() + audio.export(output_buffer, format="wav") + return output_buffer.getvalue() + + except ImportError: + logger.warning("pydub not installed, falling back to basic WAV conversion") + # Fall back to basic WAV processing + return convert_to_mono_16khz(audio_data) + except Exception as e: + logger.warning(f"pydub conversion failed: {str(e)}, trying fallback methods") + + # Try WebM converter as fallback + if audio_format in ['webm', 'unknown']: + logger.info("Trying WebM fallback converter") + converted = convert_webm_to_wav(audio_data) + if converted: + return converted + + # Last resort: basic WAV processing + return convert_to_mono_16khz(audio_data) + + except Exception as e: + logger.error(f"All audio conversion methods failed: {str(e)}") + raise Exception(f"Failed to convert audio format: {str(e)}") + +def convert_to_mono_16khz(audio_data: bytes) -> bytes: + """ + Convert audio to mono, 16kHz format suitable for speech recognition. + + Args: + audio_data: Input audio bytes (WAV format) + + Returns: + Converted audio bytes in mono 16kHz WAV format + + Raises: + Exception: If conversion fails + """ + try: + with wave.open(io.BytesIO(audio_data), 'rb') as input_wav: + frames = input_wav.readframes(input_wav.getnframes()) + channels = input_wav.getnchannels() + sample_width = input_wav.getsampwidth() + frame_rate = input_wav.getframerate() + + # Convert to numpy array + if sample_width == 2: + audio_array = np.frombuffer(frames, dtype=np.int16) + else: + raise Exception(f"Unsupported sample width: {sample_width}") + + # Convert stereo to mono if needed + if channels == 2: + audio_array = audio_array.reshape(-1, 2) + audio_array = np.mean(audio_array, axis=1).astype(np.int16) + + # Resample to 16kHz if needed + if frame_rate != 16000: + # Simple downsampling (for production, use proper resampling) + ratio = frame_rate / 16000 + if ratio > 1: + # Downsample by taking every nth sample + indices = np.arange(0, len(audio_array), ratio).astype(int) + audio_array = audio_array[indices] + else: + # Upsample by repeating samples (basic interpolation) + audio_array = np.repeat(audio_array, int(1/ratio)) + + # Create output WAV + output = io.BytesIO() + with wave.open(output, 'wb') as output_wav: + output_wav.setnchannels(1) # Mono + output_wav.setsampwidth(2) # 16-bit + output_wav.setframerate(16000) # 16kHz + output_wav.writeframes(audio_array.tobytes()) + + return output.getvalue() + + except Exception as e: + logger.error(f"Audio conversion failed: {str(e)}") + raise Exception(f"Failed to convert audio: {str(e)}") + +def get_audio_duration(audio_data: bytes) -> float: + """ + Get duration of audio in seconds. + + Args: + audio_data: WAV audio bytes + + Returns: + Duration in seconds + """ + try: + with wave.open(io.BytesIO(audio_data), 'rb') as wav_file: + frames = wav_file.getnframes() + frame_rate = wav_file.getframerate() + duration = frames / frame_rate + return duration + except Exception as e: + logger.error(f"Failed to get audio duration: {str(e)}") + return 0.0 + +def audio_to_numpy(audio_data: bytes) -> Tuple[np.ndarray, int]: + """ + Convert WAV audio bytes to numpy array. + + Args: + audio_data: WAV audio bytes + + Returns: + Tuple of (audio_array, sample_rate) + + Raises: + Exception: If conversion fails + """ + try: + with wave.open(io.BytesIO(audio_data), 'rb') as wav_file: + frames = wav_file.readframes(wav_file.getnframes()) + sample_rate = wav_file.getframerate() + channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + + if sample_width == 2: + audio_array = np.frombuffer(frames, dtype=np.int16) + else: + raise Exception(f"Unsupported sample width: {sample_width}") + + # Convert to float32 and normalize + audio_array = audio_array.astype(np.float32) / 32767.0 + + # Handle stereo + if channels == 2: + audio_array = audio_array.reshape(-1, 2) + audio_array = np.mean(audio_array, axis=1) + + return audio_array, sample_rate + + except Exception as e: + logger.error(f"Failed to convert audio to numpy: {str(e)}") + raise Exception(f"Audio conversion failed: {str(e)}") + +def create_test_audio(digit: str, duration: float = 1.0, sample_rate: int = 16000) -> bytes: + """ + Create test audio data for development purposes. + + Args: + digit: Digit to simulate ('0'-'9') + duration: Audio duration in seconds + sample_rate: Sample rate in Hz + + Returns: + WAV audio bytes + """ + try: + # Create simple tone pattern based on digit + t = np.linspace(0, duration, int(sample_rate * duration), False) + + # Different frequency patterns for each digit + freq_map = { + '0': [400, 600], # Low frequencies + '1': [800, 1000], # Higher frequencies + '2': [600, 800], + '3': [700, 900], + '4': [500, 700], + '5': [900, 1100], + '6': [450, 650], + '7': [750, 950], + '8': [550, 750], + '9': [850, 1050] + } + + freqs = freq_map.get(digit, [440, 880]) + + # Generate tone + signal = np.sin(freqs[0] * 2.0 * np.pi * t) * 0.3 + np.sin(freqs[1] * 2.0 * np.pi * t) * 0.3 + + # Add some envelope + envelope = np.exp(-3 * t) + signal = signal * envelope + + # Convert to int16 + signal = (signal * 32767).astype(np.int16) + + # Create WAV + output = io.BytesIO() + with wave.open(output, 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(sample_rate) + wav_file.writeframes(signal.tobytes()) + + return output.getvalue() + + except Exception as e: + logger.error(f"Failed to create test audio: {str(e)}") + raise Exception(f"Test audio creation failed: {str(e)}") \ No newline at end of file diff --git a/utils/enhanced_vad.py b/utils/enhanced_vad.py new file mode 100644 index 0000000000000000000000000000000000000000..a59c9033022e4eb0804c1cceb07226d36c254e68 --- /dev/null +++ b/utils/enhanced_vad.py @@ -0,0 +1,571 @@ +""" +Enhanced VAD Implementation with ffmpeg support and comprehensive debugging +""" + +import numpy as np +import logging +import subprocess +import tempfile +import os +import time +import wave +import io +from pathlib import Path +from typing import Dict, List, Tuple, Optional, Any +from threading import Thread, Lock +import asyncio +import concurrent.futures + +# Try to import WebRTC VAD +try: + import webrtcvad + WEBRTC_AVAILABLE = True +except ImportError: + WEBRTC_AVAILABLE = False + logging.warning("webrtcvad not available - using fallback VAD implementation") + +logger = logging.getLogger(__name__) + +class EnhancedVAD: + """ + Enhanced Voice Activity Detection with ffmpeg integration and comprehensive debugging. + + Features: + - ffmpeg-based audio preprocessing + - Multiple VAD implementations (WebRTC, simple energy-based) + - Comprehensive audio validation and debugging + - Async audio chunk saving + - Real-time performance monitoring + """ + + def __init__(self, + sample_rate: int = 16000, + frame_duration_ms: int = 30, + aggressiveness: int = 1, + min_speech_duration: float = 0.4, + max_speech_duration: float = 3.0, + silence_threshold: float = 0.01): + """ + Initialize Enhanced VAD. + + Args: + sample_rate: Target sample rate (Hz) + frame_duration_ms: Frame duration in milliseconds + aggressiveness: VAD aggressiveness (0-3) + min_speech_duration: Minimum speech segment duration (seconds) + max_speech_duration: Maximum speech segment duration (seconds) + silence_threshold: Energy threshold for silence detection + """ + self.sample_rate = sample_rate + self.frame_duration_ms = frame_duration_ms + self.frame_size = int(sample_rate * frame_duration_ms / 1000) + self.aggressiveness = aggressiveness + self.min_speech_duration = min_speech_duration + self.max_speech_duration = max_speech_duration + self.silence_threshold = silence_threshold + + # Initialize WebRTC VAD if available + self.webrtc_vad = None + if WEBRTC_AVAILABLE: + try: + self.webrtc_vad = webrtcvad.Vad(aggressiveness) + logger.info(f"WebRTC VAD initialized (aggressiveness: {aggressiveness})") + except Exception as e: + logger.error(f"Failed to initialize WebRTC VAD: {e}") + self.webrtc_vad = None + + # Check ffmpeg availability + self.ffmpeg_available = self._check_ffmpeg_available() + + # Performance tracking + self.stats = { + 'total_chunks_processed': 0, + 'speech_segments_detected': 0, + 'processing_time_total': 0.0, + 'last_processing_time': 0.0, + 'ffmpeg_conversions': 0, + 'audio_validation_failures': 0, + 'webrtc_available': WEBRTC_AVAILABLE and self.webrtc_vad is not None, + 'ffmpeg_available': self.ffmpeg_available + } + + # Async processing + self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=2) + self.save_lock = Lock() + + logger.info(f"Enhanced VAD initialized:") + logger.info(f" Sample rate: {sample_rate} Hz") + logger.info(f" Frame duration: {frame_duration_ms} ms") + logger.info(f" WebRTC VAD: {'Available' if self.webrtc_vad else 'Not available'}") + logger.info(f" ffmpeg: {'Available' if self.ffmpeg_available else 'Not available'}") + + def _check_ffmpeg_available(self) -> bool: + """Check if ffmpeg is available.""" + try: + result = subprocess.run(['ffmpeg', '-version'], + capture_output=True, text=True, timeout=5) + return result.returncode == 0 + except Exception: + return False + + def preprocess_audio_with_ffmpeg(self, audio_data: bytes) -> Optional[bytes]: + """ + Preprocess audio using ffmpeg for optimal VAD performance. + + Args: + audio_data: Raw audio bytes + + Returns: + Preprocessed audio bytes or None if processing fails + """ + if not self.ffmpeg_available: + logger.debug("ffmpeg not available for audio preprocessing") + return None + + temp_input = None + temp_output = None + + try: + # Create temporary files + with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as temp_input: + temp_input.write(audio_data) + temp_input.flush() + + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_output: + pass + + # ffmpeg command for VAD-optimized preprocessing + ffmpeg_cmd = [ + 'ffmpeg', + '-i', temp_input.name, + '-ar', str(self.sample_rate), # Resample to target rate + '-ac', '1', # Convert to mono + '-acodec', 'pcm_s16le', # 16-bit PCM + '-af', 'highpass=f=80,lowpass=f=8000,dynaudnorm=f=10:g=3', # Audio filters for speech + '-f', 'wav', + '-loglevel', 'error', + '-y', + temp_output.name + ] + + result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, timeout=10) + + if result.returncode == 0: + with open(temp_output.name, 'rb') as f: + preprocessed_audio = f.read() + + self.stats['ffmpeg_conversions'] += 1 + logger.debug(f"ffmpeg preprocessing: {len(audio_data)} -> {len(preprocessed_audio)} bytes") + return preprocessed_audio + else: + logger.error(f"ffmpeg preprocessing failed: {result.stderr}") + return None + + except Exception as e: + logger.error(f"ffmpeg preprocessing error: {e}") + return None + + finally: + # Cleanup + try: + if temp_input and os.path.exists(temp_input.name): + os.unlink(temp_input.name) + if temp_output and os.path.exists(temp_output.name): + os.unlink(temp_output.name) + except Exception: + pass + + def validate_and_debug_audio(self, audio_data: bytes) -> Dict[str, Any]: + """ + Comprehensive audio validation and debugging. + + Args: + audio_data: Audio data to validate + + Returns: + Validation results and debugging information + """ + debug_info = { + 'size_bytes': len(audio_data), + 'valid_wav': False, + 'sample_rate': None, + 'channels': None, + 'duration': 0.0, + 'energy_level': 0.0, + 'is_silent': True, + 'format_detected': 'unknown', + 'issues': [] + } + + try: + # Check minimum size + if len(audio_data) < 44: + debug_info['issues'].append(f"Too small: {len(audio_data)} bytes (need ≥44 for WAV)") + return debug_info + + # Detect format by header + if audio_data.startswith(b'RIFF') and b'WAVE' in audio_data[:20]: + debug_info['format_detected'] = 'wav' + elif audio_data.startswith(b'OggS'): + debug_info['format_detected'] = 'ogg' + elif audio_data.startswith(b'\x1a\x45\xdf\xa3'): + debug_info['format_detected'] = 'webm' + + # Try to parse as WAV + try: + with wave.open(io.BytesIO(audio_data), 'rb') as wav: + debug_info['valid_wav'] = True + debug_info['sample_rate'] = wav.getframerate() + debug_info['channels'] = wav.getnchannels() + debug_info['duration'] = wav.getnframes() / wav.getframerate() + + # Read audio samples for analysis + wav.rewind() + frames = wav.readframes(wav.getnframes()) + + if len(frames) > 0: + # Convert to numpy for analysis + audio_array = np.frombuffer(frames, dtype=np.int16) + + # Calculate energy level + energy = np.sqrt(np.mean(audio_array.astype(np.float32) ** 2)) + debug_info['energy_level'] = float(energy) + debug_info['is_silent'] = energy < (self.silence_threshold * 32768) + + # Check for constant beep (common issue) + if len(audio_array) > 100: + # Check if audio is a constant tone (beep) + diff = np.diff(audio_array) + if np.std(diff) < 100: # Very low variation + debug_info['issues'].append("Constant tone/beep detected") + + # Check dynamic range + if np.max(audio_array) - np.min(audio_array) < 1000: + debug_info['issues'].append("Very low dynamic range") + + except Exception as wav_error: + debug_info['issues'].append(f"WAV parsing failed: {wav_error}") + + # Additional format-specific checks + if debug_info['format_detected'] in ['ogg', 'webm'] and not debug_info['valid_wav']: + debug_info['issues'].append("Non-WAV format detected - requires conversion") + + logger.debug(f"Audio validation: {debug_info}") + + if debug_info['issues']: + self.stats['audio_validation_failures'] += 1 + logger.warning(f"Audio validation issues: {debug_info['issues']}") + + return debug_info + + except Exception as e: + debug_info['issues'].append(f"Validation error: {str(e)}") + logger.error(f"Audio validation failed: {e}") + return debug_info + + def detect_speech_segments(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]: + """ + Detect speech segments using multiple methods. + + Args: + audio_data: Input audio data + + Returns: + List of (segment_audio, segment_info) tuples + """ + start_time = time.time() + + # Validate and debug audio + debug_info = self.validate_and_debug_audio(audio_data) + + segments = [] + + try: + # Preprocess with ffmpeg if available + processed_audio = self.preprocess_audio_with_ffmpeg(audio_data) + if processed_audio: + working_audio = processed_audio + logger.debug("Using ffmpeg-preprocessed audio for VAD") + else: + working_audio = audio_data + logger.debug("Using original audio for VAD") + + # Re-validate processed audio + if processed_audio: + processed_debug = self.validate_and_debug_audio(processed_audio) + logger.debug(f"Processed audio validation: {processed_debug}") + + # Method 1: WebRTC VAD (if available) + if self.webrtc_vad and debug_info['valid_wav']: + webrtc_segments = self._webrtc_vad_detection(working_audio) + segments.extend(webrtc_segments) + logger.debug(f"WebRTC VAD found {len(webrtc_segments)} segments") + + # Method 2: Energy-based VAD (fallback) + if not segments or debug_info['issues']: + energy_segments = self._energy_based_vad(working_audio) + segments.extend(energy_segments) + logger.debug(f"Energy VAD found {len(energy_segments)} segments") + + # Method 3: Simple duration-based segmentation (last resort) + if not segments and len(audio_data) > 8000: # > 8KB + fallback_segment = self._create_fallback_segment(working_audio) + if fallback_segment: + segments.append(fallback_segment) + logger.debug("Used fallback segmentation") + + processing_time = time.time() - start_time + self.stats['total_chunks_processed'] += 1 + self.stats['speech_segments_detected'] += len(segments) + self.stats['processing_time_total'] += processing_time + self.stats['last_processing_time'] = processing_time + + logger.debug(f"VAD processing complete: {len(segments)} segments in {processing_time:.3f}s") + + return segments + + except Exception as e: + logger.error(f"Speech segment detection failed: {e}") + return [] + + def _webrtc_vad_detection(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]: + """WebRTC-based speech detection.""" + segments = [] + + try: + frame_size_bytes = self.frame_size * 2 # 16-bit = 2 bytes per sample + frames = [] + + # Extract frames + for i in range(0, len(audio_data) - frame_size_bytes + 1, frame_size_bytes): + frame = audio_data[i:i + frame_size_bytes] + if len(frame) == frame_size_bytes: + frames.append(frame) + + if len(frames) < 5: # Need minimum frames + return segments + + # VAD processing + speech_frames = [] + for frame in frames: + try: + is_speech = self.webrtc_vad.is_speech(frame, self.sample_rate) + speech_frames.append((frame, is_speech)) + except Exception as e: + logger.debug(f"WebRTC VAD frame processing failed: {e}") + speech_frames.append((frame, False)) + + # Group consecutive speech frames + current_segment = [] + for frame, is_speech in speech_frames: + if is_speech: + current_segment.append(frame) + else: + if len(current_segment) > 0: + # End of speech segment + segment_audio = b''.join(current_segment) + segment_duration = len(current_segment) * self.frame_duration_ms / 1000 + + if segment_duration >= self.min_speech_duration: + segments.append((segment_audio, { + 'duration': segment_duration, + 'method': 'webrtc_vad', + 'frames': len(current_segment) + })) + + current_segment = [] + + # Handle final segment + if current_segment: + segment_audio = b''.join(current_segment) + segment_duration = len(current_segment) * self.frame_duration_ms / 1000 + + if segment_duration >= self.min_speech_duration: + segments.append((segment_audio, { + 'duration': segment_duration, + 'method': 'webrtc_vad', + 'frames': len(current_segment) + })) + + return segments + + except Exception as e: + logger.error(f"WebRTC VAD detection failed: {e}") + return [] + + def _energy_based_vad(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]: + """Energy-based speech detection.""" + segments = [] + + try: + # Try to parse as WAV or raw PCM + try: + with wave.open(io.BytesIO(audio_data), 'rb') as wav: + frames = wav.readframes(wav.getnframes()) + sample_rate = wav.getframerate() + except: + # Assume raw 16-bit PCM + frames = audio_data + sample_rate = self.sample_rate + + if len(frames) < 1000: # Too short + return segments + + # Convert to numpy array + audio_samples = np.frombuffer(frames, dtype=np.int16) + audio_float = audio_samples.astype(np.float32) / 32768.0 + + # Calculate energy in overlapping windows + window_size = int(sample_rate * 0.1) # 100ms windows + hop_size = window_size // 2 + + energies = [] + for i in range(0, len(audio_float) - window_size, hop_size): + window = audio_float[i:i + window_size] + energy = np.sqrt(np.mean(window ** 2)) + energies.append(energy) + + if len(energies) < 3: + return segments + + # Adaptive threshold + mean_energy = np.mean(energies) + threshold = max(self.silence_threshold, mean_energy * 0.3) + + # Find speech segments + if isinstance(energies, (list, np.ndarray)): + energies = np.array(energies) # Ensure it's a numpy array + speech_windows = energies > threshold + + # Group consecutive speech windows + speech_start = None + for i, is_speech in enumerate(speech_windows): + if is_speech and speech_start is None: + speech_start = i + elif not is_speech and speech_start is not None: + # End of speech + start_sample = speech_start * hop_size + end_sample = min(i * hop_size + window_size, len(audio_samples)) + + segment_samples = audio_samples[start_sample:end_sample] + segment_duration = len(segment_samples) / sample_rate + + if segment_duration >= self.min_speech_duration: + # Convert back to bytes + segment_audio = segment_samples.tobytes() + + segments.append((segment_audio, { + 'duration': segment_duration, + 'method': 'energy_based', + 'start_time': start_sample / sample_rate, + 'energy_threshold': threshold, + 'mean_energy': mean_energy + })) + + speech_start = None + + return segments + + except Exception as e: + logger.error(f"Energy-based VAD failed: {e}") + return [] + + def _create_fallback_segment(self, audio_data: bytes) -> Optional[Tuple[bytes, Dict[str, Any]]]: + """Create a fallback segment when VAD methods fail.""" + try: + # Use the entire audio as a segment if it's reasonable length + debug_info = self.validate_and_debug_audio(audio_data) + + if debug_info['duration'] > 0: + duration = debug_info['duration'] + else: + # Estimate duration based on size (assume 16-bit, mono, 16kHz) + estimated_samples = len(audio_data) // 2 + duration = estimated_samples / self.sample_rate + + if self.min_speech_duration <= duration <= self.max_speech_duration: + return (audio_data, { + 'duration': duration, + 'method': 'fallback', + 'estimated': True, + 'issues': debug_info['issues'] + }) + + return None + + except Exception as e: + logger.error(f"Fallback segment creation failed: {e}") + return None + + async def save_audio_chunk_async(self, audio_data: bytes, session_id: str, + chunk_type: str = "vad_chunk") -> Optional[str]: + """ + Asynchronously save audio chunk to file. + + Args: + audio_data: Audio data to save + session_id: Session identifier + chunk_type: Type of chunk (for filename) + + Returns: + Path to saved file or None if failed + """ + def _save_chunk(): + try: + with self.save_lock: + timestamp = int(time.time() * 1000) + filename = f"{chunk_type}_{session_id}_{timestamp}.wav" + filepath = Path("output") / filename + + # Ensure output directory exists + filepath.parent.mkdir(exist_ok=True) + + # Save as WAV file + with open(filepath, 'wb') as f: + f.write(audio_data) + + logger.debug(f"Saved audio chunk: {filepath}") + return str(filepath) + + except Exception as e: + logger.error(f"Failed to save audio chunk: {e}") + return None + + # Run in executor to avoid blocking + loop = asyncio.get_event_loop() + result = await loop.run_in_executor(self.executor, _save_chunk) + return result + + def get_stats(self) -> Dict[str, Any]: + """Get comprehensive VAD statistics.""" + stats = self.stats.copy() + + if stats['total_chunks_processed'] > 0: + stats['average_processing_time'] = stats['processing_time_total'] / stats['total_chunks_processed'] + stats['segments_per_chunk'] = stats['speech_segments_detected'] / stats['total_chunks_processed'] + else: + stats['average_processing_time'] = 0.0 + stats['segments_per_chunk'] = 0.0 + + return stats + + def cleanup(self): + """Clean up resources.""" + if hasattr(self, 'executor'): + self.executor.shutdown(wait=True) + logger.info("Enhanced VAD cleaned up") + +# Convenience function for creating enhanced VAD +def create_enhanced_vad(config: Optional[Dict[str, Any]] = None) -> EnhancedVAD: + """Create enhanced VAD with optional configuration.""" + if config is None: + config = {} + + return EnhancedVAD( + sample_rate=config.get('sample_rate', 16000), + frame_duration_ms=config.get('frame_duration_ms', 30), + aggressiveness=config.get('aggressiveness', 1), + min_speech_duration=config.get('min_speech_duration', 0.4), + max_speech_duration=config.get('max_speech_duration', 3.0), + silence_threshold=config.get('silence_threshold', 0.01) + ) \ No newline at end of file diff --git a/utils/logging_utils.py b/utils/logging_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e2d6a45e833ae0857adec9c1a71a10d3bf408cc4 --- /dev/null +++ b/utils/logging_utils.py @@ -0,0 +1,201 @@ +import logging +import time +from typing import Dict, List, Any +from collections import defaultdict, deque +import json + +class PerformanceLogger: + """ + Performance logger for tracking audio processing metrics. + Provides detailed logging and statistics for each processing method. + """ + + def __init__(self, max_history: int = 100): + self.max_history = max_history + self.method_stats = defaultdict(lambda: { + 'predictions': deque(maxlen=max_history), + 'inference_times': deque(maxlen=max_history), + 'errors': deque(maxlen=max_history), + 'total_calls': 0, + 'total_errors': 0 + }) + + # Setup structured logging + self.setup_logging() + + def setup_logging(self): + """Setup structured logging with proper formatting.""" + # Create custom formatter + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + # Setup console handler + console_handler = logging.StreamHandler() + console_handler.setFormatter(formatter) + + # Setup file handler + file_handler = logging.FileHandler('audio_digit_classifier.log') + file_handler.setFormatter(formatter) + + # Configure root logger + logging.basicConfig( + level=logging.DEBUG, + handlers=[console_handler, file_handler] + ) + + self.logger = logging.getLogger(__name__) + + def log_prediction(self, method: str, result: Dict[str, Any]): + """ + Log a prediction result with performance metrics. + + Args: + method: Processing method name + result: Prediction result dictionary + """ + stats = self.method_stats[method] + stats['total_calls'] += 1 + + if result.get('success', True): + stats['predictions'].append({ + 'digit': result.get('predicted_digit'), + 'timestamp': result.get('timestamp', time.time()), + 'inference_time': result.get('inference_time', 0) + }) + stats['inference_times'].append(result.get('inference_time', 0)) + + self.logger.info(json.dumps({ + 'event': 'prediction', + 'method': method, + 'digit': result.get('predicted_digit'), + 'inference_time': result.get('inference_time'), + 'timestamp': result.get('timestamp') + })) + else: + stats['total_errors'] += 1 + stats['errors'].append({ + 'error': result.get('error'), + 'timestamp': result.get('timestamp', time.time()), + 'inference_time': result.get('inference_time', 0) + }) + + self.logger.error(json.dumps({ + 'event': 'error', + 'method': method, + 'error': result.get('error'), + 'timestamp': result.get('timestamp') + })) + + def get_method_stats(self, method: str) -> Dict[str, Any]: + """ + Get performance statistics for a specific method. + + Args: + method: Processing method name + + Returns: + Dictionary with performance statistics + """ + stats = self.method_stats[method] + inference_times = list(stats['inference_times']) + + if not inference_times: + return { + 'method': method, + 'total_calls': stats['total_calls'], + 'successful_predictions': 0, + 'error_rate': 0.0, + 'avg_inference_time': 0.0, + 'min_inference_time': 0.0, + 'max_inference_time': 0.0 + } + + successful_predictions = len(inference_times) + error_rate = stats['total_errors'] / stats['total_calls'] if stats['total_calls'] > 0 else 0 + + return { + 'method': method, + 'total_calls': stats['total_calls'], + 'successful_predictions': successful_predictions, + 'error_rate': round(error_rate * 100, 2), + 'avg_inference_time': round(sum(inference_times) / len(inference_times), 3), + 'min_inference_time': round(min(inference_times), 3), + 'max_inference_time': round(max(inference_times), 3), + 'recent_predictions': list(stats['predictions'])[-10:] # Last 10 predictions + } + + def get_all_stats(self) -> Dict[str, Any]: + """Get statistics for all processing methods.""" + all_stats = {} + for method in self.method_stats.keys(): + all_stats[method] = self.get_method_stats(method) + + return all_stats + + def get_comparison_report(self) -> str: + """ + Generate a comparison report of all processing methods. + + Returns: + Formatted string with method comparison + """ + all_stats = self.get_all_stats() + + if not all_stats: + return "No statistics available yet." + + report = "\n=== Audio Processing Method Comparison ===\n\n" + + for method, stats in all_stats.items(): + report += f"Method: {method}\n" + report += f" Total Calls: {stats['total_calls']}\n" + report += f" Successful: {stats['successful_predictions']}\n" + report += f" Error Rate: {stats['error_rate']}%\n" + report += f" Avg Time: {stats['avg_inference_time']}s\n" + report += f" Min/Max: {stats['min_inference_time']}s / {stats['max_inference_time']}s\n" + report += "\n" + + # Find best performing method + if len(all_stats) > 1: + best_speed = min(all_stats.items(), key=lambda x: x[1]['avg_inference_time']) + best_accuracy = min(all_stats.items(), key=lambda x: x[1]['error_rate']) + + report += f"Fastest Method: {best_speed[0]} ({best_speed[1]['avg_inference_time']}s avg)\n" + report += f"Most Accurate: {best_accuracy[0]} ({best_accuracy[1]['error_rate']}% error rate)\n" + + return report + + def log_system_info(self, info: Dict[str, Any]): + """Log system information for debugging.""" + self.logger.info(json.dumps({ + 'event': 'system_info', + 'timestamp': time.time(), + **info + })) + + def log_audio_info(self, duration: float, format_info: Dict[str, Any]): + """Log audio input information.""" + self.logger.debug(json.dumps({ + 'event': 'audio_input', + 'duration': duration, + 'format': format_info, + 'timestamp': time.time() + })) + +# Global performance logger instance +performance_logger = PerformanceLogger() + +def setup_flask_logging(app): + """Setup logging configuration for Flask application.""" + if not app.debug: + # Production logging + file_handler = logging.FileHandler('flask_app.log') + file_handler.setFormatter(logging.Formatter( + '%(asctime)s %(levelname)s %(name)s %(message)s' + )) + file_handler.setLevel(logging.INFO) + app.logger.addHandler(file_handler) + app.logger.setLevel(logging.INFO) + + app.logger.info('Audio Digit Classifier startup') \ No newline at end of file diff --git a/utils/noise_utils.py b/utils/noise_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..301976ee022463921592fcfbac48d81fe62fb894 --- /dev/null +++ b/utils/noise_utils.py @@ -0,0 +1,292 @@ +import numpy as np +import wave +import io +import logging +from typing import Literal, Optional + +logger = logging.getLogger(__name__) + +NoiseType = Literal['white', 'pink', 'brown', 'gaussian', 'background', 'speech'] + +class NoiseGenerator: + """ + Audio noise generator for robustness testing. + Supports various types of noise injection for testing digit recognition. + """ + + def __init__(self): + self.logger = logging.getLogger(__name__) + + def generate_white_noise(self, duration: float, sample_rate: int = 16000, + amplitude: float = 0.1) -> np.ndarray: + """ + Generate white noise signal. + + Args: + duration: Duration in seconds + sample_rate: Sample rate in Hz + amplitude: Noise amplitude (0.0 to 1.0) + + Returns: + Numpy array of white noise + """ + samples = int(duration * sample_rate) + noise = np.random.normal(0, amplitude, samples) + return noise.astype(np.float32) + + def generate_pink_noise(self, duration: float, sample_rate: int = 16000, + amplitude: float = 0.1) -> np.ndarray: + """ + Generate pink noise (1/f noise). + + Args: + duration: Duration in seconds + sample_rate: Sample rate in Hz + amplitude: Noise amplitude + + Returns: + Numpy array of pink noise + """ + samples = int(duration * sample_rate) + + # Generate white noise + white = np.random.randn(samples) + + # Apply 1/f filter in frequency domain + freqs = np.fft.fftfreq(samples, 1/sample_rate) + freqs[0] = 1 # Avoid division by zero + + # 1/f filter + filter_response = 1.0 / np.sqrt(np.abs(freqs)) + filter_response[0] = 0 + + # Apply filter + white_fft = np.fft.fft(white) + pink_fft = white_fft * filter_response + pink = np.real(np.fft.ifft(pink_fft)) + + # Normalize and scale + pink = pink / np.std(pink) * amplitude + return pink.astype(np.float32) + + def generate_brown_noise(self, duration: float, sample_rate: int = 16000, + amplitude: float = 0.1) -> np.ndarray: + """ + Generate brown noise (1/f^2 noise). + + Args: + duration: Duration in seconds + sample_rate: Sample rate in Hz + amplitude: Noise amplitude + + Returns: + Numpy array of brown noise + """ + samples = int(duration * sample_rate) + + # Generate white noise and integrate (cumulative sum) + white = np.random.randn(samples) + brown = np.cumsum(white) + + # Normalize and scale + brown = brown / np.std(brown) * amplitude + return brown.astype(np.float32) + + def generate_gaussian_noise(self, duration: float, sample_rate: int = 16000, + amplitude: float = 0.1) -> np.ndarray: + """ + Generate Gaussian (normal distribution) noise. + + Args: + duration: Duration in seconds + sample_rate: Sample rate in Hz + amplitude: Noise amplitude (standard deviation) + + Returns: + Numpy array of Gaussian noise + """ + samples = int(duration * sample_rate) + noise = np.random.normal(0, amplitude, samples) + return noise.astype(np.float32) + + def generate_background_noise(self, duration: float, sample_rate: int = 16000, + amplitude: float = 0.05) -> np.ndarray: + """ + Generate realistic background noise (mixture of different noise types). + + Args: + duration: Duration in seconds + sample_rate: Sample rate in Hz + amplitude: Noise amplitude + + Returns: + Numpy array of background noise + """ + # Mix different types of noise + white = self.generate_white_noise(duration, sample_rate, amplitude * 0.3) + pink = self.generate_pink_noise(duration, sample_rate, amplitude * 0.5) + + # Add some low-frequency rumble + t = np.linspace(0, duration, int(sample_rate * duration), False) + rumble = amplitude * 0.2 * np.sin(2 * np.pi * 60 * t) # 60 Hz hum + + background = white + pink + rumble + return background.astype(np.float32) + + def inject_noise(self, audio_data: bytes, noise_type: NoiseType, + noise_level: float = 0.1) -> bytes: + """ + Inject noise into existing audio data. + + Args: + audio_data: Original audio bytes (WAV format) + noise_type: Type of noise to inject + noise_level: Noise level relative to signal (0.0 to 1.0) + + Returns: + Audio bytes with noise injected + + Raises: + Exception: If noise injection fails + """ + try: + # Convert input audio to numpy + with wave.open(io.BytesIO(audio_data), 'rb') as wav_file: + frames = wav_file.readframes(wav_file.getnframes()) + sample_rate = wav_file.getframerate() + channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + + if sample_width != 2: + raise Exception(f"Unsupported sample width: {sample_width}") + + audio_array = np.frombuffer(frames, dtype=np.int16) + + # Convert to float + audio_float = audio_array.astype(np.float32) / 32767.0 + + # Handle stereo + if channels == 2: + audio_float = audio_float.reshape(-1, 2) + # Process each channel separately + for ch in range(2): + channel_data = audio_float[:, ch] + duration = len(channel_data) / sample_rate + + # Generate appropriate noise + if noise_type == 'white': + noise = self.generate_white_noise(duration, sample_rate, noise_level) + elif noise_type == 'pink': + noise = self.generate_pink_noise(duration, sample_rate, noise_level) + elif noise_type == 'brown': + noise = self.generate_brown_noise(duration, sample_rate, noise_level) + elif noise_type == 'gaussian': + noise = self.generate_gaussian_noise(duration, sample_rate, noise_level) + elif noise_type == 'background': + noise = self.generate_background_noise(duration, sample_rate, noise_level) + else: + raise Exception(f"Unsupported noise type: {noise_type}") + + # Ensure same length + if len(noise) != len(channel_data): + noise = noise[:len(channel_data)] + + # Add noise + audio_float[:, ch] = channel_data + noise + + # Flatten back + audio_float = audio_float.flatten() + else: + # Mono processing + duration = len(audio_float) / sample_rate + + # Generate noise + if noise_type == 'white': + noise = self.generate_white_noise(duration, sample_rate, noise_level) + elif noise_type == 'pink': + noise = self.generate_pink_noise(duration, sample_rate, noise_level) + elif noise_type == 'brown': + noise = self.generate_brown_noise(duration, sample_rate, noise_level) + elif noise_type == 'gaussian': + noise = self.generate_gaussian_noise(duration, sample_rate, noise_level) + elif noise_type == 'background': + noise = self.generate_background_noise(duration, sample_rate, noise_level) + else: + raise Exception(f"Unsupported noise type: {noise_type}") + + # Ensure same length + if len(noise) != len(audio_float): + noise = noise[:len(audio_float)] + + # Add noise + audio_float = audio_float + noise + + # Clip to prevent overflow + audio_float = np.clip(audio_float, -1.0, 1.0) + + # Convert back to int16 + audio_int16 = (audio_float * 32767).astype(np.int16) + + # Create output WAV + output = io.BytesIO() + with wave.open(output, 'wb') as output_wav: + output_wav.setnchannels(channels) + output_wav.setsampwidth(sample_width) + output_wav.setframerate(sample_rate) + output_wav.writeframes(audio_int16.tobytes()) + + self.logger.debug(f"Injected {noise_type} noise at level {noise_level}") + return output.getvalue() + + except Exception as e: + self.logger.error(f"Noise injection failed: {str(e)}") + raise Exception(f"Failed to inject noise: {str(e)}") + + def create_pure_noise(self, noise_type: NoiseType, duration: float = 1.0, + sample_rate: int = 16000, amplitude: float = 0.3) -> bytes: + """ + Create pure noise audio file for testing. + + Args: + noise_type: Type of noise to generate + duration: Duration in seconds + sample_rate: Sample rate in Hz + amplitude: Noise amplitude + + Returns: + WAV audio bytes containing pure noise + """ + try: + # Generate noise + if noise_type == 'white': + noise = self.generate_white_noise(duration, sample_rate, amplitude) + elif noise_type == 'pink': + noise = self.generate_pink_noise(duration, sample_rate, amplitude) + elif noise_type == 'brown': + noise = self.generate_brown_noise(duration, sample_rate, amplitude) + elif noise_type == 'gaussian': + noise = self.generate_gaussian_noise(duration, sample_rate, amplitude) + elif noise_type == 'background': + noise = self.generate_background_noise(duration, sample_rate, amplitude) + else: + raise Exception(f"Unsupported noise type: {noise_type}") + + # Convert to int16 + noise_int16 = (np.clip(noise, -1.0, 1.0) * 32767).astype(np.int16) + + # Create WAV + output = io.BytesIO() + with wave.open(output, 'wb') as wav_file: + wav_file.setnchannels(1) # Mono + wav_file.setsampwidth(2) # 16-bit + wav_file.setframerate(sample_rate) + wav_file.writeframes(noise_int16.tobytes()) + + return output.getvalue() + + except Exception as e: + self.logger.error(f"Pure noise generation failed: {str(e)}") + raise Exception(f"Failed to create pure noise: {str(e)}") + +# Global noise generator instance +noise_generator = NoiseGenerator() \ No newline at end of file diff --git a/utils/session_manager.py b/utils/session_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..ab4b98e348841119a4f5ca010e7715dffeec76a0 --- /dev/null +++ b/utils/session_manager.py @@ -0,0 +1,340 @@ +""" +Session Management for Audio Chunk Storage +Handles session creation, audio chunk saving, and folder organization +""" + +import os +import time +import uuid +import logging +import wave +import numpy as np +from typing import Dict, Optional, List +from pathlib import Path +import json +import threading + +logger = logging.getLogger(__name__) + +class SessionManager: + """ + Manages audio recording sessions with systematic file storage. + Each session gets a unique ID and folder for organized chunk storage. + """ + + def __init__(self, base_output_dir: str = "output"): + """ + Initialize session manager. + + Args: + base_output_dir: Base directory for all session outputs + """ + self.base_output_dir = Path(base_output_dir) + self.base_output_dir.mkdir(exist_ok=True) + + # Active sessions tracking + self.active_sessions: Dict[str, 'AudioSession'] = {} + self.lock = threading.Lock() + + logger.info(f"Session manager initialized with output directory: {self.base_output_dir}") + + def create_session(self, session_id: Optional[str] = None) -> str: + """ + Create a new audio recording session. + + Args: + session_id: Optional custom session ID, otherwise auto-generated + + Returns: + str: Session ID + """ + if not session_id: + # Generate session ID with timestamp and short UUID + timestamp = int(time.time()) + short_uuid = str(uuid.uuid4())[:8] + session_id = f"session{timestamp}_{short_uuid}" + + with self.lock: + if session_id in self.active_sessions: + logger.warning(f"Session {session_id} already exists, returning existing session") + return session_id + + # Create session object + session = AudioSession(session_id, self.base_output_dir) + self.active_sessions[session_id] = session + + logger.info(f"Created new session: {session_id}") + return session_id + + def get_session(self, session_id: str) -> Optional['AudioSession']: + """Get an existing session by ID.""" + with self.lock: + return self.active_sessions.get(session_id) + + def close_session(self, session_id: str) -> bool: + """ + Close and finalize a session. + + Args: + session_id: Session to close + + Returns: + bool: True if session was closed successfully + """ + with self.lock: + if session_id not in self.active_sessions: + logger.warning(f"Session {session_id} not found") + return False + + session = self.active_sessions[session_id] + session.finalize() + del self.active_sessions[session_id] + + logger.info(f"Closed session: {session_id} ({session.chunk_count} chunks saved)") + return True + + def cleanup_old_sessions(self, max_age_hours: int = 24) -> int: + """ + Clean up sessions older than specified hours. + + Args: + max_age_hours: Maximum age in hours before cleanup + + Returns: + int: Number of sessions cleaned up + """ + cutoff_time = time.time() - (max_age_hours * 3600) + cleaned_count = 0 + + # Find old session folders + for session_dir in self.base_output_dir.iterdir(): + if not session_dir.is_dir() or not session_dir.name.startswith('session'): + continue + + try: + # Check if session has a metadata file with creation time + metadata_file = session_dir / "session_info.json" + if metadata_file.exists(): + with open(metadata_file, 'r') as f: + metadata = json.load(f) + if metadata.get('created_at', 0) < cutoff_time: + import shutil + shutil.rmtree(session_dir) + cleaned_count += 1 + logger.info(f"Cleaned up old session: {session_dir.name}") + else: + # Fallback to directory modification time + if session_dir.stat().st_mtime < cutoff_time: + import shutil + shutil.rmtree(session_dir) + cleaned_count += 1 + logger.info(f"Cleaned up old session: {session_dir.name}") + + except Exception as e: + logger.error(f"Error cleaning up session {session_dir.name}: {e}") + + if cleaned_count > 0: + logger.info(f"Cleaned up {cleaned_count} old sessions") + + return cleaned_count + + def get_session_stats(self) -> Dict: + """Get statistics about all sessions.""" + with self.lock: + stats = { + 'active_sessions': len(self.active_sessions), + 'total_chunks_active': sum(s.chunk_count for s in self.active_sessions.values()), + 'session_details': { + sid: { + 'chunk_count': session.chunk_count, + 'created_at': session.created_at, + 'folder_path': str(session.session_dir) + } + for sid, session in self.active_sessions.items() + } + } + + # Count total session folders + total_session_dirs = len([ + d for d in self.base_output_dir.iterdir() + if d.is_dir() and d.name.startswith('session') + ]) + stats['total_session_folders'] = total_session_dirs + + return stats + + +class AudioSession: + """ + Represents a single audio recording session with systematic chunk storage. + """ + + def __init__(self, session_id: str, base_output_dir: Path): + """ + Initialize audio session. + + Args: + session_id: Unique session identifier + base_output_dir: Base directory for output + """ + self.session_id = session_id + self.created_at = time.time() + self.chunk_count = 0 + + # Create session directory + self.session_dir = base_output_dir / session_id + self.session_dir.mkdir(exist_ok=True) + + # Create subdirectories + self.chunks_dir = self.session_dir / "chunks" + self.chunks_dir.mkdir(exist_ok=True) + + # Session metadata + self.metadata = { + 'session_id': session_id, + 'created_at': self.created_at, + 'created_at_human': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.created_at)), + 'chunk_count': 0, + 'chunks': [] + } + + self._save_metadata() + logger.info(f"Session folder created: {self.session_dir}") + + def save_audio_chunk(self, audio_data: bytes, prediction_result: Optional[Dict] = None, + chunk_type: str = "speech") -> str: + """ + Save an audio chunk to the session folder. + + Args: + audio_data: Raw audio bytes (WAV format preferred) + prediction_result: Optional prediction results to save alongside + chunk_type: Type of chunk ("speech", "vad_segment", "raw", etc.) + + Returns: + str: Path to saved chunk file + """ + self.chunk_count += 1 + + # Generate chunk filename + chunk_filename = f"{self.chunk_count:03d}.wav" + chunk_path = self.chunks_dir / chunk_filename + + try: + # Save audio data + if self._is_wav_format(audio_data): + # Already WAV format, save directly + with open(chunk_path, 'wb') as f: + f.write(audio_data) + logger.debug(f"Saved WAV chunk: {chunk_path}") + else: + # Convert raw PCM to WAV + self._save_pcm_as_wav(audio_data, chunk_path) + logger.debug(f"Converted and saved PCM chunk: {chunk_path}") + + # Update metadata + chunk_info = { + 'chunk_id': self.chunk_count, + 'filename': chunk_filename, + 'chunk_type': chunk_type, + 'size_bytes': len(audio_data), + 'saved_at': time.time(), + 'saved_at_human': time.strftime('%Y-%m-%d %H:%M:%S'), + 'audio_format': 'wav' if self._is_wav_format(audio_data) else 'pcm_converted' + } + + # Add prediction results if provided + if prediction_result: + chunk_info['prediction'] = prediction_result + + self.metadata['chunks'].append(chunk_info) + self.metadata['chunk_count'] = self.chunk_count + self._save_metadata() + + logger.info(f"Saved audio chunk {self.chunk_count}: {chunk_path}") + return str(chunk_path) + + except Exception as e: + logger.error(f"Failed to save audio chunk {self.chunk_count}: {e}") + # Rollback chunk count on failure + self.chunk_count -= 1 + raise + + def _is_wav_format(self, audio_data: bytes) -> bool: + """Check if audio data is in WAV format.""" + return audio_data.startswith(b'RIFF') and b'WAVE' in audio_data[:12] + + def _save_pcm_as_wav(self, pcm_data: bytes, output_path: Path, + sample_rate: int = 16000, channels: int = 1, sample_width: int = 2): + """ + Convert raw PCM data to WAV format and save. + + Args: + pcm_data: Raw PCM bytes + output_path: Output WAV file path + sample_rate: Sample rate (default 16kHz for speech) + channels: Number of channels (default mono) + sample_width: Sample width in bytes (default 16-bit) + """ + try: + with wave.open(str(output_path), 'wb') as wav_file: + wav_file.setnchannels(channels) + wav_file.setsampwidth(sample_width) + wav_file.setframerate(sample_rate) + wav_file.writeframes(pcm_data) + + except Exception as e: + logger.error(f"PCM to WAV conversion failed: {e}") + # Fallback: save as raw PCM with .pcm extension + raw_path = output_path.with_suffix('.pcm') + with open(raw_path, 'wb') as f: + f.write(pcm_data) + logger.warning(f"Saved as raw PCM instead: {raw_path}") + + def _save_metadata(self): + """Save session metadata to JSON file.""" + try: + metadata_path = self.session_dir / "session_info.json" + with open(metadata_path, 'w') as f: + json.dump(self.metadata, f, indent=2, default=str) + except Exception as e: + logger.error(f"Failed to save session metadata: {e}") + + def finalize(self): + """Finalize the session and save final metadata.""" + self.metadata['finalized_at'] = time.time() + self.metadata['finalized_at_human'] = time.strftime('%Y-%m-%d %H:%M:%S') + self.metadata['final_chunk_count'] = self.chunk_count + self._save_metadata() + + logger.info(f"📋 Finalized session {self.session_id}: {self.chunk_count} chunks saved") + + def get_chunk_list(self) -> List[str]: + """Get list of all chunk files in order.""" + chunk_files = [] + for i in range(1, self.chunk_count + 1): + chunk_file = self.chunks_dir / f"{i:03d}.wav" + if chunk_file.exists(): + chunk_files.append(str(chunk_file)) + else: + # Check for .pcm fallback + pcm_file = self.chunks_dir / f"{i:03d}.pcm" + if pcm_file.exists(): + chunk_files.append(str(pcm_file)) + return chunk_files + + def get_session_summary(self) -> Dict: + """Get comprehensive session summary.""" + return { + 'session_id': self.session_id, + 'created_at': self.created_at, + 'chunk_count': self.chunk_count, + 'session_dir': str(self.session_dir), + 'chunks_dir': str(self.chunks_dir), + 'chunk_files': self.get_chunk_list(), + 'metadata': self.metadata + } + + +# Global session manager instance +session_manager = SessionManager() \ No newline at end of file diff --git a/utils/vad.py b/utils/vad.py new file mode 100644 index 0000000000000000000000000000000000000000..b68982a1d200c84f57a4e29fdf054fb8ee77473e --- /dev/null +++ b/utils/vad.py @@ -0,0 +1,149 @@ +""" +Voice Activity Detection (VAD) for streaming audio processing +Detects speech segments and trims silence +""" + +import numpy as np +import logging + +logger = logging.getLogger(__name__) + +class VoiceActivityDetector: + """Simple voice activity detector based on energy and zero-crossing rate.""" + + def __init__(self): + self.sample_rate = 16000 + self.frame_size = 512 # ~32ms frames at 16kHz + self.hop_size = 256 # 50% overlap + + # VAD thresholds + self.energy_threshold = 0.01 # Minimum energy for speech + self.zcr_threshold = 0.3 # Zero crossing rate threshold + self.min_speech_frames = 5 # Minimum frames for speech detection + self.min_silence_frames = 8 # Minimum silence frames to end speech + + # State tracking + self.is_speech_active = False + self.speech_frames = 0 + self.silence_frames = 0 + self.speech_buffer = [] + + logger.info("Voice Activity Detector initialized") + + def reset(self): + """Reset VAD state.""" + self.is_speech_active = False + self.speech_frames = 0 + self.silence_frames = 0 + self.speech_buffer = [] + + def compute_energy(self, frame): + """Compute energy of audio frame.""" + return np.mean(frame ** 2) + + def compute_zcr(self, frame): + """Compute zero crossing rate of audio frame.""" + zcr = np.sum(np.abs(np.diff(np.sign(frame)))) / (2 * len(frame)) + return zcr + + def is_speech_frame(self, frame): + """Determine if frame contains speech.""" + energy = self.compute_energy(frame) + zcr = self.compute_zcr(frame) + + # Simple rule: speech has moderate energy and ZCR + has_energy = energy > self.energy_threshold + has_reasonable_zcr = zcr < self.zcr_threshold + + return has_energy and has_reasonable_zcr + + def process_chunk(self, audio_data): + """ + Process audio chunk and return speech segments. + + Args: + audio_data: numpy array of audio samples + + Returns: + List of (start_sample, end_sample) tuples for speech segments + """ + if len(audio_data) == 0: + return [] + + speech_segments = [] + num_frames = (len(audio_data) - self.frame_size) // self.hop_size + 1 + + for i in range(num_frames): + start_idx = i * self.hop_size + end_idx = start_idx + self.frame_size + + if end_idx > len(audio_data): + break + + frame = audio_data[start_idx:end_idx] + is_speech = self.is_speech_frame(frame) + + if is_speech: + self.speech_frames += 1 + self.silence_frames = 0 + + if not self.is_speech_active and self.speech_frames >= self.min_speech_frames: + # Speech started + self.is_speech_active = True + self.speech_start_idx = max(0, start_idx - self.min_speech_frames * self.hop_size) + logger.debug(f"Speech started at sample {self.speech_start_idx}") + + else: + self.silence_frames += 1 + + if self.is_speech_active and self.silence_frames >= self.min_silence_frames: + # Speech ended + speech_end_idx = start_idx + speech_segments.append((self.speech_start_idx, speech_end_idx)) + logger.debug(f"Speech ended at sample {speech_end_idx}") + + # Reset for next speech segment + self.is_speech_active = False + self.speech_frames = 0 + self.silence_frames = 0 + + return speech_segments + + def extract_speech_segments(self, audio_data, segments): + """Extract speech segments from audio data.""" + speech_chunks = [] + + for start_idx, end_idx in segments: + if end_idx > start_idx: + segment = audio_data[start_idx:end_idx] + # Trim silence from edges + segment = self.trim_silence(segment) + if len(segment) > self.sample_rate * 0.3: # At least 300ms + speech_chunks.append(segment) + + return speech_chunks + + def trim_silence(self, audio_data, silence_threshold=0.01): + """Trim silence from beginning and end of audio.""" + if len(audio_data) == 0: + return audio_data + + # Find first and last non-silent samples + energy = audio_data ** 2 + non_silent = energy > silence_threshold + + if not np.any(non_silent): + return audio_data # All silence, return as is + + first_sound = np.argmax(non_silent) + last_sound = len(non_silent) - np.argmax(non_silent[::-1]) - 1 + + return audio_data[first_sound:last_sound + 1] + + def get_current_speech_segment(self, audio_data): + """Get current ongoing speech segment if any.""" + if self.is_speech_active and len(audio_data) > 0: + current_segment = audio_data[self.speech_start_idx:] + if len(current_segment) > self.sample_rate * 0.5: # At least 500ms + return self.trim_silence(current_segment) + return None \ No newline at end of file diff --git a/utils/vad_feature_integration.py b/utils/vad_feature_integration.py new file mode 100644 index 0000000000000000000000000000000000000000..5583a1da98375aa31edca7bf591a1647f32f6d95 --- /dev/null +++ b/utils/vad_feature_integration.py @@ -0,0 +1,483 @@ +""" +Integration module for WebRTC VAD with MFCC and Spectrogram processors +Combines voice activity detection with real-time feature extraction +""" + +import numpy as np +import librosa +import logging +from typing import Dict, List, Optional, Tuple +import time +from collections import deque +import threading +import queue + +from utils.webrtc_vad import WebRTCVADProcessor +from audio_processors.mfcc_processor import MFCCProcessor +from audio_processors.mel_spectrogram import MelSpectrogramProcessor +from audio_processors.raw_spectrogram import RawSpectrogramProcessor + +logger = logging.getLogger(__name__) + +class StreamingFeatureExtractor: + """ + Real-time feature extraction with VAD integration. + Combines WebRTC VAD with MFCC, Mel Spectrogram, and Raw Spectrogram processing. + """ + + def __init__(self, sample_rate=16000, n_mfcc=13, n_fft=2048, hop_length=512): + """ + Initialize streaming feature extractor. + + Args: + sample_rate: Audio sample rate + n_mfcc: Number of MFCC coefficients + n_fft: FFT window size + hop_length: Hop length for STFT + """ + self.sample_rate = sample_rate + self.n_mfcc = n_mfcc + self.n_fft = n_fft + self.hop_length = hop_length + + # Initialize VAD processor + self.vad_processor = WebRTCVADProcessor( + aggressiveness=2, + sample_rate=sample_rate, + frame_duration=30 + ) + + # Initialize feature processors + self.mfcc_processor = MFCCProcessor() + self.mel_processor = MelSpectrogramProcessor() + self.raw_spec_processor = RawSpectrogramProcessor() + + # Buffers for overlapped processing + self.audio_buffer = deque(maxlen=sample_rate * 2) # 2 second buffer + self.feature_buffer = deque(maxlen=100) # Store recent feature vectors + + # Threading for real-time processing + self.processing_queue = queue.Queue() + self.feature_queue = queue.Queue() + self.is_processing = False + self.processing_thread = None + + # Statistics + self.total_chunks_processed = 0 + self.features_extracted = 0 + self.speech_segments_processed = 0 + + logger.info("Streaming Feature Extractor initialized") + + def extract_features_realtime(self, audio_chunk: bytes) -> Dict[str, np.ndarray]: + """ + Extract features from streaming audio chunk with VAD. + + Args: + audio_chunk: Raw audio bytes + + Returns: + dict: Extracted features for detected speech segments + """ + # Process with VAD first + speech_segments = self.vad_processor.process_audio_chunk(audio_chunk) + + features_list = [] + + for segment in speech_segments: + # Convert bytes to numpy array + audio_array = np.frombuffer(segment, dtype=np.int16).astype(np.float32) / 32768.0 + + # Extract comprehensive features + features = self._compute_streaming_features(audio_array) + + if features: + features_list.append(features) + self.features_extracted += 1 + + self.total_chunks_processed += 1 + + if speech_segments: + self.speech_segments_processed += len(speech_segments) + logger.debug(f"Extracted features from {len(speech_segments)} speech segments") + + return features_list + + def _compute_streaming_features(self, audio_data: np.ndarray) -> Optional[Dict[str, np.ndarray]]: + """ + Compute comprehensive feature set optimized for streaming. + + Args: + audio_data: Audio samples as numpy array + + Returns: + dict: Feature dictionary or None if extraction fails + """ + try: + if len(audio_data) < self.n_fft: + logger.debug("Audio segment too short for feature extraction") + return None + + features = {} + + # Core MFCC features + mfccs = librosa.feature.mfcc( + y=audio_data, + sr=self.sample_rate, + n_mfcc=self.n_mfcc, + n_fft=self.n_fft, + hop_length=self.hop_length + ) + + # Statistical summaries for streaming + features['mfcc_mean'] = np.mean(mfccs, axis=1) + features['mfcc_std'] = np.std(mfccs, axis=1) + features['mfcc_delta'] = np.mean(librosa.feature.delta(mfccs), axis=1) + features['mfcc_delta2'] = np.mean(librosa.feature.delta(mfccs, order=2), axis=1) + + # Spectral features + features['spectral_centroid'] = np.mean( + librosa.feature.spectral_centroid(y=audio_data, sr=self.sample_rate) + ) + features['spectral_bandwidth'] = np.mean( + librosa.feature.spectral_bandwidth(y=audio_data, sr=self.sample_rate) + ) + features['spectral_rolloff'] = np.mean( + librosa.feature.spectral_rolloff(y=audio_data, sr=self.sample_rate) + ) + features['zero_crossing_rate'] = np.mean( + librosa.feature.zero_crossing_rate(audio_data) + ) + + # Energy features + features['rms_energy'] = np.mean(librosa.feature.rms(y=audio_data)) + + # Mel spectrogram features + mel_spec = librosa.feature.melspectrogram( + y=audio_data, + sr=self.sample_rate, + n_mels=40, # Reduced for streaming + n_fft=self.n_fft, + hop_length=self.hop_length + ) + features['mel_spec_mean'] = np.mean(mel_spec, axis=1) + features['mel_spec_std'] = np.std(mel_spec, axis=1) + + # Raw spectrogram features + stft = librosa.stft(audio_data, n_fft=self.n_fft, hop_length=self.hop_length) + magnitude_spec = np.abs(stft) + features['raw_spec_mean'] = np.mean(magnitude_spec, axis=1) + features['raw_spec_std'] = np.std(magnitude_spec, axis=1) + + # Harmonic and percussive components + harmonic, percussive = librosa.effects.hpss(audio_data) + features['harmonic_ratio'] = np.mean(harmonic ** 2) / (np.mean(audio_data ** 2) + 1e-8) + features['percussive_ratio'] = np.mean(percussive ** 2) / (np.mean(audio_data ** 2) + 1e-8) + + # Tempo and rhythm features (simplified for streaming) + tempo, _ = librosa.beat.beat_track(y=audio_data, sr=self.sample_rate) + features['tempo'] = tempo + + # Add metadata + features['_metadata'] = { + 'duration': len(audio_data) / self.sample_rate, + 'sample_rate': self.sample_rate, + 'n_samples': len(audio_data), + 'extraction_timestamp': time.time() + } + + return features + + except Exception as e: + logger.error(f"Feature extraction error: {e}") + return None + + def extract_mfcc_features(self, audio_data: np.ndarray) -> Optional[np.ndarray]: + """ + Extract only MFCC features for lightweight processing. + + Args: + audio_data: Audio samples + + Returns: + np.ndarray: MFCC feature vector + """ + try: + mfccs = librosa.feature.mfcc( + y=audio_data, + sr=self.sample_rate, + n_mfcc=self.n_mfcc, + n_fft=self.n_fft, + hop_length=self.hop_length + ) + return np.mean(mfccs, axis=1) + except Exception as e: + logger.error(f"MFCC extraction error: {e}") + return None + + def extract_spectrogram_features(self, audio_data: np.ndarray) -> Optional[Dict[str, np.ndarray]]: + """ + Extract spectrogram-based features. + + Args: + audio_data: Audio samples + + Returns: + dict: Spectrogram features + """ + try: + # Mel spectrogram + mel_spec = librosa.feature.melspectrogram( + y=audio_data, + sr=self.sample_rate, + n_mels=80, + n_fft=self.n_fft, + hop_length=self.hop_length + ) + + # Raw spectrogram + stft = librosa.stft(audio_data, n_fft=self.n_fft, hop_length=self.hop_length) + magnitude_spec = np.abs(stft) + + return { + 'mel_spectrogram': mel_spec, + 'mel_spec_db': librosa.power_to_db(mel_spec), + 'raw_spectrogram': magnitude_spec, + 'raw_spec_db': librosa.amplitude_to_db(magnitude_spec) + } + except Exception as e: + logger.error(f"Spectrogram extraction error: {e}") + return None + + def process_with_vad_and_features(self, audio_chunk: bytes, feature_type: str = 'all') -> List[Dict]: + """ + Process audio chunk with VAD and extract specified features. + + Args: + audio_chunk: Raw audio bytes + feature_type: Type of features to extract ('mfcc', 'spectrogram', 'all') + + Returns: + List[dict]: Feature results for each speech segment + """ + # Get speech segments from VAD + speech_segments = self.vad_processor.process_audio_chunk(audio_chunk) + + results = [] + + for i, segment in enumerate(speech_segments): + # Convert to numpy array + audio_array = np.frombuffer(segment, dtype=np.int16).astype(np.float32) / 32768.0 + + segment_result = { + 'segment_index': i, + 'segment_duration': len(audio_array) / self.sample_rate, + 'segment_samples': len(audio_array) + } + + # Extract requested features + if feature_type == 'mfcc': + mfcc_features = self.extract_mfcc_features(audio_array) + if mfcc_features is not None: + segment_result['mfcc'] = mfcc_features + + elif feature_type == 'spectrogram': + spec_features = self.extract_spectrogram_features(audio_array) + if spec_features is not None: + segment_result.update(spec_features) + + elif feature_type == 'all': + comprehensive_features = self._compute_streaming_features(audio_array) + if comprehensive_features is not None: + segment_result.update(comprehensive_features) + + results.append(segment_result) + + return results + + def start_streaming_processing(self): + """Start background thread for streaming processing.""" + if self.is_processing: + return + + self.is_processing = True + self.processing_thread = threading.Thread(target=self._streaming_worker, daemon=True) + self.processing_thread.start() + logger.info("Started streaming feature processing") + + def stop_streaming_processing(self): + """Stop background streaming processing.""" + self.is_processing = False + if self.processing_thread: + self.processing_thread.join(timeout=1.0) + logger.info("Stopped streaming feature processing") + + def add_audio_chunk(self, audio_chunk: bytes, feature_type: str = 'all'): + """ + Add audio chunk to processing queue. + + Args: + audio_chunk: Raw audio bytes + feature_type: Type of features to extract + """ + if self.is_processing: + try: + self.processing_queue.put_nowait((audio_chunk, feature_type)) + except queue.Full: + logger.warning("Processing queue full, dropping chunk") + + def get_feature_results(self) -> List[Dict]: + """ + Get all available feature extraction results. + + Returns: + List[dict]: Available feature results + """ + results = [] + try: + while True: + result = self.feature_queue.get_nowait() + results.append(result) + except queue.Empty: + pass + return results + + def _streaming_worker(self): + """Background worker for streaming feature processing.""" + while self.is_processing: + try: + # Get audio chunk with timeout + audio_chunk, feature_type = self.processing_queue.get(timeout=0.1) + + # Process chunk + start_time = time.time() + results = self.process_with_vad_and_features(audio_chunk, feature_type) + processing_time = time.time() - start_time + + # Add processing metadata + for result in results: + result['processing_time'] = processing_time + result['timestamp'] = time.time() + + # Add results to output queue + for result in results: + try: + self.feature_queue.put_nowait(result) + except queue.Full: + logger.warning("Feature queue full, dropping result") + + except queue.Empty: + continue + except Exception as e: + logger.error(f"Streaming feature processing error: {e}") + + def get_stats(self) -> Dict: + """ + Get feature extraction statistics. + + Returns: + dict: Processing statistics + """ + vad_stats = self.vad_processor.get_stats() + + return { + 'total_chunks_processed': self.total_chunks_processed, + 'features_extracted': self.features_extracted, + 'speech_segments_processed': self.speech_segments_processed, + 'vad_stats': vad_stats, + 'is_processing': self.is_processing, + 'queue_sizes': { + 'processing_queue': self.processing_queue.qsize(), + 'feature_queue': self.feature_queue.qsize() + } + } + + def reset_state(self): + """Reset all processing state.""" + self.vad_processor.reset_state() + self.audio_buffer.clear() + self.feature_buffer.clear() + + # Clear queues + while not self.processing_queue.empty(): + try: + self.processing_queue.get_nowait() + except queue.Empty: + break + + while not self.feature_queue.empty(): + try: + self.feature_queue.get_nowait() + except queue.Empty: + break + + logger.info("Feature extractor state reset") + +class VADMFCCProcessor: + """ + Simplified VAD + MFCC processor for digit recognition. + Optimized for low-latency real-time processing. + """ + + def __init__(self, sample_rate=16000, n_mfcc=13): + """Initialize VAD + MFCC processor.""" + self.sample_rate = sample_rate + self.n_mfcc = n_mfcc + + self.vad_processor = WebRTCVADProcessor( + aggressiveness=1, # Less aggressive for better digit detection + sample_rate=sample_rate, + frame_duration=30 + ) + + self.features_extracted = 0 + + logger.info("VAD-MFCC processor initialized") + + def process_audio_for_digit_recognition(self, audio_chunk: bytes) -> List[np.ndarray]: + """ + Process audio chunk and extract MFCC features from speech segments. + + Args: + audio_chunk: Raw audio bytes + + Returns: + List[np.ndarray]: MFCC feature vectors for each speech segment + """ + # Get speech segments + speech_segments = self.vad_processor.process_audio_chunk(audio_chunk) + + mfcc_features = [] + + for segment in speech_segments: + # Convert to numpy array + audio_array = np.frombuffer(segment, dtype=np.int16).astype(np.float32) / 32768.0 + + # Extract MFCC features + try: + mfccs = librosa.feature.mfcc( + y=audio_array, + sr=self.sample_rate, + n_mfcc=self.n_mfcc, + n_fft=1024, # Smaller FFT for faster processing + hop_length=256 + ) + + # Use mean across time for simplicity + mfcc_mean = np.mean(mfccs, axis=1) + mfcc_features.append(mfcc_mean) + self.features_extracted += 1 + + except Exception as e: + logger.error(f"MFCC extraction failed: {e}") + + return mfcc_features + + def get_stats(self) -> Dict: + """Get processing statistics.""" + vad_stats = self.vad_processor.get_stats() + + return { + 'features_extracted': self.features_extracted, + 'vad_stats': vad_stats + } \ No newline at end of file diff --git a/utils/webm_converter.py b/utils/webm_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..ea653bf39312c4c5c035301053d58f34c511ab36 --- /dev/null +++ b/utils/webm_converter.py @@ -0,0 +1,127 @@ +""" +WebM to WAV converter without FFmpeg dependency +Uses basic audio processing for WebM/OGG streams +""" + +import logging +import io +import struct +from typing import Optional + +logger = logging.getLogger(__name__) + +def convert_webm_to_wav(webm_data: bytes) -> Optional[bytes]: + """ + Convert WebM audio data to WAV format. + This is a simplified converter for basic WebM streams. + + Args: + webm_data: Raw WebM audio bytes + + Returns: + WAV audio bytes or None if conversion fails + """ + try: + return create_fallback_wav(webm_data) + + except Exception as e: + logger.error(f"WebM conversion failed: {str(e)}") + return None + +def create_fallback_wav(webm_data): + """Properly convert WebM to WAV using subprocess""" + import subprocess + import tempfile + import os + + webm_path = None + wav_path = None + + try: + # Write WebM data to temp file + with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as webm_file: + webm_file.write(webm_data) + webm_path = webm_file.name + + # Output WAV path + wav_path = webm_path.replace('.webm', '.wav') + + # Use ffmpeg directly via subprocess + cmd = [ + 'ffmpeg', + '-i', webm_path, + '-ar', '16000', + '-ac', '1', + '-f', 'wav', + '-acodec', 'pcm_s16le', + wav_path, + '-y' + ] + + result = subprocess.run(cmd, capture_output=True, timeout=5) + + if result.returncode == 0 and os.path.exists(wav_path): + with open(wav_path, 'rb') as f: + wav_data = f.read() + + logger.info(f"Successfully converted WebM to WAV: {len(wav_data)} bytes") + return wav_data + else: + logger.error(f"FFmpeg conversion failed: {result.stderr.decode()}") + return None + + except Exception as e: + logger.error(f"WebM conversion error: {e}") + return None + finally: + # Cleanup temp files + for path in [webm_path, wav_path]: + if path and os.path.exists(path): + try: + os.unlink(path) + except: + pass + +def create_wav_header(data_size: int, sample_rate: int = 16000, channels: int = 1, bits_per_sample: int = 16) -> bytes: + """Create a standard WAV file header.""" + + # WAV file header structure + header = bytearray(44) + + # RIFF chunk descriptor + header[0:4] = b'RIFF' + header[4:8] = struct.pack(' str: + """Detect audio format from header bytes.""" + if len(data) < 8: + return 'unknown' + + # Check for various audio formats + if data.startswith(b'RIFF') and b'WAVE' in data[:12]: + return 'wav' + elif data.startswith(b'OggS'): + return 'ogg' + elif data.startswith(b'\x1a\x45\xdf\xa3'): + return 'webm' + elif data.startswith(b'ID3') or data.startswith(b'\xff\xfb') or data.startswith(b'\xff\xf3'): + return 'mp3' + else: + return 'unknown' \ No newline at end of file diff --git a/utils/webrtc_vad.py b/utils/webrtc_vad.py new file mode 100644 index 0000000000000000000000000000000000000000..3ff0ade7f8c10fd288c5f09133267450d5584cd4 --- /dev/null +++ b/utils/webrtc_vad.py @@ -0,0 +1,442 @@ +""" +WebRTC VAD implementation for streaming audio processing +Provides high-performance voice activity detection with proper audio chunking +""" + +import webrtcvad +import collections +import numpy as np +import logging +from typing import List, Tuple, Optional, Generator +import struct +import threading +import queue +import time + +logger = logging.getLogger(__name__) + +class WebRTCVADProcessor: + """ + WebRTC-based Voice Activity Detection processor for streaming audio. + + Features: + - Real-time VAD processing with WebRTC library + - Proper audio chunking and buffering + - Speech segment detection and extraction + - Thread-safe operation for streaming applications + """ + + def __init__(self, aggressiveness=2, sample_rate=16000, frame_duration=30): + """ + Initialize WebRTC VAD processor. + + Args: + aggressiveness: VAD aggressiveness mode (0-3, higher = more aggressive) + sample_rate: Audio sample rate (8000, 16000, 32000, or 48000 Hz) + frame_duration: Frame duration in milliseconds (10, 20, or 30 ms) + """ + self.vad = webrtcvad.Vad(aggressiveness) + self.sample_rate = sample_rate + self.frame_duration = frame_duration + self.frame_size = int(sample_rate * frame_duration / 1000) + + # Circular buffer for frame management + self.ring_buffer_size = max(10, int(500 / frame_duration)) # ~500ms buffer + self.ring_buffer = collections.deque(maxlen=self.ring_buffer_size) + + # State tracking + self.triggered = False + self.speech_buffer = collections.deque() + self.is_recording = False + self.current_utterance_start = None + + # Configuration parameters + self.silence_threshold = 0.8 # Ratio of silence frames to trigger end + self.speech_threshold = 0.5 # Ratio of speech frames to trigger start + self.min_speech_duration = 0.5 # Minimum speech duration in seconds + self.max_speech_duration = 10.0 # Maximum speech duration in seconds + self.max_silence_duration = 2.0 # Maximum silence before reset + + # Performance tracking + self.total_frames_processed = 0 + self.speech_frames_detected = 0 + self.segments_extracted = 0 + + # Thread-safe queue for streaming chunks + self.audio_queue = queue.Queue() + self.output_queue = queue.Queue() + self.processing = False + + logger.info(f"WebRTC VAD initialized: aggressiveness={aggressiveness}, " + f"sample_rate={sample_rate}Hz, frame_duration={frame_duration}ms") + + def reset_state(self): + """Reset VAD state for new processing session.""" + self.triggered = False + self.is_recording = False + self.ring_buffer.clear() + self.speech_buffer.clear() + self.current_utterance_start = None + logger.debug("VAD state reset") + + def convert_audio_to_frames(self, audio_data: bytes) -> Generator[bytes, None, None]: + """ + Convert audio data to properly sized frames for WebRTC VAD. + + Args: + audio_data: Raw audio bytes (16-bit PCM) + + Yields: + bytes: Frame data suitable for VAD processing + """ + frame_size_bytes = self.frame_size * 2 # 16-bit = 2 bytes per sample + + for i in range(0, len(audio_data) - frame_size_bytes + 1, frame_size_bytes): + frame = audio_data[i:i + frame_size_bytes] + if len(frame) == frame_size_bytes: + yield frame + + def is_speech_frame(self, frame: bytes) -> bool: + """ + Determine if a frame contains speech using WebRTC VAD. + + Args: + frame: Audio frame bytes + + Returns: + bool: True if frame contains speech + """ + try: + if len(frame) != self.frame_size * 2: + return False + return self.vad.is_speech(frame, self.sample_rate) + except Exception as e: + logger.warning(f"VAD frame analysis failed: {e}") + return False + + def process_audio_chunk(self, audio_data: bytes) -> List[bytes]: + """ + Process audio chunk and return complete speech segments. + + Args: + audio_data: Raw audio bytes (16-bit PCM) + + Returns: + List[bytes]: List of detected speech segments + """ + speech_segments = [] + + for frame in self.convert_audio_to_frames(audio_data): + self.total_frames_processed += 1 + is_speech = self.is_speech_frame(frame) + + if is_speech: + self.speech_frames_detected += 1 + + # Process frame through VAD collector + collected_audio = self._vad_collector_step(frame, is_speech) + + if collected_audio is not None: + # Complete speech segment detected + speech_segments.append(collected_audio) + self.segments_extracted += 1 + logger.debug(f"Speech segment extracted: {len(collected_audio)} bytes") + + return speech_segments + + def _vad_collector_step(self, frame: bytes, is_speech: bool) -> Optional[bytes]: + """ + Single step of VAD collection algorithm. + + Args: + frame: Audio frame + is_speech: Whether frame contains speech + + Returns: + bytes: Complete speech segment if detected, None otherwise + """ + if not self.triggered: + # Not currently in speech mode + self.ring_buffer.append((frame, is_speech)) + num_voiced = sum(1 for f, speech in self.ring_buffer if speech) + + # Check if we should trigger speech detection + if len(self.ring_buffer) == self.ring_buffer.maxlen: + if num_voiced >= self.speech_threshold * self.ring_buffer.maxlen: + self.triggered = True + self.is_recording = True + self.current_utterance_start = time.time() + + # Output buffered frames to start speech segment + self.speech_buffer.clear() + for f, s in self.ring_buffer: + self.speech_buffer.append(f) + + self.ring_buffer.clear() + logger.debug("Speech triggered - starting collection") + + else: + # Currently in speech mode + self.speech_buffer.append(frame) + self.ring_buffer.append((frame, is_speech)) + + # Check duration limits + if self.current_utterance_start: + utterance_duration = time.time() - self.current_utterance_start + + if utterance_duration > self.max_speech_duration: + # Force end due to maximum duration + logger.debug("Speech segment ended due to max duration") + return self._finalize_speech_segment() + + # Check for end of speech + if len(self.ring_buffer) == self.ring_buffer.maxlen: + num_unvoiced = sum(1 for f, speech in self.ring_buffer if not speech) + + if num_unvoiced >= self.silence_threshold * self.ring_buffer.maxlen: + # End of speech detected + logger.debug("Speech segment ended due to silence") + return self._finalize_speech_segment() + + return None + + def _finalize_speech_segment(self) -> Optional[bytes]: + """ + Finalize and return current speech segment. + + Returns: + bytes: Complete speech segment or None if too short + """ + if not self.speech_buffer: + self.triggered = False + self.is_recording = False + return None + + # Calculate duration + total_frames = len(self.speech_buffer) + duration = total_frames * self.frame_duration / 1000.0 + + # Apply stricter minimum duration filter (0.1s minimum) + min_duration = max(self.min_speech_duration, 0.1) # At least 100ms + + # Check minimum duration + if duration < min_duration: + logger.debug(f"Speech segment too short: {duration:.2f}s < {min_duration}s") + self.triggered = False + self.is_recording = False + self.speech_buffer.clear() + self.ring_buffer.clear() + return None + + # Create complete audio segment + speech_data = b''.join(self.speech_buffer) + + # Reset state + self.triggered = False + self.is_recording = False + self.speech_buffer.clear() + self.ring_buffer.clear() + self.current_utterance_start = None + + logger.info(f"Speech segment finalized: {duration:.2f}s, {len(speech_data)} bytes") + return speech_data + + def process_numpy_audio(self, audio_array: np.ndarray) -> List[bytes]: + """ + Process numpy audio array. + + Args: + audio_array: Audio data as numpy array (float32, -1 to 1 range) + + Returns: + List[bytes]: List of detected speech segments + """ + # Convert to 16-bit PCM bytes + if audio_array.dtype != np.int16: + # Normalize and convert to int16 + audio_normalized = np.clip(audio_array, -1.0, 1.0) + audio_int16 = (audio_normalized * 32767).astype(np.int16) + else: + audio_int16 = audio_array + + # Convert to bytes + audio_bytes = audio_int16.tobytes() + + return self.process_audio_chunk(audio_bytes) + + def get_current_segment(self) -> Optional[bytes]: + """ + Get current ongoing speech segment if any. + + Returns: + bytes: Current speech segment or None + """ + if self.is_recording and self.speech_buffer: + current_duration = len(self.speech_buffer) * self.frame_duration / 1000.0 + if current_duration >= self.min_speech_duration: + return b''.join(self.speech_buffer) + return None + + def start_streaming_processing(self): + """Start background thread for streaming audio processing.""" + if self.processing: + return + + self.processing = True + self.processing_thread = threading.Thread(target=self._streaming_worker, daemon=True) + self.processing_thread.start() + logger.info("Started streaming VAD processing") + + def stop_streaming_processing(self): + """Stop background streaming processing.""" + self.processing = False + if hasattr(self, 'processing_thread'): + self.processing_thread.join(timeout=1.0) + logger.info("Stopped streaming VAD processing") + + def add_audio_chunk(self, audio_data: bytes): + """ + Add audio chunk to processing queue (thread-safe). + + Args: + audio_data: Raw audio bytes + """ + if self.processing: + try: + self.audio_queue.put_nowait(audio_data) + except queue.Full: + logger.warning("Audio queue full, dropping chunk") + + def get_speech_segments(self) -> List[bytes]: + """ + Get all available speech segments from processing queue. + + Returns: + List[bytes]: Available speech segments + """ + segments = [] + try: + while True: + segment = self.output_queue.get_nowait() + segments.append(segment) + except queue.Empty: + pass + return segments + + def _streaming_worker(self): + """Background worker for streaming audio processing.""" + while self.processing: + try: + # Get audio chunk with timeout + audio_chunk = self.audio_queue.get(timeout=0.1) + + # Process chunk + segments = self.process_audio_chunk(audio_chunk) + + # Add segments to output queue + for segment in segments: + try: + self.output_queue.put_nowait(segment) + except queue.Full: + logger.warning("Output queue full, dropping segment") + + except queue.Empty: + continue + except Exception as e: + logger.error(f"Streaming processing error: {e}") + + def get_stats(self) -> dict: + """ + Get VAD processing statistics. + + Returns: + dict: Processing statistics + """ + return { + 'total_frames_processed': self.total_frames_processed, + 'speech_frames_detected': self.speech_frames_detected, + 'segments_extracted': self.segments_extracted, + 'speech_ratio': ( + self.speech_frames_detected / max(1, self.total_frames_processed) + ), + 'is_recording': self.is_recording, + 'triggered': self.triggered, + 'buffer_size': len(self.speech_buffer), + 'ring_buffer_size': len(self.ring_buffer), + 'configuration': { + 'sample_rate': self.sample_rate, + 'frame_duration': self.frame_duration, + 'min_speech_duration': self.min_speech_duration, + 'max_speech_duration': self.max_speech_duration + } + } + +class StreamingAudioBuffer: + """ + Optimized audio buffer for streaming VAD processing. + Thread-safe with memory pool for high performance. + """ + + def __init__(self, sample_rate=16000, max_duration=30): + self.sample_rate = sample_rate + self.max_samples = sample_rate * max_duration + + # Thread-safe circular buffer + self.buffer = collections.deque(maxlen=self.max_samples) + self.buffer_lock = threading.RLock() + + # Performance tracking + self.total_samples_added = 0 + self.buffer_overruns = 0 + + def add_audio(self, audio_data: np.ndarray): + """ + Add audio data to buffer (thread-safe). + + Args: + audio_data: Audio samples as numpy array + """ + with self.buffer_lock: + if len(self.buffer) + len(audio_data) > self.max_samples: + self.buffer_overruns += 1 + # Remove old samples to make room + samples_to_remove = len(audio_data) + for _ in range(min(samples_to_remove, len(self.buffer))): + self.buffer.popleft() + + self.buffer.extend(audio_data) + self.total_samples_added += len(audio_data) + + def get_recent_audio(self, duration_ms: int = 1000) -> np.ndarray: + """ + Get recent audio with specified duration. + + Args: + duration_ms: Duration in milliseconds + + Returns: + np.ndarray: Recent audio samples + """ + samples_needed = int(self.sample_rate * duration_ms / 1000) + + with self.buffer_lock: + if len(self.buffer) >= samples_needed: + return np.array(list(self.buffer)[-samples_needed:], dtype=np.float32) + else: + return np.array(list(self.buffer), dtype=np.float32) + + def clear(self): + """Clear buffer contents.""" + with self.buffer_lock: + self.buffer.clear() + + def get_stats(self) -> dict: + """Get buffer statistics.""" + with self.buffer_lock: + return { + 'buffer_size': len(self.buffer), + 'max_size': self.max_samples, + 'utilization': len(self.buffer) / self.max_samples, + 'total_added': self.total_samples_added, + 'overruns': self.buffer_overruns + } \ No newline at end of file