""" ML Mel CNN Digit Processor Uses the trained Mel Spectrogram + 2D CNN model for digit classification """ import os import sys import time import logging from pathlib import Path from typing import Dict, Any, Optional, Union import numpy as np from .base_processor import AudioProcessor # Add project root to path for ML imports PROJECT_ROOT = Path(__file__).parent.parent sys.path.append(str(PROJECT_ROOT)) # Import ML inference from ml_training.inference import load_classifier logger = logging.getLogger(__name__) class MLMelCNNProcessor(AudioProcessor): """ ML-based Mel CNN digit processor using trained 2D CNN model. Performance characteristics (based on training results): - Test accuracy: 97.22% - Inference time: ~3-5ms - Model size: ~2.6MB """ name = "ML Mel CNN (2D Conv)" def __init__(self, model_dir: str = "models", device: str = "auto"): """ Initialize ML Mel CNN processor. Args: model_dir: Directory containing trained models device: Device to run inference on ('cpu', 'cuda', or 'auto') """ super().__init__(self.name) self.model_dir = Path(model_dir) self.device = device if device != "auto" else None self.classifier = None self._configured = False # Performance tracking self.prediction_count = 0 self.total_inference_time = 0.0 self.last_prediction_time = None # Try to load the model self._initialize_classifier() logger.info(f"ML Mel CNN Processor initialized (configured: {self._configured})") def _initialize_classifier(self): """Initialize the ML classifier.""" try: # Check if model directory exists if not self.model_dir.exists(): logger.warning(f"Model directory not found: {self.model_dir}") return # Load the Mel CNN classifier self.classifier = load_classifier( model_dir=str(self.model_dir), pipeline_type='mel_cnn', device=self.device ) self._configured = True logger.info("ML Mel CNN classifier loaded successfully") logger.info(f" Model device: {self.classifier.device}") logger.info(f" Parameters: {sum(p.numel() for p in self.classifier.model.parameters()):,}") except Exception as e: logger.error(f"Failed to load ML Mel CNN classifier: {str(e)}") self.classifier = None self._configured = False def is_configured(self) -> bool: """Check if the processor is properly configured.""" return self._configured and self.classifier is not None def process_audio(self, audio_data: bytes) -> str: """ Process audio and return predicted digit (required by base class). Args: audio_data: Raw audio data in bytes Returns: predicted_digit: Predicted digit as string """ return self.predict(audio_data) def predict(self, audio_data: bytes) -> str: """ Predict digit from audio data. Args: audio_data: Raw audio data in bytes Returns: predicted_digit: Predicted digit as string """ if not self.is_configured(): raise RuntimeError("ML Mel CNN processor not properly configured") try: # Convert audio with optimized format for ML models from utils.audio_utils import convert_for_ml_models optimized_audio = convert_for_ml_models(audio_data, 'mel_cnn') # Convert audio bytes to numpy array audio_array = self._bytes_to_audio_array(optimized_audio) # Make prediction using ML classifier start_time = time.time() result = self.classifier.predict( audio_array, return_probabilities=True, return_features=False ) inference_time = time.time() - start_time # Update performance tracking self.prediction_count += 1 self.total_inference_time += inference_time self.last_prediction_time = inference_time predicted_digit = str(result['predicted_digit']) confidence = result['confidence'] logger.debug(f"ML Mel CNN prediction: '{predicted_digit}' " f"(confidence: {confidence:.3f}, time: {inference_time*1000:.1f}ms)") return predicted_digit except Exception as e: logger.error(f"ML Mel CNN prediction failed: {str(e)}") raise def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]: """ Predict digit with detailed timing and confidence information. Args: audio_data: Raw audio data in bytes Returns: result: Detailed prediction results """ if not self.is_configured(): return { 'success': False, 'error': 'ML Mel CNN processor not properly configured', 'predicted_digit': None, 'inference_time': 0.0 } try: # Convert audio with optimized format for ML models from utils.audio_utils import convert_for_ml_models optimized_audio = convert_for_ml_models(audio_data, 'mel_cnn') # Convert audio bytes to numpy array audio_array = self._bytes_to_audio_array(optimized_audio) # Make prediction using ML classifier start_time = time.time() ml_result = self.classifier.predict( audio_array, return_probabilities=True, return_features=False ) inference_time = time.time() - start_time # Update performance tracking self.prediction_count += 1 self.total_inference_time += inference_time self.last_prediction_time = inference_time # Format result result = { 'success': True, 'predicted_digit': str(ml_result['predicted_digit']), 'confidence': ml_result['confidence'], 'inference_time': inference_time, 'class_probabilities': { str(k): float(v) for k, v in ml_result['class_probabilities'].items() }, 'top_3_predictions': [ { 'digit': str(pred['digit']), 'probability': pred['probability'] } for pred in ml_result['top_3_predictions'] ], 'method': self.name, 'model_type': 'ml_mel_cnn', 'timestamp': time.time() } logger.debug(f"ML Mel CNN detailed prediction: '{result['predicted_digit']}' " f"(confidence: {result['confidence']:.3f}, " f"time: {inference_time*1000:.1f}ms)") return result except Exception as e: logger.error(f"ML Mel CNN prediction with timing failed: {str(e)}") return { 'success': False, 'error': str(e), 'predicted_digit': None, 'inference_time': 0.0, 'method': self.name, 'model_type': 'ml_mel_cnn', 'timestamp': time.time() } def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray: """Convert audio bytes to numpy array.""" try: # Try to interpret as int16 PCM first (most common) audio_array = np.frombuffer(audio_data, dtype=np.int16) # Convert to float32 and normalize audio_array = audio_array.astype(np.float32) / 32768.0 # If the array is too short, pad it if len(audio_array) < 1000: # Less than ~60ms at 16kHz # Pad with zeros to minimum length audio_array = np.pad(audio_array, (0, 1000 - len(audio_array))) return audio_array except Exception as e: logger.error(f"Failed to convert audio bytes to array: {str(e)}") # Return a small zero array as fallback return np.zeros(1000, dtype=np.float32) def get_stats(self) -> Dict[str, Any]: """Get processor performance statistics.""" stats = super().get_stats() if self.prediction_count > 0: stats.update({ 'ml_predictions': self.prediction_count, 'average_inference_time': self.total_inference_time / self.prediction_count, 'last_inference_time': self.last_prediction_time, 'throughput_per_second': self.prediction_count / self.total_inference_time if self.total_inference_time > 0 else 0, 'model_configured': self.is_configured() }) if self.classifier: # Get ML classifier performance stats ml_stats = self.classifier.get_performance_stats() stats['ml_classifier_stats'] = ml_stats return stats def get_model_info(self) -> Dict[str, Any]: """Get information about the loaded model.""" if not self.is_configured(): return {'error': 'Model not loaded'} try: info = { 'pipeline_type': 'mel_cnn', 'model_class': self.classifier.model.__class__.__name__, 'device': str(self.classifier.device), 'parameters': sum(p.numel() for p in self.classifier.model.parameters()), 'feature_extractor': self.classifier.feature_extractor.__class__.__name__, 'has_scaler': self.classifier.scaler is not None, 'expected_sample_rate': 8000, 'expected_audio_length': 8000, # 1 second at 8kHz 'input_shape': '(1, 64, 51)', # Mel spectrogram shape 'model_architecture': '2D CNN' } if hasattr(self.classifier, 'model_path'): info['model_path'] = str(self.classifier.model_path) return info except Exception as e: logger.error(f"Failed to get model info: {str(e)}") return {'error': str(e)} def benchmark_speed(self, num_samples: int = 100) -> Dict[str, Any]: """Benchmark inference speed.""" if not self.is_configured(): return {'error': 'Model not configured'} try: return self.classifier.benchmark_speed(num_samples) except Exception as e: logger.error(f"Benchmark failed: {str(e)}") return {'error': str(e)}