Dreemer
/

samacebuV0

Keras

Model card Files Files and versions

xet

Community

dreemer09 commited on Apr 3, 2025

Commit

23d869f

1 Parent(s): 7cd7278

ahsdjkhakdaklshd

Browse files

Files changed (1) hide show

handler.py +90 -134

handler.py CHANGED Viewed

@@ -1,20 +1,14 @@
 import tensorflow as tf
 import numpy as np
 import os
-import io
 import tempfile
 import logging
 import time
-import json
 os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 from tensorflow.keras.models import load_model
-from tensorflow.keras.layers import (
-    Input, Conv2D, GlobalAveragePooling2D, Dense, Dropout, Add, LeakyReLU,
-    MaxPooling2D, SpatialDropout2D, LayerNormalization, Layer, Multiply, Reshape,
-    InputLayer
-)
 # Configure logging
 logging.basicConfig(
@@ -24,99 +18,13 @@ logging.basicConfig(
         logging.StreamHandler()
     ]
 )
-logger = logging.getLogger('speech_recognition')
-# Custom InputLayer to handle batch_shape compatibility issue
-class CustomInputLayer(InputLayer):
-    @classmethod
-    def from_config(cls, config):
-        # Convert batch_shape to input_shape if present
-        if 'batch_shape' in config:
-            config['input_shape'] = config['batch_shape'][1:]
-            del config['batch_shape']
-        return cls(**config)
-class AudioPreprocessingLayer(Layer):
-    def __init__(self, sample_rate=16000, n_mels=128, fft_size=1024, hop_size=512, **kwargs):
-        super(AudioPreprocessingLayer, self).__init__(**kwargs)
-        self.sample_rate = sample_rate
-        self.n_mels = n_mels
-        self.fft_size = fft_size
-        self.hop_size = hop_size
-    def call(self, inputs):
-        def process_audio(input_path):
-            logger.debug(f"Processing audio file: {input_path}")
-            try:
-                audio = tf.io.read_file(input_path)
-                audio, sr = tf.audio.decode_wav(audio, desired_channels=1)
-                logger.debug(f"Decoded WAV file with sample rate: {sr}, shape: {audio.shape}")
-                audio = tf.squeeze(audio, axis=-1)
-                stft = tf.signal.stft(audio, frame_length=self.fft_size, frame_step=self.hop_size)
-                logger.debug(f"STFT shape: {stft.shape}")
-                spectrogram = tf.abs(stft) ** 2
-                # Create mel filter bank
-                mel_weights = tf.signal.linear_to_mel_weight_matrix(
-                    self.n_mels, self.fft_size // 2 + 1, self.sample_rate, 20.0, 4000.0
-                )
-                mel_spectrogram = tf.tensordot(spectrogram, mel_weights, axes=1)
-                mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)
-                logger.debug(f"Mel spectrogram shape: {mel_spectrogram.shape}")
-                # Resize to model's expected input size and keep as single channel
-                mel_spectrogram = tf.image.resize(mel_spectrogram[..., tf.newaxis], [128, 128])
-                logger.debug(f"Final mel spectrogram shape: {mel_spectrogram.shape}")
-                # Normalize to range 0-1
-                mel_spectrogram = (mel_spectrogram - tf.reduce_min(mel_spectrogram)) / (
-                    tf.reduce_max(mel_spectrogram) - tf.reduce_min(mel_spectrogram) + 1e-6)
-                return mel_spectrogram
-            except Exception as e:
-                logger.error(f"Error in process_audio: {str(e)}")
-                raise
-        return tf.map_fn(process_audio, inputs, dtype=tf.float32)
-    def get_config(self):
-        config = super(AudioPreprocessingLayer, self).get_config()
-        config.update({
-            "sample_rate": self.sample_rate,
-            "n_mels": self.n_mels,
-            "fft_size": self.fft_size,
-            "hop_size": self.hop_size
-        })
-        return config
-# Define model architecture components for loading
-def se_block(x, ratio=8):
-    filters = x.shape[-1]
-    squeeze = GlobalAveragePooling2D()(x)
-    excitation = Dense(filters // ratio, activation="relu")(squeeze)
-    excitation = Dense(filters, activation="sigmoid")(excitation)
-    excitation = Reshape((1, 1, filters))(excitation)
-    return Multiply()([x, excitation])
-def residual_block(x, filters):
-    shortcut = x
-    x = Conv2D(filters, (3, 3), padding="same", use_bias=False)(x)
-    x = LayerNormalization()(x)
-    x = LeakyReLU()(x)
-    x = Conv2D(filters, (3, 3), padding="same", use_bias=False)(x)
-    x = LayerNormalization()(x)
-    x = se_block(x)
-    if shortcut.shape[-1] != filters:
-        shortcut = Conv2D(filters, (1, 1), padding="same", use_bias=False)(shortcut)
-        shortcut = LayerNormalization()(shortcut)
-    x = Add()([x, shortcut])
-    x = LeakyReLU()(x)
-    x = SpatialDropout2D(0.2)(x)
-    return x
 class EndpointHandler:
     def __init__(self, model_dir):
@@ -128,75 +36,123 @@ class EndpointHandler:
             logger.info(f"Using provided model directory: {model_dir}")
         # Load the model
-        model_path = os.path.join(model_dir, "model/speechModelv2.keras")
         logger.info(f"Loading model from: {model_path}")
         try:
-            # Load the model with custom objects
-            custom_objects = {
-                "AudioPreprocessingLayer": AudioPreprocessingLayer,
-                "InputLayer": CustomInputLayer
-            }
-            self.model = load_model(model_path, custom_objects=custom_objects)
-            logger.info(f"Model loaded successfully with input shape: {self.model.input_shape}")
         except Exception as e:
-            logger.error(f"Failed to initialize endpoint: {str(e)}", exc_info=True)
             raise
     def __call__(self, requests):
         start_time = time.time()
-        logger.info("Processing speech recognition request")
         temp_dir = None
         temp_wav_path = None
         try:
-            # Extract input audio bytes
-            input_audio = requests.get('inputs', None)
-            if input_audio is None:
-                logger.error("No input data provided")
-                return [{"error": "No input data provided"}]
-            if not isinstance(input_audio, bytes):
-                logger.error(f"Expected bytes input, got {type(input_audio)}")
-                return [{"error": f"Invalid input type: {type(input_audio)}, expected bytes"}]
-            # Create temporary file for audio processing
             temp_dir = tempfile.mkdtemp()
-            temp_wav_path = os.path.join(temp_dir, "speech_input.wav")
             logger.info(f"Created temporary directory: {temp_dir}")
-            # Write audio bytes to temporary file
-            logger.debug(f"Writing {len(input_audio)} bytes to temporary file: {temp_wav_path}")
             with open(temp_wav_path, "wb") as f:
-                f.write(input_audio)
             if not os.path.exists(temp_wav_path):
                 logger.error(f"Failed to create temporary WAV file: {temp_wav_path}")
                 return [{"error": "Failed to create temporary WAV file"}]
-            logger.debug(f"File size: {os.path.getsize(temp_wav_path)} bytes")
-            # Preprocess and run inference
-            inputs = tf.constant([temp_wav_path])
             logger.info("Running model prediction")
-            predictions = self.model.predict(inputs)
             logger.debug(f"Raw predictions shape: {predictions.shape}")
             # Process results
             results = []
             for i, prediction in enumerate(predictions):
-                # Get top 3 predictions
-                top_indices = np.argsort(prediction)[-3:][::-1]
-                results.append({
-                    "prediction": int(top_indices[0]),
-                    "confidence": float(prediction[top_indices[0]])
-                })
             elapsed_time = time.time() - start_time
-            logger.info(f"Speech recognition completed in {elapsed_time:.3f} seconds")
             return results
         except Exception as e:

 import tensorflow as tf
 import numpy as np
 import os
+import librosa
 import tempfile
 import logging
 import time
 os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 from tensorflow.keras.models import load_model
 # Configure logging
 logging.basicConfig(
         logging.StreamHandler()
     ]
 )
+logger = logging.getLogger('speech_recognition_inference')
+# Constants for audio preprocessing
+SAMPLE_RATE = 16000
+N_MELS = 128
+FFT_SIZE = 1024
+HOP_SIZE = 512
 class EndpointHandler:
     def __init__(self, model_dir):
             logger.info(f"Using provided model directory: {model_dir}")
         # Load the model
+        model_path = os.path.join(model_dir, "model/speech_modelv2.keras")
         logger.info(f"Loading model from: {model_path}")
         try:
+            self.model = load_model(model_path)
+            logger.info(f"Model loaded successfully")
+            logger.debug(f"Model summary: {self.model.summary()}")
+        except Exception as e:
+            logger.error(f"Failed to load model: {str(e)}")
+            raise
+    def preprocess_audio(self, file_path):
+        """
+        Process audio file to match the training preprocessing exactly
+        """
+        logger.debug(f"Processing audio file: {file_path}")
+        try:
+            # Load audio using librosa (same as training)
+            audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)
+            # Convert to Mel spectrogram (matching training parameters)
+            mel_spectrogram = librosa.feature.melspectrogram(
+                y=audio,
+                sr=sr,
+                n_mels=N_MELS,
+                n_fft=FFT_SIZE,
+                hop_length=HOP_SIZE
+            )
+            log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
+            # Ensure fixed size (128x128)
+            if log_mel_spectrogram.shape[1] < 128:
+                log_mel_spectrogram = np.pad(
+                    log_mel_spectrogram,
+                    ((0, 0), (0, 128 - log_mel_spectrogram.shape[1])),
+                    mode='constant'
+                )
+            else:
+                log_mel_spectrogram = log_mel_spectrogram[:, :128]
+            # Expand dimensions for CNN input (128x128x1)
+            mel_spectrogram_processed = np.expand_dims(log_mel_spectrogram, axis=-1)
+            # Convert to RGB by duplicating channels (128x128x3)
+            # Matching the model's expectation of RGB input
+            mel_spectrogram_rgb = np.repeat(mel_spectrogram_processed, 3, axis=2)
+            logger.debug(f"Final mel spectrogram shape: {mel_spectrogram_rgb.shape}")
+            return mel_spectrogram_rgb
         except Exception as e:
+            logger.error(f"Error in preprocess_audio: {str(e)}")
             raise
     def __call__(self, requests):
         start_time = time.time()
+        logger.info("Processing speech recognition inference request")
         temp_dir = None
         temp_wav_path = None
+        audio_data = requests.get('inputs', None)
         try:
+            # Validate input
+            if not audio_data:
+                logger.error("No 'inputs' field found in the request")
+                return [{"error": "No audio data provided in 'inputs' field"}]
+            if not isinstance(audio_data, bytes):
+                logger.error(f"Expected bytes, got {type(audio_data)}")
+                return [{"error": f"Invalid input type: {type(audio_data)}, expected bytes"}]
+            # Create temporary file for the audio
             temp_dir = tempfile.mkdtemp()
+            temp_wav_path = os.path.join(temp_dir, "wav_input.wav")
             logger.info(f"Created temporary directory: {temp_dir}")
+            # Write audio data to file
+            logger.debug(f"Writing {len(audio_data)} bytes to temporary file: {temp_wav_path}")
             with open(temp_wav_path, "wb") as f:
+                f.write(audio_data)
+            # Verify file was created
             if not os.path.exists(temp_wav_path):
                 logger.error(f"Failed to create temporary WAV file: {temp_wav_path}")
                 return [{"error": "Failed to create temporary WAV file"}]
+            # Preprocess audio
+            logger.info("Preprocessing audio")
+            try:
+                preprocessed_audio = self.preprocess_audio(temp_wav_path)
+                # Add batch dimension
+                preprocessed_input = np.expand_dims(preprocessed_audio, axis=0)
+            except Exception as e:
+                logger.error(f"Error during preprocessing: {str(e)}")
+                return [{"error": f"Preprocessing failed: {str(e)}"}]
+            # Run prediction
             logger.info("Running model prediction")
+            predictions = self.model.predict(preprocessed_input)
             logger.debug(f"Raw predictions shape: {predictions.shape}")
             # Process results
             results = []
             for i, prediction in enumerate(predictions):
+                predicted_class_index = int(np.argmax(prediction))
+                confidence = float(prediction[predicted_class_index])
+                result = {
+                    "word": predicted_class_index,
+                    "confidence": confidence
+                }
+                logger.info(f"Result {i}: class={predicted_class_index}, confidence={confidence:.4f}")
+                results.append(result)
             elapsed_time = time.time() - start_time
+            logger.info(f"Inference completed in {elapsed_time:.3f} seconds")
             return results
         except Exception as e: