Dreemer
/

samacebuV0

Keras

Model card Files Files and versions

xet

Community

dreemer09 commited on Apr 3, 2025

Commit

a806fea

1 Parent(s): 7e66a7c

alksdhlahk

Browse files

Files changed (1) hide show

handler.py +180 -45

handler.py CHANGED Viewed

@@ -1,71 +1,206 @@
 import tensorflow as tf
-import os
-import librosa
 import numpy as np
-import time
 import tempfile
 os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 class EndpointHandler:
     def __init__(self, model_dir):
         if model_dir is None:
             model_dir = os.path.dirname(os.path.abspath(__file__))
-        # Model path
-        model_path = os.path.join(model_dir, "model/speechModelv2.keras")
-        # Load the model with custom_objects to handle any custom layers
-        self.model = tf.keras.models.load_model(model_path)
-    def preprocess_audio(self, audio_path):
-        SAMPLE_RATE = 16000
-        N_MELS = 128
-        FFT_SIZE = 1024
-        HOP_SIZE = 512
-        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)
-        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=N_MELS, n_fft=FFT_SIZE, hop_length=HOP_SIZE)
-        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
-        # Ensure fixed size (128x128)
-        if log_mel_spectrogram.shape[1] < 128:
-            log_mel_spectrogram = np.pad(log_mel_spectrogram, ((0, 0), (0, 128 - log_mel_spectrogram.shape[1])), mode='constant')
         else:
-            log_mel_spectrogram = log_mel_spectrogram[:, :128]
-        return np.expand_dims(log_mel_spectrogram, axis=[0, -1])
     def __call__(self, requests):
         temp_dir = None
         temp_wav_path = None
         try:
-            # Create temporary directory and file
             temp_dir = tempfile.mkdtemp()
-            temp_wav_path = os.path.join(temp_dir, "wav_input.wav")
-            # Write audio data to temporary file
             with open(temp_wav_path, "wb") as f:
-                f.write(requests)
-            # Preprocess audio
-            input_data = self.preprocess_audio(temp_wav_path)
-            predictions = self.model.predict(input_data)
-            predicted_class = int(np.argmax(predictions, axis=1)[0])
-            confidence = float(predictions[0][predicted_class])
-            # Prepare response
-            response = {"class_id": predicted_class, "confidence": confidence}
-            return response
         except Exception as e:
-            return {"error": str(e)}
         finally:
             # Clean up temporary files
-            if temp_wav_path and os.path.exists(temp_wav_path):
-                os.remove(temp_wav_path)
-            if temp_dir and os.path.exists(temp_dir):
-                os.rmdir(temp_dir)

 import tensorflow as tf
 import numpy as np
+import os
+import io
 import tempfile
+import logging
+import time
+import json
 os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+from tensorflow.keras.models import load_model
+from tensorflow.keras.layers import (
+    Input, Conv2D, GlobalAveragePooling2D, Dense, Dropout, Add, LeakyReLU,
+    MaxPooling2D, SpatialDropout2D, LayerNormalization, Layer, Multiply, Reshape
+)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger('speech_recognition')
+class AudioPreprocessingLayer(Layer):
+    def __init__(self, sample_rate=16000, n_mels=128, fft_size=1024, hop_size=512, **kwargs):
+        super(AudioPreprocessingLayer, self).__init__(**kwargs)
+        self.sample_rate = sample_rate
+        self.n_mels = n_mels
+        self.fft_size = fft_size
+        self.hop_size = hop_size
+    def call(self, inputs):
+        def process_audio(input_path):
+            logger.debug(f"Processing audio file: {input_path}")
+            try:
+                audio = tf.io.read_file(input_path)
+                audio, sr = tf.audio.decode_wav(audio, desired_channels=1)
+                logger.debug(f"Decoded WAV file with sample rate: {sr}, shape: {audio.shape}")
+                audio = tf.squeeze(audio, axis=-1)
+                stft = tf.signal.stft(audio, frame_length=self.fft_size, frame_step=self.hop_size)
+                logger.debug(f"STFT shape: {stft.shape}")
+                spectrogram = tf.abs(stft) ** 2
+                # Create mel filter bank
+                mel_weights = tf.signal.linear_to_mel_weight_matrix(
+                    self.n_mels, self.fft_size // 2 + 1, self.sample_rate, 20.0, 4000.0
+                )
+                mel_spectrogram = tf.tensordot(spectrogram, mel_weights, axes=1)
+                mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)
+                logger.debug(f"Mel spectrogram shape: {mel_spectrogram.shape}")
+                # Resize to model's expected input size and keep as single channel
+                mel_spectrogram = tf.image.resize(mel_spectrogram[..., tf.newaxis], [128, 128])
+                logger.debug(f"Final mel spectrogram shape: {mel_spectrogram.shape}")
+                # Normalize to range 0-1
+                mel_spectrogram = (mel_spectrogram - tf.reduce_min(mel_spectrogram)) / (
+                    tf.reduce_max(mel_spectrogram) - tf.reduce_min(mel_spectrogram) + 1e-6)
+                return mel_spectrogram
+            except Exception as e:
+                logger.error(f"Error in process_audio: {str(e)}")
+                raise
+        return tf.map_fn(process_audio, inputs, dtype=tf.float32)
+    def get_config(self):
+        config = super(AudioPreprocessingLayer, self).get_config()
+        config.update({
+            "sample_rate": self.sample_rate,
+            "n_mels": self.n_mels,
+            "fft_size": self.fft_size,
+            "hop_size": self.hop_size
+        })
+        return config
+# Define model architecture components for loading
+def se_block(x, ratio=8):
+    filters = x.shape[-1]
+    squeeze = GlobalAveragePooling2D()(x)
+    excitation = Dense(filters // ratio, activation="relu")(squeeze)
+    excitation = Dense(filters, activation="sigmoid")(excitation)
+    excitation = Reshape((1, 1, filters))(excitation)
+    return Multiply()([x, excitation])
+def residual_block(x, filters):
+    shortcut = x
+    x = Conv2D(filters, (3, 3), padding="same", use_bias=False)(x)
+    x = LayerNormalization()(x)
+    x = LeakyReLU()(x)
+    x = Conv2D(filters, (3, 3), padding="same", use_bias=False)(x)
+    x = LayerNormalization()(x)
+    x = se_block(x)
+    if shortcut.shape[-1] != filters:
+        shortcut = Conv2D(filters, (1, 1), padding="same", use_bias=False)(shortcut)
+        shortcut = LayerNormalization()(shortcut)
+    x = Add()([x, shortcut])
+    x = LeakyReLU()(x)
+    x = SpatialDropout2D(0.2)(x)
+    return x
 class EndpointHandler:
     def __init__(self, model_dir):
+        logger.info("Initializing Speech Recognition EndpointHandler")
         if model_dir is None:
             model_dir = os.path.dirname(os.path.abspath(__file__))
+            logger.info(f"Model directory not provided, using current directory: {model_dir}")
         else:
+            logger.info(f"Using provided model directory: {model_dir}")
+        # Load the model
+        model_path = os.path.join(model_dir, "model/speech_model.keras")
+        logger.info(f"Loading model from: {model_path}")
+        try:
+            # Load the model with custom objects
+            custom_objects = {
+                "AudioPreprocessingLayer": AudioPreprocessingLayer
+            }
+            self.model = load_model(model_path, custom_objects=custom_objects)
+            logger.info(f"Model loaded successfully with input shape: {self.model.input_shape}")
+        except Exception as e:
+            logger.error(f"Failed to initialize endpoint: {str(e)}", exc_info=True)
+            raise
     def __call__(self, requests):
+        start_time = time.time()
+        logger.info("Processing speech recognition request")
         temp_dir = None
         temp_wav_path = None
         try:
+            # Extract input audio bytes
+            input_audio = requests.get('inputs', None)
+            if input_audio is None:
+                logger.error("No input data provided")
+                return [{"error": "No input data provided"}]
+            if not isinstance(input_audio, bytes):
+                logger.error(f"Expected bytes input, got {type(input_audio)}")
+                return [{"error": f"Invalid input type: {type(input_audio)}, expected bytes"}]
+            # Create temporary file for audio processing
             temp_dir = tempfile.mkdtemp()
+            temp_wav_path = os.path.join(temp_dir, "speech_input.wav")
+            logger.info(f"Created temporary directory: {temp_dir}")
+            # Write audio bytes to temporary file
+            logger.debug(f"Writing {len(input_audio)} bytes to temporary file: {temp_wav_path}")
             with open(temp_wav_path, "wb") as f:
+                f.write(input_audio)
+            if not os.path.exists(temp_wav_path):
+                logger.error(f"Failed to create temporary WAV file: {temp_wav_path}")
+                return [{"error": "Failed to create temporary WAV file"}]
+            logger.debug(f"File size: {os.path.getsize(temp_wav_path)} bytes")
+            # Preprocess and run inference
+            inputs = tf.constant([temp_wav_path])
+            logger.info("Running model prediction")
+            predictions = self.model.predict(inputs)
+            logger.debug(f"Raw predictions shape: {predictions.shape}")
+            # Process results
+            results = []
+            for i, prediction in enumerate(predictions):
+                # Get top 3 predictions
+                top_indices = np.argsort(prediction)[-3:][::-1]
+                predictions_list = []
+                for idx in top_indices:
+                results.append({
+                    "word": int(top_indices[0]),
+                    "confidence": float(prediction[top_indices[0]])
+                })
+            elapsed_time = time.time() - start_time
+            logger.info(f"Speech recognition completed in {elapsed_time:.3f} seconds")
+            return results
         except Exception as e:
+            logger.error(f"Error during inference: {str(e)}", exc_info=True)
+            return [{"error": str(e)}]
         finally:
             # Clean up temporary files
+            try:
+                if temp_wav_path and os.path.exists(temp_wav_path):
+                    os.remove(temp_wav_path)
+                    logger.debug(f"Removed temporary file: {temp_wav_path}")
+                if temp_dir and os.path.exists(temp_dir):
+                    os.rmdir(temp_dir)
+                    logger.debug(f"Removed temporary directory: {temp_dir}")
+            except Exception as cleanup_error:
+                logger.error(f"Error during cleanup: {str(cleanup_error)}")