dreemer09 commited on
Commit
bf1c3a7
·
1 Parent(s): 70ff895

agafgfgdgs

Browse files
.gitattributes CHANGED
@@ -23,6 +23,7 @@
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
 
26
  *.tar.* filter=lfs diff=lfs merge=lfs -text
27
  *.tar filter=lfs diff=lfs merge=lfs -text
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
@@ -32,4 +33,5 @@
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
- *.keras filter=lfs diff=lfs merge=lfs -text
 
 
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
  *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ best_model.keras filter=lfs diff=lfs merge=lfs -text
37
+ bestModel.keras filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv/
handler.py CHANGED
@@ -1,14 +1,15 @@
1
  import tensorflow as tf
2
  import numpy as np
3
  import os
4
- import librosa
5
  import tempfile
6
  import logging
7
  import time
8
  os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
9
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
10
 
11
- from tensorflow.keras.models import load_model
 
12
 
13
  # Configure logging
14
  logging.basicConfig(
@@ -18,138 +19,116 @@ logging.basicConfig(
18
  logging.StreamHandler()
19
  ]
20
  )
21
- logger = logging.getLogger('speech_recognition_inference')
22
 
23
- # Constants for audio preprocessing
24
- SAMPLE_RATE = 16000
25
- N_MELS = 128
26
- FFT_SIZE = 1024
27
- HOP_SIZE = 512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  class EndpointHandler:
30
  def __init__(self, model_dir):
31
- logger.info("Initializing Speech Recognition EndpointHandler")
32
  if model_dir is None:
33
  model_dir = os.path.dirname(os.path.abspath(__file__))
34
  logger.info(f"Model directory not provided, using current directory: {model_dir}")
35
  else:
36
  logger.info(f"Using provided model directory: {model_dir}")
37
 
38
- # Load the model
39
- model_path = os.path.join(model_dir, "model/speechModelv2.keras")
40
  logger.info(f"Loading model from: {model_path}")
41
 
42
  try:
43
- self.model = load_model(model_path)
44
- logger.info(f"Model loaded successfully")
45
- logger.debug(f"Model summary: {self.model.summary()}")
46
  except Exception as e:
47
  logger.error(f"Failed to load model: {str(e)}")
48
  raise
49
 
50
- def preprocess_audio(self, file_path):
51
- """
52
- Process audio file to match the training preprocessing exactly
53
- """
54
- logger.debug(f"Processing audio file: {file_path}")
55
- try:
56
- # Load audio using librosa (same as training)
57
- audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)
58
-
59
- # Convert to Mel spectrogram (matching training parameters)
60
- mel_spectrogram = librosa.feature.melspectrogram(
61
- y=audio,
62
- sr=sr,
63
- n_mels=N_MELS,
64
- n_fft=FFT_SIZE,
65
- hop_length=HOP_SIZE
66
- )
67
- log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
68
-
69
- # Ensure fixed size (128x128)
70
- if log_mel_spectrogram.shape[1] < 128:
71
- log_mel_spectrogram = np.pad(
72
- log_mel_spectrogram,
73
- ((0, 0), (0, 128 - log_mel_spectrogram.shape[1])),
74
- mode='constant'
75
- )
76
- else:
77
- log_mel_spectrogram = log_mel_spectrogram[:, :128]
78
-
79
- # Expand dimensions for CNN input (128x128x1)
80
- mel_spectrogram_processed = np.expand_dims(log_mel_spectrogram, axis=-1)
81
-
82
- # Convert to RGB by duplicating channels (128x128x3)
83
- # Matching the model's expectation of RGB input
84
- mel_spectrogram_rgb = np.repeat(mel_spectrogram_processed, 3, axis=2)
85
-
86
- logger.debug(f"Final mel spectrogram shape: {mel_spectrogram_rgb.shape}")
87
- return mel_spectrogram_rgb
88
-
89
- except Exception as e:
90
- logger.error(f"Error in preprocess_audio: {str(e)}")
91
- raise
92
-
93
  def __call__(self, requests):
94
  start_time = time.time()
95
- logger.info("Processing speech recognition inference request")
96
  temp_dir = None
97
  temp_wav_path = None
98
- audio_data = requests.get('inputs', None)
99
-
100
  try:
101
- # Validate input
102
- if not audio_data:
103
- logger.error("No 'inputs' field found in the request")
104
- return [{"error": "No audio data provided in 'inputs' field"}]
105
-
106
- if not isinstance(audio_data, bytes):
107
- logger.error(f"Expected bytes, got {type(audio_data)}")
108
- return [{"error": f"Invalid input type: {type(audio_data)}, expected bytes"}]
109
-
110
- # Create temporary file for the audio
111
  temp_dir = tempfile.mkdtemp()
112
  temp_wav_path = os.path.join(temp_dir, "wav_input.wav")
113
  logger.info(f"Created temporary directory: {temp_dir}")
114
 
115
- # Write audio data to file
116
- logger.debug(f"Writing {len(audio_data)} bytes to temporary file: {temp_wav_path}")
 
 
 
 
 
117
  with open(temp_wav_path, "wb") as f:
118
- f.write(audio_data)
119
 
120
- # Verify file was created
121
  if not os.path.exists(temp_wav_path):
122
  logger.error(f"Failed to create temporary WAV file: {temp_wav_path}")
123
  return [{"error": "Failed to create temporary WAV file"}]
124
 
125
- # Preprocess audio
126
- logger.info("Preprocessing audio")
127
- try:
128
- preprocessed_audio = self.preprocess_audio(temp_wav_path)
129
- # Add batch dimension
130
- preprocessed_input = np.expand_dims(preprocessed_audio, axis=0)
131
- except Exception as e:
132
- logger.error(f"Error during preprocessing: {str(e)}")
133
- return [{"error": f"Preprocessing failed: {str(e)}"}]
134
-
135
- # Run prediction
136
  logger.info("Running model prediction")
137
- predictions = self.model.predict(preprocessed_input)
138
- logger.debug(f"Raw predictions shape: {predictions.shape}")
139
 
140
- # Process results
 
 
141
  results = []
142
  for i, prediction in enumerate(predictions):
143
- predicted_class_index = int(np.argmax(prediction))
144
  confidence = float(prediction[predicted_class_index])
145
-
146
- result = {
147
- "word": predicted_class_index,
148
- "confidence": confidence
149
- }
150
-
151
  logger.info(f"Result {i}: class={predicted_class_index}, confidence={confidence:.4f}")
152
- results.append(result)
153
 
154
  elapsed_time = time.time() - start_time
155
  logger.info(f"Inference completed in {elapsed_time:.3f} seconds")
@@ -160,7 +139,6 @@ class EndpointHandler:
160
  return [{"error": str(e)}]
161
 
162
  finally:
163
- # Clean up temporary files
164
  try:
165
  if temp_wav_path and os.path.exists(temp_wav_path):
166
  os.remove(temp_wav_path)
 
1
  import tensorflow as tf
2
  import numpy as np
3
  import os
4
+ import io
5
  import tempfile
6
  import logging
7
  import time
8
  os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
9
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
10
 
11
+ from keras.models import load_model
12
+ from keras.layers import Layer
13
 
14
  # Configure logging
15
  logging.basicConfig(
 
19
  logging.StreamHandler()
20
  ]
21
  )
22
+ logger = logging.getLogger('audio_inference')
23
 
24
+ class WavToMelLayer(Layer):
25
+ def __init__(self, sample_rate=16000, n_mels=128, fft_size=1024, hop_size=512, **kwargs):
26
+ super(WavToMelLayer, self).__init__(**kwargs)
27
+ self.sample_rate = sample_rate
28
+ self.n_mels = n_mels
29
+ self.fft_size = fft_size
30
+ self.hop_size = hop_size
31
+
32
+ def call(self, inputs):
33
+ def process_audio(input_path):
34
+ logger.debug(f"Processing audio file: {input_path}")
35
+ try:
36
+ audio = tf.io.read_file(input_path)
37
+ audio, sr = tf.audio.decode_wav(audio, desired_channels=1)
38
+ logger.debug(f"Decoded WAV file with sample rate: {sr}, shape: {audio.shape}")
39
+ audio = tf.squeeze(audio, axis=-1)
40
+
41
+ stft = tf.signal.stft(audio, frame_length=self.fft_size, frame_step=self.hop_size)
42
+ logger.debug(f"STFT shape: {stft.shape}")
43
+ spectrogram = tf.abs(stft) ** 2
44
+
45
+ mel_weights = tf.signal.linear_to_mel_weight_matrix(
46
+ self.n_mels, self.fft_size // 2 + 1, self.sample_rate, 20.0, 4000.0
47
+ )
48
+ mel_spectrogram = tf.tensordot(spectrogram, mel_weights, axes=1)
49
+ mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)
50
+ logger.debug(f"Mel spectrogram shape: {mel_spectrogram.shape}")
51
+
52
+ mel_spectrogram = tf.image.resize(mel_spectrogram[..., tf.newaxis], [128, 128])
53
+ mel_spectrogram = tf.image.grayscale_to_rgb(mel_spectrogram)
54
+ logger.debug(f"Final mel spectrogram shape: {mel_spectrogram.shape}")
55
+
56
+ return mel_spectrogram
57
+ except Exception as e:
58
+ logger.error(f"Error in process_audio: {str(e)}")
59
+ raise
60
+
61
+ return tf.map_fn(process_audio, inputs, dtype=tf.float32)
62
+
63
+ def get_config(self):
64
+ config = super(WavToMelLayer, self).get_config()
65
+ config.update({
66
+ "sample_rate": self.sample_rate,
67
+ "n_mels": self.n_mels,
68
+ "fft_size": self.fft_size,
69
+ "hop_size": self.hop_size
70
+ })
71
+ return config
72
 
73
  class EndpointHandler:
74
  def __init__(self, model_dir):
75
+ logger.info("Initializing EndpointHandler")
76
  if model_dir is None:
77
  model_dir = os.path.dirname(os.path.abspath(__file__))
78
  logger.info(f"Model directory not provided, using current directory: {model_dir}")
79
  else:
80
  logger.info(f"Using provided model directory: {model_dir}")
81
 
82
+ model_path = os.path.join(model_dir, "model/bestModel.keras")
 
83
  logger.info(f"Loading model from: {model_path}")
84
 
85
  try:
86
+ self.model = load_model(model_path, custom_objects={"WavToMelLayer": WavToMelLayer})
87
+ logger.info(f"Model loaded successfully: {self.model.summary()}")
 
88
  except Exception as e:
89
  logger.error(f"Failed to load model: {str(e)}")
90
  raise
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def __call__(self, requests):
93
  start_time = time.time()
94
+ logger.info("Processing inference request")
95
  temp_dir = None
96
  temp_wav_path = None
97
+ input_yeah = requests['inputs']
98
+
99
  try:
 
 
 
 
 
 
 
 
 
 
100
  temp_dir = tempfile.mkdtemp()
101
  temp_wav_path = os.path.join(temp_dir, "wav_input.wav")
102
  logger.info(f"Created temporary directory: {temp_dir}")
103
 
104
+ logger.info(requests)
105
+
106
+ if not isinstance(input_yeah, bytes):
107
+ logger.error(f"Expected bytes, got {type(input_yeah)}")
108
+ return [{"error": f"Invalid input type: {type(input_yeah)}, expected bytes"}]
109
+
110
+ logger.debug(f"Writing {len(input_yeah)} bytes to temporary file: {temp_wav_path}")
111
  with open(temp_wav_path, "wb") as f:
112
+ f.write(input_yeah)
113
 
 
114
  if not os.path.exists(temp_wav_path):
115
  logger.error(f"Failed to create temporary WAV file: {temp_wav_path}")
116
  return [{"error": "Failed to create temporary WAV file"}]
117
 
118
+ logger.debug(f"File size: {os.path.getsize(temp_wav_path)} bytes")
119
+
120
+ inputs = tf.constant([temp_wav_path])
 
 
 
 
 
 
 
 
121
  logger.info("Running model prediction")
 
 
122
 
123
+ predictions = self.model.predict(inputs)
124
+ logger.debug(f"Raw predictions: {predictions}")
125
+
126
  results = []
127
  for i, prediction in enumerate(predictions):
128
+ predicted_class_index = np.argmax(prediction)
129
  confidence = float(prediction[predicted_class_index])
 
 
 
 
 
 
130
  logger.info(f"Result {i}: class={predicted_class_index}, confidence={confidence:.4f}")
131
+ results.append({"word": int(predicted_class_index), "confidence": confidence})
132
 
133
  elapsed_time = time.time() - start_time
134
  logger.info(f"Inference completed in {elapsed_time:.3f} seconds")
 
139
  return [{"error": str(e)}]
140
 
141
  finally:
 
142
  try:
143
  if temp_wav_path and os.path.exists(temp_wav_path):
144
  os.remove(temp_wav_path)
model/{speechModelv2.keras → bestModel.keras} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6c61ea34bb78345728320652dc98ddecf0278bc8d330d86b962cdfd70f71a7b
3
- size 11710468
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a1640a38b2fe403afaf62b04f667e2b1f375434323dcae34e5b9dd8bdc4f62b
3
+ size 11741036
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
  tensorflow
2
  tensorflow-cpu==2.15.0
3
  tf-keras
4
- numpy
5
- librosa
 
1
  tensorflow
2
  tensorflow-cpu==2.15.0
3
  tf-keras
4
+ numpy