Spaces:

amritn8
/

AnimalSoundClassifier

Sleeping

App Files Files Community

amritn8 commited on Jul 29, 2025

Commit

b02773c

verified ·

1 Parent(s): b2f5328

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -37

app.py CHANGED Viewed

@@ -5,72 +5,80 @@ import gradio as gr
 from scipy.io import wavfile
 import os
-# Load model and label encoder
 model = tf.keras.models.load_model("animal_sound_cnn.keras")
 label_encoder = joblib.load("label_encoder.joblib")
 def preprocess_audio(audio_path):
-    """
-    Processes audio to match model's expected input shape
-    Returns: (1, 384) shaped array ready for model prediction
-    """
     try:
-        # 1. Read and normalize audio
         sr, y = wavfile.read(audio_path)
-        if len(y.shape) > 1:  # Convert stereo to mono
-            y = y.mean(axis=1)
         y = y.astype(np.float32) / np.max(np.abs(y))
-        # 2. Create spectrogram (adjust parameters to match your model's training)
-        spectrogram = tf.signal.stft(y, frame_length=256, frame_step=128, fft_length=256)
-        spectrogram = tf.abs(spectrogram)
-        # 3. Flatten to match model's expected input shape (1, 384)
-        flattened = tf.reshape(spectrogram, (1, -1))  # Flatten all dimensions
-        # 4. Pad or trim to exactly 384 features
-        if flattened.shape[1] < 384:
-            flattened = tf.pad(flattened, [[0, 0], [0, 384 - flattened.shape[1]]])
-        else:
-            flattened = flattened[:, :384]
-        return flattened.numpy().astype(np.float32)
     except Exception as e:
         print(f"Preprocessing error: {str(e)}")
         return None
 def predict(audio_path):
     try:
-        # 1. Preprocess audio
         processed = preprocess_audio(audio_path)
         if processed is None:
-            return "Error processing audio"
-        # Debug output
-        print(f"Model input shape: {processed.shape}")
-        # 2. Predict
         pred = model.predict(processed)
-        animal = label_encoder.inverse_transform([np.argmax(pred)])[0]
-        return animal
     except Exception as e:
-        return f"Prediction error: {str(e)}"
-# Minimal requirements.txt
-# tensorflow>=2.16.0
-# scikit-learn
-# joblib
-# numpy
-# gradio
-# scipy
 gr.Interface(
     fn=predict,
     inputs=gr.Audio(type="filepath"),
     outputs="label",
     title="Animal Sound Classifier",
-    description="Upload a short animal sound clip (2-5 seconds)",
     examples=["example.wav"] if os.path.exists("example.wav") else None
 ).launch()

 from scipy.io import wavfile
 import os
+# Load assets
 model = tf.keras.models.load_model("animal_sound_cnn.keras")
 label_encoder = joblib.load("label_encoder.joblib")
+def get_model_input_shape():
+    """Dynamically get the model's expected input shape"""
+    if len(model.input_shape) == 2:
+        return model.input_shape[1]  # For (None, 384) shape
+    elif len(model.input_shape) == 4:
+        return model.input_shape[1:]  # For (None, 64, 64, 1) shape
+    return None
 def preprocess_audio(audio_path):
+    """Universal audio preprocessing that adapts to your model"""
     try:
+        # 1. Load and normalize audio
         sr, y = wavfile.read(audio_path)
+        y = np.mean(y, axis=1) if len(y.shape) > 1 else y  # Stereo to mono
         y = y.astype(np.float32) / np.max(np.abs(y))
+        # 2. Create spectrogram
+        n_fft = 512
+        hop_length = 256
+        stft = tf.signal.stft(y, frame_length=n_fft, frame_step=hop_length, fft_length=n_fft)
+        spectrogram = tf.abs(stft)
+        # 3. Reshape based on model requirements
+        expected_shape = get_model_input_shape()
+        if expected_shape and len(expected_shape) == 1:  # Flattened input (384)
+            flattened = tf.reshape(spectrogram, (1, -1))
+            if flattened.shape[1] < expected_shape[0]:
+                flattened = tf.pad(flattened, [[0, 0], [0, expected_shape[0] - flattened.shape[1]]])
+            else:
+                flattened = flattened[:, :expected_shape[0]]
+            return flattened.numpy().astype(np.float32)
+        else:  # Image-like input (64, 64, 1)
+            # Convert to mel spectrogram
+            linear_to_mel = tf.signal.linear_to_mel_weight_matrix(
+                num_mel_bins=64,
+                num_spectrogram_bins=spectrogram.shape[-1],
+                sample_rate=22050,
+                lower_edge_hertz=125,
+                upper_edge_hertz=7500)
+            mel_spectrogram = tf.tensordot(spectrogram, linear_to_mel, 1)
+            log_mel = tf.math.log(mel_spectrogram + 1e-6)
+            # Resize and add channel dimension
+            resized = tf.image.resize(tf.expand_dims(log_mel, -1), (64, 64))
+            return tf.expand_dims(resized, 0).numpy().astype(np.float32)
     except Exception as e:
         print(f"Preprocessing error: {str(e)}")
         return None
 def predict(audio_path):
     try:
         processed = preprocess_audio(audio_path)
         if processed is None:
+            return "Error: Invalid audio input"
+        print(f"Final input shape: {processed.shape}")
         pred = model.predict(processed)
+        return label_encoder.inverse_transform([np.argmax(pred)])[0]
     except Exception as e:
+        return f"Prediction failed: {str(e)}"
 gr.Interface(
     fn=predict,
     inputs=gr.Audio(type="filepath"),
     outputs="label",
     title="Animal Sound Classifier",
     examples=["example.wav"] if os.path.exists("example.wav") else None
 ).launch()