amritn8 commited on
Commit
b02773c
·
verified ·
1 Parent(s): b2f5328

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -37
app.py CHANGED
@@ -5,72 +5,80 @@ import gradio as gr
5
  from scipy.io import wavfile
6
  import os
7
 
8
- # Load model and label encoder
9
  model = tf.keras.models.load_model("animal_sound_cnn.keras")
10
  label_encoder = joblib.load("label_encoder.joblib")
11
 
 
 
 
 
 
 
 
 
12
  def preprocess_audio(audio_path):
13
- """
14
- Processes audio to match model's expected input shape
15
- Returns: (1, 384) shaped array ready for model prediction
16
- """
17
  try:
18
- # 1. Read and normalize audio
19
  sr, y = wavfile.read(audio_path)
20
- if len(y.shape) > 1: # Convert stereo to mono
21
- y = y.mean(axis=1)
22
  y = y.astype(np.float32) / np.max(np.abs(y))
23
 
24
- # 2. Create spectrogram (adjust parameters to match your model's training)
25
- spectrogram = tf.signal.stft(y, frame_length=256, frame_step=128, fft_length=256)
26
- spectrogram = tf.abs(spectrogram)
 
 
27
 
28
- # 3. Flatten to match model's expected input shape (1, 384)
29
- flattened = tf.reshape(spectrogram, (1, -1)) # Flatten all dimensions
30
 
31
- # 4. Pad or trim to exactly 384 features
32
- if flattened.shape[1] < 384:
33
- flattened = tf.pad(flattened, [[0, 0], [0, 384 - flattened.shape[1]]])
34
- else:
35
- flattened = flattened[:, :384]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- return flattened.numpy().astype(np.float32)
38
-
39
  except Exception as e:
40
  print(f"Preprocessing error: {str(e)}")
41
  return None
42
 
43
  def predict(audio_path):
44
  try:
45
- # 1. Preprocess audio
46
  processed = preprocess_audio(audio_path)
47
  if processed is None:
48
- return "Error processing audio"
49
 
50
- # Debug output
51
- print(f"Model input shape: {processed.shape}")
52
 
53
- # 2. Predict
54
  pred = model.predict(processed)
55
- animal = label_encoder.inverse_transform([np.argmax(pred)])[0]
56
- return animal
57
 
58
  except Exception as e:
59
- return f"Prediction error: {str(e)}"
60
-
61
- # Minimal requirements.txt
62
- # tensorflow>=2.16.0
63
- # scikit-learn
64
- # joblib
65
- # numpy
66
- # gradio
67
- # scipy
68
 
69
  gr.Interface(
70
  fn=predict,
71
  inputs=gr.Audio(type="filepath"),
72
  outputs="label",
73
  title="Animal Sound Classifier",
74
- description="Upload a short animal sound clip (2-5 seconds)",
75
  examples=["example.wav"] if os.path.exists("example.wav") else None
76
  ).launch()
 
5
  from scipy.io import wavfile
6
  import os
7
 
8
+ # Load assets
9
  model = tf.keras.models.load_model("animal_sound_cnn.keras")
10
  label_encoder = joblib.load("label_encoder.joblib")
11
 
12
+ def get_model_input_shape():
13
+ """Dynamically get the model's expected input shape"""
14
+ if len(model.input_shape) == 2:
15
+ return model.input_shape[1] # For (None, 384) shape
16
+ elif len(model.input_shape) == 4:
17
+ return model.input_shape[1:] # For (None, 64, 64, 1) shape
18
+ return None
19
+
20
  def preprocess_audio(audio_path):
21
+ """Universal audio preprocessing that adapts to your model"""
 
 
 
22
  try:
23
+ # 1. Load and normalize audio
24
  sr, y = wavfile.read(audio_path)
25
+ y = np.mean(y, axis=1) if len(y.shape) > 1 else y # Stereo to mono
 
26
  y = y.astype(np.float32) / np.max(np.abs(y))
27
 
28
+ # 2. Create spectrogram
29
+ n_fft = 512
30
+ hop_length = 256
31
+ stft = tf.signal.stft(y, frame_length=n_fft, frame_step=hop_length, fft_length=n_fft)
32
+ spectrogram = tf.abs(stft)
33
 
34
+ # 3. Reshape based on model requirements
35
+ expected_shape = get_model_input_shape()
36
 
37
+ if expected_shape and len(expected_shape) == 1: # Flattened input (384)
38
+ flattened = tf.reshape(spectrogram, (1, -1))
39
+ if flattened.shape[1] < expected_shape[0]:
40
+ flattened = tf.pad(flattened, [[0, 0], [0, expected_shape[0] - flattened.shape[1]]])
41
+ else:
42
+ flattened = flattened[:, :expected_shape[0]]
43
+ return flattened.numpy().astype(np.float32)
44
+
45
+ else: # Image-like input (64, 64, 1)
46
+ # Convert to mel spectrogram
47
+ linear_to_mel = tf.signal.linear_to_mel_weight_matrix(
48
+ num_mel_bins=64,
49
+ num_spectrogram_bins=spectrogram.shape[-1],
50
+ sample_rate=22050,
51
+ lower_edge_hertz=125,
52
+ upper_edge_hertz=7500)
53
+ mel_spectrogram = tf.tensordot(spectrogram, linear_to_mel, 1)
54
+ log_mel = tf.math.log(mel_spectrogram + 1e-6)
55
+
56
+ # Resize and add channel dimension
57
+ resized = tf.image.resize(tf.expand_dims(log_mel, -1), (64, 64))
58
+ return tf.expand_dims(resized, 0).numpy().astype(np.float32)
59
 
 
 
60
  except Exception as e:
61
  print(f"Preprocessing error: {str(e)}")
62
  return None
63
 
64
  def predict(audio_path):
65
  try:
 
66
  processed = preprocess_audio(audio_path)
67
  if processed is None:
68
+ return "Error: Invalid audio input"
69
 
70
+ print(f"Final input shape: {processed.shape}")
 
71
 
 
72
  pred = model.predict(processed)
73
+ return label_encoder.inverse_transform([np.argmax(pred)])[0]
 
74
 
75
  except Exception as e:
76
+ return f"Prediction failed: {str(e)}"
 
 
 
 
 
 
 
 
77
 
78
  gr.Interface(
79
  fn=predict,
80
  inputs=gr.Audio(type="filepath"),
81
  outputs="label",
82
  title="Animal Sound Classifier",
 
83
  examples=["example.wav"] if os.path.exists("example.wav") else None
84
  ).launch()