Spaces:

AKMESSI
/

bird-identifier

Sleeping

App Files Files Community

AKMESSI commited on Jan 5

Commit

44dff6a

verified ·

1 Parent(s): f2d115f

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -5

app.py CHANGED Viewed

@@ -32,20 +32,63 @@ def load_model_and_map():
     # Load the checkpoint
     checkpoint = torch.load("multi_species_model.pth", map_location="cpu")
     # Create model directly from torchvision instead of torch.hub
     model = models.mobilenet_v3_small(pretrained=False)
-    num_classes = len(checkpoint['label_map'])
     model.classifier[3] = torch.nn.Linear(model.classifier[3].in_features, num_classes)
-    model.load_state_dict(checkpoint['model_state_dict'])
     model.eval()
-    # Get class names (scientific names)
-    class_names = list(checkpoint['label_map'].keys())
     return model, class_names
 model, class_names = load_model_and_map()
 # Show status of audio backend
 if not TORCHAUDIO_AVAILABLE:
     st.info("ℹ️ Using soundfile backend for audio processing (torchaudio not available)")
@@ -124,6 +167,9 @@ if audio_data:
             audio_bytes = audio_data.read()
             audio_data.seek(0)  # Reset file pointer
             if TORCHAUDIO_AVAILABLE:
                 try:
                     waveform, original_sr = torchaudio.load(io.BytesIO(audio_bytes))
@@ -143,6 +189,10 @@ if audio_data:
                     waveform = waveform.mean(dim=1)
                 waveform = waveform.unsqueeze(0)
             # Resample to 22050 if needed
             if original_sr != 22050:
                 if TORCHAUDIO_AVAILABLE:
@@ -170,6 +220,8 @@ if audio_data:
             else:
                 waveform = waveform[:, :target_samples]
             # Compute Mel spectrogram
             if TORCHAUDIO_AVAILABLE:
                 mel = full_transform(waveform)  # (1, 128, time)
@@ -178,6 +230,16 @@ if audio_data:
                 mel = full_transform(waveform)  # (1, 128, time)
                 mel = mel.squeeze(0)  # (128, time)
             # Normalize for visualization
             mel_min = mel.min()
             mel_max = mel.max()
@@ -185,15 +247,33 @@ if audio_data:
             # Prepare for model: resize to 224x224, add batch & RGB channels
             mel_input = mel.unsqueeze(0).unsqueeze(0)  # (1, 1, 128, time)
             mel_input = torch.nn.functional.interpolate(mel_input, size=(224, 224), mode='bilinear', align_corners=False)
             mel_input = mel_input.repeat(1, 3, 1, 1)  # to RGB
             # Inference
             with torch.no_grad():
                 output = model(mel_input)
                 probs = torch.nn.functional.softmax(output[0], dim=0)
                 top5_probs, top5_idx = torch.topk(probs, 5)
             # Determine confidence level
             top1_confidence = top5_probs[0].item()
             top1_species = class_names[top5_idx[0]]
@@ -258,7 +338,7 @@ if audio_data:
             st.markdown("---")
             with st.expander("📊 View Audio Spectrogram"):
                 mel_vis = mel_norm.cpu().numpy()
-                st.image(mel_vis, caption="Mel Spectrogram of your audio", use_column_width=True, clamp=True)
                 st.caption("This visualization shows the frequency content of the bird call over time.")
         except Exception as e:

     # Load the checkpoint
     checkpoint = torch.load("multi_species_model.pth", map_location="cpu")
+    # Debug: Check what's in the checkpoint
+    st.write("🔍 **Checkpoint Keys:**", list(checkpoint.keys()))
+    # Get label map
+    label_map = checkpoint['label_map']
+    st.write(f"📋 **Number of classes in checkpoint:** {len(label_map)}")
+    st.write(f"📝 **First 5 species in label_map:**", list(label_map.keys())[:5])
+    st.write(f"🔢 **Label map type:**", type(label_map))
     # Create model directly from torchvision instead of torch.hub
     model = models.mobilenet_v3_small(pretrained=False)
+    num_classes = len(label_map)
+    st.write(f"🧠 **Model output classes:** {num_classes}")
+    st.write(f"🔧 **Original classifier final layer:** {model.classifier[3]}")
+    # Replace final layer
     model.classifier[3] = torch.nn.Linear(model.classifier[3].in_features, num_classes)
+    st.write(f"✅ **New classifier final layer:** {model.classifier[3]}")
+    # Load state dict
+    try:
+        model.load_state_dict(checkpoint['model_state_dict'])
+        st.success(f"✅ Model weights loaded successfully!")
+    except Exception as e:
+        st.error(f"❌ Error loading model weights: {e}")
+        st.stop()
     model.eval()
+    # Get class names - THIS IS CRITICAL
+    # The label_map from your checkpoint should be {species_name: index}
+    # We need to create a list where list[index] = species_name
+    if isinstance(list(label_map.keys())[0], str):
+        # label_map is {species_name: index}, need to invert it
+        st.info("📖 Label map format: {species_name: index}")
+        # Create inverse mapping: index -> species_name
+        index_to_species = {v: k for k, v in label_map.items()}
+        # Create ordered list by index
+        class_names = [index_to_species[i] for i in range(len(label_map))]
+    else:
+        # label_map is {index: species_name}
+        st.info("📖 Label map format: {index: species_name}")
+        class_names = [label_map[i] for i in sorted(label_map.keys())]
+    st.write(f"🐦 **Total species loaded:** {len(class_names)}")
+    st.write(f"🔤 **Class names sample (indices 0-4):**")
+    for i in range(min(5, len(class_names))):
+        st.write(f"   Index {i}: {class_names[i]}")
     return model, class_names
 model, class_names = load_model_and_map()
+st.markdown("---")
 # Show status of audio backend
 if not TORCHAUDIO_AVAILABLE:
     st.info("ℹ️ Using soundfile backend for audio processing (torchaudio not available)")
             audio_bytes = audio_data.read()
             audio_data.seek(0)  # Reset file pointer
+            # Debug: Show file info
+            st.info(f"📁 File size: {len(audio_bytes) / 1024:.1f} KB")
             if TORCHAUDIO_AVAILABLE:
                 try:
                     waveform, original_sr = torchaudio.load(io.BytesIO(audio_bytes))
                     waveform = waveform.mean(dim=1)
                 waveform = waveform.unsqueeze(0)
+            # Debug info
+            st.info(f"🎵 Original sample rate: {original_sr} Hz, Duration: {waveform.shape[1] / original_sr:.2f} seconds")
+            st.info(f"📊 Waveform shape: {waveform.shape}")
             # Resample to 22050 if needed
             if original_sr != 22050:
                 if TORCHAUDIO_AVAILABLE:
             else:
                 waveform = waveform[:, :target_samples]
+            st.info(f"✂️ Processed to 5 seconds: {waveform.shape}")
             # Compute Mel spectrogram
             if TORCHAUDIO_AVAILABLE:
                 mel = full_transform(waveform)  # (1, 128, time)
                 mel = full_transform(waveform)  # (1, 128, time)
                 mel = mel.squeeze(0)  # (128, time)
+            st.info(f"🎼 Mel spectrogram shape: {mel.shape}")
+            # Check if mel spectrogram is valid
+            if torch.isnan(mel).any() or torch.isinf(mel).any():
+                st.error("⚠️ Invalid mel spectrogram detected (NaN or Inf values)")
+                st.stop()
+            # Show mel spectrogram statistics
+            st.info(f"📈 Mel stats - Min: {mel.min():.2f}, Max: {mel.max():.2f}, Mean: {mel.mean():.2f}")
             # Normalize for visualization
             mel_min = mel.min()
             mel_max = mel.max()
             # Prepare for model: resize to 224x224, add batch & RGB channels
             mel_input = mel.unsqueeze(0).unsqueeze(0)  # (1, 1, 128, time)
+            st.info(f"🔧 Before resize: {mel_input.shape}")
             mel_input = torch.nn.functional.interpolate(mel_input, size=(224, 224), mode='bilinear', align_corners=False)
+            st.info(f"📐 After resize to 224x224: {mel_input.shape}")
             mel_input = mel_input.repeat(1, 3, 1, 1)  # to RGB
+            st.info(f"🎨 After RGB conversion: {mel_input.shape}")
+            # Show input statistics
+            st.info(f"🔢 Model input stats - Min: {mel_input.min():.2f}, Max: {mel_input.max():.2f}, Mean: {mel_input.mean():.2f}")
             # Inference
             with torch.no_grad():
                 output = model(mel_input)
+                st.info(f"🧠 Raw model output shape: {output.shape}")
+                st.info(f"📊 Raw output stats - Min: {output.min():.2f}, Max: {output.max():.2f}")
                 probs = torch.nn.functional.softmax(output[0], dim=0)
+                st.info(f"🎲 Probabilities sum: {probs.sum():.4f} (should be ~1.0)")
                 top5_probs, top5_idx = torch.topk(probs, 5)
+            # Show raw top 5 for debugging
+            with st.expander("🔍 DEBUG: Raw Top 5 Predictions"):
+                for i in range(5):
+                    st.write(f"{i+1}. Index: {top5_idx[i].item()}, Prob: {top5_probs[i].item():.4f}, Species: {class_names[top5_idx[i]]}")
             # Determine confidence level
             top1_confidence = top5_probs[0].item()
             top1_species = class_names[top5_idx[0]]
             st.markdown("---")
             with st.expander("📊 View Audio Spectrogram"):
                 mel_vis = mel_norm.cpu().numpy()
+                st.image(mel_vis, caption="Mel Spectrogram of your audio", use_container_width=True, clamp=True)
                 st.caption("This visualization shows the frequency content of the bird call over time.")
         except Exception as e: