Spaces:

fevot
/

iti110

Build error

App Files Files Community

fevot commited on Feb 26, 2025

Commit

5cc1efc

verified ·

1 Parent(s): 1e646fa

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -53

app.py CHANGED Viewed

@@ -1,83 +1,100 @@
-import gradio as gr
 import torch
-from torch import nn
-import cv2
 import numpy as np
 import json
-from torchvision import models
-import librosa
 class BirdCallRNN(nn.Module):
     def __init__(self, resnet, num_classes):
-        super(BirdCallRNN, self).__init__()
         self.resnet = resnet
-        num_features = self.resnet.fc.in_features
-        self.resnet.fc = nn.Identity()
-        self.rnn = nn.LSTM(input_size=num_features, hidden_size=256, num_layers=2, batch_first=True, bidirectional=True)
-        self.fc = nn.Linear(512, num_classes)
     def forward(self, x):
         batch, seq_len, C, H, W = x.size()
-        x = x.view(batch * seq_len, C, H, W)
-        features = self.resnet(x)
-        features = features.view(batch, seq_len, -1)
-        rnn_out, _ = self.rnn(features)
-        output = self.fc(rnn_out[:, -1, :])
         return output
-def mp3_to_mel_spectrogram(mp3_file, target_shape=(128, 500), resize_shape=(224, 224)):
-    y, sr = librosa.load(mp3_file, sr=None)
-    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
-    log_S = librosa.power_to_db(S, ref=np.max)
-    current_time_steps = log_S.shape[1]
-    target_time_steps = target_shape[1]
-    if current_time_steps < target_time_steps:
-        pad_width = target_time_steps - current_time_steps
-        log_S_resized = np.pad(log_S, ((0, 0), (0, pad_width)), mode='constant')
-    elif current_time_steps > target_time_steps:
-        log_S_resized = log_S[:, :target_time_steps]
-    else:
-        log_S_resized = log_S
-    log_S_resized = cv2.resize(log_S_resized, resize_shape, interpolation=cv2.INTER_CUBIC)
-    return log_S_resized
-def infer_birdcall(model, mp3_file, segment_length=500, device="cuda"):
-    model.eval()
     y, sr = librosa.load(mp3_file, sr=None)
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     log_S = librosa.power_to_db(S, ref=np.max)
     num_segments = log_S.shape[1] // segment_length
     if num_segments == 0:
         segments = [log_S]
     else:
         segments = [log_S[:, i * segment_length:(i + 1) * segment_length] for i in range(num_segments)]
     segment_tensors = []
     for seg in segments:
         seg_resized = cv2.resize(seg, (224, 224), interpolation=cv2.INTER_CUBIC)
         seg_rgb = np.repeat(seg_resized[:, :, np.newaxis], 3, axis=-1)
-        seg_tensor = torch.Tensor(seg_rgb).permute(2, 0, 1).float()
         segment_tensors.append(seg_tensor)
     sequence = torch.stack(segment_tensors, dim=0).unsqueeze(0).to(device)
-    output = model(sequence)
-    pred = torch.max(output, dim=1)[1].cpu().numpy()[0]
-    with open('class_names.json', 'r') as f:
-        class_names = json.load(f)
-    predicted_bird = class_names[pred]
-    return predicted_bird
-# Load model and set up
-resnet = models.resnet50(weights='IMAGENET1K_V2')
-with open('class_names.json', 'r') as f:
-    class_names = json.load(f)
-num_classes = len(class_names)
-model = BirdCallRNN(resnet, num_classes)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-model.load_state_dict(torch.load('birdcall_model.pth', map_location=device))
-model.eval()
-def predict_bird(file_path):
-    return infer_birdcall(model, file_path, segment_length=500, device=str(device))
-interface = gr.Interface(fn=predict_bird, inputs=gr.File(label="Upload MP3 file", file_types=['.mp3']), outputs=gr.Textbox(label="Predicted Bird Species"))
-interface.launch()

 import torch
+import torch.nn as nn
+import torchvision.models as models
+import librosa
 import numpy as np
+import cv2
 import json
+import gradio as gr
+# --------------------------
+# Define the Model Architecture
+# --------------------------
 class BirdCallRNN(nn.Module):
     def __init__(self, resnet, num_classes):
+        super(BirdCallRNN, self).__init__()
         self.resnet = resnet
+        # RNN expects input of shape (batch, seq_len, feature_dim)
+        self.rnn = nn.LSTM(input_size=resnet.fc.in_features, hidden_size=256, num_layers=2, batch_first=True, bidirectional=True)
+        self.fc = nn.Linear(512, num_classes)  # 512 = 2 * hidden_size (bidirectional)
     def forward(self, x):
+        # x shape: (batch, seq_len, 3, 224, 224)
         batch, seq_len, C, H, W = x.size()
+        x = x.view(batch * seq_len, C, H, W)  # (batch * seq_len, 3, 224, 224)
+        features = self.resnet(x)             # (batch * seq_len, feature_dim)
+        features = features.view(batch, seq_len, -1)  # (batch, seq_len, feature_dim)
+        rnn_out, _ = self.rnn(features)       # (batch, seq_len, 512)
+        output = self.fc(rnn_out[:, -1, :])     # Use last time step for classification
         return output
+# --------------------------
+# Load Model Weights and Class Mapping
+# --------------------------
+# Load class mapping from JSON file (index -> class name)
+with open("class_mapping.json", "r") as f:
+    class_mapping = json.load(f)
+num_classes = len(class_mapping)
+# Load pre-trained ResNet50 and remove its classification head
+resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
+resnet.fc = nn.Identity()
+# Initialize the BirdCallRNN model and load trained weights
+model = BirdCallRNN(resnet, num_classes)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+model.load_state_dict(torch.load("model_weights.pth", map_location=device))
+model.eval()
+# --------------------------
+# Inference Function
+# --------------------------
+def predict_bird(mp3_file):
+    """
+    Given an uploaded MP3 file, process it and predict the bird species.
+    """
+    # Load the audio file (Gradio provides a temporary file path)
     y, sr = librosa.load(mp3_file, sr=None)
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     log_S = librosa.power_to_db(S, ref=np.max)
+    # Define segment length and segment the spectrogram
+    segment_length = 500
     num_segments = log_S.shape[1] // segment_length
     if num_segments == 0:
         segments = [log_S]
     else:
         segments = [log_S[:, i * segment_length:(i + 1) * segment_length] for i in range(num_segments)]
     segment_tensors = []
     for seg in segments:
+        # Resize each segment to 224x224 and replicate the single channel to 3 channels
         seg_resized = cv2.resize(seg, (224, 224), interpolation=cv2.INTER_CUBIC)
         seg_rgb = np.repeat(seg_resized[:, :, np.newaxis], 3, axis=-1)
+        seg_tensor = torch.tensor(seg_rgb, dtype=torch.float32).permute(2, 0, 1)  # (3, 224, 224)
         segment_tensors.append(seg_tensor)
+    # Stack segments to form a sequence: (1, seq_len, 3, 224, 224)
     sequence = torch.stack(segment_tensors, dim=0).unsqueeze(0).to(device)
+    with torch.no_grad():
+        output = model(sequence)
+        pred = torch.argmax(output, dim=1).cpu().numpy()[0]
+    # Look up the predicted class name
+    predicted_bird = class_mapping.get(str(pred), "Unknown")
+    return predicted_bird
+# --------------------------
+# Create Gradio Interface
+# --------------------------
+iface = gr.Interface(
+    fn=predict_bird,
+    inputs=gr.Audio(source="upload", type="filepath"),
+    outputs="text",
+    title="BirdCall Classification",
+    description="Upload an MP3 file of a bird call to classify the bird species."
+)
+if __name__ == "__main__":
+    iface.launch()