Spaces:

fevot
/

iti110

Build error

App Files Files Community

fevot commited on Feb 26, 2025

Commit

a9eca6f

verified ·

1 Parent(s): 41701b7

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -73

app.py CHANGED Viewed

@@ -1,102 +1,82 @@
 import torch
-import torch.nn as nn
-import torchvision.models as models
-import librosa
-import numpy as np
 import cv2
 import json
-import gradio as gr
-# --------------------------
-# Define the Model Architecture
-# --------------------------
 class BirdCallRNN(nn.Module):
-    def __init__(self, resnet, num_classes, num_features):
-        super(BirdCallRNN, self).__init__()
         self.resnet = resnet
-        # RNN expects input of shape (batch, seq_len, feature_dim)
-        self.rnn = nn.LSTM(input_size=num_features, hidden_size=256, num_layers=2,
-                           batch_first=True, bidirectional=True)
-        self.fc = nn.Linear(512, num_classes)  # 512 = 2 * hidden_size (bidirectional)
     def forward(self, x):
-        # x shape: (batch, seq_len, 3, 224, 224)
         batch, seq_len, C, H, W = x.size()
-        x = x.view(batch * seq_len, C, H, W)  # (batch * seq_len, 3, 224, 224)
-        features = self.resnet(x)             # (batch * seq_len, feature_dim)
-        features = features.view(batch, seq_len, -1)  # (batch, seq_len, feature_dim)
-        rnn_out, _ = self.rnn(features)       # (batch, seq_len, 512)
-        output = self.fc(rnn_out[:, -1, :])     # Use last time step for classification
         return output
-# --------------------------
-# Load Model Weights and Class Mapping
-# --------------------------
-# Load class mapping from JSON file (index -> class name)
-with open("class_mapping.json", "r") as f:
-    class_mapping = json.load(f)
-num_classes = len(class_mapping)
-# Load pre-trained ResNet50 and capture the in_features attribute before modification
-resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
-num_features = resnet.fc.in_features  # Capture in_features before replacing fc
-resnet.fc = nn.Identity()             # Remove the classification head
-# Initialize the BirdCallRNN model and load trained weights
-model = BirdCallRNN(resnet, num_classes, num_features)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-model.load_state_dict(torch.load("model_weights.pth", map_location=device))
-model.eval()
-# --------------------------
-# Inference Function
-# --------------------------
-def predict_bird(mp3_file):
-    """
-    Given an uploaded MP3 file, process it and predict the bird species.
-    """
-    # Load the audio file (Gradio provides a temporary file path)
     y, sr = librosa.load(mp3_file, sr=None)
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     log_S = librosa.power_to_db(S, ref=np.max)
-    # Define segment length and segment the spectrogram
-    segment_length = 500
     num_segments = log_S.shape[1] // segment_length
     if num_segments == 0:
         segments = [log_S]
     else:
         segments = [log_S[:, i * segment_length:(i + 1) * segment_length] for i in range(num_segments)]
     segment_tensors = []
     for seg in segments:
-        # Resize each segment to 224x224 and replicate the single channel to 3 channels
         seg_resized = cv2.resize(seg, (224, 224), interpolation=cv2.INTER_CUBIC)
         seg_rgb = np.repeat(seg_resized[:, :, np.newaxis], 3, axis=-1)
-        seg_tensor = torch.tensor(seg_rgb, dtype=torch.float32).permute(2, 0, 1)  # (3, 224, 224)
         segment_tensors.append(seg_tensor)
-    # Stack segments to form a sequence: (1, seq_len, 3, 224, 224)
     sequence = torch.stack(segment_tensors, dim=0).unsqueeze(0).to(device)
-    with torch.no_grad():
-        output = model(sequence)
-        pred = torch.argmax(output, dim=1).cpu().numpy()[0]
-    # Look up the predicted class name
-    predicted_bird = class_mapping.get(str(pred), "Unknown")
     return predicted_bird
-# --------------------------
-# Create Gradio Interface
-# --------------------------
-iface = gr.Interface(
-    fn=predict_bird,
-    inputs=gr.Audio(source="upload", type="filepath"),
-    outputs="text",
-    title="BirdCall Classification",
-    description="Upload an MP3 file of a bird call to classify the bird species."
-)
-if __name__ == "__main__":
-    iface.launch()

+import gradio as gr
 import torch
+from torch import nn
 import cv2
+import numpy as np
 import json
+from torchvision import models
+import librosa
 class BirdCallRNN(nn.Module):
+    def __init__(self, resnet, num_features, num_classes):
+        super(BirdCallRNN, self).__init__()
         self.resnet = resnet
+        self.rnn = nn.LSTM(input_size=num_features, hidden_size=256, num_layers=2, batch_first=True, bidirectional=True)
+        self.fc = nn.Linear(512, num_classes)
     def forward(self, x):
         batch, seq_len, C, H, W = x.size()
+        x = x.view(batch * seq_len, C, H, W)
+        features = self.resnet(x)
+        features = features.view(batch, seq_len, -1)
+        rnn_out, _ = self.rnn(features)
+        output = self.fc(rnn_out[:, -1, :])
         return output
+def mp3_to_mel_spectrogram(mp3_file, target_shape=(128, 500), resize_shape=(224, 224)):
     y, sr = librosa.load(mp3_file, sr=None)
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     log_S = librosa.power_to_db(S, ref=np.max)
+    current_time_steps = log_S.shape[1]
+    target_time_steps = target_shape[1]
+    if current_time_steps < target_time_steps:
+        pad_width = target_time_steps - current_time_steps
+        log_S_resized = np.pad(log_S, ((0, 0), (0, pad_width)), mode='constant')
+    elif current_time_steps > target_time_steps:
+        log_S_resized = log_S[:, :target_time_steps]
+    else:
+        log_S_resized = log_S
+    log_S_resized = cv2.resize(log_S_resized, resize_shape, interpolation=cv2.INTER_CUBIC)
+    return log_S_resized
+def infer_birdcall(model, mp3_file, segment_length=500, device="cuda"):
+    model.eval()
+    y, sr = librosa.load(mp3_file, sr=None)
+    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
+    log_S = librosa.power_to_db(S, ref=np.max)
     num_segments = log_S.shape[1] // segment_length
     if num_segments == 0:
         segments = [log_S]
     else:
         segments = [log_S[:, i * segment_length:(i + 1) * segment_length] for i in range(num_segments)]
     segment_tensors = []
     for seg in segments:
         seg_resized = cv2.resize(seg, (224, 224), interpolation=cv2.INTER_CUBIC)
         seg_rgb = np.repeat(seg_resized[:, :, np.newaxis], 3, axis=-1)
+        seg_tensor = torch.from_numpy(seg_rgb).permute(2, 0, 1).float()
         segment_tensors.append(seg_tensor)
     sequence = torch.stack(segment_tensors, dim=0).unsqueeze(0).to(device)
+    output = model(sequence)
+    pred = torch.max(output, dim=1)[1].cpu().numpy()[0]
+    with open('class_names.json', 'r') as f:
+        class_names = json.load(f)
+    predicted_bird = class_names[pred]
     return predicted_bird
+resnet = models.resnet50(weights='IMAGENET1K_V2')
+num_features = resnet.fc.in_features
+resnet.fc = nn.Identity()
+with open('class_names.json', 'r') as f:
+    class_names = json.load(f)
+num_classes = len(class_names)
+model = BirdCallRNN(resnet, num_features, num_classes)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+model.load_state_dict(torch.load('birdcall_model.pth', map_location=device))
+model.eval()
+def predict_bird(file_path):
+    return infer_birdcall(model, file_path, segment_length=500, device=str(device))
+interface = gr.Interface(fn=predict_bird, inputs=gr.File(label="Upload MP3 file", file_types=['.mp3']), outputs=gr.Textbox(label="Predicted Bird Species"))
+interface.launch()