Spaces:

fevot
/

iti110

Sleeping

App Files Files Community

fevot commited on Feb 26, 2025

Commit

fcf045e

verified ·

1 Parent(s): f604fd4

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -93

app.py CHANGED Viewed

@@ -6,9 +6,7 @@ import numpy as np
 import json
 from torchvision import models
 import librosa
-import matplotlib.pyplot as plt
-import io
-from PIL import Image
 # Define the BirdCallRNN model
 class BirdCallRNN(nn.Module):
@@ -27,7 +25,7 @@ class BirdCallRNN(nn.Module):
         output = self.fc(rnn_out[:, -1, :])  # Note: We'll use this for single-segment sequences
         return output
-# Function to convert MP3 to mel spectrogram
 def mp3_to_mel_spectrogram(mp3_file, target_shape=(128, 500), resize_shape=(224, 224)):
     y, sr = librosa.load(mp3_file, sr=None)
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
@@ -44,26 +42,11 @@ def mp3_to_mel_spectrogram(mp3_file, target_shape=(128, 500), resize_shape=(224,
     log_S_resized = cv2.resize(log_S_resized, resize_shape, interpolation=cv2.INTER_CUBIC)
     return log_S_resized
-# Generate mel spectrogram image for display
-def generate_mel_spectrogram_plot(log_S):
-    plt.figure(figsize=(10, 4))
-    plt.imshow(log_S, aspect='auto', origin='lower', cmap='viridis')
-    plt.colorbar(format='%+2.0f dB')
-    plt.title('Mel Spectrogram')
-    plt.tight_layout()
-    # Save plot to a bytes buffer
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png')
-    plt.close()
-    buf.seek(0)
-    return Image.open(buf)
 # Load class mapping globally
 with open('class_mapping.json', 'r') as f:
     class_names = json.load(f)
-# Revised inference function to predict per segment with confidence scores
 def infer_birdcall(model, mp3_file, segment_length=500, device="cuda"):
     model.eval()
     # Load audio and compute mel spectrogram
@@ -71,39 +54,24 @@ def infer_birdcall(model, mp3_file, segment_length=500, device="cuda"):
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     log_S = librosa.power_to_db(S, ref=np.max)
     # Segment the spectrogram
-    num_segments = max(1, log_S.shape[1] // segment_length)
-    segments = [log_S[:, i * segment_length:min((i + 1) * segment_length, log_S.shape[1])] for i in range(num_segments)]
     predictions = []
-    confidence_scores = []
-    spectrogram_images = []
     # Process each segment individually
     for seg in segments:
-        # Generate spectrogram image first
-        spec_img = generate_mel_spectrogram_plot(seg)
-        spectrogram_images.append(spec_img)
-        # Prepare for model input
         seg_resized = cv2.resize(seg, (224, 224), interpolation=cv2.INTER_CUBIC)
         seg_rgb = np.repeat(seg_resized[:, :, np.newaxis], 3, axis=-1)
         # Create a tensor with batch size 1 and sequence length 1
         seg_tensor = torch.from_numpy(seg_rgb).permute(2, 0, 1).float().unsqueeze(0).unsqueeze(0).to(device)  # Shape: (1, 1, 3, 224, 224)
-        with torch.no_grad():
-            output = model(seg_tensor)
-        # Get prediction
-        probabilities = torch.nn.functional.softmax(output, dim=1)
-        confidence, pred_idx = torch.max(probabilities, dim=1)
-        pred = pred_idx.cpu().numpy()[0]
-        conf = confidence.cpu().numpy()[0]
         predicted_bird = class_names[str(pred)]  # Convert pred to string to match JSON keys
         predictions.append(predicted_bird)
-        confidence_scores.append(conf)
-    return predictions, confidence_scores, spectrogram_images
 # Initialize the model
 resnet = models.resnet50(weights='IMAGENET1K_V2')
@@ -117,75 +85,54 @@ model.load_state_dict(torch.load('model_weights.pth', map_location=device))
 model.eval()
 # Prediction function for Gradio
-def predict_bird(file_path):
-    if file_path is None:
-        return "Please upload an audio file", [], None, None, None
-    predictions, confidence_scores, spectrograms = infer_birdcall(model, file_path, segment_length=500, device=str(device))
-    # Format the predictions with numbering and confidence
-    formatted_predictions = [f"{i+1}. {bird} (Confidence: {conf:.2%})" for i, (bird, conf) in enumerate(zip(predictions, confidence_scores))]
-    prediction_text = "\n".join(formatted_predictions)
-    # Load the three static images
-    bird_species_img = "1.jpeg"
-    bird_description_img = "2.jpeg"
-    bird_origins_img = "3.jpeg"
-    return prediction_text, spectrograms, bird_species_img, bird_description_img, bird_origins_img
-# Create Gradio blocks interface
-with gr.Blocks() as interface:
-    gr.Markdown("# Bird Call Recognition")
     with gr.Row():
         with gr.Column():
-            # File upload - fixed parameter issue
-            input_audio = gr.Audio(
-                type="filepath",
-                label="Upload MP3 file"
-            )
-            # Submit button
-            submit_btn = gr.Button("Identify Bird Species")
-    # Results section
     with gr.Row():
-        prediction_output = gr.Textbox(label="Identified Bird Species")
-    # Spectrograms gallery - removed style method
     with gr.Row():
-        spectrogram_gallery = gr.Gallery(
-            label="Mel Spectrograms by Segment",
-            show_label=True,
-            # Removed style() method that was causing errors
-            # Instead using direct parameters if available
-            grid=[2, 2],
-            height=400
-        )
-    # Bird information images
     with gr.Row():
-        bird_species_image = gr.Image(label="Bird Species")
     with gr.Row():
-        bird_description_image = gr.Image(label="Bird Description")
     with gr.Row():
-        bird_origins_image = gr.Image(label="Bird Origins")
-    # Set up the submission event
     submit_btn.click(
         fn=predict_bird,
-        inputs=input_audio,
-        outputs=[
-            prediction_output,
-            spectrogram_gallery,
-            bird_species_image,
-            bird_description_image,
-            bird_origins_image
-        ]
     )
-# Launch the interface
-interface.launch()

 import json
 from torchvision import models
 import librosa
+import os
 # Define the BirdCallRNN model
 class BirdCallRNN(nn.Module):
         output = self.fc(rnn_out[:, -1, :])  # Note: We'll use this for single-segment sequences
         return output
+# Function to convert MP3 to mel spectrogram (unchanged)
 def mp3_to_mel_spectrogram(mp3_file, target_shape=(128, 500), resize_shape=(224, 224)):
     y, sr = librosa.load(mp3_file, sr=None)
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     log_S_resized = cv2.resize(log_S_resized, resize_shape, interpolation=cv2.INTER_CUBIC)
     return log_S_resized
 # Load class mapping globally
 with open('class_mapping.json', 'r') as f:
     class_names = json.load(f)
+# Revised inference function to predict per segment
 def infer_birdcall(model, mp3_file, segment_length=500, device="cuda"):
     model.eval()
     # Load audio and compute mel spectrogram
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     log_S = librosa.power_to_db(S, ref=np.max)
     # Segment the spectrogram
+    num_segments = log_S.shape[1] // segment_length
+    if num_segments == 0:
+        segments = [log_S]
+    else:
+        segments = [log_S[:, i * segment_length:(i + 1) * segment_length] for i in range(num_segments)]
     predictions = []
     # Process each segment individually
     for seg in segments:
         seg_resized = cv2.resize(seg, (224, 224), interpolation=cv2.INTER_CUBIC)
         seg_rgb = np.repeat(seg_resized[:, :, np.newaxis], 3, axis=-1)
         # Create a tensor with batch size 1 and sequence length 1
         seg_tensor = torch.from_numpy(seg_rgb).permute(2, 0, 1).float().unsqueeze(0).unsqueeze(0).to(device)  # Shape: (1, 1, 3, 224, 224)
+        output = model(seg_tensor)
+        pred = torch.max(output, dim=1)[1].cpu().numpy()[0]
         predicted_bird = class_names[str(pred)]  # Convert pred to string to match JSON keys
         predictions.append(predicted_bird)
+    return predictions
 # Initialize the model
 resnet = models.resnet50(weights='IMAGENET1K_V2')
 model.eval()
 # Prediction function for Gradio
+def predict_bird(audio_file):
+    if audio_file is None:
+        return "Please upload an MP3 file."
+    predictions = infer_birdcall(model, audio_file, segment_length=500, device=str(device))
+    # Format the predictions with numbering
+    if not predictions:
+        return "No birds identified."
+    numbered_predictions = [f"{i+1}. {bird}" for i, bird in enumerate(predictions)]
+    return "\n".join(numbered_predictions)
+# Create Gradio Blocks for more complex layout
+with gr.Blocks() as demo:
+    gr.Markdown("# Bird Call Identification")
     with gr.Row():
         with gr.Column():
+            audio_input = gr.Audio(type="filepath", label="Upload Bird Call Audio")
     with gr.Row():
+        submit_btn = gr.Button("Identify Birds")
     with gr.Row():
+        output_text = gr.Textbox(label="Predicted Bird Species")
+    # Bird Species Image
     with gr.Row():
+        gr.Markdown("## Bird Species")
+        species_image = gr.Image("1.jpeg", label="")
+    # Bird Description Image
     with gr.Row():
+        gr.Markdown("## Bird Description")
+        description_image = gr.Image("2.jpeg", label="")
+    # Bird Origins Image
     with gr.Row():
+        gr.Markdown("## Bird Origins")
+        origins_image = gr.Image("3.jpeg", label="")
+    # Set up the prediction event
     submit_btn.click(
         fn=predict_bird,
+        inputs=audio_input,
+        outputs=output_text
     )
+# Launch the app
+demo.launch()