Spaces:

Calotriton
/

RibbID

Sleeping

App Files Files Community

Calotriton commited on Jul 1, 2025

Commit

08175df

verified ·

1 Parent(s): 3d23449

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -76

app.py CHANGED Viewed

@@ -1,98 +1,180 @@
-import gradio as gr
-import torch
-import torchaudio
-import numpy as np
-import pickle
 import json
-from sklearn.isotonic import IsotonicRegression
-from model import EfficientNetSE, load_and_normalize, bandpass, segment, extract_log_mel
-# -------------------- Load resources --------------------
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Model
-model = EfficientNetSE()
-model.load_state_dict(torch.load("cnn_final.pth", map_location=DEVICE))
-model.eval()
-model.to(DEVICE)
-# Label encoder, thresholds, calibrators
-with open("label_encoder_and_thresholds.pkl", "rb") as f:
-    data = pickle.load(f)
-classes = data["classes"]
-thresholds = data["thresholds"]
-adj_thresholds = data["adj_thresholds"]
-calibrators = data["calibrators"]
-# -------------------- Inference --------------------
-def predict(audio_path, override_max=1.0):
-    # Load and preprocess
-    y = load_and_normalize(audio_path)
-    y = bandpass(y, sr=32000)
-    segments = segment(y, sr=32000)
-    if len(segments) == 0:
-        return "⚠️ No usable segments found in the audio file."
-    segment_preds = []
     with torch.no_grad():
-        for seg in segments:
             mel = extract_log_mel(seg)
-            inp = torch.tensor(mel[None, None], dtype=torch.float32).to(DEVICE)
             out = model(inp)
-            prob = torch.sigmoid(out).cpu().numpy()[0]
-            segment_preds.append(prob)
-    segment_preds = np.array(segment_preds)
-    agg = np.percentile(segment_preds, 90, axis=0)
     calibrated = np.array([
         calibrators[i].transform([agg[i]])[0]
-        for i in range(len(classes))
     ])
-    final = {}
-    for i, sp in enumerate(classes):
-        threshold = min(adj_thresholds[i], override_max)
-        detected = calibrated[i] > threshold
-        if detected:
-            final[sp] = f"{calibrated[i]:.2f}"
-    if not final:
-        return "🔍 No species confidently detected."
-    result_md = "### ✅ Detected species:\n"
-    for sp, prob in final.items():
-        result_md += f"- **{sp}**: {prob}\n"
-    return result_md
-# -------------------- Interface --------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# 🐸 RibbID – European Frog Call Identifier")
     gr.Markdown(
-        "Upload a recording of frog calls and RibbID will identify the species present.\n\n"
-        "**Detection strictness** controls how confident the model must be to report a detection:\n"
-        "- Lower = more sensitive (can include false positives)\n"
-        "- Higher = more conservative (only very confident predictions shown)"
     )
     with gr.Row():
-        audio_input = gr.Audio(type="filepath", label="Upload your audio (WAV/MP3)")
-        slider = gr.Slider(minimum=0.5, maximum=1.0, value=1.0, step=0.01, label="Detection strictness")
-    status = gr.Markdown("")  # Spinner text
     output = gr.Markdown()
-    def wrapped_predict(audio_path, slider_value):
         status.update("⏳ Processing...")
-        result = predict(audio_path, override_max=slider_value)
-        status.update("")  # clear
-        return result
-    submit_btn = gr.Button("Submit")
-    submit_btn.click(fn=wrapped_predict, inputs=[audio_input, slider], outputs=[output])
-# -------------------- Launch --------------------
 if __name__ == "__main__":
-    demo.launch()

+import os
 import json
+import pickle
+import numpy as np
+import torch
+import torch.nn as nn
+import librosa
+import scipy.signal as sps
+import gradio as gr
+from sklearn.preprocessing import LabelEncoder
+# ----------------------------
+# 1) Global parameters & paths
+# ----------------------------
+SR          = 22050
+DURATION    = 4.0
+HOP         = 512
+FMIN, FMAX  = 150, 4500
+MODEL_PATH  = "cnn_final.pth"
+DATA_PKL    = "label_encoder_and_thresholds.pkl"
+CAL_PATH    = "calibrators.pkl"
+DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ----------------------------
+# 2) Model definition
+# ----------------------------
+class SEBlock(nn.Module):
+    def __init__(self, channels, red=16):
+        super().__init__()
+        self.fc = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(channels, channels//red, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(channels//red, channels, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, x): return x * self.fc(x)
+class EfficientNetSE(nn.Module):
+    def __init__(self, bbone, num_classes, drop=0.3):
+        super().__init__()
+        self.backbone   = bbone
+        self.se         = SEBlock(1280)
+        self.pool       = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(drop),
+            nn.Linear(1280, num_classes)
+        )
+    def forward(self, x):
+        x = self.backbone.features(x)
+        x = self.se(x)
+        x = self.pool(x).flatten(1)
+        return self.classifier(x)
+# ----------------------------
+# 3) Audio preprocessing
+# ----------------------------
+def load_and_normalize(path, sr=SR, target_dBFS=-20.0):
+    y, _   = librosa.load(path, sr=sr)
+    y      = y - np.mean(y)
+    rms    = np.sqrt(np.mean(y**2)) + 1e-9
+    scalar = (10**(target_dBFS/20)) / rms
+    return y * scalar
+def bandpass(y, sr=SR, low=FMIN, high=FMAX, order=6):
+    nyq = 0.5*sr
+    b,a = sps.butter(order, [low/nyq, high/nyq], btype='band')
+    return sps.filtfilt(b,a,y)
+def segment(y, sr=SR, win=DURATION, hop=1.0):
+    w = int(win*sr); h = int(hop*sr)
+    if len(y) < w:
+        y = np.pad(y, (0, w - len(y)))
+        return [y]
+    return [y[i:i+w] for i in range(0, len(y)-w+1, h)]
+def extract_log_mel(y, sr=SR, n_mels=128, hop_length=HOP, fmin=FMIN, fmax=FMAX):
+    mel = librosa.feature.melspectrogram(
+        y=y, sr=sr, n_mels=n_mels,
+        hop_length=hop_length, fmin=fmin, fmax=fmax, power=1.0
+    )
+    return librosa.pcen(mel * (2**31))
+def predict_segments(fp):
+    y    = load_and_normalize(fp)
+    y    = bandpass(y)
+    segs = segment(y)
+    all_p = []
     with torch.no_grad():
+        for seg in segs:
             mel = extract_log_mel(seg)
+            inp = torch.tensor(mel[None,None], dtype=torch.float32).to(DEVICE)
             out = model(inp)
+            all_p.append(torch.sigmoid(out).cpu().numpy()[0])
+    return np.vstack(all_p)
+# ----------------------------
+# 4) Load artifacts
+# ----------------------------
+with open(DATA_PKL, "rb") as f:
+    data = pickle.load(f)
+classes        = data["classes"]
+orig_thresholds = np.array(data["thresholds"])
+adj_thresholds = np.array(data["adj_thresholds"])
+# Rebuild encoder
+le = LabelEncoder()
+le.classes_ = np.array(classes, dtype=object)
+# Calibrators
+with open(CAL_PATH, "rb") as f:
+    calibrators = pickle.load(f)
+# Load backbone & model
+backbone = torch.hub.load('pytorch/vision:v0.14.0','efficientnet_b0',pretrained=True)
+backbone.features[0][0] = nn.Conv2d(1,32,3,2,1,bias=False)
+model = EfficientNetSE(backbone, num_classes=len(le.classes_)).to(DEVICE)
+model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
+model.eval()
+# ----------------------------
+# 5) Inference logic
+# ----------------------------
+def infer(audio_path, sensitivity):
+    # segments → probabilities
+    seg_probs = predict_segments(audio_path)
+    agg       = np.percentile(seg_probs, 90, axis=0)
+    # calibrate
     calibrated = np.array([
         calibrators[i].transform([agg[i]])[0]
+        for i in range(len(le.classes_))
     ])
+    # adjust thresholds
+    thresholds = adj_thresholds * sensitivity
+    preds = calibrated > thresholds
+    # build results
+    results = [(le.classes_[i].replace("_"," "), round(float(calibrated[i]),3))
+               for i, flag in enumerate(preds) if flag]
+    if not results:
+        return "🔍 **No species confidently detected.**\nTry reducing the strictness."
+    # sort and format Markdown
+    results.sort(key=lambda x: -x[1])
+    md = "### ✅ Detected species:\n"
+    for sp, p in results:
+        md += f"- **{sp}** — probability: {p}\n"
+    return md
+# ----------------------------
+# 6) Gradio Blocks interface
+# ----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🐸 RibbID – Amphibian Call Identifier\n")
     gr.Markdown(
+        "**Detection strictness** controls how conservative the model is:\n\n"
+        "- Lower values (0.5) = more sensitive (may include false positives).\n"
+        "- Higher values (1.0) = only very confident detections."
     )
     with gr.Row():
+        audio = gr.Audio(type="filepath", label="Upload audio file (.wav/.mp3)")
+        slider = gr.Slider(0.5, 1.0, value=1.0, step=0.05,
+                           label="Detection strictness")
+    status = gr.Markdown("")  # spinner placeholder
     output = gr.Markdown()
+    def wrapped(audio_path, strictness):
         status.update("⏳ Processing...")
+        res = infer(audio_path, strictness)
+        status.update("")  # clear spinner
+        return res
+    btn = gr.Button("Submit")
+    btn.click(fn=wrapped, inputs=[audio, slider], outputs=[output])
+# launch without share link
 if __name__ == "__main__":
+    demo.launch(share=False)