Spaces:

Calotriton
/

RibbID

Sleeping

App Files Files Community

RibbID / app.py

Calotriton

Update app.py

7f4460d verified 8 months ago

raw

history blame contribute delete

5.94 kB

	import os
	import json
	import pickle
	import numpy as np
	import torch
	import torch.nn as nn
	import librosa
	import scipy.signal as sps
	import gradio as gr
	from sklearn.preprocessing import LabelEncoder

	# ----------------------------
	# 1) Global parameters & paths
	# ----------------------------
	SR = 22050
	DURATION = 4.0
	HOP = 512
	FMIN, FMAX = 150, 4500
	MODEL_PATH = "CNN_final.pth"
	DATA_PKL = "label_encoder_and_thresholds.pkl"
	CAL_PATH = "calibrators.pkl"
	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# ----------------------------
	# 2) Model definition
	# ----------------------------
	class SEBlock(nn.Module):
	def __init__(self, channels, red=16):
	super().__init__()
	self.fc = nn.Sequential(
	nn.AdaptiveAvgPool2d(1),
	nn.Conv2d(channels, channels//red, 1),
	nn.ReLU(inplace=True),
	nn.Conv2d(channels//red, channels, 1),
	nn.Sigmoid()
	)
	def forward(self, x): return x * self.fc(x)

	class EfficientNetSE(nn.Module):
	def __init__(self, bbone, num_classes, drop=0.3):
	super().__init__()
	self.backbone = bbone
	self.se = SEBlock(1280)
	self.pool = nn.AdaptiveAvgPool2d(1)
	self.classifier = nn.Sequential(
	nn.Dropout(drop),
	nn.Linear(1280, num_classes)
	)
	def forward(self, x):
	x = self.backbone.features(x)
	x = self.se(x)
	x = self.pool(x).flatten(1)
	return self.classifier(x)

	# ----------------------------
	# 3) Audio preprocessing
	# ----------------------------
	def load_and_normalize(path, sr=SR, target_dBFS=-20.0):
	y, _ = librosa.load(path, sr=sr)
	y = y - np.mean(y)
	rms = np.sqrt(np.mean(y**2)) + 1e-9
	scalar = (10**(target_dBFS/20)) / rms
	return y * scalar

	def bandpass(y, sr=SR, low=FMIN, high=FMAX, order=6):
	nyq = 0.5*sr
	b,a = sps.butter(order, [low/nyq, high/nyq], btype='band')
	return sps.filtfilt(b,a,y)

	def segment(y, sr=SR, win=DURATION, hop=1.0):
	w = int(winsr); h = int(hopsr)
	if len(y) < w:
	y = np.pad(y, (0, w - len(y)))
	return [y]
	return [y[i:i+w] for i in range(0, len(y)-w+1, h)]

	def extract_log_mel(y, sr=SR, n_mels=128, hop_length=HOP, fmin=FMIN, fmax=FMAX):
	mel = librosa.feature.melspectrogram(
	y=y, sr=sr, n_mels=n_mels,
	hop_length=hop_length, fmin=fmin, fmax=fmax, power=1.0
	)
	return librosa.pcen(mel * (2**31))

	def predict_segments(fp):
	y = load_and_normalize(fp)
	y = bandpass(y)
	segs = segment(y)
	all_p = []
	with torch.no_grad():
	for seg in segs:
	mel = extract_log_mel(seg)
	inp = torch.tensor(mel[None,None], dtype=torch.float32).to(DEVICE)
	out = model(inp)
	all_p.append(torch.sigmoid(out).cpu().numpy()[0])
	return np.vstack(all_p)

	# ----------------------------
	# 4) Load artifacts
	# ----------------------------
	with open(DATA_PKL, "rb") as f:
	data = pickle.load(f)
	classes = data["classes"]
	orig_thresholds = np.array(data["thresholds"])
	adj_thresholds = np.array(data["adj_thresholds"])

	# Rebuild encoder
	le = LabelEncoder()
	le.classes_ = np.array(classes, dtype=object)

	# Calibrators
	with open(CAL_PATH, "rb") as f:
	calibrators = pickle.load(f)

	# Load backbone & model
	backbone = torch.hub.load('pytorch/vision:v0.14.0','efficientnet_b0',pretrained=True)
	backbone.features[0][0] = nn.Conv2d(1,32,3,2,1,bias=False)
	model = EfficientNetSE(backbone, num_classes=len(le.classes_)).to(DEVICE)
	model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
	model.eval()

	# ----------------------------
	# 5) Inference logic
	# ----------------------------
	def infer(audio_path, sensitivity):
	# segments → probabilities
	seg_probs = predict_segments(audio_path)
	agg = np.percentile(seg_probs, 90, axis=0)
	# calibrate
	calibrated = np.array([
	calibrators[i].transform([agg[i]])[0]
	for i in range(len(le.classes_))
	])
	# adjust thresholds
	thresholds = adj_thresholds * sensitivity
	preds = calibrated > thresholds

	# build results
	results = [(le.classes_[i].replace("_"," "), round(float(calibrated[i]),3))
	for i, flag in enumerate(preds) if flag]
	if not results:
	return "🔍 No species confidently detected.\nTry reducing the strictness."

	# sort and format Markdown with italics species names
	results.sort(key=lambda x: -x[1])
	md = "### ✅ Detected species:\n"
	for sp, p in results:
	md += f"- {sp} — probability: {p}\n"
	return md

	# ----------------------------
	# 6) Gradio Blocks interface
	# ----------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# 🐸 RibbID – Amphibian species acoustic identifier\n")
	# Intro sentence about native species
	gr.Markdown(
	"This CNN model detects the native frog and toad species of Catalonia (Northern Spain) through ther calls."
	)
	gr.Markdown(
	"To start, upload an audio file or record a new one. Next, select the detection strictness in the slider, and click submit. Results might take time.\n"
	"\n"
	"Detection strictness controls how conservative the model is:\n"
	"- Lower values (0.5) = more sensitive (may include false positives).\n"
	"- Higher values (1.0) = only very confident detections."
	)

	with gr.Row():
	audio = gr.Audio(type="filepath", label="Upload audio file (.wav/.mp3) or record live")
	slider = gr.Slider(0.5, 1.0, value=1.0, step=0.05,
	label="Detection strictness")

	output = gr.Markdown()

	btn = gr.Button("Submit")
	btn.click(
	fn=infer,
	inputs=[audio, slider],
	outputs=[output],
	show_progress=True
	)

	if __name__ == "__main__":
	demo.launch(share=False)