|
|
|
|
|
"""
|
|
|
Created on Mon Jun 30 17:06:08 2025
|
|
|
|
|
|
@author: User
|
|
|
"""
|
|
|
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
import numpy as np
|
|
|
import librosa
|
|
|
import joblib
|
|
|
import pickle
|
|
|
from pathlib import Path
|
|
|
from sklearn.isotonic import IsotonicRegression
|
|
|
import argparse
|
|
|
|
|
|
|
|
|
SR = 22050
|
|
|
DURATION = 4.0
|
|
|
SAMPLES = int(SR * DURATION)
|
|
|
BANDS = 128
|
|
|
HOP = 512
|
|
|
FMIN, FMAX = 150, 4500
|
|
|
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
|
|
|
class SEBlock(nn.Module):
|
|
|
def __init__(self, channels, red=16):
|
|
|
super().__init__()
|
|
|
self.fc = nn.Sequential(
|
|
|
nn.AdaptiveAvgPool2d(1),
|
|
|
nn.Conv2d(channels, channels // red, 1),
|
|
|
nn.ReLU(inplace=True),
|
|
|
nn.Conv2d(channels // red, channels, 1),
|
|
|
nn.Sigmoid()
|
|
|
)
|
|
|
def forward(self, x):
|
|
|
return x * self.fc(x)
|
|
|
|
|
|
class EfficientNetSE(nn.Module):
|
|
|
def __init__(self, backbone, num_classes, drop=0.3):
|
|
|
super().__init__()
|
|
|
self.backbone = backbone
|
|
|
self.se = SEBlock(1280)
|
|
|
self.pool = nn.AdaptiveAvgPool2d(1)
|
|
|
self.classifier = nn.Sequential(
|
|
|
nn.Dropout(drop),
|
|
|
nn.Linear(1280, num_classes)
|
|
|
)
|
|
|
def forward(self, x):
|
|
|
x = self.backbone.features(x)
|
|
|
x = self.se(x)
|
|
|
x = self.pool(x).flatten(1)
|
|
|
return self.classifier(x)
|
|
|
|
|
|
|
|
|
def load_and_normalize(path, sr=SR, target_dBFS=-20.0):
|
|
|
y, _ = librosa.load(path, sr=sr)
|
|
|
y = y - np.mean(y)
|
|
|
rms = np.sqrt(np.mean(y ** 2)) + 1e-9
|
|
|
scalar = (10 ** (target_dBFS / 20)) / rms
|
|
|
return y * scalar
|
|
|
|
|
|
def bandpass(y, sr=SR, low=FMIN, high=FMAX, order=6):
|
|
|
from scipy.signal import butter, filtfilt
|
|
|
nyq = 0.5 * sr
|
|
|
b, a = butter(order, [low / nyq, high / nyq], btype='band')
|
|
|
return filtfilt(b, a, y)
|
|
|
|
|
|
def segment(y, sr=SR, win=DURATION, hop=1.0):
|
|
|
w = int(win * sr)
|
|
|
h = int(hop * sr)
|
|
|
if len(y) < w:
|
|
|
y = np.pad(y, (0, w - len(y)))
|
|
|
return [y]
|
|
|
return [y[i:i + w] for i in range(0, len(y) - w + 1, h)]
|
|
|
|
|
|
def extract_log_mel(y, sr=SR, n_mels=BANDS, hop_length=HOP, fmin=FMIN, fmax=FMAX):
|
|
|
mel = librosa.feature.melspectrogram(
|
|
|
y=y, sr=sr, n_mels=n_mels, hop_length=hop_length, fmin=fmin, fmax=fmax, power=1.0)
|
|
|
pcen = librosa.pcen(mel * (2 ** 31))
|
|
|
return pcen
|
|
|
|
|
|
|
|
|
def predict_segments(file_path, model):
|
|
|
y = load_and_normalize(file_path)
|
|
|
y = bandpass(y, SR)
|
|
|
segments = segment(y, SR)
|
|
|
all_probs = []
|
|
|
model.eval()
|
|
|
with torch.no_grad():
|
|
|
for seg in segments:
|
|
|
mel = extract_log_mel(seg)
|
|
|
inp = torch.tensor(mel[None, None], dtype=torch.float32).to(DEVICE)
|
|
|
probs = torch.sigmoid(model(inp)).cpu().numpy()[0]
|
|
|
all_probs.append(probs)
|
|
|
return np.array(all_probs)
|
|
|
|
|
|
|
|
|
def predict_file_with_hybrid_strategy(file_path, model, thresholds, label_encoder, override_max=0.9):
|
|
|
probs = predict_segments(file_path, model)
|
|
|
mean_probs = probs.mean(axis=0)
|
|
|
max_probs = probs.max(axis=0)
|
|
|
sensitive_thresh = [t - 0.15 for t in thresholds]
|
|
|
|
|
|
preds = []
|
|
|
for i, sp in enumerate(label_encoder.classes_):
|
|
|
if mean_probs[i] > sensitive_thresh[i] or max_probs[i] > override_max:
|
|
|
preds.append(sp)
|
|
|
return preds, mean_probs, max_probs, probs
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
parser = argparse.ArgumentParser()
|
|
|
parser.add_argument("audio_file", type=str, help="Ruta al archivo de audio (.wav)")
|
|
|
parser.add_argument("--model", default="CNN_final.pth", help="Ruta al modelo CNN .pth")
|
|
|
parser.add_argument("--meta", default="label_encoder_and_thresholds.pkl", help="Pickle con encoder y thresholds")
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
with open(args.meta, "rb") as f:
|
|
|
meta = pickle.load(f)
|
|
|
|
|
|
label_encoder = meta["label_encoder"]
|
|
|
thresholds = meta["thresholds"]
|
|
|
|
|
|
|
|
|
from torchvision import models
|
|
|
backbone = models.efficientnet_b0(weights=None)
|
|
|
backbone.features[0][0] = nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1, bias=False)
|
|
|
model = EfficientNetSE(backbone, num_classes=len(label_encoder.classes_))
|
|
|
model.load_state_dict(torch.load(args.model, map_location=DEVICE))
|
|
|
model.to(DEVICE)
|
|
|
|
|
|
|
|
|
file_path = args.audio_file
|
|
|
preds, mean_probs, max_probs, probs_all = predict_file_with_hybrid_strategy(
|
|
|
file_path, model, thresholds, label_encoder
|
|
|
)
|
|
|
|
|
|
print(f"\n Archivo: {file_path}")
|
|
|
print(f"Especies detectadas: {', '.join(preds)}\n")
|
|
|
|
|
|
print("📊 Probabilidades por especie:")
|
|
|
for i, sp in enumerate(label_encoder.classes_):
|
|
|
print(f" {sp:<25} → mean: {mean_probs[i]:.2f}, max: {max_probs[i]:.2f}")
|
|
|
|