Inference failing
#1
by
adsoul
- opened
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import librosa
import numpy as np
class AudioClassifierCNN(nn.Module):
def __init__(self):
super(AudioClassifierCNN, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(16)
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(32)
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
self.bn3 = nn.BatchNorm2d(64)
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.fc1 = nn.Linear(51200, 64)
self.dropout = nn.Dropout(p=0.2)
self.fc2 = nn.Linear(64, 2)
def forward(self, x):
x = self.pool1(F.relu(self.bn1(self.conv1(x))))
x = self.pool2(F.relu(self.bn2(self.conv2(x))))
x = self.pool3(F.relu(self.bn3(self.conv3(x))))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
def load_model(model_path: str) -> nn.Module:
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model file not found: {model_path}")
model = torch.load(model_path, map_location='cpu', weights_only=False)
model.eval()
return model
def preprocess_audio_librosa(audio_path: str) -> torch.Tensor:
target_sr = 16000
duration_sec = 4
signal, sr = librosa.load(audio_path, sr=target_sr, mono=True)
num_samples = duration_sec * target_sr
if signal.shape[0] > num_samples:
signal = signal[:num_samples]
else:
signal = np.pad(signal, (0, num_samples - signal.shape[0]), 'constant')
power_mel_spec = librosa.feature.melspectrogram(
y=signal,
sr=target_sr,
n_fft=1024,
hop_length=160,
n_mels=128,
fmin=0,
fmax=target_sr/2
)
spec_db = librosa.power_to_db(power_mel_spec, ref=np.max).astype('float32')
spec_normalized = (spec_db - spec_db.min()) / (spec_db.max() - spec_db.min())
spec_tensor = torch.from_numpy(spec_normalized).float().unsqueeze(0).unsqueeze(0)
return spec_tensor
def classify_voicemail(model_path: str, audio_path: str) -> dict:
model = load_model(model_path)
audio_tensor = preprocess_audio_librosa(audio_path)
with torch.no_grad():
outputs = model(audio_tensor)
probabilities = F.softmax(outputs, dim=1)
confidence, prediction_idx = torch.max(probabilities, dim=1)
labels = {0: "Live Call", 1: "Voicemail"}
prediction = labels[prediction_idx.item()]
return {
"file": os.path.basename(audio_path),
"prediction": prediction,
"confidence": f"{confidence.item():.2%}"
}
Here is the code, I assume the spectrogram is normalized between 0 > 1? i even tried -1 to 1 or original spectrogram but the model always provides the same index for all recordings i tried (4 live call vs 3 voicemail) and the result is voicemail for all.
Is something else done for the audio preprocessing?