Inference failing

by adsoul - opened Jul 31, 2025

Jul 31, 2025

import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import librosa
import numpy as np

class AudioClassifierCNN(nn.Module):
    def __init__(self):
        super(AudioClassifierCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(51200, 64)
        self.dropout = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(64, 2)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

def load_model(model_path: str) -> nn.Module:
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found: {model_path}")
    model = torch.load(model_path, map_location='cpu', weights_only=False)
    model.eval()
    return model

def preprocess_audio_librosa(audio_path: str) -> torch.Tensor:
    target_sr = 16000
    duration_sec = 4
    
    signal, sr = librosa.load(audio_path, sr=target_sr, mono=True)

    num_samples = duration_sec * target_sr
    if signal.shape[0] > num_samples:
        signal = signal[:num_samples]
    else:
        signal = np.pad(signal, (0, num_samples - signal.shape[0]), 'constant')

    power_mel_spec = librosa.feature.melspectrogram(
        y=signal,
        sr=target_sr,
        n_fft=1024,
        hop_length=160,
        n_mels=128,
        fmin=0,
        fmax=target_sr/2
    )
    
    spec_db = librosa.power_to_db(power_mel_spec, ref=np.max).astype('float32')

    spec_normalized = (spec_db - spec_db.min()) / (spec_db.max() - spec_db.min())
    
    spec_tensor = torch.from_numpy(spec_normalized).float().unsqueeze(0).unsqueeze(0)
    
    return spec_tensor

def classify_voicemail(model_path: str, audio_path: str) -> dict:
    model = load_model(model_path)
    audio_tensor = preprocess_audio_librosa(audio_path)
    
    with torch.no_grad():
        outputs = model(audio_tensor)
        probabilities = F.softmax(outputs, dim=1)
        confidence, prediction_idx = torch.max(probabilities, dim=1)

    labels = {0: "Live Call", 1: "Voicemail"}
    prediction = labels[prediction_idx.item()]
    
    return {
        "file": os.path.basename(audio_path),
        "prediction": prediction,
        "confidence": f"{confidence.item():.2%}"
    }

Here is the code, I assume the spectrogram is normalized between 0 > 1? i even tried -1 to 1 or original spectrogram but the model always provides the same index for all recordings i tried (4 live call vs 3 voicemail) and the result is voicemail for all.

Is something else done for the audio preprocessing?

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment