Inference failing

#1
by adsoul - opened
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import librosa
import numpy as np

class AudioClassifierCNN(nn.Module):
    def __init__(self):
        super(AudioClassifierCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(51200, 64)
        self.dropout = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(64, 2)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

def load_model(model_path: str) -> nn.Module:
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found: {model_path}")
    model = torch.load(model_path, map_location='cpu', weights_only=False)
    model.eval()
    return model

def preprocess_audio_librosa(audio_path: str) -> torch.Tensor:
    target_sr = 16000
    duration_sec = 4
    
    signal, sr = librosa.load(audio_path, sr=target_sr, mono=True)

    num_samples = duration_sec * target_sr
    if signal.shape[0] > num_samples:
        signal = signal[:num_samples]
    else:
        signal = np.pad(signal, (0, num_samples - signal.shape[0]), 'constant')

    power_mel_spec = librosa.feature.melspectrogram(
        y=signal,
        sr=target_sr,
        n_fft=1024,
        hop_length=160,
        n_mels=128,
        fmin=0,
        fmax=target_sr/2
    )
    
    spec_db = librosa.power_to_db(power_mel_spec, ref=np.max).astype('float32')

    spec_normalized = (spec_db - spec_db.min()) / (spec_db.max() - spec_db.min())
    
    spec_tensor = torch.from_numpy(spec_normalized).float().unsqueeze(0).unsqueeze(0)
    
    return spec_tensor

def classify_voicemail(model_path: str, audio_path: str) -> dict:
    model = load_model(model_path)
    audio_tensor = preprocess_audio_librosa(audio_path)
    
    with torch.no_grad():
        outputs = model(audio_tensor)
        probabilities = F.softmax(outputs, dim=1)
        confidence, prediction_idx = torch.max(probabilities, dim=1)

    labels = {0: "Live Call", 1: "Voicemail"}
    prediction = labels[prediction_idx.item()]
    
    return {
        "file": os.path.basename(audio_path),
        "prediction": prediction,
        "confidence": f"{confidence.item():.2%}"
    }

Here is the code, I assume the spectrogram is normalized between 0 > 1? i even tried -1 to 1 or original spectrogram but the model always provides the same index for all recordings i tried (4 live call vs 3 voicemail) and the result is voicemail for all.

Is something else done for the audio preprocessing?

Sign up or log in to comment