import os import pandas as pd import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader import librosa from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder # Custom dataset class AudioDataset(Dataset): def __init__(self, audio_paths, labels): self.audio_paths = audio_paths self.labels = labels def __len__(self): return len(self.audio_paths) def __getitem__(self, idx): audio_path = self.audio_paths[idx] # Load audio file using librosa audio, sr = librosa.load(audio_path, sr=16000, mono=True) # Extract MFCCs mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40) # Pad or truncate to ensure consistent shape if mfccs.shape[1] > 100: mfccs = mfccs[:, :100] else: mfccs = np.pad(mfccs, ((0, 0), (0, 100 - mfccs.shape[1])), mode='constant') # Convert to torch tensor mfccs_tensor = torch.FloatTensor(mfccs).unsqueeze(0) # Add channel dimension # Convert label to Long tensor label = torch.LongTensor([self.labels[idx]])[0] return mfccs_tensor, label # CNN Model class AudioClassifier(nn.Module): def __init__(self, num_classes): super(AudioClassifier, self).__init__() self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1) self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1) self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1) self.pool = nn.MaxPool2d(2, 2) self.fc1 = nn.Linear(128 * 5 * 12, 128) self.fc2 = nn.Linear(128, num_classes) self.relu = nn.ReLU() self.dropout = nn.Dropout(0.5) def forward(self, x): x = self.pool(self.relu(self.conv1(x))) x = self.pool(self.relu(self.conv2(x))) x = self.pool(self.relu(self.conv3(x))) x = x.view(-1, 128 * 5 * 12) x = self.relu(self.fc1(x)) x = self.dropout(x) x = self.fc2(x) return x model=AudioClassifier(2) model.load_state_dict(torch.load('audio_classifier_model_2.pth',map_location=torch.device('cpu'))) model.eval() k=(np.load(file="label_encoder_classes.npy",allow_pickle=True)) classes=k print(k) def classify_audio(audio_file): # Load and preprocess the audio audio, sr = librosa.load(audio_file, sr=16000, mono=True) mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40) if mfccs.shape[1] > 100: mfccs = mfccs[:, :100] else: mfccs = np.pad(mfccs, ((0, 0), (0, 100 - mfccs.shape[1])), mode='constant') # Convert to torch tensor mfccs_tensor = torch.FloatTensor(mfccs).unsqueeze(0).unsqueeze(0) # Make prediction with torch.no_grad(): outputs = model(mfccs_tensor) _, predicted = torch.max(outputs, 1) # Convert prediction to label predicted_label = predicted.item() return classes[predicted_label] import gradio as gr iface = gr.Interface( fn=classify_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="💓 Heartbeat", description=( "Upload an audio file of a baby's heartbeat to check for abnormalities.
" "Trained on: Bhaskaran, A., & Arora, M. (2022). Indian Institute of Science Fetal Heart Sound Database (IIScFHSDB) (version 1.0). PhysioNet. https://doi.org/10.13026/9vvw-cx05.
" "Original publication: Amrutha, B; Sidhesh Kumar, J; George, S. & Arora, M. Heart rate estimation and validation algorithm for fetal phonocardiography. Physiological Measurement, 2022." ) ) # Launch the app iface.launch()