File size: 4,620 Bytes
cc1ffa0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
import torch
import numpy as np
import librosa
from torch import nn
import torch.nn.functional as F
# Fungsi untuk ekstraksi MFCC
def extract_mfcc_and_pitch(audio_path, sr=16000, n_mfcc=40):
"""
Ekstrak fitur MFCC dan pitch dari file audio
"""
# Load audio file
audio, sr = librosa.load(audio_path, sr=sr)
# Ekstrak MFCC
mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
# Normalisasi MFCC
mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
# Ekstrak pitch menggunakan metode YIN
pitch = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C6'))
pitch = np.nan_to_num(pitch, nan=np.nanmean(pitch)) # Handle NaN values
# Normalisasi pitch
pitch = (pitch - np.mean(pitch)) / np.std(pitch)
# Ubah pitch menjadi 2D array untuk konsistensi
pitch = pitch.reshape(1, -1)
# Gabungkan MFCC dan pitch
combined_features = np.vstack([mfcc, pitch])
return combined_features
# X-Vector Architecture
class XVectorNet(nn.Module):
def __init__(self, input_dim=41, dropout_rate=0.45): # Tambah 1 dimensi untuk pitch
super(XVectorNet, self).__init__()
# Frame-level features
self.layer1 = nn.Conv1d(input_dim, 512, 5, padding=2)
self.dropout1 = nn.Dropout(dropout_rate)
self.layer2 = nn.Conv1d(512, 512, 3, padding=1)
self.dropout2 = nn.Dropout(dropout_rate)
self.layer3 = nn.Conv1d(512, 512, 3, padding=1)
self.dropout3 = nn.Dropout(dropout_rate)
self.layer4 = nn.Conv1d(512, 512, 1)
self.dropout4 = nn.Dropout(dropout_rate)
self.layer5 = nn.Conv1d(512, 1500, 1)
# Statistics pooling
self.stats_pooling = StatsPooling()
# Segment-level features
self.layer6 = nn.Linear(3000, 512)
self.dropout6 = nn.Dropout(dropout_rate)
self.layer7 = nn.Linear(512, 512)
self.dropout7 = nn.Dropout(dropout_rate)
self.output = nn.Linear(512, 2) # Binary classification
def forward(self, x):
x = F.relu(self.layer1(x))
x = self.dropout1(x)
x = F.relu(self.layer2(x))
x = self.dropout2(x)
x = F.relu(self.layer3(x))
x = self.dropout3(x)
x = F.relu(self.layer4(x))
x = self.dropout4(x)
x = F.relu(self.layer5(x))
x = self.stats_pooling(x)
x = F.relu(self.layer6(x))
x = self.dropout6(x)
x = F.relu(self.layer7(x))
x = self.dropout7(x)
x = self.output(x)
return x
class StatsPooling(nn.Module):
def forward(self, x):
mean = torch.mean(x, dim=2)
std = torch.std(x, dim=2)
return torch.cat((mean, std), dim=1)
# Fungsi untuk memuat model
def load_model(model_path, input_dim=41, dropout_rate=0.45):
model = XVectorNet(input_dim=input_dim, dropout_rate=dropout_rate)
model.load_state_dict(torch.load(model_path))
model.eval()
return model
# Fungsi untuk melakukan inference
def inference(model, audio_path, device='cuda' if torch.cuda.is_available() else 'cpu'):
# Ekstrak fitur dari file audio
features = extract_mfcc_and_pitch(audio_path)
# Konversi ke tensor dan tambahkan dimensi batch
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
# Lakukan inference
with torch.no_grad():
output = model(features_tensor)
probabilities = F.softmax(output, dim=1)
predicted_class = torch.argmax(probabilities, dim=1).item()
return predicted_class, probabilities[:, 1].item()
# Main execution untuk inference
def main_inference(model_path, audio_folder):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Muat model
model = load_model(model_path).to(device)
# Dapatkan semua file .wav dalam folder
wav_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')]
# Lakukan inference untuk setiap file
for wav_file in wav_files:
audio_path = os.path.join(audio_folder, wav_file)
predicted_class, probability = inference(model, audio_path, device)
print(f"File: {wav_file}, Predicted Class: {predicted_class}, Probability: {probability:.4f}")
if __name__ == "__main__":
# Path ke model yang telah disimpan
model_path = 'output/best_overall_model.pth'
# Path ke folder yang berisi file .wav untuk inference
audio_folder = '/path/to/folder/test'
# Jalankan inference
main_inference(model_path, audio_folder) |