MR-AI-007 commited on
Commit
cc1ffa0
·
verified ·
1 Parent(s): edf4493

to check the model results

Browse files
Files changed (1) hide show
  1. inference.py +137 -0
inference.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import librosa
5
+ from torch import nn
6
+ import torch.nn.functional as F
7
+
8
+ # Fungsi untuk ekstraksi MFCC
9
+ def extract_mfcc_and_pitch(audio_path, sr=16000, n_mfcc=40):
10
+ """
11
+ Ekstrak fitur MFCC dan pitch dari file audio
12
+ """
13
+ # Load audio file
14
+ audio, sr = librosa.load(audio_path, sr=sr)
15
+
16
+ # Ekstrak MFCC
17
+ mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
18
+
19
+ # Normalisasi MFCC
20
+ mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
21
+
22
+ # Ekstrak pitch menggunakan metode YIN
23
+ pitch = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C6'))
24
+ pitch = np.nan_to_num(pitch, nan=np.nanmean(pitch)) # Handle NaN values
25
+
26
+ # Normalisasi pitch
27
+ pitch = (pitch - np.mean(pitch)) / np.std(pitch)
28
+
29
+ # Ubah pitch menjadi 2D array untuk konsistensi
30
+ pitch = pitch.reshape(1, -1)
31
+
32
+ # Gabungkan MFCC dan pitch
33
+ combined_features = np.vstack([mfcc, pitch])
34
+
35
+ return combined_features
36
+
37
+ # X-Vector Architecture
38
+ class XVectorNet(nn.Module):
39
+ def __init__(self, input_dim=41, dropout_rate=0.45): # Tambah 1 dimensi untuk pitch
40
+ super(XVectorNet, self).__init__()
41
+
42
+ # Frame-level features
43
+ self.layer1 = nn.Conv1d(input_dim, 512, 5, padding=2)
44
+ self.dropout1 = nn.Dropout(dropout_rate)
45
+ self.layer2 = nn.Conv1d(512, 512, 3, padding=1)
46
+ self.dropout2 = nn.Dropout(dropout_rate)
47
+ self.layer3 = nn.Conv1d(512, 512, 3, padding=1)
48
+ self.dropout3 = nn.Dropout(dropout_rate)
49
+ self.layer4 = nn.Conv1d(512, 512, 1)
50
+ self.dropout4 = nn.Dropout(dropout_rate)
51
+ self.layer5 = nn.Conv1d(512, 1500, 1)
52
+
53
+ # Statistics pooling
54
+ self.stats_pooling = StatsPooling()
55
+
56
+ # Segment-level features
57
+ self.layer6 = nn.Linear(3000, 512)
58
+ self.dropout6 = nn.Dropout(dropout_rate)
59
+ self.layer7 = nn.Linear(512, 512)
60
+ self.dropout7 = nn.Dropout(dropout_rate)
61
+ self.output = nn.Linear(512, 2) # Binary classification
62
+
63
+ def forward(self, x):
64
+ x = F.relu(self.layer1(x))
65
+ x = self.dropout1(x)
66
+ x = F.relu(self.layer2(x))
67
+ x = self.dropout2(x)
68
+ x = F.relu(self.layer3(x))
69
+ x = self.dropout3(x)
70
+ x = F.relu(self.layer4(x))
71
+ x = self.dropout4(x)
72
+ x = F.relu(self.layer5(x))
73
+
74
+ x = self.stats_pooling(x)
75
+
76
+ x = F.relu(self.layer6(x))
77
+ x = self.dropout6(x)
78
+ x = F.relu(self.layer7(x))
79
+ x = self.dropout7(x)
80
+ x = self.output(x)
81
+
82
+ return x
83
+
84
+ class StatsPooling(nn.Module):
85
+ def forward(self, x):
86
+ mean = torch.mean(x, dim=2)
87
+ std = torch.std(x, dim=2)
88
+ return torch.cat((mean, std), dim=1)
89
+
90
+ # Fungsi untuk memuat model
91
+ def load_model(model_path, input_dim=41, dropout_rate=0.45):
92
+ model = XVectorNet(input_dim=input_dim, dropout_rate=dropout_rate)
93
+ model.load_state_dict(torch.load(model_path))
94
+ model.eval()
95
+ return model
96
+
97
+ # Fungsi untuk melakukan inference
98
+ def inference(model, audio_path, device='cuda' if torch.cuda.is_available() else 'cpu'):
99
+ # Ekstrak fitur dari file audio
100
+ features = extract_mfcc_and_pitch(audio_path)
101
+
102
+ # Konversi ke tensor dan tambahkan dimensi batch
103
+ features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
104
+
105
+ # Lakukan inference
106
+ with torch.no_grad():
107
+ output = model(features_tensor)
108
+ probabilities = F.softmax(output, dim=1)
109
+ predicted_class = torch.argmax(probabilities, dim=1).item()
110
+
111
+ return predicted_class, probabilities[:, 1].item()
112
+
113
+ # Main execution untuk inference
114
+ def main_inference(model_path, audio_folder):
115
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
116
+
117
+ # Muat model
118
+ model = load_model(model_path).to(device)
119
+
120
+ # Dapatkan semua file .wav dalam folder
121
+ wav_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')]
122
+
123
+ # Lakukan inference untuk setiap file
124
+ for wav_file in wav_files:
125
+ audio_path = os.path.join(audio_folder, wav_file)
126
+ predicted_class, probability = inference(model, audio_path, device)
127
+ print(f"File: {wav_file}, Predicted Class: {predicted_class}, Probability: {probability:.4f}")
128
+
129
+ if __name__ == "__main__":
130
+ # Path ke model yang telah disimpan
131
+ model_path = 'output/best_overall_model.pth'
132
+
133
+ # Path ke folder yang berisi file .wav untuk inference
134
+ audio_folder = '/path/to/folder/test'
135
+
136
+ # Jalankan inference
137
+ main_inference(model_path, audio_folder)