MR-AI-007
/

speech-verification

+import os
+import torch
+import numpy as np
+import librosa
+from torch import nn
+import torch.nn.functional as F
+# Fungsi untuk ekstraksi MFCC
+def extract_mfcc_and_pitch(audio_path, sr=16000, n_mfcc=40):
+    """
+    Ekstrak fitur MFCC dan pitch dari file audio
+    """
+    # Load audio file
+    audio, sr = librosa.load(audio_path, sr=sr)
+    # Ekstrak MFCC
+    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
+    # Normalisasi MFCC
+    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
+    # Ekstrak pitch menggunakan metode YIN
+    pitch = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C6'))
+    pitch = np.nan_to_num(pitch, nan=np.nanmean(pitch))  # Handle NaN values
+    # Normalisasi pitch
+    pitch = (pitch - np.mean(pitch)) / np.std(pitch)
+    # Ubah pitch menjadi 2D array untuk konsistensi
+    pitch = pitch.reshape(1, -1)
+    # Gabungkan MFCC dan pitch
+    combined_features = np.vstack([mfcc, pitch])
+    return combined_features
+# X-Vector Architecture
+class XVectorNet(nn.Module):
+    def __init__(self, input_dim=41, dropout_rate=0.45):  # Tambah 1 dimensi untuk pitch
+        super(XVectorNet, self).__init__()
+        # Frame-level features
+        self.layer1 = nn.Conv1d(input_dim, 512, 5, padding=2)
+        self.dropout1 = nn.Dropout(dropout_rate)
+        self.layer2 = nn.Conv1d(512, 512, 3, padding=1)
+        self.dropout2 = nn.Dropout(dropout_rate)
+        self.layer3 = nn.Conv1d(512, 512, 3, padding=1)
+        self.dropout3 = nn.Dropout(dropout_rate)
+        self.layer4 = nn.Conv1d(512, 512, 1)
+        self.dropout4 = nn.Dropout(dropout_rate)
+        self.layer5 = nn.Conv1d(512, 1500, 1)
+        # Statistics pooling
+        self.stats_pooling = StatsPooling()
+        # Segment-level features
+        self.layer6 = nn.Linear(3000, 512)
+        self.dropout6 = nn.Dropout(dropout_rate)
+        self.layer7 = nn.Linear(512, 512)
+        self.dropout7 = nn.Dropout(dropout_rate)
+        self.output = nn.Linear(512, 2)  # Binary classification
+    def forward(self, x):
+        x = F.relu(self.layer1(x))
+        x = self.dropout1(x)
+        x = F.relu(self.layer2(x))
+        x = self.dropout2(x)
+        x = F.relu(self.layer3(x))
+        x = self.dropout3(x)
+        x = F.relu(self.layer4(x))
+        x = self.dropout4(x)
+        x = F.relu(self.layer5(x))
+        x = self.stats_pooling(x)
+        x = F.relu(self.layer6(x))
+        x = self.dropout6(x)
+        x = F.relu(self.layer7(x))
+        x = self.dropout7(x)
+        x = self.output(x)
+        return x
+class StatsPooling(nn.Module):
+    def forward(self, x):
+        mean = torch.mean(x, dim=2)
+        std = torch.std(x, dim=2)
+        return torch.cat((mean, std), dim=1)
+# Fungsi untuk memuat model
+def load_model(model_path, input_dim=41, dropout_rate=0.45):
+    model = XVectorNet(input_dim=input_dim, dropout_rate=dropout_rate)
+    model.load_state_dict(torch.load(model_path))
+    model.eval()
+    return model
+# Fungsi untuk melakukan inference
+def inference(model, audio_path, device='cuda' if torch.cuda.is_available() else 'cpu'):
+    # Ekstrak fitur dari file audio
+    features = extract_mfcc_and_pitch(audio_path)
+    # Konversi ke tensor dan tambahkan dimensi batch
+    features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
+    # Lakukan inference
+    with torch.no_grad():
+        output = model(features_tensor)
+        probabilities = F.softmax(output, dim=1)
+        predicted_class = torch.argmax(probabilities, dim=1).item()
+    return predicted_class, probabilities[:, 1].item()
+# Main execution untuk inference
+def main_inference(model_path, audio_folder):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Muat model
+    model = load_model(model_path).to(device)
+    # Dapatkan semua file .wav dalam folder
+    wav_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')]
+    # Lakukan inference untuk setiap file
+    for wav_file in wav_files:
+        audio_path = os.path.join(audio_folder, wav_file)
+        predicted_class, probability = inference(model, audio_path, device)
+        print(f"File: {wav_file}, Predicted Class: {predicted_class}, Probability: {probability:.4f}")
+if __name__ == "__main__":
+    # Path ke model yang telah disimpan
+    model_path = 'output/best_overall_model.pth'
+    # Path ke folder yang berisi file .wav untuk inference
+    audio_folder = '/path/to/folder/test'
+    # Jalankan inference
+    main_inference(model_path, audio_folder)