|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import librosa |
|
|
from transformers import WhisperProcessor, WhisperModel |
|
|
|
|
|
class WhisperClassifier(nn.Module): |
|
|
def __init__(self, model_name="openai/whisper-small"): |
|
|
super(WhisperClassifier, self).__init__() |
|
|
self.whisper = WhisperModel.from_pretrained(model_name) |
|
|
self.fc = nn.Linear(self.whisper.config.d_model, 1) |
|
|
self.sigmoid = nn.Sigmoid() |
|
|
|
|
|
def forward(self, input_features): |
|
|
hidden_states = self.whisper.encoder(input_features).last_hidden_state |
|
|
pooled_output = hidden_states.mean(dim=1) |
|
|
logits = self.fc(pooled_output) |
|
|
return self.sigmoid(logits).squeeze(1) |
|
|
|
|
|
def predict(model, audio_path, processor, device='cpu'): |
|
|
audio, sr = librosa.load(audio_path, sr=16000) |
|
|
input_features = processor(audio, return_tensors="pt", sampling_rate=16000).input_features |
|
|
input_features = input_features.to(device) |
|
|
|
|
|
model.eval() |
|
|
with torch.no_grad(): |
|
|
output = model(input_features) |
|
|
prediction = 1 if output.item() > 0.5 else 0 |
|
|
return prediction |
|
|
|
|
|
def load_model(model_path): |
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
model = WhisperClassifier().to(device) |
|
|
model.load_state_dict(torch.load(model_path, map_location=device)) |
|
|
return model, device |
|
|
|