import torch import torch.nn as nn import librosa from transformers import WhisperProcessor, WhisperModel class WhisperClassifier(nn.Module): def __init__(self, model_name="openai/whisper-small"): super(WhisperClassifier, self).__init__() self.whisper = WhisperModel.from_pretrained(model_name) self.fc = nn.Linear(self.whisper.config.d_model, 1) self.sigmoid = nn.Sigmoid() def forward(self, input_features): hidden_states = self.whisper.encoder(input_features).last_hidden_state pooled_output = hidden_states.mean(dim=1) logits = self.fc(pooled_output) return self.sigmoid(logits).squeeze(1) def predict(model, audio_path, processor, device='cpu'): audio, sr = librosa.load(audio_path, sr=16000) input_features = processor(audio, return_tensors="pt", sampling_rate=16000).input_features input_features = input_features.to(device) model.eval() with torch.no_grad(): output = model(input_features) prediction = 1 if output.item() > 0.5 else 0 # Convert probability to 0 or 1 return prediction def load_model(model_path): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = WhisperClassifier().to(device) model.load_state_dict(torch.load(model_path, map_location=device)) return model, device