import torch import librosa import numpy as np from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification # ===================== # CONFIG # ===================== MODEL_DIR = "exported_audio_model" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" SR = 16000 MAX_SAMPLES = 8 * SR # 8 seconds # ===================== # LOAD MODEL + PROCESSOR (ONCE) # ===================== processor = Wav2Vec2Processor.from_pretrained(MODEL_DIR) model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR) model.to(DEVICE) model.eval() # ===================== # PREDICT FUNCTION # ===================== def predict_audio(wav_path): # Load audio audio, sr = librosa.load(wav_path, sr=SR, mono=True) # Truncate if needed if len(audio) > MAX_SAMPLES: audio = audio[:MAX_SAMPLES] # Processor handles padding inputs = processor( audio, sampling_rate=SR, return_tensors="pt", padding=True, return_attention_mask=True ) input_values = inputs.input_values.to(DEVICE) attention_mask = inputs.attention_mask.to(DEVICE) with torch.no_grad(): outputs = model( input_values=input_values, attention_mask=attention_mask ) probs = torch.softmax(outputs.logits, dim=1)[0] pred_id = torch.argmax(probs).item() label = model.config.id2label[pred_id] confidence = probs[pred_id].item() * 100 return label, round(confidence, 2)