Spaces:
Sleeping
Sleeping
| import torch | |
| import librosa | |
| import numpy as np | |
| from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification | |
| # ===================== | |
| # CONFIG | |
| # ===================== | |
| MODEL_DIR = "exported_audio_model" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| SR = 16000 | |
| MAX_SAMPLES = 8 * SR # 8 seconds | |
| # ===================== | |
| # LOAD MODEL + PROCESSOR (ONCE) | |
| # ===================== | |
| processor = Wav2Vec2Processor.from_pretrained(MODEL_DIR) | |
| model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR) | |
| model.to(DEVICE) | |
| model.eval() | |
| # ===================== | |
| # PREDICT FUNCTION | |
| # ===================== | |
| def predict_audio(wav_path): | |
| # Load audio | |
| audio, sr = librosa.load(wav_path, sr=SR, mono=True) | |
| # Truncate if needed | |
| if len(audio) > MAX_SAMPLES: | |
| audio = audio[:MAX_SAMPLES] | |
| # Processor handles padding | |
| inputs = processor( | |
| audio, | |
| sampling_rate=SR, | |
| return_tensors="pt", | |
| padding=True, | |
| return_attention_mask=True | |
| ) | |
| input_values = inputs.input_values.to(DEVICE) | |
| attention_mask = inputs.attention_mask.to(DEVICE) | |
| with torch.no_grad(): | |
| outputs = model( | |
| input_values=input_values, | |
| attention_mask=attention_mask | |
| ) | |
| probs = torch.softmax(outputs.logits, dim=1)[0] | |
| pred_id = torch.argmax(probs).item() | |
| label = model.config.id2label[pred_id] | |
| confidence = probs[pred_id].item() * 100 | |
| return label, round(confidence, 2) | |