import torch import librosa from transformers import AutoFeatureExtractor, AutoModel MODEL_ID = "microsoft/wavlm-base" def load_audio(path: str, target_sr: int = 16000): audio, sr = librosa.load(path, sr=target_sr, mono=True) return audio, sr def main(): print("Loading model:", MODEL_ID) feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID) model = AutoModel.from_pretrained(MODEL_ID) model.eval() audio, sr = load_audio("sample.wav") print("Audio length (sec):", round(len(audio) / sr, 2)) inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt") with torch.no_grad(): out = model(**inputs) x = out.last_hidden_state # [batch, frames, hidden] print("OK ✅ WavLM ran on CPU") print("Embedding tensor shape:", tuple(x.shape)) if __name__ == "__main__": main()