| ``` | |
| from huggingface_hub import hf_hub_download | |
| import joblib | |
| from transformers import Wav2Vec2Processor, HubertModel | |
| from torchaudio import load | |
| import torch | |
| hf_hub_download(repo_id="Ansu/mHubert-basque-k1000-L9", filename="kmeans/basque_hubert_k1000_L9.pt", local_dir="./") | |
| kmeans = joblib.load("kmeans/basque_hubert_k1000_L9.pt") | |
| model_name = "Ansu/mHubert-basque-k1000-L9" | |
| processor = Wav2Vec2Processor.from_pretrained(model_name) | |
| model = HubertModel.from_pretrained(model_name) | |
| model.eval() | |
| audio = load("path/to/audio")[0] | |
| audio = audio.squeeze(0) | |
| inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| out = model(**inputs, output_hidden_states=True) | |
| features = out.hidden_states[9].squeeze(0).numpy() | |
| units = kmeans.predict(features) | |
| ``` |