Ansu's picture
Update README.md
2b7c9a4 verified

from huggingface_hub import hf_hub_download
import joblib
from transformers import Wav2Vec2Processor, HubertModel
from torchaudio import load
import torch

hf_hub_download(repo_id="Ansu/mHubert-basque-k1000-L9", filename="kmeans/basque_hubert_k1000_L9.pt", local_dir="./")

kmeans = joblib.load("kmeans/basque_hubert_k1000_L9.pt")

model_name = "Ansu/mHubert-basque-k1000-L9"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = HubertModel.from_pretrained(model_name)
model.eval()

audio = load("path/to/audio")[0]
audio = audio.squeeze(0)

inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)

with torch.no_grad():
    out = model(**inputs, output_hidden_states=True)

features = out.hidden_states[9].squeeze(0).numpy()

units = kmeans.predict(features)