| from datasets import load_dataset | |
| from tqdm import tqdm | |
| import pandas as pd | |
| cache_dir = "./../../../cache" | |
| dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True) | |
| from repcodec.RepCodec import RepCodec | |
| import torch | |
| import yaml | |
| config = "./../repcodec/configs/repcodec_dim1024.yaml" | |
| with open(config) as fp: | |
| conf = yaml.load(fp, Loader=yaml.FullLoader) | |
| model = RepCodec(**conf) | |
| model.load_state_dict(torch.load("./../../models/data2vec_large_l18.pkl", map_location="cuda:0")["model"]["repcodec"]) | |
| model.quantizer.initial() | |
| model.eval() | |
| model.to("cuda:0") | |
| from data2vec_feature_reader import Data2vecFeatureReader | |
| reader = Data2vecFeatureReader("./../../models/vox_pretrained.pt", 18, device="cuda:0", max_chunk=1600000) | |
| import torch.nn.functional as F | |
| import numpy as np | |
| for split in dataset.keys(): | |
| tokens = [] | |
| for idx in tqdm(range(len(dataset[split]))): | |
| sample = dataset[split][idx] | |
| x = sample["audio"]["array"] | |
| with torch.no_grad(): | |
| x = torch.from_numpy(x).float().to(reader.device) | |
| if reader.task.cfg.normalize: | |
| x = F.layer_norm(x, x.shape) | |
| x = x.view(1, -1) | |
| feat = [] | |
| for start in range(0, x.size(1), reader.max_chunk): | |
| x_chunk = x[:, start: start + reader.max_chunk] | |
| res = reader.model.extract_features( | |
| source=x_chunk, | |
| padding_mask=None, | |
| mask=False, | |
| layer=reader.layer, | |
| ) | |
| feat_chunk = res["x"] | |
| feat.append(feat_chunk) | |
| features = torch.cat(feat, 1).permute(0, 2, 1) | |
| x = model.encoder(features) | |
| z = model.projector(x) | |
| _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1)) | |
| tkn = idx.detach().cpu().data.numpy()[0] | |
| tokens.append(tkn) | |
| np.savez(f"./tkns/{split}.npz", *tokens) |