| from transformers import HubertModel |
| import torch.nn as nn |
| import torch |
| import torch.nn.functional as F |
| import librosa |
|
|
|
|
| class HubertModelWithFinalProj(HubertModel): |
| def __init__(self, config): |
| super().__init__(config) |
|
|
| |
| |
| |
| self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) |
|
|
|
|
| def get_content_model(config='lengyue233/content-vec-best'): |
| model = HubertModelWithFinalProj.from_pretrained(config) |
| model.eval() |
| return model |
|
|
|
|
| @torch.no_grad() |
| def get_content(model, wav_16k_tensor, device='cuda'): |
| |
| wav_16k_tensor = wav_16k_tensor.to(device) |
| |
| wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2)) |
| logits = model(wav_16k_tensor)['last_hidden_state'] |
| return logits |
|
|
|
|
| if __name__ == '__main__': |
| model = get_content_model().cuda() |
| audio, sr = librosa.load('test.wav', sr=16000) |
| audio = audio[:100*320] |
| audio = torch.tensor([audio]) |
| content = get_content(model, audio, 'cuda') |
| print(content) |