|
|
from transformers import HubertModel |
|
|
import torch.nn as nn |
|
|
import torch |
|
|
import torch.nn.functional as F |
|
|
import torchaudio |
|
|
import librosa |
|
|
|
|
|
|
|
|
class HubertModelWithFinalProj(HubertModel): |
|
|
def __init__(self, config): |
|
|
super().__init__(config) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) |
|
|
|
|
|
|
|
|
class VoiceConversionExtractor(nn.Module): |
|
|
|
|
|
def __init__(self, config, sr): |
|
|
super().__init__() |
|
|
self.encoder = HubertModelWithFinalProj.from_pretrained(config) |
|
|
self.encoder.eval() |
|
|
self.sr = sr |
|
|
self.target_sr = 16000 |
|
|
if self.sr != self.target_sr: |
|
|
self.resampler = torchaudio.transforms.Resample(orig_freq=self.sr, |
|
|
new_freq=self.target_sr) |
|
|
|
|
|
def forward(self, audio): |
|
|
if self.sr != self.target_sr: |
|
|
audio = self.resampler(audio) |
|
|
audio = F.pad(audio, ((400 - 320) // 2, (400 - 320) // 2)) |
|
|
logits = self.encoder(audio)['last_hidden_state'] |
|
|
return logits |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
model = VoiceConversionExtractor('lengyue233/content-vec-best', 24000) |
|
|
audio, sr = librosa.load('test.wav', sr=24000) |
|
|
audio = audio[:round(100*320*1.5)] |
|
|
audio = torch.tensor([audio]) |
|
|
with torch.no_grad(): |
|
|
content = model(audio) |
|
|
print(content.shape) |