Eripsa's picture
add phd_model deps
c27990f
"""
This is an example how to load the model from Huggingface and use it to
- Recognize IPA phones
- Extract CNN features
- Extract Transformer Encoder features
"""
from decoder.ctc_decoder import decode_lattice
from phonetics.ipa import symbol_to_descriptor, to_symbol
from model.wav2vec2 import Wav2Vec2
from torchinfo import summary
import torch
import numpy as np
def main():
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load model from Huggingface hub
wav2vec2 = Wav2Vec2.from_pretrained("pklumpp/Wav2Vec2_CommonPhone")
wav2vec2.to(device)
wav2vec2.eval()
# Print model summary for batch_size 1 and a single second of audio samples
summary(wav2vec2, input_size=(1, 16_000), depth=8, device=device)
# Create new random audio (you can load your own audio here to get actual predictions)
rand_audio = np.random.rand(1, 16_000)
# IMPORTANT: Always standardize input audio
mean = rand_audio.mean()
std = rand_audio.std()
rand_audio = (rand_audio - mean) / (std + 1e-9)
# Create torch tensor, move to device and feed the model
rand_audio = torch.tensor(
rand_audio,
dtype=torch.float,
device=device,
)
with torch.no_grad():
y_pred, enc_features, cnn_features = wav2vec2(rand_audio)
# Decode CTC output for first sample in batch
phone_sequence, enc_feats, cnn_feats, probs = decode_lattice(
lattice=y_pred[0].cpu().numpy(),
enc_feats=enc_features[0].cpu().numpy(),
cnn_feats=cnn_features[0].cpu().numpy(),
)
# phone_sequence contains indices right now. Convert to actual IPA symbols
symbol_sequence = [to_symbol(i) for i in phone_sequence]
# Example to convert [œ] to the descriptor "front open-mid rounded vowel"
print(symbol_to_descriptor("œ"))
if __name__ == "__main__":
main()