Spaces:
Sleeping
Sleeping
| """ | |
| This is an example how to load the model from Huggingface and use it to | |
| - Recognize IPA phones | |
| - Extract CNN features | |
| - Extract Transformer Encoder features | |
| """ | |
| from decoder.ctc_decoder import decode_lattice | |
| from phonetics.ipa import symbol_to_descriptor, to_symbol | |
| from model.wav2vec2 import Wav2Vec2 | |
| from torchinfo import summary | |
| import torch | |
| import numpy as np | |
| def main(): | |
| # Get device | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load model from Huggingface hub | |
| wav2vec2 = Wav2Vec2.from_pretrained("pklumpp/Wav2Vec2_CommonPhone") | |
| wav2vec2.to(device) | |
| wav2vec2.eval() | |
| # Print model summary for batch_size 1 and a single second of audio samples | |
| summary(wav2vec2, input_size=(1, 16_000), depth=8, device=device) | |
| # Create new random audio (you can load your own audio here to get actual predictions) | |
| rand_audio = np.random.rand(1, 16_000) | |
| # IMPORTANT: Always standardize input audio | |
| mean = rand_audio.mean() | |
| std = rand_audio.std() | |
| rand_audio = (rand_audio - mean) / (std + 1e-9) | |
| # Create torch tensor, move to device and feed the model | |
| rand_audio = torch.tensor( | |
| rand_audio, | |
| dtype=torch.float, | |
| device=device, | |
| ) | |
| with torch.no_grad(): | |
| y_pred, enc_features, cnn_features = wav2vec2(rand_audio) | |
| # Decode CTC output for first sample in batch | |
| phone_sequence, enc_feats, cnn_feats, probs = decode_lattice( | |
| lattice=y_pred[0].cpu().numpy(), | |
| enc_feats=enc_features[0].cpu().numpy(), | |
| cnn_feats=cnn_features[0].cpu().numpy(), | |
| ) | |
| # phone_sequence contains indices right now. Convert to actual IPA symbols | |
| symbol_sequence = [to_symbol(i) for i in phone_sequence] | |
| # Example to convert [œ] to the descriptor "front open-mid rounded vowel" | |
| print(symbol_to_descriptor("œ")) | |
| if __name__ == "__main__": | |
| main() | |