| # import | |
| import librosa | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM | |
| # load the processor | |
| processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm") | |
| model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") | |
| # load the audio data (use your own wav file here!) | |
| input_audio, sr = librosa.load('my_wav_file.wav', sr=16000) | |
| # tokenize | |
| input_values = processor(input_audio, return_tensors="pt", padding="longest").input_values | |
| # retrieve logits | |
| logits = model(input_values).logits | |
| # decode using n-gram | |
| transcription = processor.batch_decode(logits.detach().numpy()).text | |
| # print the output | |
| print(transcription) | |