| |
| import nltk |
| import librosa |
| import torch |
| import gradio as gr |
| from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC |
| nltk.download("punkt") |
|
|
|
|
| model_name = "kalmuraee/tokens" |
| tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name) |
| model = Wav2Vec2ForCTC.from_pretrained(model_name) |
|
|
| def load_data(input_file): |
|
|
| |
| speech, sample_rate = librosa.load(input_file) |
| |
| if len(speech.shape) > 1: |
| speech = speech[:,0] + speech[:,1] |
| |
| if sample_rate !=16000: |
| speech = librosa.resample(speech, sample_rate,16000) |
| return speech |
|
|
|
|
| def correct_casing(input_sentence): |
|
|
| sentences = nltk.sent_tokenize(input_sentence) |
| return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences])) |
| |
| |
| def asr_transcript(input_file): |
|
|
| speech = load_data(input_file) |
| |
| input_values = tokenizer(speech, return_tensors="pt").input_values |
| |
| logits = model(input_values).logits |
| |
| predicted_ids = torch.argmax(logits, dim=-1) |
| |
| transcription = tokenizer.decode(predicted_ids[0]) |
| |
| transcription = correct_casing(transcription.lower()) |
| return transcription |
| |
| |
| |
| |
| gr.Interface(asr_transcript, |
| inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker"), |
| outputs = gr.outputs.Textbox(label="Output Text"), |
| title="ASR using Wav2Vec 2.0", |
| description = "This application displays transcribed text for given audio input", |
| examples = [["Test_File1.wav"], ["Test_File2.wav"], ["Test_File3.wav"]], theme="grass").launch() |
|
|