Spaces:
Sleeping
Sleeping
| #Importing all the necessary packages | |
| import nltk | |
| import soundfile | |
| import librosa | |
| import torch | |
| import gradio as gr | |
| from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
| import os | |
| nltk.download("punkt") | |
| hf_token = os.environ.get("25-hf-secrete-key") | |
| ## | |
| #Loading the pre-trained model and the tokenizer | |
| model_name = "moro23/wav2vec-large-xls-r-300-ha-colab_4" | |
| #tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name, use_auth_token=token_value) | |
| tokenizer = Wav2Vec2Processor.from_pretrained(model_name, use_auth_token=hf_token) | |
| model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=hf_token) | |
| def load_data(input_file): | |
| speech , sample_rate = librosa.load(input_file) | |
| #make it 1-D | |
| if speech.ndim > 1: | |
| speech = speech.mean(axis=1) | |
| #Resampling the audio at 16KHz | |
| if sample_rate !=16000: | |
| speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000) | |
| return speech | |
| def correct_casing(input_sentence): | |
| sentences = nltk.sent_tokenize(input_sentence) | |
| return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences])) | |
| def asr_transcript(input_file): | |
| speech = load_data(input_file) | |
| #Tokenize | |
| input_dict = tokenizer(speech, return_tensors="pt", sampling_rate=16000, padding=True) | |
| #Take logits | |
| logits = model(input_dict.input_values).logits | |
| #Take argmax | |
| predicted_ids = torch.argmax(logits, dim=-1)[0] | |
| #Get the words from predicted word ids | |
| transcription = tokenizer.decode(predicted_ids) | |
| #Correcting the letter casing | |
| transcription = correct_casing(transcription.lower()) | |
| return transcription | |
| ################### Gradio Web APP ################################ | |
| title = "Hausa Automatic Speech Recognition" | |
| examples = [["Sample/sample1.mp3"], ["Sample/sample2.mp3"], ["Sample/sample3.mp3"]] | |
| Input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Please Record Your Voice") | |
| Output = gr.Textbox(label="Hausa Script") | |
| description = "This application transcribes spoken Hausa. Record your voice or upload an audio file. Click the 'Flag' button to help us improve the model!" | |
| demo = gr.Interface(fn = asr_transcript, inputs = Input, outputs = Output, title = title,description=description, | |
| allow_flagging="manual", flagging_options=["incorrect", "worst", "ambiguous"], flagging_dir="flaffed_data") | |
| demo.launch(share=True) |