| # Arabic syllables recognition with tashkeel. | |
| This is fine tuned wav2vec2 model to recognize arabic syllables from speech. | |
| The model was trained on Modern standard arabic dataset.\ | |
| 5-gram language model is available with the model. | |
| To try it out : | |
| ``` | |
| !pip install datasets transformers | |
| !pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode | |
| ``` | |
| ``` | |
| from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
| from transformers import Wav2Vec2ProcessorWithLM | |
| processor = Wav2Vec2ProcessorWithLM.from_pretrained('IbrahimSalah/Syllables_final_Large') | |
| model = Wav2Vec2ForCTC.from_pretrained("IbrahimSalah/Syllables_final_Large") | |
| ``` | |
| ``` | |
| import pandas as pd | |
| dftest = pd.DataFrame(columns=['audio']) | |
| import datasets | |
| from datasets import Dataset | |
| path ='/content/908-33.wav' | |
| dftest['audio']=[path] ## audio path | |
| dataset = Dataset.from_pandas(dftest) | |
| ``` | |
| ``` | |
| import torch | |
| import torchaudio | |
| def speech_file_to_array_fn(batch): | |
| speech_array, sampling_rate = torchaudio.load(batch["audio"]) | |
| print(sampling_rate) | |
| resampler = torchaudio.transforms.Resample(sampling_rate, 16_000) # The original data was with 48,000 sampling rate. You can change it according to your input. | |
| batch["audio"] = resampler(speech_array).squeeze().numpy() | |
| return batch | |
| ``` | |
| ``` | |
| import numpy as np | |
| from datasets import load_dataset | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
| test_dataset = dataset.map(speech_file_to_array_fn) | |
| inputs = processor(test_dataset["audio"], sampling_rate=16_000, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| logits = model(inputs.input_values).logits | |
| print(logits.numpy().shape) | |
| transcription = processor.batch_decode(logits.numpy()).text | |
| print("Prediction:",transcription[0]) | |
| ``` | |