SLPL
/

Sharif-wav2vec2

Automatic Speech Recognition

Model card Files Files and versions

sadrasabouri commited on Sep 4, 2022

Commit

b466f87

·

1 Parent(s): 1e2b39a

Update README.md

Files changed (1) hide show

README.md +8 -19

README.md CHANGED Viewed

@@ -32,45 +32,34 @@ model-index:
 # Sharif-wav2vec2
-Prior to the usage, you may need to install the below dependencies:
 ```shell
 pip -q install pyctcdecode
 python -m pip -q install pypi-kenlm
 ```
-Then you can use it with:
 ```python
 import tensorflow
 import torchaudio
 import torch
-import librosa
 import numpy as np
-from transformers import AutoProcessor, AutoModelForCTC
-processor = AutoProcessor.from_pretrained("SLPL/Sharif-wav2vec2")
-model = AutoModelForCTC.from_pretrained("SLPL/Sharif-wav2vec2")
-speech_array, sampling_rate = torchaudio.load("test.wav")
 speech_array = speech_array.squeeze().numpy()
-speech_array = librosa.resample(
-    np.asarray(speech_array),
-    sampling_rate,
-    processor.feature_extractor.sampling_rate)
 features = processor(
     speech_array,
     sampling_rate=processor.feature_extractor.sampling_rate,
     return_tensors="pt",
     padding=True)
-input_values = features.input_values
-attention_mask = features.attention_mask
 with torch.no_grad():
-    logits = model(input_values, attention_mask=attention_mask).logits
     prediction = processor.batch_decode(logits.numpy()).text
 print(prediction[0])

 # Sharif-wav2vec2
+This is the fine-tuned version of Sharif Wav2vec2 for Farsi. Prior to the usage, you may need to install the below dependencies:
 ```shell
 pip -q install pyctcdecode
 python -m pip -q install pypi-kenlm
 ```
+For testing you can use the hoster API at the hugging face (There are provided examples from common voice) it may take a while to transcribe the given voice. Or you can use bellow code for local run:
 ```python
 import tensorflow
 import torchaudio
 import torch
 import numpy as np
+speech_array, sampling_rate = torchaudio.load("wav2vec2-test.wav")
 speech_array = speech_array.squeeze().numpy()
 features = processor(
     speech_array,
     sampling_rate=processor.feature_extractor.sampling_rate,
     return_tensors="pt",
     padding=True)
 with torch.no_grad():
+    logits = model(
+        features.input_values,
+        attention_mask=features.attention_mask).logits
     prediction = processor.batch_decode(logits.numpy()).text
 print(prediction[0])