This is canary-1b-flash finetuned for Persian lagnauge with only ASR taks.
How to perform inference, see https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Canary_Multitask_Speech_Model.ipynb
Inference
Install packages
!python -m pip install --upgrade pip
!apt-get update && apt-get install -y libsndfile1 ffmpeg
!pip install Cython packaging
!rm -rf /usr/lib/python3.10/site-packages/blinker*
!rm -rf /usr/local/lib/python3.10/dist-packages/blinker*
!pip install --ignore-installed blinker
!pip install --upgrade --force-reinstall blinker
!pip install 'nemo_toolkit[asr] @ git+https://github.com/NVIDIA/NeMo.git@main'
Download .nemo model from HF
from huggingface_hub import hf_hub_download
file_path = hf_hub_download(
repo_id="SadeghK/persian-nemo-stt-canary-1b-flash-fa-model",
filename="canary-1b-flash-fa/checkpoints/canary-1b-flash-fa.nemo",
token="TOKEN"
)
print(f"Downloaded to: {file_path}")
Import model
import torch
from nemo.collections.asr.models import EncDecMultiTaskModel
map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
canary_model = EncDecMultiTaskModel.restore_from(file_path, map_location=map_location)
canary_model.eval()
Change strategy to beam
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)
print(canary_model.cfg.decoding)
Change other beam parameters
decode_cfg = canary_model.cfg.decoding
decode_cfg.preserve_alignments = True
decode_cfg.compute_hypothesis_token_set = True
decode_cfg.strategy = "beam"
decode_cfg.beam.beam_size = 16
decode_cfg.beam.max_generation_delta = 25
decode_cfg.beam.len_pen = 1.0
canary_model.change_decoding_strategy(decode_cfg)
transcribe
AUDIO_FILE_PATH = "/content/drive/MyDrive/cNotebooks/perSTT/nvidia-nemo-asr/janathan-morgh-e-daryayi/03/chunk_007.wav"
# To transcribe in a particular language;
transcript = canary_model.transcribe(
audio=[AUDIO_FILE_PATH],
batch_size=16,
source_lang='fa',
target_lang='fa',
pnc='False',
timestamps=True
)
print(transcript)
RESULT is not AS good AS I expected! text='فلجر بود که داشتنِ آهسته و عمودی شانزده ای را تمرین می کرد و شماریِ هر نوشته را بلند می زد', text='فلجر بود که داشت غلتیدنِ آهسته و عمودی شانزده نقطهای را تمرین می کرد و شمارۀ هر نوشته را بلند فریاد می زد',
- Downloads last month
- 2
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support