openslr/librispeech_asr
Viewer • Updated • 585k • 81.1k • 228
Fine-tuned version of mms-trilingual-dv-ar-en with improved:
from transformers import AutoProcessor, Wav2Vec2ForCTC
import torch
processor = AutoProcessor.from_pretrained("Serialtechlab/mms-trilingual-dv-ar-en-v2")
model = Wav2Vec2ForCTC.from_pretrained("Serialtechlab/mms-trilingual-dv-ar-en-v2")
# Process audio (16kHz)
inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]