import os import torch import librosa from transformers import AutoProcessor, AutoModelForCTC # Arabic wav2vec2 CTC model (CPU friendly but heavy) MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic" AUDIO_PATH = "sample_trim.wav" OUT_TXT = os.path.join("output", "asr_raw.txt") def main(): os.makedirs("output", exist_ok=True) print("Loading:", MODEL_ID) processor = AutoProcessor.from_pretrained(MODEL_ID) model = AutoModelForCTC.from_pretrained(MODEL_ID) model.eval() audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True) print("Audio sec:", round(len(audio)/sr, 2)) inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(**inputs).logits pred_ids = torch.argmax(logits, dim=-1) text = processor.batch_decode(pred_ids)[0].strip() # Save to file for downstream steps with open(OUT_TXT, "w", encoding="utf-8") as f: f.write(text + "\n") print("\n--- RAW TRANSCRIPTION ---") print(text) print(f"\nOK ✅ wrote {OUT_TXT}") if __name__ == "__main__": main()