import torch import librosa from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor # ---------------------------- # CONFIG # ---------------------------- CHECKPOINT_PATH = "./wav2vec2-xlsr-khmer-300m/checkpoint-1800" AUDIO_PATH = "data\wavs\00000.wav" # <-- change this to your test file # ---------------------------- # LOAD MODEL AND PROCESSOR # ---------------------------- print("Loading model and processor...") processor = Wav2Vec2Processor.from_pretrained(CHECKPOINT_PATH) model = Wav2Vec2ForCTC.from_pretrained(CHECKPOINT_PATH) model.eval() # ---------------------------- # LOAD AUDIO # ---------------------------- print("Loading audio:", AUDIO_PATH) speech, sr = librosa.load(AUDIO_PATH, sr=16000) inputs = processor( speech, sampling_rate=16000, return_tensors="pt", padding=True ) with torch.no_grad(): logits = model(inputs.input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] print("\n===============================") print("🔊 Transcription Result:") print(transcription) print("===============================")