from transformers.models.wav2vec2 import Wav2Vec2Processor, Wav2Vec2ForCTC # pip install transformers import torch import torchaudio model_name = "sarahai/uzbek-stt-3" model = Wav2Vec2ForCTC.from_pretrained(model_name) processor = Wav2Vec2Processor.from_pretrained(model_name) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def load_and_preprocess_audio(file_path): speech_array, sampling_rate = torchaudio.load(file_path) if sampling_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) speech_array = resampler(speech_array) return speech_array.squeeze().numpy() def replace_unk(transcription): return transcription.replace("[UNK]", "ΚΌ") audio_file = "/content/audio_2024-08-13_15-20-53.ogg" speech_array = load_and_preprocess_audio(audio_file) input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values.to(device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) transcription_text = replace_unk(transcription[0]) print("Transcription:", transcription_text)