| import torch |
| import torchaudio |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
|
| |
| device = torch.device("cpu") |
|
|
| |
| torch.cuda.empty_cache() |
|
|
| |
| processor = WhisperProcessor.from_pretrained("openai/whisper-small") |
| model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device) |
|
|
| def transcribe_audio(audio_path): |
| global model, processor |
|
|
| |
| waveform, sample_rate = torchaudio.load(audio_path) |
| if sample_rate != 16000: |
| waveform = torchaudio.functional.resample(waveform, sample_rate, 16000) |
|
|
| |
| if waveform.shape[0] > 1: |
| waveform = waveform.mean(dim=0, keepdim=True) |
|
|
| |
| input_features = processor( |
| waveform.squeeze().numpy(), |
| sampling_rate=16000, |
| return_tensors="pt", |
| language="uz" |
| ).input_features.to(device) |
|
|
| |
| with torch.no_grad(): |
| predicted_ids = model.generate(input_features) |
|
|
| |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
| return transcription.strip() |
|
|
| |
| if __name__ == "__main__": |
| audio_file = "some_audio_max_30_sec.wav" |
| print("Transcribing on CPU, please wait...") |
| text = transcribe_audio(audio_file) |
| print(f"Transcription:\n{text}") |
|
|