| import copy |
| import time |
|
|
| import gradio as gr |
| import numpy as np |
| import torch |
| import torchaudio |
| from loguru import logger |
| from transformers import pipeline |
|
|
|
|
|
|
| transcriber = None |
|
|
|
|
| def load_stt(): |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| transcriber = pipeline( |
| "automatic-speech-recognition", model="openai/whisper-base.en", device=device |
| ) |
| return transcriber |
|
|
|
|
|
|
| def save_audio_as_wav(data, sample_rate, file_path): |
| |
| data = torch.tensor(data).reshape(1, -1) |
| torchaudio.save( |
| file_path, data, sample_rate=sample_rate, bits_per_sample=16, encoding="PCM_S" |
| ) |
|
|
|
|
| def transcribe_audio(audio): |
| global transcriber |
| if transcriber is None: |
| transcriber = load_stt() |
|
|
| sample_rate, data = audio |
| try: |
| data = data.astype(np.float32) |
| data /= np.max(np.abs(data)) |
| text = transcriber({"sampling_rate": sample_rate, "raw": data})["text"] |
| gr.Info(f"Transcribed text is: {text}\nProcessing the input...") |
|
|
| except Exception as e: |
| logger.error(f"Error: {e}") |
| raise Exception("Error transcribing audio.") |
| return text |
|
|
|
|
| def save_and_transcribe_audio(audio, save=True): |
| sample_rate, data = audio |
| |
| filename = f"recordings/audio{time.time()}.wav" |
| if save: |
| save_audio_as_wav(data, sample_rate, filename) |
| return transcribe_audio(audio) |
|
|