import whisper import spaces # Cache loaded whisper models to avoid reloading for each request _model_cache = {} def list_available_whisper_models(): """Return list of available Whisper models""" return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"] @spaces.GPU def transcribe_audio(audio_path: str, model_size: str = "medium.en"): """ Transcribe the given audio file using OpenAI Whisper and return the result dictionary. The result includes per-word timestamps. Args: audio_path: Path to the audio file model_size: Size of Whisper model to use (tiny, base, small, medium, medium.en, large) Returns: Dictionary with transcription results including segments with word timestamps """ # Load model (use cache if available) model_size = model_size or "medium.en" if model_size not in _model_cache: # Load Whisper model print(f"Loading Whisper model: {model_size}...") _model_cache[model_size] = whisper.load_model(model_size) model = _model_cache[model_size] # Perform transcription with word-level timestamps result = model.transcribe(audio_path, word_timestamps=True, verbose=False, task="transcribe", language="en") # The result is a dict with "text" and "segments". Each segment may include 'words' list for word-level timestamps. return result