import gradio as gr import whisperx import torch # Device setup device = "cuda" if torch.cuda.is_available() else "cpu" compute_type = "float16" if device == "cuda" else "int8" # Load WhisperX model once model = whisperx.load_model("base", device, compute_type=compute_type) def transcribe(audio_file, language_code): if audio_file is None: return "Please upload audio" try: # Load audio directly from filepath audio = whisperx.load_audio(audio_file) # Transcribe (disable VAD for stability) result = model.transcribe( audio, language=language_code ) # Load alignment model model_a, metadata = whisperx.load_align_model( language_code=result["language"], device=device ) # Align words aligned_result = whisperx.align( result["segments"], model_a, metadata, audio, device, return_char_alignments=False ) # Format output output_lines = [] for seg in aligned_result["segments"]: if "words" in seg: for word in seg["words"]: start = round(word["start"], 2) end = round(word["end"], 2) text = word["word"] output_lines.append(f"[{start} - {end}] {text}") return "\n".join(output_lines) except Exception as e: return f"Error: {str(e)}" # Gradio UI demo = gr.Interface( fn=transcribe, inputs=[ gr.Audio(type="filepath", label="Upload Audio"), gr.Textbox(label="Language Code (en, hi, hi-IN, etc.)", value="en"), ], outputs=gr.Textbox(label="Word-level Transcription"), title="WhisperX Word-level Transcription", description="Upload audio + language code → get word timestamps" ) demo.launch()