File size: 1,916 Bytes
7e4df1a 751f552 7e4df1a 751f552 7e4df1a 751f552 7e4df1a 751f552 d320303 751f552 7e4df1a 751f552 7e4df1a 751f552 7e4df1a 751f552 7e4df1a 751f552 7e4df1a 751f552 7e4df1a 7f6e67c 7e4df1a 751f552 7e4df1a 7f6e67c 751f552 7e4df1a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | import gradio as gr
import whisperx
import torch
# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
# Load WhisperX model once
model = whisperx.load_model("base", device, compute_type=compute_type)
def transcribe(audio_file, language_code):
if audio_file is None:
return "Please upload audio"
try:
# Load audio directly from filepath
audio = whisperx.load_audio(audio_file)
# Transcribe (disable VAD for stability)
result = model.transcribe(
audio,
language=language_code
)
# Load alignment model
model_a, metadata = whisperx.load_align_model(
language_code=result["language"],
device=device
)
# Align words
aligned_result = whisperx.align(
result["segments"],
model_a,
metadata,
audio,
device,
return_char_alignments=False
)
# Format output
output_lines = []
for seg in aligned_result["segments"]:
if "words" in seg:
for word in seg["words"]:
start = round(word["start"], 2)
end = round(word["end"], 2)
text = word["word"]
output_lines.append(f"[{start} - {end}] {text}")
return "\n".join(output_lines)
except Exception as e:
return f"Error: {str(e)}"
# Gradio UI
demo = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(type="filepath", label="Upload Audio"),
gr.Textbox(label="Language Code (en, hi, hi-IN, etc.)", value="en"),
],
outputs=gr.Textbox(label="Word-level Transcription"),
title="WhisperX Word-level Transcription",
description="Upload audio + language code → get word timestamps"
)
demo.launch() |