hello / app.py
don0726's picture
Update app.py
d320303 verified
import gradio as gr
import whisperx
import torch
# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
# Load WhisperX model once
model = whisperx.load_model("base", device, compute_type=compute_type)
def transcribe(audio_file, language_code):
if audio_file is None:
return "Please upload audio"
try:
# Load audio directly from filepath
audio = whisperx.load_audio(audio_file)
# Transcribe (disable VAD for stability)
result = model.transcribe(
audio,
language=language_code
)
# Load alignment model
model_a, metadata = whisperx.load_align_model(
language_code=result["language"],
device=device
)
# Align words
aligned_result = whisperx.align(
result["segments"],
model_a,
metadata,
audio,
device,
return_char_alignments=False
)
# Format output
output_lines = []
for seg in aligned_result["segments"]:
if "words" in seg:
for word in seg["words"]:
start = round(word["start"], 2)
end = round(word["end"], 2)
text = word["word"]
output_lines.append(f"[{start} - {end}] {text}")
return "\n".join(output_lines)
except Exception as e:
return f"Error: {str(e)}"
# Gradio UI
demo = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(type="filepath", label="Upload Audio"),
gr.Textbox(label="Language Code (en, hi, hi-IN, etc.)", value="en"),
],
outputs=gr.Textbox(label="Word-level Transcription"),
title="WhisperX Word-level Transcription",
description="Upload audio + language code → get word timestamps"
)
demo.launch()