| import gradio as gr |
| import whisperx |
| import torch |
|
|
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| compute_type = "float16" if device == "cuda" else "int8" |
|
|
| |
| model = whisperx.load_model("base", device, compute_type=compute_type) |
|
|
| def transcribe(audio_file, language_code): |
| if audio_file is None: |
| return "Please upload audio" |
|
|
| try: |
| |
| audio = whisperx.load_audio(audio_file) |
|
|
| |
| result = model.transcribe( |
| audio, |
| language=language_code |
| ) |
|
|
| |
| model_a, metadata = whisperx.load_align_model( |
| language_code=result["language"], |
| device=device |
| ) |
|
|
| |
| aligned_result = whisperx.align( |
| result["segments"], |
| model_a, |
| metadata, |
| audio, |
| device, |
| return_char_alignments=False |
| ) |
|
|
| |
| output_lines = [] |
| for seg in aligned_result["segments"]: |
| if "words" in seg: |
| for word in seg["words"]: |
| start = round(word["start"], 2) |
| end = round(word["end"], 2) |
| text = word["word"] |
| output_lines.append(f"[{start} - {end}] {text}") |
|
|
| return "\n".join(output_lines) |
|
|
| except Exception as e: |
| return f"Error: {str(e)}" |
|
|
|
|
| |
| demo = gr.Interface( |
| fn=transcribe, |
| inputs=[ |
| gr.Audio(type="filepath", label="Upload Audio"), |
| gr.Textbox(label="Language Code (en, hi, hi-IN, etc.)", value="en"), |
| ], |
| outputs=gr.Textbox(label="Word-level Transcription"), |
| title="WhisperX Word-level Transcription", |
| description="Upload audio + language code → get word timestamps" |
| ) |
|
|
| demo.launch() |