don0726 commited on
Commit
7e4df1a
·
verified ·
1 Parent(s): 6ba0047

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisperx
3
+ import torch
4
+ import tempfile
5
+ import os
6
+
7
+ # Load model once (important for speed)
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ compute_type = "float16" if device == "cuda" else "int8"
10
+
11
+ model = whisperx.load_model("base", device, compute_type=compute_type)
12
+
13
+ def transcribe(audio_file, language_code):
14
+ if audio_file is None:
15
+ return "Please upload audio"
16
+
17
+ # Save temp file
18
+ temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
19
+ temp_audio.write(audio_file)
20
+ temp_audio.close()
21
+
22
+ # Load audio
23
+ audio = whisperx.load_audio(temp_audio.name)
24
+
25
+ # Transcribe
26
+ result = model.transcribe(audio, language=language_code)
27
+
28
+ # Align model for word timestamps
29
+ model_a, metadata = whisperx.load_align_model(
30
+ language_code=result["language"], device=device
31
+ )
32
+
33
+ aligned_result = whisperx.align(
34
+ result["segments"],
35
+ model_a,
36
+ metadata,
37
+ audio,
38
+ device,
39
+ return_char_alignments=False
40
+ )
41
+
42
+ # Format output
43
+ output = []
44
+ for seg in aligned_result["segments"]:
45
+ for word in seg["words"]:
46
+ start = round(word["start"], 2)
47
+ end = round(word["end"], 2)
48
+ text = word["word"]
49
+ output.append(f"[{start} - {end}] {text}")
50
+
51
+ os.remove(temp_audio.name)
52
+
53
+ return "\n".join(output)
54
+
55
+
56
+ # Gradio UI
57
+ demo = gr.Interface(
58
+ fn=transcribe,
59
+ inputs=[
60
+ gr.Audio(type="binary", label="Upload Audio"),
61
+ gr.Textbox(label="Language Code (en, hi, etc.)", value="en"),
62
+ ],
63
+ outputs=gr.Textbox(label="Word-level Transcription"),
64
+ title="WhisperX Word-level Transcription",
65
+ description="Upload audio and get word-level timestamps"
66
+ )
67
+
68
+ demo.launch()