don0726 commited on
Commit
751f552
·
verified ·
1 Parent(s): 7f6e67c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -38
app.py CHANGED
@@ -1,68 +1,71 @@
1
  import gradio as gr
2
  import whisperx
3
  import torch
4
- import tempfile
5
- import os
6
 
7
- # Load model once (important for speed)
8
  device = "cuda" if torch.cuda.is_available() else "cpu"
9
  compute_type = "float16" if device == "cuda" else "int8"
10
 
11
- model = whisperx.load_model("small", device, compute_type=compute_type)
 
12
 
13
  def transcribe(audio_file, language_code):
14
  if audio_file is None:
15
  return "Please upload audio"
16
 
17
- # Save temp file
18
- temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
19
- temp_audio.write(audio_file)
20
- temp_audio.close()
21
 
22
- # Load audio
23
- audio = whisperx.load_audio(temp_audio.name)
 
 
 
 
24
 
25
- # Transcribe
26
- result = model.transcribe(audio, language=language_code)
 
 
 
27
 
28
- # Align model for word timestamps
29
- model_a, metadata = whisperx.load_align_model(
30
- language_code=result["language"], device=device
31
- )
 
 
 
 
 
32
 
33
- aligned_result = whisperx.align(
34
- result["segments"],
35
- model_a,
36
- metadata,
37
- audio,
38
- device,
39
- return_char_alignments=False
40
- )
 
41
 
42
- # Format output
43
- output = []
44
- for seg in aligned_result["segments"]:
45
- for word in seg["words"]:
46
- start = round(word["start"], 2)
47
- end = round(word["end"], 2)
48
- text = word["word"]
49
- output.append(f"[{start} - {end}] {text}")
50
 
51
- os.remove(temp_audio.name)
52
-
53
- return "\n".join(output)
54
 
55
 
56
  # Gradio UI
57
  demo = gr.Interface(
58
  fn=transcribe,
59
  inputs=[
60
- gr.Audio(type="binary", label="Upload Audio"),
61
- gr.Textbox(label="Language Code (en, hi, etc.)", value="en"),
62
  ],
63
  outputs=gr.Textbox(label="Word-level Transcription"),
64
  title="WhisperX Word-level Transcription",
65
- description="Upload audio and get word-level timestamps"
66
  )
67
 
68
  demo.launch()
 
1
  import gradio as gr
2
  import whisperx
3
  import torch
 
 
4
 
5
+ # Device setup
6
  device = "cuda" if torch.cuda.is_available() else "cpu"
7
  compute_type = "float16" if device == "cuda" else "int8"
8
 
9
+ # Load WhisperX model once
10
+ model = whisperx.load_model("base", device, compute_type=compute_type)
11
 
12
  def transcribe(audio_file, language_code):
13
  if audio_file is None:
14
  return "Please upload audio"
15
 
16
+ try:
17
+ # Load audio directly from filepath
18
+ audio = whisperx.load_audio(audio_file)
 
19
 
20
+ # Transcribe (disable VAD for stability)
21
+ result = model.transcribe(
22
+ audio,
23
+ language=language_code,
24
+ vad_filter=False
25
+ )
26
 
27
+ # Load alignment model
28
+ model_a, metadata = whisperx.load_align_model(
29
+ language_code=result["language"],
30
+ device=device
31
+ )
32
 
33
+ # Align words
34
+ aligned_result = whisperx.align(
35
+ result["segments"],
36
+ model_a,
37
+ metadata,
38
+ audio,
39
+ device,
40
+ return_char_alignments=False
41
+ )
42
 
43
+ # Format output
44
+ output_lines = []
45
+ for seg in aligned_result["segments"]:
46
+ if "words" in seg:
47
+ for word in seg["words"]:
48
+ start = round(word["start"], 2)
49
+ end = round(word["end"], 2)
50
+ text = word["word"]
51
+ output_lines.append(f"[{start} - {end}] {text}")
52
 
53
+ return "\n".join(output_lines)
 
 
 
 
 
 
 
54
 
55
+ except Exception as e:
56
+ return f"Error: {str(e)}"
 
57
 
58
 
59
  # Gradio UI
60
  demo = gr.Interface(
61
  fn=transcribe,
62
  inputs=[
63
+ gr.Audio(type="filepath", label="Upload Audio"),
64
+ gr.Textbox(label="Language Code (en, hi, hi-IN, etc.)", value="en"),
65
  ],
66
  outputs=gr.Textbox(label="Word-level Transcription"),
67
  title="WhisperX Word-level Transcription",
68
+ description="Upload audio + language code → get word timestamps"
69
  )
70
 
71
  demo.launch()