Rafii commited on
Commit
137c8da
Β·
verified Β·
1 Parent(s): 6e7be52

Replace WhisperX with faster-whisper

Browse files
Files changed (1) hide show
  1. app.py +21 -79
app.py CHANGED
@@ -1,90 +1,33 @@
1
  import gradio as gr
2
- import whisperx
3
- import os
4
- import tempfile
5
- from dotenv import load_dotenv
6
-
7
- # Load environment variables
8
- load_dotenv()
9
- hf_token = os.getenv("hf_token")
10
 
11
  # Model config
12
  device = "cpu"
13
- batch_size = 16
14
  compute_type = "int8"
15
-
16
  # Load main model
17
- model = whisperx.load_model("large-v3", device, compute_type=compute_type)
18
 
19
  title = "πŸŽ™οΈ Multilingual Audio Processor"
20
- description = "Upload an audio file and select whether to transcribe, align words, or identify speakers (Powered by WhisperX)."
21
-
22
- def clean_alignment(result):
23
- cleaned_segments = []
24
- for seg in result.get("segments", []):
25
- cleaned_words = []
26
- for word in seg.get("words", []):
27
- cleaned_words.append({
28
- "word": word["word"],
29
- "start": float(word["start"]),
30
- "end": float(word["end"]),
31
- "score": float(word["score"])
32
- })
33
- cleaned_segments.append({
34
- "text": seg["text"],
35
- "start": float(seg["start"]),
36
- "end": float(seg["end"]),
37
- "words": cleaned_words
38
- })
39
- return {"segments": cleaned_segments}
40
-
41
- def process_audio(audio_path, transcribe=True, align=False, diarize=False):
42
- transcript_output = ""
43
- align_output = {}
44
- diarize_output = ""
45
-
46
- audio = whisperx.load_audio(audio_path)
47
- result = None
48
-
49
- # Step 1: Transcribe
50
- # if transcribe:
51
- result = model.transcribe(audio, batch_size=batch_size)
52
- transcript_output = " ".join(seg["text"] for seg in result["segments"])
53
-
54
- # Step 2: Align
55
- if align and result:
56
- model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
57
- result = whisperx.align(result["segments"], model_a, metadata, audio, device)
58
- align_output = clean_alignment(result)
59
-
60
- # Step 3: Diarization
61
- if diarize and result:
62
- diarize_model = whisperx.diarize.DiarizationPipeline(
63
- use_auth_token=hf_token,
64
- device=device
65
- )
66
- diarize_segments = diarize_model(audio)
67
- result = whisperx.assign_word_speakers(diarize_segments, result)
68
- diarize_output = [
69
- {
70
- "start": float(seg["start"]),
71
- "end": float(seg["end"]),
72
- "speaker": seg.get("speaker", "SPEAKER_00"),
73
- "text": seg["text"]
74
- } for seg in result["segments"]
75
- ]
76
-
77
- return transcript_output , align_output or {}, diarize_output or "No diarization."
78
-
79
- with gr.Blocks(title=title, theme=gr.themes.Default(), analytics_enabled=True) as demo:
80
  gr.Markdown(f"<h1 style='text-align: center;font-size: 40px;'>{title}</h1>")
81
  gr.Markdown(f"<p style='text-align: center; font-size: 16px;'>{description}</p>")
 
82
  with gr.Row():
83
  with gr.Column(scale=1):
84
  audio_input = gr.Audio(type="filepath", label="Upload Audio")
85
- transcribe_checkbox = gr.Markdown("βœ… Transcription will always be performed.")
86
- align_checkbox = gr.Checkbox(label="Align")
87
- diarize_checkbox = gr.Checkbox(label="Diarize")
88
  gr.Markdown("### <span style='font-size: 18px;'>🎧 Try Sample Audio</span>")
89
  gr.Examples(
90
  examples=[[f"test_audios/{audio_file}"] for audio_file in os.listdir("test_audios") if audio_file.endswith(('.mp3', '.wav'))],
@@ -93,15 +36,14 @@ with gr.Blocks(title=title, theme=gr.themes.Default(), analytics_enabled=True) a
93
  )
94
  with gr.Column(scale=2):
95
  transcript_output = gr.Textbox(label="πŸ“„ Transcript", lines=10, interactive=False)
96
- alignment_output = gr.JSON(label="🧭 Word Alignment")
97
- diarization_output = gr.JSON(label="πŸ—£οΈ Speaker Diarization")
98
- with gr.Row():
99
  process_button = gr.Button("Process")
100
 
 
101
  process_button.click(
102
  fn=process_audio,
103
- inputs=[audio_input, transcribe_checkbox, align_checkbox, diarize_checkbox],
104
- outputs=[transcript_output, alignment_output, diarization_output]
105
  )
106
 
107
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from faster_whisper import WhisperModelimport os
 
 
 
 
 
 
 
3
 
4
  # Model config
5
  device = "cpu"
 
6
  compute_type = "int8"
 
7
  # Load main model
8
+ model = WhisperModel("large-v3", device=device, compute_type=compute_type)
9
 
10
  title = "πŸŽ™οΈ Multilingual Audio Processor"
11
+ description = "Upload an audio file to transcribe (Powered by faster-whisper)."
12
+
13
+ def process_audio(audio_path):
14
+
15
+ # Transcribe using faster-whisper
16
+ segments, info = model.transcribe(audio_path)
17
+
18
+ # Extract text from segments
19
+ transcript_output = " ".join([seg.text for seg in segments])
20
+ return transcript_output
21
+
22
+
23
+
24
+ with gr.Blocks(title=title, theme=gr.themes.Default(), analytics_enabled=True) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  gr.Markdown(f"<h1 style='text-align: center;font-size: 40px;'>{title}</h1>")
26
  gr.Markdown(f"<p style='text-align: center; font-size: 16px;'>{description}</p>")
27
+
28
  with gr.Row():
29
  with gr.Column(scale=1):
30
  audio_input = gr.Audio(type="filepath", label="Upload Audio")
 
 
 
31
  gr.Markdown("### <span style='font-size: 18px;'>🎧 Try Sample Audio</span>")
32
  gr.Examples(
33
  examples=[[f"test_audios/{audio_file}"] for audio_file in os.listdir("test_audios") if audio_file.endswith(('.mp3', '.wav'))],
 
36
  )
37
  with gr.Column(scale=2):
38
  transcript_output = gr.Textbox(label="πŸ“„ Transcript", lines=10, interactive=False)
39
+ with gr.Row():
 
 
40
  process_button = gr.Button("Process")
41
 
42
+ process_button.click(
43
  process_button.click(
44
  fn=process_audio,
45
+ inputs=[audio_input],
46
+ outputs=[transcript_output]
47
  )
48
 
49
  if __name__ == "__main__":