palli23 commited on
Commit
e4aa950
·
verified ·
1 Parent(s): 2b95269

Update app.py

Browse files

Whisperx + diarization test

Files changed (1) hide show
  1. app.py +86 -56
app.py CHANGED
@@ -1,70 +1,100 @@
1
- # app.py — Íslenskt ASR 3 mínútur (public, no login, with contact)
 
 
2
  import os
3
  os.environ["OMP_NUM_THREADS"] = "1"
4
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
5
 
6
  import gradio as gr
7
  import spaces
 
8
  from transformers import pipeline
 
9
 
10
- # ——————————————————————————————
11
- # Model loaded ONCE at startup (global)
12
- # ——————————————————————————————
13
- MODEL_NAME = "palli23/whisper-small-sam_spjall"
 
 
 
 
 
14
 
15
- @spaces.GPU(duration=180)
16
- def get_pipe():
17
- return pipeline(
18
- "automatic-speech-recognition",
19
- model=MODEL_NAME,
20
- torch_dtype="float16",
21
- device=0,
22
- token=os.getenv("HF_TOKEN"),
23
- )
24
 
25
- pipe = get_pipe()
 
 
 
26
 
27
- # ——————————————————————————————
28
- # Transcription function
29
- # ——————————————————————————————
30
- def transcribe_3min(audio_path):
31
- if not audio_path:
32
- return "Hladdu upp hljóðskrá"
33
-
34
- result = pipe(
35
- audio_path,
36
- chunk_length_s=30,
37
- stride_length_s=(6, 0),
38
- batch_size=8,
39
- return_timestamps=False,
40
- )
41
- return result["text"]
42
 
43
- # ——————————————————————————————
44
- # UI — only added your email, nothing else changed
45
- # ——————————————————————————————
46
- with gr.Blocks() as demo: # ← removed 'theme=' (was causing error)
47
- gr.Markdown("# Íslenskt ASR – 3 mínútur")
48
- gr.Markdown("**Whisper small· mjög lágur WER á prófunarupptökum · allt að 5 mín hljóð**")
49
- gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
50
 
51
- audio_in = gr.Audio(
52
- type="filepath",
53
- label="Hladdu upp .mp3 / .wav (max 5 mín)"
54
- )
55
- btn = gr.Button("Transcribe", variant="primary", size="lg")
56
- output = gr.Textbox(lines=30, label="Útskrift")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- btn.click(fn=transcribe_3min, inputs=audio_in, outputs=output)
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- # ——————————————————————————————
61
- # PUBLIC — NO LOGIN, NO PASSWORD
62
- # ——————————————————————————————
63
- demo.launch(
64
- auth=None, # ← No login
65
- share=True, # ← Public
66
- server_name="0.0.0.0",
67
- server_port=7860,
68
- show_error=True,
69
- quiet=False
70
- )
 
1
+ # app.py — Whisper-small + WhisperX Diarization + Timestamps
2
+ # Public, no login, contact email
3
+
4
  import os
5
  os.environ["OMP_NUM_THREADS"] = "1"
 
6
 
7
  import gradio as gr
8
  import spaces
9
+ import whisperx
10
  from transformers import pipeline
11
+ import torch
12
 
13
+ # Keep Space awake
14
+ import threading, time, requests
15
+ def keep_awake():
16
+ while True:
17
+ time.sleep(45 * 60)
18
+ try:
19
+ requests.get(f"https://{os.getenv('SPACE_HOST')}")
20
+ except: pass
21
+ threading.Thread(target=keep_awake, daemon=True).start()
22
 
23
+ # Load your Whisper-small
24
+ asr = pipeline(
25
+ "automatic-speech-recognition",
26
+ model="palli23/whisper-small-sam_spjall",
27
+ torch_dtype="float16",
28
+ device=0,
29
+ chunk_length_s=30,
30
+ batch_size=8,
31
+ )
32
 
33
+ # WhisperX setup (diarization + timestamps)
34
+ device = "cuda" if torch.cuda.is_available() else "cpu"
35
+ batch_size = 16
36
+ compute_type = "float16"
37
 
38
+ # Load WhisperX model
39
+ model = whisperx.load_model("base", device, compute_type=compute_type)
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Load diarization model
42
+ diarize_model = whisperx.DiarizationPipeline(
43
+ use_auth_token=True,
44
+ device=device,
45
+ min_speakers=2,
46
+ max_speakers=5,
47
+ )
48
 
49
+ def transcribe_with_whisperx(audio_path, use_diarization=False):
50
+ if not audio_path:
51
+ return "Hladdu upp hljóðskrá"
52
+
53
+ # Load audio
54
+ audio = whisperx.load_audio(audio_path)
55
+
56
+ # Transcribe with Whisper
57
+ result = model.transcribe(audio, batch_size=batch_size)
58
+
59
+ # Align for word-level timestamps
60
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
61
+ result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
62
+
63
+ if not use_diarization:
64
+ # Return with timestamps
65
+ lines = []
66
+ for segment in result["segments"]:
67
+ start = segment["start"]
68
+ end = segment["end"]
69
+ text = segment["text"]
70
+ lines.append(f"{start:.1f}s – {end:.1f}s: {text}")
71
+ return "\n".join(lines)
72
+
73
+ # Diarization
74
+ diarize_segments = diarize_model(audio)
75
+ result = whisperx.assign_word_speakers(diarize_segments, result)
76
+
77
+ # Return with speakers + timestamps
78
+ lines = []
79
+ for segment in result["segments"]:
80
+ speaker = segment.get("speaker", "Unknown")
81
+ start = segment["start"]
82
+ end = segment["end"]
83
+ text = segment["text"]
84
+ lines.append(f"[{speaker}] {start:.1f}s – {end:.1f}s: {text}")
85
+ return "\n".join(lines)
86
 
87
+ # UI
88
+ with gr.Blocks(title="Íslensk talgreining + WhisperX") as demo:
89
+ gr.Markdown("# Íslensk talgreining + WhisperX")
90
+ gr.Markdown("**Whisper-small + diarization + timestamps • pallinr1@protonmail.com**")
91
+
92
+ audio = gr.Audio(type="filepath", label="Hladdu upp hljóð (max 15 mín)")
93
+ diarize = gr.Checkbox(label="Virkja diarization (speakers + timestamps)", value=True)
94
+
95
+ btn = gr.Button("Transcribe", variant="primary")
96
+ out = gr.Textbox(lines=25, label="Útskrift")
97
+
98
+ btn.click(transcribe_with_whisperx, inputs=[audio, diarize], outputs=out)
99
 
100
+ demo.launch(auth=None, share=True)