palli23 commited on
Commit
c675e00
·
verified ·
1 Parent(s): 97c7e9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -86
app.py CHANGED
@@ -1,6 +1,3 @@
1
- # app.py — Íslensk talgreining with WhisperX Diarization & Timestamps
2
- # Public, no login, pallinr1@protonmail.com
3
-
4
  import os
5
  os.environ["OMP_NUM_THREADS"] = "1"
6
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
@@ -8,98 +5,106 @@ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
8
  import gradio as gr
9
  import spaces
10
  import whisperx
11
- import torch
12
- from transformers import pipeline
13
-
14
- # Keep Space awake
15
- import threading, time, requests
16
- def keep_awake():
17
- while True:
18
- time.sleep(45*60)
19
- try:
20
- requests.get(f"https://{os.getenv('SPACE_HOST')}")
21
- except: pass
22
- threading.Thread(target=keep_awake, daemon=True).start()
23
-
24
- # Load your Whisper-small
25
- asr = pipeline(
26
- "automatic-speech-recognition",
27
- model="palli23/whisper-small-sam_spjall",
28
- torch_dtype="float16",
29
- device=0,
30
- chunk_length_s=30,
31
- batch_size=8,
32
- )
33
 
34
- # WhisperX setup (diarization + timestamps)
35
- device = "cuda" if torch.cuda.is_available() else "cpu"
36
- batch_size = 16
37
- compute_type = "float16"
 
38
 
39
- # Load WhisperX model (your Whisper-small)
40
- model = whisperx.load_model("palli23/whisper-small-sam_spjall", device, compute_type=compute_type)
 
 
 
 
41
 
42
- # Load diarization model (pyannote internal to WhisperX)
43
- diarize_model = whisperx.DiarizationPipeline(
44
- use_auth_token=True,
45
- device=device,
46
- min_speakers=1,
47
- max_speakers=6,
48
- )
49
 
50
- # Load alignment model (for timestamps)
51
- align_model, metadata = whisperx.load_align_model(language_code="is", device=device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- def transcribe_with_whisperx(audio_path, use_diarization=True):
54
  if not audio_path:
55
  return "Hladdu upp hljóðskrá"
56
-
57
  # Load audio
58
  audio = whisperx.load_audio(audio_path)
59
-
60
- # Transcribe with your model
61
- result = model.transcribe(audio, batch_size=batch_size)
62
-
63
- # Align for word-level timestamps
64
- result = whisperx.align(
65
- result["segments"], align_model, metadata, audio, device, return_char_alignments=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  )
67
-
68
- if not use_diarization:
69
- # Return with timestamps (no speakers)
70
- lines = []
71
- for segment in result["segments"]:
72
- start = segment["start"]
73
- end = segment["end"]
74
- text = segment["text"]
75
- lines.append(f"{start:.1f}s – {end:.1f}s: {text}")
76
- return "\n".join(lines)
77
-
78
- # Diarization
79
- diarize_segments = diarize_model(audio)
80
- result = whisperx.assign_word_speakers(diarize_segments, result)
81
-
82
- # Return with speakers + timestamps
83
- lines = []
84
- for segment in result["segments"]:
85
- speaker = segment.get("speaker", "Unknown")
86
- start = segment["start"]
87
- end = segment["end"]
88
- text = segment["text"]
89
- lines.append(f"[{speaker}] {start:.1f}s – {end:.1f}s: {text}")
90
- return "\n".join(lines)
91
-
92
- # UI — public, no login, your email
93
- with gr.Blocks(title="Íslensk talgreining + WhisperX") as demo:
94
- gr.Markdown("# Íslensk talgreining + WhisperX")
95
- gr.Markdown("**Whisper-small + diarization + timestamps • pallinr1@protonmail.com**")
96
-
97
- audio = gr.Audio(type="filepath", label="Hladdu upp hljóð (max 15 mín)")
98
- diarize = gr.Checkbox(label="Virkja diarization + timestamps", value=True)
99
-
100
  btn = gr.Button("Transcribe", variant="primary")
101
- out = gr.Textbox(lines=25, label="Útskrift", show_copy_button=True)
102
-
103
- btn.click(transcribe_with_whisperx, inputs=[audio, diarize], outputs=out)
104
 
105
- demo.launch(auth=None, share=True)
 
 
 
 
 
 
 
 
 
1
  import os
2
  os.environ["OMP_NUM_THREADS"] = "1"
3
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
 
5
  import gradio as gr
6
  import spaces
7
  import whisperx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # -----------------------------
10
+ # MODEL SETTINGS
11
+ # -----------------------------
12
+ MODEL_NAME = "palli23/whisper-small-sam_spjall"
13
+ HF_TOKEN = os.getenv("HF_TOKEN")
14
 
15
+ # -----------------------------
16
+ # LOAD MODELS ONCE (GPU)
17
+ # -----------------------------
18
+ @spaces.GPU(duration=180)
19
+ def load_all_models():
20
+ device = "cuda"
21
 
22
+ # 1. Whisper-small model
23
+ asr_model = whisperx.load_model(
24
+ MODEL_NAME,
25
+ device=device,
26
+ compute_type="float16"
27
+ )
 
28
 
29
+ # 2. Alignment model
30
+ align_model, metadata = whisperx.load_align_model(
31
+ language_code="is",
32
+ device=device
33
+ )
34
+
35
+ # 3. Diarization model (pyannote)
36
+ diar_model = whisperx.DiarizationPipeline(
37
+ model_name="pyannote/speaker-diarization-3.1",
38
+ device=device,
39
+ use_auth_token=HF_TOKEN
40
+ )
41
+
42
+ return asr_model, align_model, metadata, diar_model
43
+
44
+ asr_model, align_model, align_metadata, diar_model = load_all_models()
45
+
46
+
47
+ # -----------------------------
48
+ # TRANSCRIPTION + DIARIZATION
49
+ # -----------------------------
50
+ def transcribe_is_with_diar(audio_path):
51
 
 
52
  if not audio_path:
53
  return "Hladdu upp hljóðskrá"
54
+
55
  # Load audio
56
  audio = whisperx.load_audio(audio_path)
57
+
58
+ # --- 1. ASR with Whisper-small
59
+ asr_result = asr_model.transcribe(
60
+ audio,
61
+ batch_size=8
62
+ )
63
+
64
+ # --- 2. Alignment (word timestamps)
65
+ aligned = whisperx.align(
66
+ asr_result["segments"],
67
+ align_model,
68
+ align_metadata,
69
+ audio,
70
+ device="cuda"
71
+ )
72
+
73
+ # --- 3. Diarization
74
+ diarization = diar_model(audio)
75
+
76
+ # --- 4. Merge diarization + words
77
+ final = whisperx.assign_word_speakers(diarization, aligned)
78
+
79
+ # Format output text
80
+ output_lines = []
81
+ for seg in final["segments"]:
82
+ speaker = seg.get("speaker", "SPEAKER_00")
83
+ text = seg.get("text", "")
84
+ output_lines.append(f"[{speaker}] {text}")
85
+
86
+ return "\n".join(output_lines)
87
+
88
+
89
+ # -----------------------------
90
+ # BUILD GRADIO UI
91
+ # -----------------------------
92
+ with gr.Blocks() as demo:
93
+ gr.Markdown("# 🇮🇸 Íslenskt ASR + Raddgreining (Diarization)")
94
+ gr.Markdown("**Whisper-small + WhisperX** — Hljóð allt að 5 mínútur")
95
+
96
+ audio_in = gr.Audio(
97
+ type="filepath",
98
+ label="Hladdu upp hljóði (.mp3 / .wav)"
99
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  btn = gr.Button("Transcribe", variant="primary")
101
+ output = gr.Textbox(lines=30, label="Útskrift með raddgreiningu")
102
+
103
+ btn.click(fn=transcribe_is_with_diar, inputs=audio_in, outputs=output)
104
 
105
+ demo.launch(
106
+ auth=None,
107
+ share=True,
108
+ server_name="0.0.0.0",
109
+ server_port=7860
110
+ )