palli23 commited on
Commit
d89e139
·
1 Parent(s): 48ac366
Files changed (2) hide show
  1. app.py +38 -29
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,31 +1,29 @@
1
- # app.py – ZeroGPU SAFE 3 mín hljóð án "GPU task aborted"
2
  import os
3
  import gradio as gr
4
  import spaces
5
  from transformers import pipeline
6
- import numpy as np
7
- import librosa
8
 
9
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
10
 
11
- @spaces.GPU(duration=60) # MEST 60 sek ZeroGPU leyfir
12
- def transcribe_safe(audio_path):
 
13
  if not audio_path:
14
  return "Hladdu upp hljóðskrá"
15
 
16
- # Hlaða hljóð og klippa í 20 sek chunkar (mjög öruggt)
17
- audio, sr = librosa.load(audio_path, sr=16000)
18
- chunk_len = 16000 * 20 # 20 sek
19
- stride = 16000 * 2 # 2 sek overlap
20
- chunks = []
21
- for i in range(0, len(audio), chunk_len - stride):
22
- chunk = audio[i:i + chunk_len]
23
- if len(chunk) < 16000: # undir 1 sek → hætta
24
- break
25
- chunks.append(chunk)
26
 
27
- # Hlaða ASR á GPU (cached)
28
- pipe = pipeline(
 
 
29
  "automatic-speech-recognition",
30
  model=MODEL_NAME,
31
  device=0,
@@ -33,21 +31,32 @@ def transcribe_safe(audio_path):
33
  )
34
 
35
  full_text = ""
36
- for idx, chunk in enumerate(chunks):
37
- result = pipe(chunk, batch_size=8)
38
- full_text += result["text"] + " "
 
 
 
 
 
 
 
 
 
 
39
 
40
- return full_text.strip() or "Ekkert heyrt"
41
 
42
- # Gradio – fallegt og tilbúið fyrir 3 mín
43
- with gr.Blocks(title="Íslenskt ASR 3 mín ZeroGPU") as demo:
44
- gr.Markdown("# Íslenskt ASR 3 mín hljóð")
45
- gr.Markdown("**~4 % WER · 25–45 sek · ZeroGPU (PRO)**")
 
46
 
47
- audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
48
- btn = gr.Button("Transcribe (2545 sek)", variant="primary", size="lg")
49
- out = gr.Textbox(lines=30, label="Útskrift")
50
 
51
- btn.click(transcribe_safe, inputs=audio, outputs=out)
52
 
53
  demo.launch(auth=("beta", "beta2025"))
 
1
+ # app.py – Whisper-small + Mælendagreining (pyannote 3.1) ZeroGPU
2
  import os
3
  import gradio as gr
4
  import spaces
5
  from transformers import pipeline
6
+ from pyannote.audio import Pipeline
7
+ import tempfile
8
 
9
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
10
 
11
+ # Mælendagreining BESTA módel 2025 (þú hefur samþykkt license)
12
+ @spaces.GPU(duration=120) # 120 sek max – nóg fyrir 5 mín hljóð
13
+ def transcribe_with_diarization(audio_path):
14
  if not audio_path:
15
  return "Hladdu upp hljóðskrá"
16
 
17
+ # 1. Mælendagreining (pyannote)
18
+ diarization = Pipeline.from_pretrained(
19
+ "pyannote/speaker-diarization-3.1",
20
+ use_auth_token=os.getenv("HF_TOKEN")
21
+ ).to("cuda")
 
 
 
 
 
22
 
23
+ dia_result = diarization(audio_path)
24
+
25
+ # 2. Whisper-small á hverjum mælandahluta
26
+ asr = pipeline(
27
  "automatic-speech-recognition",
28
  model=MODEL_NAME,
29
  device=0,
 
31
  )
32
 
33
  full_text = ""
34
+ for turn, _, speaker in dia_result.itertracks(yield_label=True):
35
+ start = turn.start
36
+ end = turn.end
37
+
38
+ # Klippa út segmentið
39
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
40
+ dia_result.crop(audio_path, turn).export(tmp.name, format="wav")
41
+ segment_path = tmp.name
42
+
43
+ text = asr(segment_path)["text"].strip()
44
+ full_text += f"[MÆLENDI {speaker}] {text}\n"
45
+
46
+ os.unlink(segment_path) # hreinsa temp skrá
47
 
48
+ return full_text or "Ekkert heyrt"
49
 
50
+ # Gradio interface
51
+ with gr.Blocks(title="Íslenskt ASR + Mælendagreining") as demo:
52
+ gr.Markdown("# Íslenskt ASR + Mælendagreining")
53
+ gr.Markdown("**Whisper-small + pyannote 3.1 · ~4 % WER + 95 % DIAR**")
54
+ gr.Markdown("Fullkominn podcast-transcript með réttum mælendum")
55
 
56
+ audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 5 mín)")
57
+ btn = gr.Button("Transcribe með mælendum (4090 sek)", variant="primary", size="lg")
58
+ out = gr.Textbox(lines=35, label="Útskrift með mælendum")
59
 
60
+ btn.click(transcribe_with_diarization, inputs=audio, outputs=out)
61
 
62
  demo.launch(auth=("beta", "beta2025"))
requirements.txt CHANGED
@@ -2,5 +2,6 @@ gradio
2
  transformers
3
  torch
4
  spaces
 
5
  librosa
6
  soundfile
 
2
  transformers
3
  torch
4
  spaces
5
+ pyannote.audio
6
  librosa
7
  soundfile