palli23 commited on
Commit
c27f348
·
1 Parent(s): b348bed

diarization1Mæló

Browse files
Files changed (2) hide show
  1. app.py +24 -33
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py – Whisper-small + WebRTC VAD + ECAPA mælendagreining VIRKAR Á ZeroGPU
2
  import os
3
  import gradio as gr
4
  import spaces
@@ -6,23 +6,25 @@ import webrtcvad
6
  import numpy as np
7
  from pydub import AudioSegment
8
  from sklearn.cluster import AgglomerativeClustering
9
- from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2Model
 
10
  import torch
11
  import tempfile
12
 
13
  # ÞITT Whisper-small model
14
  ASR_MODEL = "palli23/whisper-small-sam_spjall"
15
 
16
- # ECAPA speaker embedding model (létt og hratt)
17
- EMB_MODEL = "speechbrain/spkrec-ecapa-voxceleb"
18
- processor = Wav2Vec2Processor.from_pretrained(EMB_MODEL)
19
- embedder = Wav2Vec2Model.from_pretrained(EMB_MODEL)
 
20
 
21
- # Hlaða ASR á GPU (cached)
22
  asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, device=0)
23
 
24
- # WebRTC VAD (mjög létt)
25
- vad = webrtcvad.Vad(2) # mode 2 = aggressive
26
 
27
  def audio_to_frames(path, frame_ms=30):
28
  audio = AudioSegment.from_file(path).set_channels(1).set_frame_rate(16000)
@@ -40,7 +42,7 @@ def extract_speech_segments(path):
40
  if vad.is_speech(frame.tobytes(), 16000):
41
  current.append(frame)
42
  else:
43
- if len(current) > 20: # minnsta kosti 20 frames (~600 ms)
44
  segments.append(np.concatenate(current))
45
  current = []
46
  if len(current) > 20:
@@ -49,9 +51,8 @@ def extract_speech_segments(path):
49
 
50
  def get_embedding(segment):
51
  with torch.no_grad():
52
- inputs = processor(segment, sampling_rate=16000, return_tensors="pt", padding=True)
53
- emb = embedder(inputs.input_values.to("cuda")).last_hidden_state.mean(dim=1)
54
- return emb.cpu().numpy()[0]
55
 
56
  @spaces.GPU(duration=120)
57
  def transcribe_with_speakers(audio_path):
@@ -62,27 +63,17 @@ def transcribe_with_speakers(audio_path):
62
  if not segments:
63
  return "Engin tala heyrðist"
64
 
65
- # Búa til embeddings
66
  embeddings = [get_embedding(seg) for seg in segments]
67
-
68
- # Klústra mælendur (max 8)
69
- clustering = AgglomerativeClustering(
70
- n_clusters=None,
71
- distance_threshold=0.8,
72
- linkage="average"
73
- ).fit(embeddings)
74
  labels = clustering.labels_
75
 
76
- # Transcribe hvert segment
77
  result = []
78
  for seg, spk in zip(segments, labels):
79
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
80
- audio = AudioSegment(
81
- data=seg.tobytes(),
82
- sample_width=2,
83
- frame_rate=16000,
84
- channels=1
85
- ).export(f.name, format="wav")
86
  seg_path = f.name
87
 
88
  text = asr(seg_path)["text"].strip()
@@ -91,14 +82,14 @@ def transcribe_with_speakers(audio_path):
91
 
92
  return "\n".join(result)
93
 
94
- # Gradio
95
  with gr.Blocks() as demo:
96
  gr.Markdown("# Íslenskt ASR + Mælendagreining (WebRTC + ECAPA)")
97
- gr.Markdown("**ZeroGPU Virkar 100 % · 3–5 mín hljóð → 30–60 sek**")
98
 
99
- audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav")
100
- btn = gr.Button("Transcribe með mælendum", variant="primary", size="lg")
101
- out = gr.Textbox(lines=35, label="Útskrift")
102
 
103
  btn.click(transcribe_with_speakers, audio, out)
104
 
 
1
+ # app.py – FIXED ECAPA (SpeechBrain Native) + Whisper-small – ZeroGPU
2
  import os
3
  import gradio as gr
4
  import spaces
 
6
  import numpy as np
7
  from pydub import AudioSegment
8
  from sklearn.cluster import AgglomerativeClustering
9
+ from transformers import pipeline
10
+ from speechbrain.inference.speaker import EncoderClassifier # ← Native SpeechBrain
11
  import torch
12
  import tempfile
13
 
14
  # ÞITT Whisper-small model
15
  ASR_MODEL = "palli23/whisper-small-sam_spjall"
16
 
17
+ # SpeechBrain ECAPA (native no Transformers error)
18
+ embedder = EncoderClassifier.from_hparams(
19
+ source="speechbrain/spkrec-ecapa-voxceleb",
20
+ savedir="tmp_ecapa_cache" # local cache
21
+ )
22
 
23
+ # Hlaða ASR á GPU
24
  asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, device=0)
25
 
26
+ # WebRTC VAD
27
+ vad = webrtcvad.Vad(2)
28
 
29
  def audio_to_frames(path, frame_ms=30):
30
  audio = AudioSegment.from_file(path).set_channels(1).set_frame_rate(16000)
 
42
  if vad.is_speech(frame.tobytes(), 16000):
43
  current.append(frame)
44
  else:
45
+ if len(current) > 20: # min 600 ms
46
  segments.append(np.concatenate(current))
47
  current = []
48
  if len(current) > 20:
 
51
 
52
  def get_embedding(segment):
53
  with torch.no_grad():
54
+ emb = embedder.encode_batch(torch.tensor(segment).unsqueeze(0).float() / 32768.0)
55
+ return emb.squeeze().numpy()
 
56
 
57
  @spaces.GPU(duration=120)
58
  def transcribe_with_speakers(audio_path):
 
63
  if not segments:
64
  return "Engin tala heyrðist"
65
 
66
+ # Embeddings og clustering
67
  embeddings = [get_embedding(seg) for seg in segments]
68
+ clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.8).fit(embeddings)
 
 
 
 
 
 
69
  labels = clustering.labels_
70
 
71
+ # Transcribe
72
  result = []
73
  for seg, spk in zip(segments, labels):
74
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
75
+ audio = AudioSegment(data=seg.tobytes(), sample_width=2, frame_rate=16000, channels=1)
76
+ audio.export(f.name, format="wav")
 
 
 
 
77
  seg_path = f.name
78
 
79
  text = asr(seg_path)["text"].strip()
 
82
 
83
  return "\n".join(result)
84
 
85
+ # Interface
86
  with gr.Blocks() as demo:
87
  gr.Markdown("# Íslenskt ASR + Mælendagreining (WebRTC + ECAPA)")
88
+ gr.Markdown("**Whisper-small + SpeechBrain ECAPA · Virkar á ZeroGPU**")
89
 
90
+ audio = gr.Audio(type="filepath")
91
+ btn = gr.Button("Transcribe með mælendum", variant="primary")
92
+ out = gr.Textbox(lines=35)
93
 
94
  btn.click(transcribe_with_speakers, audio, out)
95
 
requirements.txt CHANGED
@@ -2,9 +2,9 @@ gradio
2
  transformers
3
  torch
4
  spaces
 
5
  webrtcvad
6
  pydub
7
  numpy
8
  scikit-learn
9
- speechbrain
10
  soundfile
 
2
  transformers
3
  torch
4
  spaces
5
+ speechbrain
6
  webrtcvad
7
  pydub
8
  numpy
9
  scikit-learn
 
10
  soundfile