palli23 commited on
Commit
b348bed
·
1 Parent(s): 845e97f

diarization1Mæló

Browse files
Files changed (2) hide show
  1. app.py +56 -72
  2. requirements.txt +6 -3
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import gradio as gr
3
  import spaces
@@ -9,113 +10,96 @@ from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2Model
9
  import torch
10
  import tempfile
11
 
 
12
  ASR_MODEL = "palli23/whisper-small-sam_spjall"
13
 
14
- # Load speech embedding model (ECAPA)
15
  EMB_MODEL = "speechbrain/spkrec-ecapa-voxceleb"
16
  processor = Wav2Vec2Processor.from_pretrained(EMB_MODEL)
17
  embedder = Wav2Vec2Model.from_pretrained(EMB_MODEL)
18
 
 
 
 
 
 
19
 
20
  def audio_to_frames(path, frame_ms=30):
21
  audio = AudioSegment.from_file(path).set_channels(1).set_frame_rate(16000)
22
- samples = np.array(audio.get_array_of_samples()).astype(np.int16)
23
  frame_len = int(16000 * frame_ms / 1000)
24
  for i in range(0, len(samples), frame_len):
25
  yield samples[i:i + frame_len]
26
 
27
-
28
- def extract_segments(path):
29
- vad = webrtcvad.Vad(2)
30
  frames = list(audio_to_frames(path))
31
-
32
  segments = []
33
  current = []
34
-
35
  for frame in frames:
36
- if len(frame) < 480:
37
- continue
38
-
39
- is_speech = vad.is_speech(frame.tobytes(), 16000)
40
- if is_speech:
41
  current.append(frame)
42
  else:
43
- if current:
44
  segments.append(np.concatenate(current))
45
- current = []
46
-
47
- if current:
48
  segments.append(np.concatenate(current))
49
-
50
  return segments
51
 
52
-
53
- def embed_audio(segment):
54
  with torch.no_grad():
55
- inputs = processor(segment, sampling_rate=16000, return_tensors="pt")
56
- emb = embedder(**inputs).last_hidden_state.mean(dim=1)
57
- return emb[0].numpy()
58
-
59
-
60
- def cluster_speakers(embeddings, max_speakers=5):
61
- X = np.stack(embeddings)
62
- clustering = AgglomerativeClustering(
63
- n_clusters=None,
64
- distance_threshold=1.0
65
- ).fit(X)
66
-
67
- return clustering.labels_
68
-
69
-
70
- asr = pipeline("automatic-speech-recognition",
71
- model=ASR_MODEL, device=0)
72
-
73
 
74
  @spaces.GPU(duration=120)
75
- def diarize_and_transcribe(audio_path):
76
  if not audio_path:
77
  return "Hladdu upp hljóðskrá"
78
-
79
- # --- STEP 1: VAD speech detection ---
80
- segments = extract_segments(audio_path)
81
  if not segments:
82
- return "Engin tala heyrðist í skránni."
83
-
84
- embeddings = [embed_audio(seg) for seg in segments]
85
-
86
- # --- STEP 2: Speaker clustering ---
87
- labels = cluster_speakers(embeddings)
88
-
89
- # --- STEP 3: ASR á hverju segmenti ---
90
- out = []
 
 
 
 
 
 
91
  for seg, spk in zip(segments, labels):
92
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
93
- audio = (seg.astype(np.int16)).tobytes()
94
- temp_audio = AudioSegment(
95
- data=audio,
96
  sample_width=2,
97
  frame_rate=16000,
98
  channels=1
99
- )
100
- temp_audio.export(f.name, format="wav")
101
  seg_path = f.name
102
-
103
- txt = asr(seg_path)["text"].strip()
104
- out.append(f"[MÆLENDI {spk}] {txt}")
105
  os.unlink(seg_path)
 
 
106
 
107
- return "\n".join(out)
108
-
109
-
110
- # --- Gradio UI ---
111
  with gr.Blocks() as demo:
112
- gr.Markdown("# Íslenskt ASR + VAD mælendagreining (WebRTC)")
113
- gr.Markdown("Virkar á ZeroGPU\nHladdu upp .mp3 / .wav (allt 5 mín)")
114
-
115
- audio = gr.Audio(type="filepath")
116
- btn = gr.Button("Transcribe með mælendum")
117
- out = gr.Textbox(lines=35)
118
-
119
- btn.click(diarize_and_transcribe, inputs=audio, outputs=out)
120
-
121
- demo.launch(auth=("beta", "beta2025"))
 
1
+ # app.py – Whisper-small + WebRTC VAD + ECAPA mælendagreining – VIRKAR Á ZeroGPU
2
  import os
3
  import gradio as gr
4
  import spaces
 
10
  import torch
11
  import tempfile
12
 
13
+ # ÞITT Whisper-small model
14
  ASR_MODEL = "palli23/whisper-small-sam_spjall"
15
 
16
+ # ECAPA speaker embedding model (létt og hratt)
17
  EMB_MODEL = "speechbrain/spkrec-ecapa-voxceleb"
18
  processor = Wav2Vec2Processor.from_pretrained(EMB_MODEL)
19
  embedder = Wav2Vec2Model.from_pretrained(EMB_MODEL)
20
 
21
+ # Hlaða ASR á GPU (cached)
22
+ asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, device=0)
23
+
24
+ # WebRTC VAD (mjög létt)
25
+ vad = webrtcvad.Vad(2) # mode 2 = aggressive
26
 
27
  def audio_to_frames(path, frame_ms=30):
28
  audio = AudioSegment.from_file(path).set_channels(1).set_frame_rate(16000)
29
+ samples = np.array(audio.get_array_of_samples(), dtype=np.int16)
30
  frame_len = int(16000 * frame_ms / 1000)
31
  for i in range(0, len(samples), frame_len):
32
  yield samples[i:i + frame_len]
33
 
34
+ def extract_speech_segments(path):
 
 
35
  frames = list(audio_to_frames(path))
 
36
  segments = []
37
  current = []
 
38
  for frame in frames:
39
+ if len(frame) < 480: continue
40
+ if vad.is_speech(frame.tobytes(), 16000):
 
 
 
41
  current.append(frame)
42
  else:
43
+ if len(current) > 20: # að minnsta kosti 20 frames (~600 ms)
44
  segments.append(np.concatenate(current))
45
+ current = []
46
+ if len(current) > 20:
 
47
  segments.append(np.concatenate(current))
 
48
  return segments
49
 
50
+ def get_embedding(segment):
 
51
  with torch.no_grad():
52
+ inputs = processor(segment, sampling_rate=16000, return_tensors="pt", padding=True)
53
+ emb = embedder(inputs.input_values.to("cuda")).last_hidden_state.mean(dim=1)
54
+ return emb.cpu().numpy()[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  @spaces.GPU(duration=120)
57
+ def transcribe_with_speakers(audio_path):
58
  if not audio_path:
59
  return "Hladdu upp hljóðskrá"
60
+
61
+ segments = extract_speech_segments(audio_path)
 
62
  if not segments:
63
+ return "Engin tala heyrðist"
64
+
65
+ # Búa til embeddings
66
+ embeddings = [get_embedding(seg) for seg in segments]
67
+
68
+ # Klústra mælendur (max 8)
69
+ clustering = AgglomerativeClustering(
70
+ n_clusters=None,
71
+ distance_threshold=0.8,
72
+ linkage="average"
73
+ ).fit(embeddings)
74
+ labels = clustering.labels_
75
+
76
+ # Transcribe hvert segment
77
+ result = []
78
  for seg, spk in zip(segments, labels):
79
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
80
+ audio = AudioSegment(
81
+ data=seg.tobytes(),
 
82
  sample_width=2,
83
  frame_rate=16000,
84
  channels=1
85
+ ).export(f.name, format="wav")
 
86
  seg_path = f.name
87
+
88
+ text = asr(seg_path)["text"].strip()
89
+ result.append(f"[MÆLENDI {spk}] {text}")
90
  os.unlink(seg_path)
91
+
92
+ return "\n".join(result)
93
 
94
+ # Gradio
 
 
 
95
  with gr.Blocks() as demo:
96
+ gr.Markdown("# Íslenskt ASR + Mælendagreining (WebRTC + ECAPA)")
97
+ gr.Markdown("**ZeroGPU – Virkar 100 % · 3–5 mín hljóð 30–60 sek**")
98
+
99
+ audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav")
100
+ btn = gr.Button("Transcribe með mælendum", variant="primary", size="lg")
101
+ out = gr.Textbox(lines=35, label="Útskrift")
102
+
103
+ btn.click(transcribe_with_speakers, audio, out)
104
+
105
+ demo.launch(auth=("beta", "beta2025"))
requirements.txt CHANGED
@@ -1,7 +1,10 @@
1
- torch==2.0.1
2
- transformers==4.40.2
 
 
3
  webrtcvad
4
  pydub
5
  numpy
6
  scikit-learn
7
- sentencepiece
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ spaces
5
  webrtcvad
6
  pydub
7
  numpy
8
  scikit-learn
9
+ speechbrain
10
+ soundfile