aidn commited on
Commit
a5cc652
Β·
verified Β·
1 Parent(s): e7cb287

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -17
app.py CHANGED
@@ -1,8 +1,5 @@
1
  import os
2
- import tempfile
3
-
4
  import numpy as np
5
- import soundfile as sf
6
  import torch
7
  import spaces
8
  import gradio as gr
@@ -53,7 +50,7 @@ def get_diar(device: str):
53
 
54
  _diar_pipe = PyannotePipeline.from_pretrained(
55
  "pyannote/speaker-diarization-3.1",
56
- use_auth_token=HF_TOKEN,
57
  )
58
  if device == "cuda":
59
  _diar_pipe = _diar_pipe.to(torch.device("cuda"))
@@ -110,11 +107,18 @@ def transcribe_audio(audio_16k: np.ndarray, processor, model, device: str, dtype
110
  language="de", # ggf. auf "en" Γ€ndern oder weglassen fΓΌr Auto-Detect
111
  )
112
 
113
- # Dekodieren mit Timestamps
114
- result = processor.batch_decode(predicted_ids, decode_with_timestamps=True)[0]
 
 
 
 
115
 
116
- # Timestamps aus dem Ergebnis extrahieren (Format: <|0.00|> Text <|1.50|>)
117
  import re
 
 
 
118
  ts_pattern = re.compile(r"<\|([\d.]+)\|>")
119
  tokens = ts_pattern.split(result)
120
 
@@ -187,15 +191,13 @@ def run_pipeline(audio_array: np.ndarray, sample_rate: int, model_key: str, use_
187
  if not use_diar:
188
  return raw_transcript, ""
189
 
190
- # 3. Diarisierung (pyannote braucht Datei)
191
- tmp_path = None
192
  try:
193
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
194
- tmp_path = f.name
195
- sf.write(tmp_path, audio_array, sample_rate)
196
 
197
  diar = get_diar(device)
198
- diarization = diar(tmp_path)
199
  segments = merge_with_speakers(chunks, diarization)
200
  labeled = format_diarized(segments)
201
  return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
@@ -204,9 +206,6 @@ def run_pipeline(audio_array: np.ndarray, sample_rate: int, model_key: str, use_
204
  return raw_transcript, f"⚠️ {e}"
205
  except Exception as e:
206
  return raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
207
- finally:
208
- if tmp_path and os.path.exists(tmp_path):
209
- os.unlink(tmp_path)
210
 
211
 
212
  # ── Gradio-Handler ────────────────────────────────────────────────────────────
@@ -242,7 +241,7 @@ TOKEN_WARNING = (
242
  with gr.Blocks(title="πŸŽ™οΈ YAPPER Β· ZeroGPU Edition") as demo:
243
  gr.Markdown("# πŸŽ™οΈ YAPPER Β· ZeroGPU Edition")
244
  gr.Markdown(
245
- "## Transkription & Speaker-Diarisierung fΓΌr Teams Meetings. \n"
246
  "Lade eine Datei hoch oder nimm direkt ΓΌber das Mikrofon auf."
247
  )
248
 
 
1
  import os
 
 
2
  import numpy as np
 
3
  import torch
4
  import spaces
5
  import gradio as gr
 
50
 
51
  _diar_pipe = PyannotePipeline.from_pretrained(
52
  "pyannote/speaker-diarization-3.1",
53
+ token=HF_TOKEN, # ← use_auth_token wurde entfernt
54
  )
55
  if device == "cuda":
56
  _diar_pipe = _diar_pipe.to(torch.device("cuda"))
 
107
  language="de", # ggf. auf "en" Γ€ndern oder weglassen fΓΌr Auto-Detect
108
  )
109
 
110
+ # Dekodieren mit Timestamps, Special Tokens filtern
111
+ result = processor.batch_decode(
112
+ predicted_ids,
113
+ decode_with_timestamps=True,
114
+ skip_special_tokens=False, # brauchen Timestamp-Tokens
115
+ )[0]
116
 
117
+ # Nicht-Timestamp Special Tokens entfernen (<|startoftranscript|> etc.)
118
  import re
119
+ result = re.sub(r"<\|(?![\d.]+\|)[^>]+\|>", "", result).strip()
120
+
121
+ # Timestamps extrahieren (Format: <|0.00|> Text <|1.50|>)
122
  ts_pattern = re.compile(r"<\|([\d.]+)\|>")
123
  tokens = ts_pattern.split(result)
124
 
 
191
  if not use_diar:
192
  return raw_transcript, ""
193
 
194
+ # 3. Diarisierung: Tensor-Dict β†’ kein torchcodec nΓΆtig
 
195
  try:
196
+ waveform = torch.tensor(audio_array).unsqueeze(0).float()
197
+ diar_input = {"waveform": waveform, "sample_rate": sample_rate}
 
198
 
199
  diar = get_diar(device)
200
+ diarization = diar(diar_input)
201
  segments = merge_with_speakers(chunks, diarization)
202
  labeled = format_diarized(segments)
203
  return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
 
206
  return raw_transcript, f"⚠️ {e}"
207
  except Exception as e:
208
  return raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
 
 
 
209
 
210
 
211
  # ── Gradio-Handler ────────────────────────────────────────────────────────────
 
241
  with gr.Blocks(title="πŸŽ™οΈ YAPPER Β· ZeroGPU Edition") as demo:
242
  gr.Markdown("# πŸŽ™οΈ YAPPER Β· ZeroGPU Edition")
243
  gr.Markdown(
244
+ "## Transkription & Speaker-Segmentierung fΓΌr Teams Meetings. \n"
245
  "Lade eine Datei hoch oder nimm direkt ΓΌber das Mikrofon auf."
246
  )
247