Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
import os
|
| 2 |
-
import tempfile
|
| 3 |
-
|
| 4 |
import numpy as np
|
| 5 |
-
import soundfile as sf
|
| 6 |
import torch
|
| 7 |
import spaces
|
| 8 |
import gradio as gr
|
|
@@ -53,7 +50,7 @@ def get_diar(device: str):
|
|
| 53 |
|
| 54 |
_diar_pipe = PyannotePipeline.from_pretrained(
|
| 55 |
"pyannote/speaker-diarization-3.1",
|
| 56 |
-
|
| 57 |
)
|
| 58 |
if device == "cuda":
|
| 59 |
_diar_pipe = _diar_pipe.to(torch.device("cuda"))
|
|
@@ -110,11 +107,18 @@ def transcribe_audio(audio_16k: np.ndarray, processor, model, device: str, dtype
|
|
| 110 |
language="de", # ggf. auf "en" Γ€ndern oder weglassen fΓΌr Auto-Detect
|
| 111 |
)
|
| 112 |
|
| 113 |
-
# Dekodieren mit Timestamps
|
| 114 |
-
result = processor.batch_decode(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
#
|
| 117 |
import re
|
|
|
|
|
|
|
|
|
|
| 118 |
ts_pattern = re.compile(r"<\|([\d.]+)\|>")
|
| 119 |
tokens = ts_pattern.split(result)
|
| 120 |
|
|
@@ -187,15 +191,13 @@ def run_pipeline(audio_array: np.ndarray, sample_rate: int, model_key: str, use_
|
|
| 187 |
if not use_diar:
|
| 188 |
return raw_transcript, ""
|
| 189 |
|
| 190 |
-
# 3. Diarisierung
|
| 191 |
-
tmp_path = None
|
| 192 |
try:
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
sf.write(tmp_path, audio_array, sample_rate)
|
| 196 |
|
| 197 |
diar = get_diar(device)
|
| 198 |
-
diarization = diar(
|
| 199 |
segments = merge_with_speakers(chunks, diarization)
|
| 200 |
labeled = format_diarized(segments)
|
| 201 |
return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
|
|
@@ -204,9 +206,6 @@ def run_pipeline(audio_array: np.ndarray, sample_rate: int, model_key: str, use_
|
|
| 204 |
return raw_transcript, f"β οΈ {e}"
|
| 205 |
except Exception as e:
|
| 206 |
return raw_transcript, f"β οΈ Diarisierung fehlgeschlagen: {e}"
|
| 207 |
-
finally:
|
| 208 |
-
if tmp_path and os.path.exists(tmp_path):
|
| 209 |
-
os.unlink(tmp_path)
|
| 210 |
|
| 211 |
|
| 212 |
# ββ Gradio-Handler ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -242,7 +241,7 @@ TOKEN_WARNING = (
|
|
| 242 |
with gr.Blocks(title="ποΈ YAPPER Β· ZeroGPU Edition") as demo:
|
| 243 |
gr.Markdown("# ποΈ YAPPER Β· ZeroGPU Edition")
|
| 244 |
gr.Markdown(
|
| 245 |
-
"## Transkription & Speaker-
|
| 246 |
"Lade eine Datei hoch oder nimm direkt ΓΌber das Mikrofon auf."
|
| 247 |
)
|
| 248 |
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
|
|
|
| 3 |
import torch
|
| 4 |
import spaces
|
| 5 |
import gradio as gr
|
|
|
|
| 50 |
|
| 51 |
_diar_pipe = PyannotePipeline.from_pretrained(
|
| 52 |
"pyannote/speaker-diarization-3.1",
|
| 53 |
+
token=HF_TOKEN, # β use_auth_token wurde entfernt
|
| 54 |
)
|
| 55 |
if device == "cuda":
|
| 56 |
_diar_pipe = _diar_pipe.to(torch.device("cuda"))
|
|
|
|
| 107 |
language="de", # ggf. auf "en" Γ€ndern oder weglassen fΓΌr Auto-Detect
|
| 108 |
)
|
| 109 |
|
| 110 |
+
# Dekodieren mit Timestamps, Special Tokens filtern
|
| 111 |
+
result = processor.batch_decode(
|
| 112 |
+
predicted_ids,
|
| 113 |
+
decode_with_timestamps=True,
|
| 114 |
+
skip_special_tokens=False, # brauchen Timestamp-Tokens
|
| 115 |
+
)[0]
|
| 116 |
|
| 117 |
+
# Nicht-Timestamp Special Tokens entfernen (<|startoftranscript|> etc.)
|
| 118 |
import re
|
| 119 |
+
result = re.sub(r"<\|(?![\d.]+\|)[^>]+\|>", "", result).strip()
|
| 120 |
+
|
| 121 |
+
# Timestamps extrahieren (Format: <|0.00|> Text <|1.50|>)
|
| 122 |
ts_pattern = re.compile(r"<\|([\d.]+)\|>")
|
| 123 |
tokens = ts_pattern.split(result)
|
| 124 |
|
|
|
|
| 191 |
if not use_diar:
|
| 192 |
return raw_transcript, ""
|
| 193 |
|
| 194 |
+
# 3. Diarisierung: Tensor-Dict β kein torchcodec nΓΆtig
|
|
|
|
| 195 |
try:
|
| 196 |
+
waveform = torch.tensor(audio_array).unsqueeze(0).float()
|
| 197 |
+
diar_input = {"waveform": waveform, "sample_rate": sample_rate}
|
|
|
|
| 198 |
|
| 199 |
diar = get_diar(device)
|
| 200 |
+
diarization = diar(diar_input)
|
| 201 |
segments = merge_with_speakers(chunks, diarization)
|
| 202 |
labeled = format_diarized(segments)
|
| 203 |
return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
|
|
|
|
| 206 |
return raw_transcript, f"β οΈ {e}"
|
| 207 |
except Exception as e:
|
| 208 |
return raw_transcript, f"β οΈ Diarisierung fehlgeschlagen: {e}"
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
|
| 211 |
# ββ Gradio-Handler ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 241 |
with gr.Blocks(title="ποΈ YAPPER Β· ZeroGPU Edition") as demo:
|
| 242 |
gr.Markdown("# ποΈ YAPPER Β· ZeroGPU Edition")
|
| 243 |
gr.Markdown(
|
| 244 |
+
"## Transkription & Speaker-Segmentierung fΓΌr Teams Meetings. \n"
|
| 245 |
"Lade eine Datei hoch oder nimm direkt ΓΌber das Mikrofon auf."
|
| 246 |
)
|
| 247 |
|