aidn commited on
Commit
dd3c0dd
Β·
verified Β·
1 Parent(s): ed8b8a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -41
app.py CHANGED
@@ -4,7 +4,7 @@ import tempfile
4
  import numpy as np
5
  import soundfile as sf
6
  import torch
7
- import spaces # ← ZeroGPU: muss importiert werden
8
  import gradio as gr
9
  from transformers import pipeline as hf_pipeline
10
 
@@ -14,28 +14,32 @@ HF_TOKEN = os.environ.get("HF_TOKEN", "")
14
  ASR_MODELS = {
15
  "whisper-small (gut, schnell)": "openai/whisper-small",
16
  "whisper-large-v3 (beste QualitΓ€t)": "openai/whisper-large-v3",
17
- "distil-whisper-large-v3 (empfohlen: QualitΓ€t+Speed)": "distil-whisper/distil-large-v3",
18
  }
19
 
20
  _asr_cache: dict = {}
21
  _diar_pipe = None
22
 
 
23
 
24
- def get_asr(model_key: str):
 
 
 
25
  model_id = ASR_MODELS[model_key]
26
  if model_id not in _asr_cache:
27
  _asr_cache[model_id] = hf_pipeline(
28
  "automatic-speech-recognition",
29
  model=model_id,
30
- device="cuda", # ← ZeroGPU: cuda statt cpu
31
- torch_dtype=torch.float16, # ← ZeroGPU: float16 statt float32
32
- chunk_length_s=30,
33
  return_timestamps=True,
34
  )
35
  return _asr_cache[model_id]
36
 
37
 
38
- def get_diar():
39
  global _diar_pipe
40
  if _diar_pipe is None:
41
  if not HF_TOKEN:
@@ -48,11 +52,26 @@ def get_diar():
48
  _diar_pipe = PyannotePipeline.from_pretrained(
49
  "pyannote/speaker-diarization-3.1",
50
  use_auth_token=HF_TOKEN,
51
- ).to(torch.device("cuda")) # ← ZeroGPU: auf GPU verschieben
 
 
52
  return _diar_pipe
53
 
54
 
55
- # ── Hilfsfunktionen ────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  def merge_with_speakers(chunks: list, diarization) -> list[tuple]:
58
  merged = []
@@ -94,60 +113,76 @@ def format_diarized(segments: list[tuple]) -> str:
94
  return "\n\n".join(lines)
95
 
96
 
97
- # ── Haupt-Pipeline (mit @spaces.GPU dekoriert) ────────────────────────────────
98
- # duration=300 = max. 5 Minuten GPU-Zeit pro Call.
99
- # Passe den Wert an deine lΓ€ngsten Meetings an (300s reicht fΓΌr ~30 min Audio).
 
 
 
 
 
 
 
 
100
 
101
- @spaces.GPU(duration=300) # ← ZeroGPU: Pflicht-Decorator
102
- def run_pipeline(tmp_path: str, model_key: str, use_diar: bool):
103
- """LΓ€uft komplett auf der GPU. Wird von transcribe() aufgerufen."""
104
- asr = get_asr(model_key)
105
- result = asr(tmp_path)
 
106
  raw_transcript = result["text"].strip()
107
- chunks = result.get("chunks", [])
108
 
109
  if not use_diar:
110
  return raw_transcript, ""
111
 
 
 
112
  try:
113
- diar = get_diar()
 
 
 
 
114
  diarization = diar(tmp_path)
115
- segments = merge_with_speakers(chunks, diarization)
116
- labeled = format_diarized(segments)
117
  return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
 
118
  except EnvironmentError as e:
119
  return raw_transcript, f"⚠️ {e}"
120
  except Exception as e:
121
  return raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
 
 
 
122
 
123
 
 
 
124
  def transcribe(audio, model_key: str, use_diar: bool):
125
- """UI-Handler: Audio vorbereiten, GPU-Funktion aufrufen."""
126
  if audio is None:
127
  yield "⚠️ Kein Audio eingegeben.", ""
128
  return
129
 
130
  sample_rate, audio_data = audio
131
 
 
132
  if audio_data.ndim > 1:
133
  audio_data = audio_data.mean(axis=1)
134
  audio_data = audio_data.astype(np.float32)
 
 
135
  if audio_data.max() > 1.0:
136
  audio_data /= 32768.0
137
 
138
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
139
- tmp_path = f.name
140
- sf.write(tmp_path, audio_data, sample_rate)
141
-
142
- yield "⏳ GPU wird angefordert und Pipeline gestartet...", ""
143
- try:
144
- transcript, labeled = run_pipeline(tmp_path, model_key, use_diar)
145
- yield transcript, labeled
146
- finally:
147
- os.unlink(tmp_path)
148
 
149
 
150
- # ── UI ─────────────────────────────────────────────────────────────────────────
151
 
152
  TOKEN_WARNING = (
153
  "> ⚠️ **Kein `HF_TOKEN` gefunden.** \n"
@@ -157,12 +192,11 @@ TOKEN_WARNING = (
157
  "[hf.co/pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)."
158
  )
159
 
160
- with gr.Blocks(title="Meeting Transcriber (ZeroGPU)") as demo:
161
  gr.Markdown("# πŸŽ™οΈ YAPPER Β· ZeroGPU Edition")
162
  gr.Markdown(
163
- "## Dein Teams Meeting Begleiter des Vertrauens. \n"
164
- "Lade eine Audiodatei hoch **oder** nimm direkt ΓΌber das Mikrofon auf. \n"
165
- "LΓ€uft auf NVIDIA H200 via ZeroGPU."
166
  )
167
 
168
  if not HF_TOKEN:
@@ -177,7 +211,7 @@ with gr.Blocks(title="Meeting Transcriber (ZeroGPU)") as demo:
177
  )
178
  model_dd = gr.Dropdown(
179
  choices=list(ASR_MODELS.keys()),
180
- value="distil-whisper-large-v3 (empfohlen: QualitΓ€t+Speed)",
181
  label="Transkriptionsmodell",
182
  )
183
  diar_cb = gr.Checkbox(
@@ -201,9 +235,8 @@ with gr.Blocks(title="Meeting Transcriber (ZeroGPU)") as demo:
201
  gr.Markdown(
202
  "---\n"
203
  "**Hinweise:** \n"
204
- "β€’ ZeroGPU-Quota: PRO-User haben 1.500 Sek/Tag (~50 kurze Meetings). \n"
205
- "β€’ Max. 5 Minuten GPU-Zeit pro Transkription (`duration=300`). \n"
206
- "β€’ FΓΌr pyannote musst du die Lizenzbedingungen auf Hugging Face akzeptiert haben."
207
  )
208
 
209
  run_btn.click(
 
4
  import numpy as np
5
  import soundfile as sf
6
  import torch
7
+ import spaces
8
  import gradio as gr
9
  from transformers import pipeline as hf_pipeline
10
 
 
14
  ASR_MODELS = {
15
  "whisper-small (gut, schnell)": "openai/whisper-small",
16
  "whisper-large-v3 (beste QualitΓ€t)": "openai/whisper-large-v3",
17
+ "distil-whisper-large-v3 (empfohlen)": "distil-whisper/distil-large-v3",
18
  }
19
 
20
  _asr_cache: dict = {}
21
  _diar_pipe = None
22
 
23
+ WHISPER_SR = 16_000 # Whisper erwartet immer 16 kHz
24
 
25
+
26
+ # ── Model Loading ──────────────────────────────────────────────────────────────
27
+
28
+ def get_asr(model_key: str, device: str, dtype: torch.dtype):
29
  model_id = ASR_MODELS[model_key]
30
  if model_id not in _asr_cache:
31
  _asr_cache[model_id] = hf_pipeline(
32
  "automatic-speech-recognition",
33
  model=model_id,
34
+ device=device,
35
+ dtype=dtype,
36
+ # chunk_length_s weglassen – wir ΓΌbergeben Array, kein Dateipfad
37
  return_timestamps=True,
38
  )
39
  return _asr_cache[model_id]
40
 
41
 
42
+ def get_diar(device: str):
43
  global _diar_pipe
44
  if _diar_pipe is None:
45
  if not HF_TOKEN:
 
52
  _diar_pipe = PyannotePipeline.from_pretrained(
53
  "pyannote/speaker-diarization-3.1",
54
  use_auth_token=HF_TOKEN,
55
+ )
56
+ if device == "cuda":
57
+ _diar_pipe = _diar_pipe.to(torch.device("cuda"))
58
  return _diar_pipe
59
 
60
 
61
+ # ── Hilfsfunktionen ───────────────────────────────────────────────────────────
62
+
63
+ def resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
64
+ """Einfaches lineares Resampling ohne librosa-AbhΓ€ngigkeit."""
65
+ if orig_sr == target_sr:
66
+ return audio
67
+ ratio = target_sr / orig_sr
68
+ new_len = int(len(audio) * ratio)
69
+ return np.interp(
70
+ np.linspace(0, len(audio) - 1, new_len),
71
+ np.arange(len(audio)),
72
+ audio,
73
+ ).astype(np.float32)
74
+
75
 
76
  def merge_with_speakers(chunks: list, diarization) -> list[tuple]:
77
  merged = []
 
113
  return "\n\n".join(lines)
114
 
115
 
116
+ # ── Haupt-Pipeline ─────────────────────────────────────────────────────────────
117
+
118
+ @spaces.GPU(duration=300)
119
+ def run_pipeline(
120
+ audio_array: np.ndarray,
121
+ sample_rate: int,
122
+ model_key: str,
123
+ use_diar: bool,
124
+ ):
125
+ device = "cuda" if torch.cuda.is_available() else "cpu"
126
+ dtype = torch.float16 if device == "cuda" else torch.float32
127
 
128
+ # ── ASR: Array direkt ΓΌbergeben β†’ kein torchcodec / FFmpeg nΓΆtig ──
129
+ audio_16k = resample(audio_array, sample_rate, WHISPER_SR)
130
+ asr_input = {"array": audio_16k, "sampling_rate": WHISPER_SR}
131
+
132
+ asr = get_asr(model_key, device, dtype)
133
+ result = asr(asr_input)
134
  raw_transcript = result["text"].strip()
135
+ chunks = result.get("chunks", [])
136
 
137
  if not use_diar:
138
  return raw_transcript, ""
139
 
140
+ # ── Diarisierung: pyannote braucht eine Datei ──────────────────────
141
+ tmp_path = None
142
  try:
143
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
144
+ tmp_path = f.name
145
+ sf.write(tmp_path, audio_array, sample_rate)
146
+
147
+ diar = get_diar(device)
148
  diarization = diar(tmp_path)
149
+ segments = merge_with_speakers(chunks, diarization)
150
+ labeled = format_diarized(segments)
151
  return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
152
+
153
  except EnvironmentError as e:
154
  return raw_transcript, f"⚠️ {e}"
155
  except Exception as e:
156
  return raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
157
+ finally:
158
+ if tmp_path and os.path.exists(tmp_path):
159
+ os.unlink(tmp_path)
160
 
161
 
162
+ # ── Gradio-Handler ────────────────────────────────────────────────────────────
163
+
164
  def transcribe(audio, model_key: str, use_diar: bool):
 
165
  if audio is None:
166
  yield "⚠️ Kein Audio eingegeben.", ""
167
  return
168
 
169
  sample_rate, audio_data = audio
170
 
171
+ # Mono erzwingen
172
  if audio_data.ndim > 1:
173
  audio_data = audio_data.mean(axis=1)
174
  audio_data = audio_data.astype(np.float32)
175
+
176
+ # 16-bit PCM β†’ float normalisieren
177
  if audio_data.max() > 1.0:
178
  audio_data /= 32768.0
179
 
180
+ yield "⏳ GPU wird angefordert, Pipeline startet...", ""
181
+ transcript, labeled = run_pipeline(audio_data, sample_rate, model_key, use_diar)
182
+ yield transcript, labeled
 
 
 
 
 
 
 
183
 
184
 
185
+ # ── UI ────────────────────────────────────────────────────────────────────────
186
 
187
  TOKEN_WARNING = (
188
  "> ⚠️ **Kein `HF_TOKEN` gefunden.** \n"
 
192
  "[hf.co/pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)."
193
  )
194
 
195
+ with gr.Blocks(title="πŸŽ™οΈ YAPPER Β· ZeroGPU Edition") as demo:
196
  gr.Markdown("# πŸŽ™οΈ YAPPER Β· ZeroGPU Edition")
197
  gr.Markdown(
198
+ "## Transkription & Speaker-Diarisierung fΓΌr Teams Meetings. \n"
199
+ "Lade eine Datei hoch oder nimm direkt ΓΌber das Mikrofon auf."
 
200
  )
201
 
202
  if not HF_TOKEN:
 
211
  )
212
  model_dd = gr.Dropdown(
213
  choices=list(ASR_MODELS.keys()),
214
+ value="distil-whisper-large-v3 (empfohlen)",
215
  label="Transkriptionsmodell",
216
  )
217
  diar_cb = gr.Checkbox(
 
235
  gr.Markdown(
236
  "---\n"
237
  "**Hinweise:** \n"
238
+ "β€’ FΓΌr pyannote musst du die Lizenzbedingungen auf Hugging Face akzeptiert haben. \n"
239
+ "β€’ ZeroGPU-Quota: 1.500 Sek/Tag fΓΌr PRO-User (reicht fΓΌr ~50 kurze Meetings)."
 
240
  )
241
 
242
  run_btn.click(