aidn commited on
Commit
4f54665
Β·
verified Β·
1 Parent(s): bd0276b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -53
app.py CHANGED
@@ -4,6 +4,7 @@ import tempfile
4
  import numpy as np
5
  import soundfile as sf
6
  import torch
 
7
  import gradio as gr
8
  from transformers import pipeline as hf_pipeline
9
 
@@ -11,13 +12,11 @@ from transformers import pipeline as hf_pipeline
11
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
12
 
13
  ASR_MODELS = {
14
- "whisper-tiny (schnellste, geringste QualitΓ€t)": "openai/whisper-tiny",
15
- "whisper-base (schnell, gut fΓΌr kurze Aufnahmen)": "openai/whisper-base",
16
- "whisper-small (empfohlen fΓΌr CPU)": "openai/whisper-small",
17
- "distil-whisper-large-v3 (langsam, beste QualitΓ€t)": "distil-whisper/distil-large-v3",
18
  }
19
 
20
- # ── Lazy Model Loading ─────────────────────────────────────────────────────────
21
  _asr_cache: dict = {}
22
  _diar_pipe = None
23
 
@@ -28,8 +27,8 @@ def get_asr(model_key: str):
28
  _asr_cache[model_id] = hf_pipeline(
29
  "automatic-speech-recognition",
30
  model=model_id,
31
- device="cpu",
32
- torch_dtype=torch.float32,
33
  chunk_length_s=30,
34
  return_timestamps=True,
35
  )
@@ -49,21 +48,20 @@ def get_diar():
49
  _diar_pipe = PyannotePipeline.from_pretrained(
50
  "pyannote/speaker-diarization-3.1",
51
  use_auth_token=HF_TOKEN,
52
- )
53
  return _diar_pipe
54
 
55
 
56
  # ── Hilfsfunktionen ────────────────────────────────────────────────────────────
57
 
58
  def merge_with_speakers(chunks: list, diarization) -> list[tuple]:
59
- """Ordnet jedem ASR-Chunk den dominanten Sprecher zu."""
60
  merged = []
61
  for chunk in chunks:
62
  ts = chunk.get("timestamp", (None, None))
63
  start, end = ts if ts else (None, None)
64
  if start is None:
65
  continue
66
- end = end or (start + 1.0) # Fallback falls letzter Chunk kein End-Timestamp hat
67
 
68
  best_speaker, best_overlap = "Unbekannt", 0.0
69
  for turn, _, speaker in diarization.itertracks(yield_label=True):
@@ -77,10 +75,8 @@ def merge_with_speakers(chunks: list, diarization) -> list[tuple]:
77
 
78
 
79
  def format_diarized(segments: list[tuple]) -> str:
80
- """Gruppiert aufeinanderfolgende Chunks desselben Sprechers."""
81
  if not segments:
82
  return ""
83
-
84
  lines = []
85
  cur_speaker, cur_start, cur_texts = None, 0.0, []
86
 
@@ -98,22 +94,44 @@ def format_diarized(segments: list[tuple]) -> str:
98
  return "\n\n".join(lines)
99
 
100
 
101
- # ── Haupt-Pipeline ─────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  def transcribe(audio, model_key: str, use_diar: bool):
104
- """Generator-Funktion: liefert Zwischenergebnisse live an die UI."""
105
  if audio is None:
106
  yield "⚠️ Kein Audio eingegeben.", ""
107
  return
108
 
109
  sample_rate, audio_data = audio
110
 
111
- # Mono erzwingen
112
  if audio_data.ndim > 1:
113
  audio_data = audio_data.mean(axis=1)
114
  audio_data = audio_data.astype(np.float32)
115
-
116
- # Normalisieren (16-bit PCM β†’ float)
117
  if audio_data.max() > 1.0:
118
  audio_data /= 32768.0
119
 
@@ -121,34 +139,10 @@ def transcribe(audio, model_key: str, use_diar: bool):
121
  tmp_path = f.name
122
  sf.write(tmp_path, audio_data, sample_rate)
123
 
 
124
  try:
125
- # ── Schritt 1: Transkription ──
126
- yield "⏳ Lade ASR-Modell und transkribiere...", ""
127
-
128
- asr = get_asr(model_key)
129
- result = asr(tmp_path)
130
- raw_transcript = result["text"].strip()
131
- chunks = result.get("chunks", [])
132
-
133
- if not use_diar:
134
- yield raw_transcript, ""
135
- return
136
-
137
- # ── Schritt 2: Diarisierung ──
138
- yield raw_transcript, "⏳ Diarisierung lÀuft (auf CPU kann das einige Minuten dauern)..."
139
-
140
- try:
141
- diar = get_diar()
142
- diarization = diar(tmp_path)
143
- segments = merge_with_speakers(chunks, diarization)
144
- labeled = format_diarized(segments)
145
- yield raw_transcript, labeled or "(Keine Sprecher erkannt.)"
146
-
147
- except EnvironmentError as e:
148
- yield raw_transcript, f"⚠️ {e}"
149
- except Exception as e:
150
- yield raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
151
-
152
  finally:
153
  os.unlink(tmp_path)
154
 
@@ -159,14 +153,15 @@ TOKEN_WARNING = (
159
  "> ⚠️ **Kein `HF_TOKEN` gefunden.** \n"
160
  "> Diarisierung (pyannote) ist deaktiviert. \n"
161
  "> FΓΌge das Token unter **Settings β†’ Variables and secrets** als `HF_TOKEN` hinzu \n"
162
- "> und akzeptiere die Lizenzbedingungen auf [hf.co/pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)."
 
163
  )
164
 
165
- with gr.Blocks(title="Meeting Transcriber") as demo:
166
- gr.Markdown("# πŸŽ™οΈ Meeting Transcriber")
167
  gr.Markdown(
168
  "Lade eine Audiodatei hoch **oder** nimm direkt ΓΌber das Mikrofon auf. \n"
169
- "Das Audio wird transkribiert und optional nach Sprechern getrennt."
170
  )
171
 
172
  if not HF_TOKEN:
@@ -181,12 +176,12 @@ with gr.Blocks(title="Meeting Transcriber") as demo:
181
  )
182
  model_dd = gr.Dropdown(
183
  choices=list(ASR_MODELS.keys()),
184
- value="whisper-small (empfohlen fΓΌr CPU)",
185
  label="Transkriptionsmodell",
186
  )
187
  diar_cb = gr.Checkbox(
188
  value=bool(HF_TOKEN),
189
- label="Speaker-Diarisierung aktivieren (pyannote, braucht HF_TOKEN)",
190
  interactive=bool(HF_TOKEN),
191
  )
192
  run_btn = gr.Button("β–Ά Transkribieren", variant="primary")
@@ -207,9 +202,9 @@ with gr.Blocks(title="Meeting Transcriber") as demo:
207
  gr.Markdown(
208
  "---\n"
209
  "**Hinweise:** \n"
210
- "β€’ Auf Free CPU dauert Whisper-small ~1–2Γ— Echtzeit, Diarisierung ~2–5Γ— Echtzeit. \n"
211
- "β€’ FΓΌr pyannote musst du die Lizenzbedingungen auf Hugging Face akzeptiert haben. \n"
212
- "β€’ Das erste Laden der Modelle dauert lΓ€nger (Download-Cache)."
213
  )
214
 
215
  run_btn.click(
 
4
  import numpy as np
5
  import soundfile as sf
6
  import torch
7
+ import spaces # ← ZeroGPU: muss importiert werden
8
  import gradio as gr
9
  from transformers import pipeline as hf_pipeline
10
 
 
12
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
13
 
14
  ASR_MODELS = {
15
+ "whisper-small (gut, schnell)": "openai/whisper-small",
16
+ "whisper-large-v3 (beste QualitΓ€t)": "openai/whisper-large-v3",
17
+ "distil-whisper-large-v3 (empfohlen: QualitΓ€t+Speed)": "distil-whisper/distil-large-v3",
 
18
  }
19
 
 
20
  _asr_cache: dict = {}
21
  _diar_pipe = None
22
 
 
27
  _asr_cache[model_id] = hf_pipeline(
28
  "automatic-speech-recognition",
29
  model=model_id,
30
+ device="cuda", # ← ZeroGPU: cuda statt cpu
31
+ torch_dtype=torch.float16, # ← ZeroGPU: float16 statt float32
32
  chunk_length_s=30,
33
  return_timestamps=True,
34
  )
 
48
  _diar_pipe = PyannotePipeline.from_pretrained(
49
  "pyannote/speaker-diarization-3.1",
50
  use_auth_token=HF_TOKEN,
51
+ ).to(torch.device("cuda")) # ← ZeroGPU: auf GPU verschieben
52
  return _diar_pipe
53
 
54
 
55
  # ── Hilfsfunktionen ────────────────────────────────────────────────────────────
56
 
57
  def merge_with_speakers(chunks: list, diarization) -> list[tuple]:
 
58
  merged = []
59
  for chunk in chunks:
60
  ts = chunk.get("timestamp", (None, None))
61
  start, end = ts if ts else (None, None)
62
  if start is None:
63
  continue
64
+ end = end or (start + 1.0)
65
 
66
  best_speaker, best_overlap = "Unbekannt", 0.0
67
  for turn, _, speaker in diarization.itertracks(yield_label=True):
 
75
 
76
 
77
  def format_diarized(segments: list[tuple]) -> str:
 
78
  if not segments:
79
  return ""
 
80
  lines = []
81
  cur_speaker, cur_start, cur_texts = None, 0.0, []
82
 
 
94
  return "\n\n".join(lines)
95
 
96
 
97
+ # ── Haupt-Pipeline (mit @spaces.GPU dekoriert) ────────────────────────────────
98
+ # duration=300 = max. 5 Minuten GPU-Zeit pro Call.
99
+ # Passe den Wert an deine lΓ€ngsten Meetings an (300s reicht fΓΌr ~30 min Audio).
100
+
101
+ @spaces.GPU(duration=300) # ← ZeroGPU: Pflicht-Decorator
102
+ def run_pipeline(tmp_path: str, model_key: str, use_diar: bool):
103
+ """LΓ€uft komplett auf der GPU. Wird von transcribe() aufgerufen."""
104
+ asr = get_asr(model_key)
105
+ result = asr(tmp_path)
106
+ raw_transcript = result["text"].strip()
107
+ chunks = result.get("chunks", [])
108
+
109
+ if not use_diar:
110
+ return raw_transcript, ""
111
+
112
+ try:
113
+ diar = get_diar()
114
+ diarization = diar(tmp_path)
115
+ segments = merge_with_speakers(chunks, diarization)
116
+ labeled = format_diarized(segments)
117
+ return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
118
+ except EnvironmentError as e:
119
+ return raw_transcript, f"⚠️ {e}"
120
+ except Exception as e:
121
+ return raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
122
+
123
 
124
  def transcribe(audio, model_key: str, use_diar: bool):
125
+ """UI-Handler: Audio vorbereiten, GPU-Funktion aufrufen."""
126
  if audio is None:
127
  yield "⚠️ Kein Audio eingegeben.", ""
128
  return
129
 
130
  sample_rate, audio_data = audio
131
 
 
132
  if audio_data.ndim > 1:
133
  audio_data = audio_data.mean(axis=1)
134
  audio_data = audio_data.astype(np.float32)
 
 
135
  if audio_data.max() > 1.0:
136
  audio_data /= 32768.0
137
 
 
139
  tmp_path = f.name
140
  sf.write(tmp_path, audio_data, sample_rate)
141
 
142
+ yield "⏳ GPU wird angefordert und Pipeline gestartet...", ""
143
  try:
144
+ transcript, labeled = run_pipeline(tmp_path, model_key, use_diar)
145
+ yield transcript, labeled
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  finally:
147
  os.unlink(tmp_path)
148
 
 
153
  "> ⚠️ **Kein `HF_TOKEN` gefunden.** \n"
154
  "> Diarisierung (pyannote) ist deaktiviert. \n"
155
  "> FΓΌge das Token unter **Settings β†’ Variables and secrets** als `HF_TOKEN` hinzu \n"
156
+ "> und akzeptiere die Lizenzbedingungen auf "
157
+ "[hf.co/pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)."
158
  )
159
 
160
+ with gr.Blocks(title="Meeting Transcriber (ZeroGPU)") as demo:
161
+ gr.Markdown("# πŸŽ™οΈ Meeting Transcriber Β· ZeroGPU Edition")
162
  gr.Markdown(
163
  "Lade eine Audiodatei hoch **oder** nimm direkt ΓΌber das Mikrofon auf. \n"
164
+ "LΓ€uft auf NVIDIA H200 via ZeroGPU – deutlich schneller als CPU."
165
  )
166
 
167
  if not HF_TOKEN:
 
176
  )
177
  model_dd = gr.Dropdown(
178
  choices=list(ASR_MODELS.keys()),
179
+ value="distil-whisper-large-v3 (empfohlen: QualitΓ€t+Speed)",
180
  label="Transkriptionsmodell",
181
  )
182
  diar_cb = gr.Checkbox(
183
  value=bool(HF_TOKEN),
184
+ label="Speaker-Diarisierung (pyannote) – braucht HF_TOKEN",
185
  interactive=bool(HF_TOKEN),
186
  )
187
  run_btn = gr.Button("β–Ά Transkribieren", variant="primary")
 
202
  gr.Markdown(
203
  "---\n"
204
  "**Hinweise:** \n"
205
+ "β€’ ZeroGPU-Quota: PRO-User haben 1.500 Sek/Tag (~50 kurze Meetings). \n"
206
+ "β€’ Max. 5 Minuten GPU-Zeit pro Transkription (`duration=300`). \n"
207
+ "β€’ FΓΌr pyannote musst du die Lizenzbedingungen auf Hugging Face akzeptiert haben."
208
  )
209
 
210
  run_btn.click(