benhadjermed commited on
Commit
90b0434
·
verified ·
1 Parent(s): 8cb52fb

Migrate to faster-whisper with INT8 quantization for ~4x speedup

Browse files
Files changed (3) hide show
  1. Dockerfile +4 -4
  2. main.py +158 -119
  3. requirements.txt +1 -3
Dockerfile CHANGED
@@ -1,7 +1,8 @@
1
  # ── Tahkik Inference Space ──────────────────────────────────────────────────
2
- # CPU image. To enable GPU (T4/L4/A100), change the base image to:
 
3
  # FROM nvidia/cuda:12.1-runtime-ubuntu22.04
4
- # and replace the pip torch line with the CUDA-specific wheel URL.
5
  # ---------------------------------------------------------------------------
6
 
7
  FROM python:3.10-slim
@@ -20,9 +21,8 @@ COPY --chown=user . .
20
 
21
  # Redirect all model/cache downloads to /tmp (only writable path in Spaces).
22
  ENV HF_HOME=/tmp/huggingface_cache
23
- ENV TORCH_HOME=/tmp/torch_cache
24
- ENV TRANSFORMERS_VERBOSITY=error
25
  ENV HF_HUB_DISABLE_PROGRESS_BARS=1
 
26
 
27
  USER user
28
 
 
1
  # ── Tahkik Inference Space ──────────────────────────────────────────────────
2
+ # Uses faster-whisper (CTranslate2 INT8) for ~4x faster inference vs PyTorch.
3
+ # To enable GPU (T4/L4/A100), change the base image to:
4
  # FROM nvidia/cuda:12.1-runtime-ubuntu22.04
5
+ # and set compute_type="float16" in main.py.
6
  # ---------------------------------------------------------------------------
7
 
8
  FROM python:3.10-slim
 
21
 
22
  # Redirect all model/cache downloads to /tmp (only writable path in Spaces).
23
  ENV HF_HOME=/tmp/huggingface_cache
 
 
24
  ENV HF_HUB_DISABLE_PROGRESS_BARS=1
25
+ ENV CT2_VERBOSE=0
26
 
27
  USER user
28
 
main.py CHANGED
@@ -2,37 +2,34 @@
2
  """
3
  Tahkik Inference Server — Hugging Face Space entry point.
4
 
5
- Loads the Whisper model ONCE at startup, then serves:
 
6
  - POST /evaluate — batch transcription (upload a full audio file)
7
  - WS /ws/stream — real-time streaming transcription (send PCM chunks)
8
  """
9
 
10
  import asyncio
11
  import json
 
12
  import os
13
- import sys
14
- import struct
15
  import time
16
  import tempfile
17
 
18
  # Redirect model caches to /tmp (only writable dir in HF Spaces)
19
  os.environ.setdefault("HF_HOME", "/tmp/huggingface_cache")
20
- os.environ.setdefault("TORCH_HOME", "/tmp/torch_cache")
21
- os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
22
  os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
 
23
 
24
  import numpy as np
25
  from fastapi import FastAPI, File, UploadFile, HTTPException, WebSocket, WebSocketDisconnect
26
  from fastapi.responses import JSONResponse
27
- import torch
28
- import torch.nn.functional as F
29
- from transformers import WhisperForConditionalGeneration, WhisperProcessor
30
 
31
  # ---------------------------------------------------------------------------
32
  # Constants
33
  # ---------------------------------------------------------------------------
34
 
35
- TAHKIK_MODEL = "benhadjermed/tahkik-basic-warsh"
36
  SAMPLE_RATE = 16000
37
  CHUNK_LENGTH_S = 30
38
  OVERLAP_S = 1
@@ -41,39 +38,26 @@ OVERLAP_S = 1
41
  MIN_AUDIO_FOR_INFERENCE_S = 1.0
42
  MIN_SAMPLES_FOR_INFERENCE = int(MIN_AUDIO_FOR_INFERENCE_S * SAMPLE_RATE)
43
 
 
 
 
 
44
  ALLOWED_EXTS = {".wav", ".m4a", ".mp3", ".flac", ".ogg"}
45
 
46
  # ---------------------------------------------------------------------------
47
  # Model loading (happens once at module import / server startup)
48
  # ---------------------------------------------------------------------------
49
 
50
- print("[inference] importing torch / transformers...", flush=True)
51
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
52
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
53
- print(f"[inference] device: {device}", flush=True)
54
-
55
- print("[inference] loading processor (openai/whisper-base)...", flush=True)
56
- processor = WhisperProcessor.from_pretrained(
57
- "openai/whisper-base", language="Arabic", task="transcribe"
58
  )
59
-
60
- print(f"[inference] loading model ({TAHKIK_MODEL})...", flush=True)
61
- model = WhisperForConditionalGeneration.from_pretrained(
62
- TAHKIK_MODEL, torch_dtype=torch_dtype
63
- ).to(device)
64
-
65
- # Patch missing generation config fields that some fine-tuned checkpoints omit.
66
- if not hasattr(model.generation_config, "lang_to_id") or model.generation_config.lang_to_id is None:
67
- print("[inference] patching generation config from base model...", flush=True)
68
- _base = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
69
- model.generation_config.lang_to_id = _base.generation_config.lang_to_id
70
- model.generation_config.id_to_lang = {v: k for k, v in _base.generation_config.lang_to_id.items()}
71
- model.generation_config.task_to_id = _base.generation_config.task_to_id
72
- del _base
73
-
74
  print("[inference] model ready", flush=True)
75
 
76
- # Global inference lock — one inference at a time to avoid GPU OOM.
77
  _inference_lock = asyncio.Lock()
78
 
79
  # ---------------------------------------------------------------------------
@@ -139,6 +123,7 @@ async def stream_transcribe(ws: WebSocket):
139
 
140
  # Accumulate raw PCM bytes from the client.
141
  audio_buffer = bytearray()
 
142
  last_inference_len = 0 # track buffer size at last inference to avoid redundant runs
143
 
144
  async def _run_partial(pcm_data: bytes):
@@ -147,9 +132,15 @@ async def stream_transcribe(ws: WebSocket):
147
  text = await asyncio.get_event_loop().run_in_executor(
148
  None, _transcribe_pcm_buffer, pcm_data
149
  )
150
- await ws.send_json({"type": "partial", "text": text})
 
 
 
 
151
  except Exception as e:
152
- print(f"[ws] partial inference error: {e}", flush=True)
 
 
153
 
154
  try:
155
  while True:
@@ -163,20 +154,50 @@ async def stream_transcribe(ws: WebSocket):
163
  buffer_samples = len(audio_buffer) // 2 # 16-bit = 2 bytes/sample
164
  new_samples = buffer_samples - (last_inference_len // 2)
165
 
166
- if buffer_samples >= MIN_SAMPLES_FOR_INFERENCE and new_samples >= (SAMPLE_RATE // 2):
167
- # Run partial inference ONLY if the CPU is free.
168
- # This prevents thousands of requests from queuing and timing out the final run.
169
- if not _inference_lock.locked():
170
- last_inference_len = len(audio_buffer)
171
- # We must run this in the background, otherwise we block ws.receive()
172
- asyncio.create_task(_run_partial(bytes(audio_buffer)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  # --- Text frame: control message ------------------------------
175
  elif "text" in message and message["text"] is not None:
176
  try:
177
  msg = json.loads(message["text"])
178
  except json.JSONDecodeError:
179
- await ws.send_json({"type": "error", "message": "invalid JSON"})
 
 
 
180
  continue
181
 
182
  if msg.get("type") == "stop":
@@ -184,12 +205,15 @@ async def stream_transcribe(ws: WebSocket):
184
 
185
  buffer_samples = len(audio_buffer) // 2
186
  if buffer_samples < MIN_SAMPLES_FOR_INFERENCE:
187
- await ws.send_json({
188
- "type": "final",
189
- "text": "",
190
- "confidence": 0.0,
191
- "processing_time_ms": 0,
192
- })
 
 
 
193
  else:
194
  t_start = time.time()
195
  async with _inference_lock:
@@ -198,22 +222,28 @@ async def stream_transcribe(ws: WebSocket):
198
  )
199
  elapsed = int((time.time() - t_start) * 1000)
200
 
201
- await ws.send_json({
202
- "type": "final",
203
- "text": text,
204
- "confidence": confidence,
205
- "processing_time_ms": elapsed,
206
- })
 
 
 
 
207
 
208
  # Reset for potential next session on the same connection.
209
  audio_buffer = bytearray()
 
210
  last_inference_len = 0
211
  break # Close after final result.
212
 
213
  except WebSocketDisconnect:
214
  print("[ws] client disconnected", flush=True)
215
  except Exception as exc:
216
- print(f"[ws] error: {exc}", flush=True)
 
217
  try:
218
  await ws.send_json({"type": "error", "message": str(exc)})
219
  except Exception:
@@ -236,8 +266,32 @@ def _pcm_bytes_to_float32(pcm_bytes: bytes) -> np.ndarray:
236
  return int16_array.astype(np.float32) / 32768.0
237
 
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  def _transcribe_pcm_buffer(pcm_bytes: bytes) -> str:
240
- """Run Whisper inference on raw PCM buffer, return text only."""
241
  audio_array = _pcm_bytes_to_float32(pcm_bytes)
242
 
243
  # Limit to last 30 seconds (Whisper's context window).
@@ -245,52 +299,41 @@ def _transcribe_pcm_buffer(pcm_bytes: bytes) -> str:
245
  if len(audio_array) > max_samples:
246
  audio_array = audio_array[-max_samples:]
247
 
248
- inputs = processor(
249
- audio_array, sampling_rate=SAMPLE_RATE, return_tensors="pt"
250
- ).input_features.to(device, dtype=torch_dtype)
251
-
252
- with torch.no_grad():
253
- outputs = model.generate(
254
- inputs,
255
- language="ar",
256
- task="transcribe",
257
- )
258
-
259
- text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
260
- return text
261
 
262
 
263
  def _transcribe_pcm_buffer_with_confidence(pcm_bytes: bytes) -> tuple:
264
- """Run Whisper inference on raw PCM buffer, return (text, confidence)."""
265
  audio_array = _pcm_bytes_to_float32(pcm_bytes)
266
-
267
  chunks = _split_audio(audio_array)
268
  all_texts = []
269
  all_scores = []
270
 
271
  for chunk in chunks:
272
- inputs = processor(
273
- chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt"
274
- ).input_features.to(device, dtype=torch_dtype)
275
-
276
- with torch.no_grad():
277
- outputs = model.generate(
278
- inputs,
279
- language="ar",
280
- task="transcribe",
281
- return_dict_in_generate=True,
282
- output_scores=True,
283
- )
284
-
285
- text = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0].strip()
286
- all_texts.append(text)
287
-
288
- if outputs.scores:
289
- token_probs = [F.softmax(s, dim=-1).max(dim=-1).values for s in outputs.scores]
290
- chunk_score = float(sum(p.mean().item() for p in token_probs) / len(token_probs))
291
  else:
292
- chunk_score = 1.0
293
- all_scores.append(chunk_score)
294
 
295
  transcription = " ".join(all_texts)
296
  confidence = round(sum(all_scores) / len(all_scores), 4) if all_scores else 0.0
@@ -316,39 +359,35 @@ def _split_audio(audio_array, sr=SAMPLE_RATE, chunk_s=CHUNK_LENGTH_S, overlap_s=
316
  def _transcribe_file(audio_path: str) -> dict:
317
  import librosa
318
 
319
- t_start = time.time()
320
  audio_array, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
321
 
322
- chunks = _split_audio(audio_array)
323
- all_texts = []
324
  all_scores = []
325
 
326
  for chunk in chunks:
327
- inputs = processor(
328
- chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt"
329
- ).input_features.to(device, dtype=torch_dtype)
330
-
331
- with torch.no_grad():
332
- outputs = model.generate(
333
- inputs,
334
- language="ar",
335
- task="transcribe",
336
- return_dict_in_generate=True,
337
- output_scores=True,
338
- )
339
-
340
- text = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0].strip()
341
- all_texts.append(text)
342
-
343
- if outputs.scores:
344
- token_probs = [F.softmax(s, dim=-1).max(dim=-1).values for s in outputs.scores]
345
- chunk_score = float(sum(p.mean().item() for p in token_probs) / len(token_probs))
346
  else:
347
- chunk_score = 1.0
348
- all_scores.append(chunk_score)
349
 
350
  return {
351
- "transcription": " ".join(all_texts),
352
- "confidence_score": round(sum(all_scores) / len(all_scores), 4) if all_scores else 0.0,
353
  "processing_time_ms": int((time.time() - t_start) * 1000),
354
  }
 
2
  """
3
  Tahkik Inference Server — Hugging Face Space entry point.
4
 
5
+ Loads the Whisper model ONCE at startup via faster-whisper (CTranslate2),
6
+ then serves:
7
  - POST /evaluate — batch transcription (upload a full audio file)
8
  - WS /ws/stream — real-time streaming transcription (send PCM chunks)
9
  """
10
 
11
  import asyncio
12
  import json
13
+ import math
14
  import os
 
 
15
  import time
16
  import tempfile
17
 
18
  # Redirect model caches to /tmp (only writable dir in HF Spaces)
19
  os.environ.setdefault("HF_HOME", "/tmp/huggingface_cache")
 
 
20
  os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
21
+ os.environ.setdefault("CT2_VERBOSE", "0")
22
 
23
  import numpy as np
24
  from fastapi import FastAPI, File, UploadFile, HTTPException, WebSocket, WebSocketDisconnect
25
  from fastapi.responses import JSONResponse
26
+ from faster_whisper import WhisperModel
 
 
27
 
28
  # ---------------------------------------------------------------------------
29
  # Constants
30
  # ---------------------------------------------------------------------------
31
 
32
+ TAHKIK_MODEL = "benhadjermed/tahkik-small-warsh-ct2"
33
  SAMPLE_RATE = 16000
34
  CHUNK_LENGTH_S = 30
35
  OVERLAP_S = 1
 
38
  MIN_AUDIO_FOR_INFERENCE_S = 1.0
39
  MIN_SAMPLES_FOR_INFERENCE = int(MIN_AUDIO_FOR_INFERENCE_S * SAMPLE_RATE)
40
 
41
+ SILENCE_THRESHOLD = 0.02 # RMS threshold for silence
42
+ SILENCE_DURATION_S = 0.8 # seconds of trailing silence to trigger finalization
43
+ SILENCE_SAMPLES = int(SILENCE_DURATION_S * SAMPLE_RATE)
44
+
45
  ALLOWED_EXTS = {".wav", ".m4a", ".mp3", ".flac", ".ogg"}
46
 
47
  # ---------------------------------------------------------------------------
48
  # Model loading (happens once at module import / server startup)
49
  # ---------------------------------------------------------------------------
50
 
51
+ print("[inference] loading faster-whisper model...", flush=True)
52
+ model = WhisperModel(
53
+ TAHKIK_MODEL,
54
+ device="cpu",
55
+ compute_type="int8",
56
+ download_root="/tmp/huggingface_cache",
 
 
57
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  print("[inference] model ready", flush=True)
59
 
60
+ # Global inference lock — one inference at a time to avoid resource contention.
61
  _inference_lock = asyncio.Lock()
62
 
63
  # ---------------------------------------------------------------------------
 
123
 
124
  # Accumulate raw PCM bytes from the client.
125
  audio_buffer = bytearray()
126
+ session_text = ""
127
  last_inference_len = 0 # track buffer size at last inference to avoid redundant runs
128
 
129
  async def _run_partial(pcm_data: bytes):
 
132
  text = await asyncio.get_event_loop().run_in_executor(
133
  None, _transcribe_pcm_buffer, pcm_data
134
  )
135
+ full_text = (session_text + " " + text).strip()
136
+ try:
137
+ await ws.send_json({"type": "partial", "text": full_text})
138
+ except Exception:
139
+ pass # Connection likely closed
140
  except Exception as e:
141
+ import traceback
142
+ err_msg = traceback.format_exc()
143
+ print(f"[ws] partial inference error:\n{err_msg}", flush=True)
144
 
145
  try:
146
  while True:
 
154
  buffer_samples = len(audio_buffer) // 2 # 16-bit = 2 bytes/sample
155
  new_samples = buffer_samples - (last_inference_len // 2)
156
 
157
+ if buffer_samples >= MIN_SAMPLES_FOR_INFERENCE:
158
+ if _has_trailing_silence(bytes(audio_buffer), SILENCE_THRESHOLD, SILENCE_SAMPLES):
159
+ print(f"[ws] auto-finalizing chunk due to silence", flush=True)
160
+ async with _inference_lock:
161
+ chunk_text = await asyncio.get_event_loop().run_in_executor(
162
+ None, _transcribe_pcm_buffer, bytes(audio_buffer)
163
+ )
164
+ session_text = (session_text + " " + chunk_text).strip()
165
+ try:
166
+ await ws.send_json({"type": "partial", "text": session_text})
167
+ except RuntimeError:
168
+ # Client closed connection while we were running inference
169
+ break
170
+
171
+ audio_buffer = bytearray()
172
+ last_inference_len = 0
173
+ continue
174
+
175
+ # Prevent OOM if mic is left on but user is entirely silent for 10s
176
+ if buffer_samples > SAMPLE_RATE * 10:
177
+ audio_array = _pcm_bytes_to_float32(bytes(audio_buffer))
178
+ if np.sqrt(np.mean(audio_array ** 2)) < SILENCE_THRESHOLD * 2:
179
+ print("[ws] buffer full of purely silence, dropping...", flush=True)
180
+ audio_buffer = bytearray()
181
+ last_inference_len = 0
182
+ continue
183
+
184
+ if new_samples >= (SAMPLE_RATE // 2):
185
+ # Run partial inference ONLY if the lock is free.
186
+ # This prevents thousands of requests from queuing and timing out the final run.
187
+ if not _inference_lock.locked():
188
+ last_inference_len = len(audio_buffer)
189
+ # Run in background so ws.receive() is not blocked.
190
+ asyncio.create_task(_run_partial(bytes(audio_buffer)))
191
 
192
  # --- Text frame: control message ------------------------------
193
  elif "text" in message and message["text"] is not None:
194
  try:
195
  msg = json.loads(message["text"])
196
  except json.JSONDecodeError:
197
+ try:
198
+ await ws.send_json({"type": "error", "message": "invalid JSON"})
199
+ except RuntimeError:
200
+ pass
201
  continue
202
 
203
  if msg.get("type") == "stop":
 
205
 
206
  buffer_samples = len(audio_buffer) // 2
207
  if buffer_samples < MIN_SAMPLES_FOR_INFERENCE:
208
+ try:
209
+ await ws.send_json({
210
+ "type": "final",
211
+ "text": session_text,
212
+ "confidence": 1.0,
213
+ "processing_time_ms": 0,
214
+ })
215
+ except RuntimeError:
216
+ pass
217
  else:
218
  t_start = time.time()
219
  async with _inference_lock:
 
222
  )
223
  elapsed = int((time.time() - t_start) * 1000)
224
 
225
+ final_text = (session_text + " " + text).strip()
226
+ try:
227
+ await ws.send_json({
228
+ "type": "final",
229
+ "text": final_text,
230
+ "confidence": confidence,
231
+ "processing_time_ms": elapsed,
232
+ })
233
+ except RuntimeError:
234
+ pass
235
 
236
  # Reset for potential next session on the same connection.
237
  audio_buffer = bytearray()
238
+ session_text = ""
239
  last_inference_len = 0
240
  break # Close after final result.
241
 
242
  except WebSocketDisconnect:
243
  print("[ws] client disconnected", flush=True)
244
  except Exception as exc:
245
+ import traceback
246
+ print(f"[ws] error:\n{traceback.format_exc()}", flush=True)
247
  try:
248
  await ws.send_json({"type": "error", "message": str(exc)})
249
  except Exception:
 
266
  return int16_array.astype(np.float32) / 32768.0
267
 
268
 
269
+ def _has_trailing_silence(pcm_bytes: bytes, threshold: float, duration_samples: int) -> bool:
270
+ """Check if buffer ends with N seconds of silence below threshold, AND had speech before it."""
271
+ if len(pcm_bytes) < duration_samples * 2:
272
+ return False
273
+
274
+ audio_array = _pcm_bytes_to_float32(pcm_bytes)
275
+ trailing = audio_array[-duration_samples:]
276
+ rms = np.sqrt(np.mean(trailing ** 2))
277
+
278
+ if rms < threshold:
279
+ # Require some actual speech before the trailing silence to count as "trailing silence"
280
+ leading = audio_array[:-duration_samples]
281
+ if len(leading) > 0:
282
+ leading_rms = np.sqrt(np.mean(leading ** 2))
283
+ if leading_rms > threshold * 1.5:
284
+ return True
285
+ return False
286
+
287
+
288
+ def _logprob_to_confidence(avg_logprob: float) -> float:
289
+ """Convert faster-whisper's avg_logprob to a 0-1 confidence score via exp()."""
290
+ return math.exp(max(avg_logprob, -5.0)) # clamp to avoid exp(-inf) = 0
291
+
292
+
293
  def _transcribe_pcm_buffer(pcm_bytes: bytes) -> str:
294
+ """Run faster-whisper inference on raw PCM buffer, return text only."""
295
  audio_array = _pcm_bytes_to_float32(pcm_bytes)
296
 
297
  # Limit to last 30 seconds (Whisper's context window).
 
299
  if len(audio_array) > max_samples:
300
  audio_array = audio_array[-max_samples:]
301
 
302
+ segments, _ = model.transcribe(
303
+ audio_array,
304
+ language="ar",
305
+ task="transcribe",
306
+ vad_filter=False,
307
+ )
308
+ return " ".join(seg.text.strip() for seg in segments)
 
 
 
 
 
 
309
 
310
 
311
  def _transcribe_pcm_buffer_with_confidence(pcm_bytes: bytes) -> tuple:
312
+ """Run faster-whisper inference on raw PCM buffer, return (text, confidence)."""
313
  audio_array = _pcm_bytes_to_float32(pcm_bytes)
 
314
  chunks = _split_audio(audio_array)
315
  all_texts = []
316
  all_scores = []
317
 
318
  for chunk in chunks:
319
+ segments, _ = model.transcribe(
320
+ chunk,
321
+ language="ar",
322
+ task="transcribe",
323
+ vad_filter=False,
324
+ )
325
+ chunk_texts = []
326
+ chunk_logprobs = []
327
+ for seg in segments:
328
+ chunk_texts.append(seg.text.strip())
329
+ chunk_logprobs.append(seg.avg_logprob)
330
+
331
+ all_texts.append(" ".join(chunk_texts))
332
+ if chunk_logprobs:
333
+ avg = sum(chunk_logprobs) / len(chunk_logprobs)
334
+ all_scores.append(_logprob_to_confidence(avg))
 
 
 
335
  else:
336
+ all_scores.append(1.0)
 
337
 
338
  transcription = " ".join(all_texts)
339
  confidence = round(sum(all_scores) / len(all_scores), 4) if all_scores else 0.0
 
359
  def _transcribe_file(audio_path: str) -> dict:
360
  import librosa
361
 
362
+ t_start = time.time()
363
  audio_array, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
364
 
365
+ chunks = _split_audio(audio_array)
366
+ all_texts = []
367
  all_scores = []
368
 
369
  for chunk in chunks:
370
+ segments, _ = model.transcribe(
371
+ chunk,
372
+ language="ar",
373
+ task="transcribe",
374
+ vad_filter=False,
375
+ )
376
+ chunk_texts = []
377
+ chunk_logprobs = []
378
+ for seg in segments:
379
+ chunk_texts.append(seg.text.strip())
380
+ chunk_logprobs.append(seg.avg_logprob)
381
+
382
+ all_texts.append(" ".join(chunk_texts))
383
+ if chunk_logprobs:
384
+ avg = sum(chunk_logprobs) / len(chunk_logprobs)
385
+ all_scores.append(_logprob_to_confidence(avg))
 
 
 
386
  else:
387
+ all_scores.append(1.0)
 
388
 
389
  return {
390
+ "transcription": " ".join(all_texts),
391
+ "confidence_score": round(sum(all_scores) / len(all_scores), 4) if all_scores else 0.0,
392
  "processing_time_ms": int((time.time() - t_start) * 1000),
393
  }
requirements.txt CHANGED
@@ -1,9 +1,7 @@
1
  fastapi
2
  uvicorn[standard]
3
- torch
4
- transformers
5
  librosa
6
  soundfile
7
- accelerate
8
  python-multipart
9
  numpy
 
1
  fastapi
2
  uvicorn[standard]
3
+ faster-whisper
 
4
  librosa
5
  soundfile
 
6
  python-multipart
7
  numpy