Spaces:

benhadjermed
/

tahkik-basic-warsh

Sleeping

App Files Files Community

benhadjermed commited on Apr 11

Commit

3f4cf11

verified ·

1 Parent(s): 0785bb2

feat: migrate to streaming transcriptions via WebSockets

Browse files

Files changed (4) hide show

Dockerfile +31 -0
README.md +48 -8
main.py +346 -0
requirements.txt +9 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+# ── Tahkik Inference Space ──────────────────────────────────────────────────
+# CPU image. To enable GPU (T4/L4/A100), change the base image to:
+#   FROM nvidia/cuda:12.1-runtime-ubuntu22.04
+# and replace the pip torch line with the CUDA-specific wheel URL.
+# ---------------------------------------------------------------------------
+FROM python:3.10-slim
+# HF Spaces requires a non-root user with UID 1000.
+RUN useradd -m -u 1000 user
+WORKDIR /home/user/app
+# Install dependencies as root (before switching user).
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code.
+COPY --chown=user . .
+# Redirect all model/cache downloads to /tmp (only writable path in Spaces).
+ENV HF_HOME=/tmp/huggingface_cache
+ENV TORCH_HOME=/tmp/torch_cache
+ENV TRANSFORMERS_VERBOSITY=error
+ENV HF_HUB_DISABLE_PROGRESS_BARS=1
+USER user
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,52 @@
 ---
 title: Tahkik Basic Warsh
-emoji: 📊
-colorFrom: blue
-colorTo: gray
-sdk: gradio
-sdk_version: 6.12.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Tahkik Basic Warsh
+emoji: 📖
+colorFrom: green
+colorTo: blue
+sdk: docker
+app_port: 7860
 ---
+# Tahkik Inference API
+FastAPI inference server for the `benhadjermed/tahkik-basic-warsh` Whisper model.
+Accepts Arabic Quranic audio and returns a transcription with a confidence score.
+## Endpoints
+| Method | Path        | Description                  |
+|--------|-------------|------------------------------|
+| GET    | `/health`   | Liveness check               |
+| POST   | `/evaluate` | Transcribe an audio file     |
+## POST /evaluate
+**Request** — `multipart/form-data`
+| Field   | Type | Required | Notes                                      |
+|---------|------|----------|--------------------------------------------|
+| `audio` | file | yes      | `.wav`, `.mp3`, `.m4a`, `.flac`, or `.ogg` |
+**Response** — `application/json`
+```json
+{
+  "transcription": "الحمد لله رب العالمين",
+  "confidence_score": 0.9423,
+  "processing_time_ms": 1350
+}
+```
+**Error** — non-200 status
+```json
+{
+  "detail": "unsupported audio format: .xyz"
+}
+```
+## Environment / Secrets
+| Name       | Where to set      | Purpose                                        |
+|------------|-------------------|------------------------------------------------|
+| `HF_TOKEN` | Space secret      | Required if `tahkik-basic-warsh` is private    |

main.py ADDED Viewed

	@@ -0,0 +1,346 @@

+#!/usr/bin/env python3
+"""
+Tahkik Inference Server — Hugging Face Space entry point.
+Loads the Whisper model ONCE at startup, then serves:
+  - POST /evaluate   — batch transcription (upload a full audio file)
+  - WS   /ws/stream  — real-time streaming transcription (send PCM chunks)
+"""
+import asyncio
+import json
+import os
+import sys
+import struct
+import time
+import tempfile
+# Redirect model caches to /tmp (only writable dir in HF Spaces)
+os.environ.setdefault("HF_HOME", "/tmp/huggingface_cache")
+os.environ.setdefault("TORCH_HOME", "/tmp/torch_cache")
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+import numpy as np
+from fastapi import FastAPI, File, UploadFile, HTTPException, WebSocket, WebSocketDisconnect
+from fastapi.responses import JSONResponse
+import torch
+import torch.nn.functional as F
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+TAHKIK_MODEL   = "benhadjermed/tahkik-basic-warsh"
+SAMPLE_RATE    = 16000
+CHUNK_LENGTH_S = 30
+OVERLAP_S      = 1
+# Minimum seconds of audio before running partial inference (reduces hallucinations)
+MIN_AUDIO_FOR_INFERENCE_S = 1.5
+MIN_SAMPLES_FOR_INFERENCE = int(MIN_AUDIO_FOR_INFERENCE_S * SAMPLE_RATE)
+ALLOWED_EXTS = {".wav", ".m4a", ".mp3", ".flac", ".ogg"}
+# ---------------------------------------------------------------------------
+# Model loading (happens once at module import / server startup)
+# ---------------------------------------------------------------------------
+print("[inference] importing torch / transformers...", flush=True)
+device      = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+print(f"[inference] device: {device}", flush=True)
+print("[inference] loading processor (openai/whisper-base)...", flush=True)
+processor = WhisperProcessor.from_pretrained(
+    "openai/whisper-base", language="Arabic", task="transcribe"
+)
+print(f"[inference] loading model ({TAHKIK_MODEL})...", flush=True)
+model = WhisperForConditionalGeneration.from_pretrained(
+    TAHKIK_MODEL, torch_dtype=torch_dtype
+).to(device)
+# Patch missing generation config fields that some fine-tuned checkpoints omit.
+if not hasattr(model.generation_config, "lang_to_id") or model.generation_config.lang_to_id is None:
+    print("[inference] patching generation config from base model...", flush=True)
+    _base = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+    model.generation_config.lang_to_id = _base.generation_config.lang_to_id
+    model.generation_config.id_to_lang = {v: k for k, v in _base.generation_config.lang_to_id.items()}
+    model.generation_config.task_to_id = _base.generation_config.task_to_id
+    del _base
+print("[inference] model ready", flush=True)
+# Global inference lock — one inference at a time to avoid GPU OOM.
+_inference_lock = asyncio.Lock()
+# ---------------------------------------------------------------------------
+# FastAPI app
+# ---------------------------------------------------------------------------
+app = FastAPI(title="Tahkik Inference API")
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+# ---------------------------------------------------------------------------
+# POST /evaluate — batch transcription (backward compatible)
+# ---------------------------------------------------------------------------
+@app.post("/evaluate")
+async def evaluate(audio: UploadFile = File(...)):
+    filename = audio.filename or "recording.wav"
+    ext = os.path.splitext(filename)[1].lower() or ".wav"
+    if ext not in ALLOWED_EXTS:
+        raise HTTPException(status_code=400, detail=f"unsupported audio format: {ext}")
+    data = await audio.read()
+    with tempfile.NamedTemporaryFile(suffix=ext, delete=False, dir="/tmp") as f:
+        f.write(data)
+        tmp_path = f.name
+    try:
+        result = _transcribe_file(tmp_path)
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc))
+    finally:
+        os.unlink(tmp_path)
+    return JSONResponse(result)
+# ---------------------------------------------------------------------------
+# WS /ws/stream — real-time streaming transcription
+# ---------------------------------------------------------------------------
+@app.websocket("/ws/stream")
+async def stream_transcribe(ws: WebSocket):
+    """
+    Real-time streaming transcription over WebSocket.
+    Protocol:
+      Client → Server:
+        - Binary frames: raw PCM 16-bit signed LE, 16 kHz, mono
+        - Text frame: JSON {"type": "stop"} to signal end of recording
+      Server → Client:
+        - Text frames: JSON messages
+          {"type": "partial", "text": "..."} — intermediate transcription
+          {"type": "final",   "text": "...", "confidence": 0.94, "processing_time_ms": 1234}
+          {"type": "error",   "message": "..."}
+    """
+    await ws.accept()
+    print("[ws] client connected", flush=True)
+    # Accumulate raw PCM bytes from the client.
+    audio_buffer = bytearray()
+    last_inference_len = 0  # track buffer size at last inference to avoid redundant runs
+    try:
+        while True:
+            message = await ws.receive()
+            # --- Binary frame: audio chunk --------------------------------
+            if "bytes" in message and message["bytes"] is not None:
+                audio_buffer.extend(message["bytes"])
+                # Only run inference if we have enough new audio.
+                buffer_samples = len(audio_buffer) // 2  # 16-bit = 2 bytes/sample
+                new_samples = buffer_samples - (last_inference_len // 2)
+                if buffer_samples >= MIN_SAMPLES_FOR_INFERENCE and new_samples >= (SAMPLE_RATE // 2):
+                    # Run partial inference on the accumulated buffer.
+                    async with _inference_lock:
+                        text = await asyncio.get_event_loop().run_in_executor(
+                            None, _transcribe_pcm_buffer, bytes(audio_buffer)
+                        )
+                    last_inference_len = len(audio_buffer)
+                    await ws.send_json({"type": "partial", "text": text})
+            # --- Text frame: control message ------------------------------
+            elif "text" in message and message["text"] is not None:
+                try:
+                    msg = json.loads(message["text"])
+                except json.JSONDecodeError:
+                    await ws.send_json({"type": "error", "message": "invalid JSON"})
+                    continue
+                if msg.get("type") == "stop":
+                    print(f"[ws] stop received, buffer size: {len(audio_buffer)} bytes", flush=True)
+                    buffer_samples = len(audio_buffer) // 2
+                    if buffer_samples < MIN_SAMPLES_FOR_INFERENCE:
+                        await ws.send_json({
+                            "type": "final",
+                            "text": "",
+                            "confidence": 0.0,
+                            "processing_time_ms": 0,
+                        })
+                    else:
+                        t_start = time.time()
+                        async with _inference_lock:
+                            text, confidence = await asyncio.get_event_loop().run_in_executor(
+                                None, _transcribe_pcm_buffer_with_confidence, bytes(audio_buffer)
+                            )
+                        elapsed = int((time.time() - t_start) * 1000)
+                        await ws.send_json({
+                            "type": "final",
+                            "text": text,
+                            "confidence": confidence,
+                            "processing_time_ms": elapsed,
+                        })
+                    # Reset for potential next session on the same connection.
+                    audio_buffer = bytearray()
+                    last_inference_len = 0
+                    break  # Close after final result.
+    except WebSocketDisconnect:
+        print("[ws] client disconnected", flush=True)
+    except Exception as exc:
+        print(f"[ws] error: {exc}", flush=True)
+        try:
+            await ws.send_json({"type": "error", "message": str(exc)})
+        except Exception:
+            pass
+    finally:
+        try:
+            await ws.close()
+        except Exception:
+            pass
+        print("[ws] connection closed", flush=True)
+# ---------------------------------------------------------------------------
+# Inference helpers
+# ---------------------------------------------------------------------------
+def _pcm_bytes_to_float32(pcm_bytes: bytes) -> np.ndarray:
+    """Convert raw PCM 16-bit signed LE bytes to float32 numpy array in [-1, 1]."""
+    int16_array = np.frombuffer(pcm_bytes, dtype=np.int16)
+    return int16_array.astype(np.float32) / 32768.0
+def _transcribe_pcm_buffer(pcm_bytes: bytes) -> str:
+    """Run Whisper inference on raw PCM buffer, return text only."""
+    audio_array = _pcm_bytes_to_float32(pcm_bytes)
+    # Limit to last 30 seconds (Whisper's context window).
+    max_samples = CHUNK_LENGTH_S * SAMPLE_RATE
+    if len(audio_array) > max_samples:
+        audio_array = audio_array[-max_samples:]
+    inputs = processor(
+        audio_array, sampling_rate=SAMPLE_RATE, return_tensors="pt"
+    ).input_features.to(device, dtype=torch_dtype)
+    with torch.no_grad():
+        outputs = model.generate(
+            inputs,
+            language="ar",
+            task="transcribe",
+        )
+    text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+    return text
+def _transcribe_pcm_buffer_with_confidence(pcm_bytes: bytes) -> tuple:
+    """Run Whisper inference on raw PCM buffer, return (text, confidence)."""
+    audio_array = _pcm_bytes_to_float32(pcm_bytes)
+    chunks = _split_audio(audio_array)
+    all_texts = []
+    all_scores = []
+    for chunk in chunks:
+        inputs = processor(
+            chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt"
+        ).input_features.to(device, dtype=torch_dtype)
+        with torch.no_grad():
+            outputs = model.generate(
+                inputs,
+                language="ar",
+                task="transcribe",
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+        text = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0].strip()
+        all_texts.append(text)
+        if outputs.scores:
+            token_probs = [F.softmax(s, dim=-1).max(dim=-1).values for s in outputs.scores]
+            chunk_score = float(sum(p.mean().item() for p in token_probs) / len(token_probs))
+        else:
+            chunk_score = 1.0
+        all_scores.append(chunk_score)
+    transcription = " ".join(all_texts)
+    confidence = round(sum(all_scores) / len(all_scores), 4) if all_scores else 0.0
+    return transcription, confidence
+def _split_audio(audio_array, sr=SAMPLE_RATE, chunk_s=CHUNK_LENGTH_S, overlap_s=OVERLAP_S):
+    chunk_len = int(chunk_s * sr)
+    step_len  = int((chunk_s - overlap_s) * sr)
+    chunks = []
+    start  = 0
+    while start < len(audio_array):
+        end = min(start + chunk_len, len(audio_array))
+        chunks.append(audio_array[start:end])
+        start += step_len
+        remaining = len(audio_array) - start
+        if 0 < remaining < 2 * sr:
+            chunks[-1] = audio_array[start - step_len:]
+            break
+    return chunks
+def _transcribe_file(audio_path: str) -> dict:
+    import librosa
+    t_start     = time.time()
+    audio_array, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
+    chunks     = _split_audio(audio_array)
+    all_texts  = []
+    all_scores = []
+    for chunk in chunks:
+        inputs = processor(
+            chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt"
+        ).input_features.to(device, dtype=torch_dtype)
+        with torch.no_grad():
+            outputs = model.generate(
+                inputs,
+                language="ar",
+                task="transcribe",
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+        text = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0].strip()
+        all_texts.append(text)
+        if outputs.scores:
+            token_probs = [F.softmax(s, dim=-1).max(dim=-1).values for s in outputs.scores]
+            chunk_score = float(sum(p.mean().item() for p in token_probs) / len(token_probs))
+        else:
+            chunk_score = 1.0
+        all_scores.append(chunk_score)
+    return {
+        "transcription":     " ".join(all_texts),
+        "confidence_score":  round(sum(all_scores) / len(all_scores), 4) if all_scores else 0.0,
+        "processing_time_ms": int((time.time() - t_start) * 1000),
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn[standard]
+torch
+transformers
+librosa
+soundfile
+accelerate
+python-multipart
+numpy