Spaces:

ConvxO2
/

Who-Spoke-When

Running

App Files Files Community

ConvxO2 commited on 8 days ago

Commit

f54b658

1 Parent(s): 7ff62ef

Fix portability, model cache handling, and deploy token safety

Browse files

Files changed (3) hide show

app/main.py +63 -52
deploy_hf.py +60 -0
models/embedder.py +35 -28

app/main.py CHANGED Viewed

@@ -1,9 +1,5 @@
-"""
-Speaker Diarization API - FastAPI Application
-"""
-import io
-import time
 import asyncio
 import tempfile
 import traceback
@@ -19,12 +15,9 @@ from fastapi import (
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import HTMLResponse
-from pydantic import BaseModel, Field
 from loguru import logger
-# ---------------------------------------------------------------------------
-# Schemas
-# ---------------------------------------------------------------------------
 class SegmentOut(BaseModel):
     start: float
@@ -49,13 +42,9 @@ class HealthResponse(BaseModel):
     version: str = "1.0.0"
-# ---------------------------------------------------------------------------
-# App
-# ---------------------------------------------------------------------------
 app = FastAPI(
     title="Speaker Diarization API",
-    description="Who Spoke When — Speaker diarization using ECAPA-TDNN + AHC Clustering",
     version="1.0.0",
 )
@@ -69,12 +58,16 @@ app.add_middleware(
 _pipeline = None
 def get_pipeline():
     global _pipeline
     if _pipeline is None:
         from app.pipeline import DiarizationPipeline
-        import os
-        cache_dir = os.getenv("CACHE_DIR", "/tmp/model_cache")
         _pipeline = DiarizationPipeline(
             device="auto",
             use_pyannote_vad=True,
@@ -85,10 +78,6 @@ def get_pipeline():
     return _pipeline
-# ---------------------------------------------------------------------------
-# Endpoints
-# ---------------------------------------------------------------------------
 @app.get("/health", response_model=HealthResponse, tags=["System"])
 async def health_check():
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -136,6 +125,7 @@ async def diarize_from_url(
 ):
     """Diarize audio from a URL."""
     import httpx
     try:
         async with httpx.AsyncClient(timeout=60.0) as client:
             resp = await client.get(audio_url)
@@ -169,6 +159,7 @@ async def stream_diarization(websocket: WebSocket):
     """Real-time streaming diarization via WebSocket."""
     await websocket.accept()
     import numpy as np
     audio_buffer = bytearray()
     sample_rate = 16000
     num_speakers = None
@@ -179,10 +170,12 @@ async def stream_diarization(websocket: WebSocket):
         sample_rate = config_msg.get("sample_rate", 16000)
         num_speakers = config_msg.get("num_speakers", None)
-        await websocket.send_json({
-            "type": "progress",
-            "data": {"message": "Config received. Send audio chunks.", "chunks_received": 0},
-        })
         while True:
             try:
@@ -194,12 +187,18 @@ async def stream_diarization(websocket: WebSocket):
             if "bytes" in msg:
                 audio_buffer.extend(msg["bytes"])
                 chunk_count += 1
-                await websocket.send_json({
-                    "type": "progress",
-                    "data": {"message": f"Received chunk {chunk_count}", "chunks_received": chunk_count},
-                })
             elif "text" in msg:
                 import json
                 data = json.loads(msg["text"])
                 if data.get("type") == "eof":
                     break
@@ -208,14 +207,17 @@ async def stream_diarization(websocket: WebSocket):
             await websocket.send_json({"type": "error", "data": {"message": "No audio received"}})
             return
-        import torch
         audio_np = np.frombuffer(audio_buffer, dtype=np.float32).copy()
-        audio_tensor = torch.from_numpy(audio_np)
-        await websocket.send_json({
-            "type": "progress",
-            "data": {"message": "Running diarization pipeline..."},
-        })
         loop = asyncio.get_event_loop()
         pipeline = get_pipeline()
@@ -227,15 +229,17 @@ async def stream_diarization(websocket: WebSocket):
         for seg in result.segments:
             await websocket.send_json({"type": "segment", "data": seg.to_dict()})
-        await websocket.send_json({
-            "type": "done",
-            "data": {
-                "num_speakers": result.num_speakers,
-                "total_segments": len(result.segments),
-                "audio_duration": result.audio_duration,
-                "processing_time": result.processing_time,
-            },
-        })
     except WebSocketDisconnect:
         logger.info("WebSocket client disconnected")
@@ -249,26 +253,33 @@ async def stream_diarization(websocket: WebSocket):
 @app.get("/", response_class=HTMLResponse, include_in_schema=False)
 async def serve_ui():
-    ui_path = Path("static/index.html")
     if ui_path.exists():
-        return HTMLResponse(ui_path.read_text())
-    return HTMLResponse("<h1>Speaker Diarization API</h1><p><a href='/docs'>API Docs →</a></p>")
 @app.get("/debug", tags=["System"])
 async def debug():
-    import speechbrain
-    import os
     import inspect
     from speechbrain.inference.classifiers import EncoderClassifier
     sig = str(inspect.signature(EncoderClassifier.from_hparams))
     return {
         "speechbrain_version": speechbrain.__version__,
-        "tmp_writable": os.access("/tmp", os.W_OK),
-        "cache_exists": os.path.exists("/tmp/model_cache"),
         "from_hparams_signature": sig,
     }
-static_dir = Path("static")
 if static_dir.exists():
-    app.mount("/static", StaticFiles(directory="static"), name="static")

+"""Speaker Diarization API - FastAPI Application."""
 import asyncio
 import tempfile
 import traceback
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import HTMLResponse
+from pydantic import BaseModel
 from loguru import logger
 class SegmentOut(BaseModel):
     start: float
     version: str = "1.0.0"
 app = FastAPI(
     title="Speaker Diarization API",
+    description="Who Spoke When - Speaker diarization using ECAPA-TDNN + AHC Clustering",
     version="1.0.0",
 )
 _pipeline = None
 def get_pipeline():
     global _pipeline
     if _pipeline is None:
         from app.pipeline import DiarizationPipeline
+        cache_dir = os.getenv(
+            "CACHE_DIR",
+            str(Path(tempfile.gettempdir()) / "model_cache"),
+        )
         _pipeline = DiarizationPipeline(
             device="auto",
             use_pyannote_vad=True,
     return _pipeline
 @app.get("/health", response_model=HealthResponse, tags=["System"])
 async def health_check():
     device = "cuda" if torch.cuda.is_available() else "cpu"
 ):
     """Diarize audio from a URL."""
     import httpx
     try:
         async with httpx.AsyncClient(timeout=60.0) as client:
             resp = await client.get(audio_url)
     """Real-time streaming diarization via WebSocket."""
     await websocket.accept()
     import numpy as np
     audio_buffer = bytearray()
     sample_rate = 16000
     num_speakers = None
         sample_rate = config_msg.get("sample_rate", 16000)
         num_speakers = config_msg.get("num_speakers", None)
+        await websocket.send_json(
+            {
+                "type": "progress",
+                "data": {"message": "Config received. Send audio chunks.", "chunks_received": 0},
+            }
+        )
         while True:
             try:
             if "bytes" in msg:
                 audio_buffer.extend(msg["bytes"])
                 chunk_count += 1
+                await websocket.send_json(
+                    {
+                        "type": "progress",
+                        "data": {
+                            "message": f"Received chunk {chunk_count}",
+                            "chunks_received": chunk_count,
+                        },
+                    }
+                )
             elif "text" in msg:
                 import json
                 data = json.loads(msg["text"])
                 if data.get("type") == "eof":
                     break
             await websocket.send_json({"type": "error", "data": {"message": "No audio received"}})
             return
+        import torch as torch_local
         audio_np = np.frombuffer(audio_buffer, dtype=np.float32).copy()
+        audio_tensor = torch_local.from_numpy(audio_np)
+        await websocket.send_json(
+            {
+                "type": "progress",
+                "data": {"message": "Running diarization pipeline..."},
+            }
+        )
         loop = asyncio.get_event_loop()
         pipeline = get_pipeline()
         for seg in result.segments:
             await websocket.send_json({"type": "segment", "data": seg.to_dict()})
+        await websocket.send_json(
+            {
+                "type": "done",
+                "data": {
+                    "num_speakers": result.num_speakers,
+                    "total_segments": len(result.segments),
+                    "audio_duration": result.audio_duration,
+                    "processing_time": result.processing_time,
+                },
+            }
+        )
     except WebSocketDisconnect:
         logger.info("WebSocket client disconnected")
 @app.get("/", response_class=HTMLResponse, include_in_schema=False)
 async def serve_ui():
+    ui_path = Path(__file__).resolve().parent.parent / "static" / "index.html"
     if ui_path.exists():
+        return HTMLResponse(ui_path.read_text(encoding="utf-8"))
+    return HTMLResponse("<h1>Speaker Diarization API</h1><p><a href='/docs'>API Docs</a></p>")
 @app.get("/debug", tags=["System"])
 async def debug():
     import inspect
+    import speechbrain
     from speechbrain.inference.classifiers import EncoderClassifier
+    cache_dir = os.getenv(
+        "CACHE_DIR",
+        str(Path(tempfile.gettempdir()) / "model_cache"),
+    )
     sig = str(inspect.signature(EncoderClassifier.from_hparams))
     return {
         "speechbrain_version": speechbrain.__version__,
+        "temp_dir": tempfile.gettempdir(),
+        "temp_writable": os.access(tempfile.gettempdir(), os.W_OK),
+        "cache_dir": cache_dir,
+        "cache_exists": os.path.exists(cache_dir),
         "from_hparams_signature": sig,
     }
+static_dir = Path(__file__).resolve().parent.parent / "static"
 if static_dir.exists():
+    app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")

deploy_hf.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python3
+"""Deploy this project to a Hugging Face Space."""
+import os
+import subprocess
+import sys
+from huggingface_hub import HfApi
+def require_env(name: str) -> str:
+    value = os.getenv(name)
+    if not value:
+        raise SystemExit(f"Missing required environment variable: {name}")
+    return value
+def main() -> None:
+    token = require_env("HF_TOKEN")
+    space_name = os.getenv("HF_SPACE_NAME", "who-spoke-when")
+    api = HfApi(token=token)
+    username = os.getenv("HF_USERNAME")
+    if not username:
+        whoami = api.whoami(token=token)
+        username = whoami["name"]
+    space_id = f"{username}/{space_name}"
+    try:
+        api.create_repo(
+            repo_id=space_id,
+            repo_type="space",
+            space_sdk="docker",
+            private=False,
+            token=token,
+            exist_ok=True,
+        )
+        print(f"Space ready: {space_id}")
+    except Exception as exc:
+        raise SystemExit(f"Failed to create or fetch space '{space_id}': {exc}") from exc
+    remote_url = f"https://{username}:{token}@huggingface.co/spaces/{space_id}"
+    subprocess.run(["git", "remote", "remove", "huggingface"], check=False, capture_output=True)
+    subprocess.run(["git", "remote", "add", "huggingface", remote_url], check=True)
+    push_cmd = ["git", "push", "huggingface", "main"]
+    if os.getenv("HF_FORCE_PUSH", "false").lower() in {"1", "true", "yes"}:
+        push_cmd.append("--force")
+    subprocess.run(push_cmd, check=True)
+    print(f"Pushed to https://huggingface.co/spaces/{space_id}")
+if __name__ == "__main__":
+    try:
+        main()
+    except subprocess.CalledProcessError as exc:
+        sys.exit(exc.returncode)

models/embedder.py CHANGED Viewed

@@ -1,14 +1,16 @@
-"""
 Speaker Embedding Extraction using ECAPA-TDNN architecture via SpeechBrain.
 Handles audio preprocessing, feature extraction, and L2-normalized embeddings.
 """
-import os
-import torch
-import torchaudio
-import numpy as np
 from pathlib import Path
 from typing import Union, List, Tuple
 from loguru import logger
@@ -22,7 +24,7 @@ class EcapaTDNNEmbedder:
     SAMPLE_RATE = 16000
     EMBEDDING_DIM = 192
-    def __init__(self, device: str = "auto", cache_dir: str = "/tmp/model_cache"):
         self.device = self._resolve_device(device)
         self.cache_dir = Path(cache_dir)
         self.cache_dir.mkdir(parents=True, exist_ok=True)
@@ -39,41 +41,46 @@ class EcapaTDNNEmbedder:
             return
         try:
-            import shutil
             import speechbrain.utils.fetching as _fetching
             from speechbrain.utils.fetching import LocalStrategy
             def _patched_link(src, dst, local_strategy):
-                from pathlib import Path as _Path
-                dst = _Path(dst)
-                src = _Path(src)
-                dst.parent.mkdir(parents=True, exist_ok=True)
-                if dst.exists() or dst.is_symlink():
-                    dst.unlink()
-                shutil.copy2(str(src), str(dst))
             _fetching.link_with_strategy = _patched_link
-            from speechbrain.inference.classifiers import EncoderClassifier
             logger.info(f"Loading ECAPA-TDNN from {self.MODEL_SOURCE}...")
-            savedir = "/tmp/model_cache/ecapa_tdnn"
-            os.makedirs(savedir, exist_ok=True)
-            logger.info(f"Savedir: {savedir}, exists: {os.path.exists(savedir)}")
-            self._model = EncoderClassifier.from_hparams(
-                source=self.MODEL_SOURCE,
-                savedir=savedir,
-                run_opts={"device": self.device},
-                huggingface_cache_dir="/tmp/hf_cache",
-                local_strategy=LocalStrategy.COPY,
-            )
             self._model.eval()
             logger.success("ECAPA-TDNN model loaded successfully.")
-        except ImportError:
-            raise ImportError("SpeechBrain not installed.")
     def preprocess_audio(
         self, audio: Union[np.ndarray, torch.Tensor], sample_rate: int
     ) -> torch.Tensor:

+"""
 Speaker Embedding Extraction using ECAPA-TDNN architecture via SpeechBrain.
 Handles audio preprocessing, feature extraction, and L2-normalized embeddings.
 """
+import inspect
+import shutil
 from pathlib import Path
 from typing import Union, List, Tuple
+import numpy as np
+import torch
+import torchaudio
 from loguru import logger
     SAMPLE_RATE = 16000
     EMBEDDING_DIM = 192
+    def __init__(self, device: str = "auto", cache_dir: str = "./model_cache"):
         self.device = self._resolve_device(device)
         self.cache_dir = Path(cache_dir)
         self.cache_dir.mkdir(parents=True, exist_ok=True)
             return
         try:
             import speechbrain.utils.fetching as _fetching
             from speechbrain.utils.fetching import LocalStrategy
+            from speechbrain.inference.classifiers import EncoderClassifier
             def _patched_link(src, dst, local_strategy):
+                dst_path = Path(dst)
+                src_path = Path(src)
+                dst_path.parent.mkdir(parents=True, exist_ok=True)
+                if dst_path.exists() or dst_path.is_symlink():
+                    dst_path.unlink()
+                shutil.copy2(str(src_path), str(dst_path))
             _fetching.link_with_strategy = _patched_link
+            savedir = self.cache_dir / "ecapa_tdnn"
+            hf_cache = self.cache_dir / "hf_cache"
+            savedir.mkdir(parents=True, exist_ok=True)
+            hf_cache.mkdir(parents=True, exist_ok=True)
             logger.info(f"Loading ECAPA-TDNN from {self.MODEL_SOURCE}...")
+            logger.info(f"Savedir: {savedir}, exists: {savedir.exists()}")
+            kwargs = {
+                "source": self.MODEL_SOURCE,
+                "savedir": str(savedir),
+                "run_opts": {"device": self.device},
+            }
+            sig = inspect.signature(EncoderClassifier.from_hparams)
+            if "huggingface_cache_dir" in sig.parameters:
+                kwargs["huggingface_cache_dir"] = str(hf_cache)
+            if "local_strategy" in sig.parameters:
+                kwargs["local_strategy"] = LocalStrategy.COPY
+            self._model = EncoderClassifier.from_hparams(**kwargs)
             self._model.eval()
             logger.success("ECAPA-TDNN model loaded successfully.")
+        except ImportError as exc:
+            raise ImportError("SpeechBrain not installed.") from exc
     def preprocess_audio(
         self, audio: Union[np.ndarray, torch.Tensor], sample_rate: int
     ) -> torch.Tensor: