Spaces:

kadarakos
/

mentioned

Sleeping

App Files Files Community

kadarakos commited on Feb 27

Commit

fd47265

1 Parent(s): f4ae92f

prometheus

Browse files

Files changed (2) hide show

Dockerfile +26 -0
src/mentioned/app.py +54 -40

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
+# Stay in root to keep paths simple
+WORKDIR /
+# 1. Install dependencies (Cached layer)
+# We need --extra train because we need Torch for the initial compilation
+COPY pyproject.toml uv.lock ./
+RUN uv sync --frozen --no-install-project --extra train
+# 2. Pre-bake NLTK data so it doesn't download on every request
+RUN uv run python -m nltk.downloader punkt punkt_tab
+# 3. Copy only the source code (Excludes ONNX via .dockerignore)
+COPY src ./src
+COPY README.md ./
+# 4. Final project install
+RUN uv sync --frozen --extra train
+# 5. HF Space defaults
+ENV PORT=7860
+EXPOSE 7860
+# Run the app. The 'lifespan' in mentioned.app will handle the download/ONNX export.
+CMD ["uv", "run", "python", "-m", "uvicorn", "mentioned.app:app", "--host", "0.0.0.0", "--port", "7860"]

src/mentioned/app.py CHANGED Viewed

@@ -3,81 +3,95 @@ import gc
 import nltk
 from contextlib import asynccontextmanager
 from typing import List
-from nltk.tokenize import word_tokenize
 from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import AutoTokenizer
-# Internal imports from your package
 from mentioned.inference import (
     create_inference_model,
     compile_inference_model,
     ONNXMentionDetectorPipeline,
 )
-REPO_ID = "kadarakos/mention-detector-poc-dry-run"
-ONNX_DIR = "model_v1_onnx"
-MODEL_PATH = os.path.join(ONNX_DIR, "model.onnx")
-# We use a global dict to store the pipeline after the heavy startup
-state = {}
-def ensure_nltk_resources():
     resources = ["punkt", "punkt_tab"]
     for res in resources:
         try:
             nltk.data.find(f"tokenizers/{res}")
         except LookupError:
-            print(f"gettin' {res} for ya...")
             nltk.download(res)
-ensure_nltk_resources()
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     if not os.path.exists(MODEL_PATH):
-        print(f"🏗️ Compiling model from {REPO_ID}...")
         torch_model = create_inference_model(REPO_ID, "model_v1")
-        compile_inference_model(torch_model, MODEL_PATH)
-        # Keep tokenizer, evict Torch
         tokenizer = torch_model.tokenizer
         del torch_model
         gc.collect()
-        print("✅ Compilation complete. RAM cleared.")
     else:
-        print("🚀 Loading existing ONNX model...")
-        tokenizer = AutoTokenizer.from_pretrained(ONNX_DIR)
-    state["pipeline"] = ONNXMentionDetectorPipeline(
-        MODEL_PATH,
-        tokenizer,
-        # TODO Don't hardcode!
-        threshold=0.3,
-    )
     yield
     state.clear()
 app = FastAPI(lifespan=lifespan)
-class TextRequest(BaseModel):
-    texts: List[str]
 @app.post("/predict")
 async def predict(request: TextRequest):
-    docs = [word_tokenize(text) for text in request.texts]
-    # docs = [text.split() for text in request.texts]
-    results = state["pipeline"].predict(docs)
-    print("YEAH")
-    return {"results": results}
-@app.get("/health")
-def health():
-    return {"status": "ok", "onnx_exists": os.path.exists(MODEL_PATH)}

 import nltk
 from contextlib import asynccontextmanager
 from typing import List
 from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import AutoTokenizer
+from prometheus_fastapi_instrumentator import Instrumentator
+from prometheus_client import Histogram
+from nltk.tokenize import word_tokenize
+# Internal package imports
 from mentioned.inference import (
     create_inference_model,
     compile_inference_model,
     ONNXMentionDetectorPipeline,
 )
+def setup_nltk():
     resources = ["punkt", "punkt_tab"]
     for res in resources:
         try:
             nltk.data.find(f"tokenizers/{res}")
         except LookupError:
             nltk.download(res)
+class TextRequest(BaseModel):
+    texts: List[str]
+MODEL_CONFIDENCE = Histogram(
+    "mention_detector_confidence",
+    "Distribution of prediction confidence scores",
+    buckets=[0.1, 0.3, 0.5, 0.7, 0.8, 0.9, 1.0]
+)
+MENTIONS_PER_DOC = Histogram(
+    "mention_detector_density",
+    "Number of mentions detected per document",
+    buckets=[0, 1, 2, 5, 10, 20, 50]
+)
+REPO_ID = os.getenv("REPO_ID", "kadarakos/mention-detector-poc-dry-run")
+ENGINE_DIR = "engine"
+MODEL_PATH = os.path.join(ENGINE_DIR, "model.onnx")
+state = {}
+setup_nltk()
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    """Handles the JIT compilation and RAM cleanup."""
     if not os.path.exists(MODEL_PATH):
+        print(f"🏗️ Engine not found. Compiling from {REPO_ID}...")
+        # create_inference_model respects HF_TOKEN env var automatically
         torch_model = create_inference_model(REPO_ID, "model_v1")
+        compile_inference_model(torch_model, ENGINE_DIR)
         tokenizer = torch_model.tokenizer
         del torch_model
         gc.collect()
+        print("✅ Compilation complete.")
     else:
+        print("🚀 Loading existing ONNX engine...")
+        tokenizer = AutoTokenizer.from_pretrained(ENGINE_DIR)
+    state["pipeline"] = ONNXMentionDetectorPipeline(MODEL_PATH, tokenizer)
     yield
     state.clear()
 app = FastAPI(lifespan=lifespan)
+Instrumentator().instrument(app).expose(app)
 @app.post("/predict")
 async def predict(request: TextRequest):
+    pipeline = state["pipeline"]
+    docs = [word_tokenize(t) for t in request.texts]
+    batch_results = pipeline.predict(docs)
+    for doc_mentions in batch_results:
+        MENTIONS_PER_DOC.observe(len(doc_mentions))
+        for m in doc_mentions:
+            MODEL_CONFIDENCE.observe(m["score"])
+    return {"results": batch_results}
+@app.get("/")
+def home():
+    return {
+        "message": "Mention Detector API",
+        "docs": "/docs",
+        "metrics": "/metrics",
+    }