feat: add POST /predict endpoint with MiniLM embedding generation

Files changed (10) hide show

.gitignore +13 -0
.python-version +1 -0
app/__init__.py +0 -0
app/config.py +18 -0
app/main.py +58 -0
app/model.py +28 -0
app/schemas.py +20 -0
pyproject.toml +13 -0
tests/__init__.py +0 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# AI agent files
+.claude

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

app/__init__.py ADDED Viewed

File without changes

app/config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from pydantic_settings import BaseSettings
+import torch
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables.
+    Hint: pydantic-settings reads from env vars automatically.
+    Prefix with model_config = SettingsConfigDict(env_prefix="INFERENCE_") if you want
+    namespaced env vars like INFERENCE_MODEL_NAME.
+    """
+    model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
+    device: str = "mps" if torch.backends.mps.is_available() else "cpu"
+    host: str = "0.0.0.0"
+    port: int = 8000
+settings = Settings()

app/main.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request
+from .schemas import PredictRequest, PredictResponse
+from .model import load_model, predict
+from .config import settings
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """FastAPI lifespan context manager — runs on startup and shutdown.
+    The 'yield' separates startup from shutdown.
+    After yield, add any cleanup logic if needed (e.g. logging shutdown).
+    Docs: https://fastapi.tiangolo.com/advanced/events/#lifespan
+    """
+    # Startup
+    model = load_model(settings.model_name, settings.device)
+    app.state.model = model
+    print(f"model loaded on {settings.device}")
+    # raise NotImplementedError  # Replace with your startup logic
+    yield
+    # Shutdown (optional cleanup here)
+    # model.clear()
+app = FastAPI(
+    title="Embedding Inference Server",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+@app.get("/health")
+async def health():
+    """Health check endpoint.
+    This lets you verify the server is running and which model is loaded.
+    """
+    return {
+        "status": "ok",
+        "model" : settings.model_name,
+        "device": settings.device
+    }
+@app.post("/predict", response_model=PredictResponse)
+async def predict_endpoint(request: Request, body: PredictRequest):
+    """Generate embeddings for input texts."""
+    model = request.app.state.model
+    result = predict(model, body.texts)
+    return {
+        "embeddings": result,
+        "dim": model.get_sentence_embedding_dimension(),
+        "num_texts": len(body.texts)
+    }

app/model.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from sentence_transformers import SentenceTransformer
+def load_model(model_name: str, device: str) -> SentenceTransformer:
+    """Load a SentenceTransformer model onto the specified device.
+    Args:
+        model_name: HuggingFace model ID (e.g. "sentence-transformers/all-MiniLM-L6-v2")
+        device: torch device string ("cpu", "mps", "cuda")
+    Returns:
+        Loaded SentenceTransformer model ready for inference
+    """
+    model = SentenceTransformer(model_name, device=device)
+    return model
+def predict(model: SentenceTransformer, texts: list[str]) -> list[list[float]]:
+    """Generate embeddings for a list of text strings.
+    Args:
+        model: Loaded SentenceTransformer model
+        texts: List of strings to embed
+    Returns:
+        List of embedding vectors, each a list of floats
+    """
+    embeddings = model.encode(texts).tolist()
+    return embeddings

app/schemas.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from pydantic import BaseModel
+class PredictRequest(BaseModel):
+    """Request body for the /predict endpoint.
+    Consider adding validation:
+    - Non-empty list (min_length=1)
+    - Individual strings should be non-empty
+    """
+    texts: list[str]
+class PredictResponse(BaseModel):
+    """Response body for the /predict endpoint."""
+    embeddings: list[list[float]]
+    dim: int | None
+    num_texts: int

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[project]
+name = "inference-server"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "fastapi>=0.133.1",
+    "pydantic-settings>=2.13.1",
+    "sentence-transformers>=5.2.3",
+    "torch>=2.10.0",
+    "uvicorn>=0.41.0",
+]

tests/__init__.py ADDED Viewed

File without changes

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff