Spaces:
Sleeping
Sleeping
File size: 1,684 Bytes
ddb9445 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import os
import asyncio
from typing import List
from fastapi import FastAPI
from pydantic import BaseModel
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
app = FastAPI(title="Embedder Service", version="1.0.0")
class EmbedRequest(BaseModel):
texts: List[str]
class EmbedResponse(BaseModel):
vectors: List[List[float]]
model: str
_model = None
_model_lock = asyncio.Lock()
_sequential_gate = asyncio.Semaphore(1) # ensure one job at a time
def _lazy_load_model():
global _model
if _model is None:
# Lazy import to keep container startup light
from sentence_transformers import SentenceTransformer
_model = SentenceTransformer(MODEL_NAME)
@app.get("/health")
async def health():
return {"ok": True, "model": MODEL_NAME, "loaded": _model is not None}
@app.post("/embed", response_model=EmbedResponse)
async def embed(req: EmbedRequest):
# Simple sequential queueing: only one request processes at a time
async with _sequential_gate:
# Protect model initialization under a lock to avoid concurrent loads
async with _model_lock:
_lazy_load_model()
# Actual encoding
# sentence-transformers encode is sync; run in thread pool so we don't block loop
loop = asyncio.get_event_loop()
vectors = await loop.run_in_executor(None, lambda: _model.encode(req.texts, show_progress_bar=False, normalize_embeddings=True).tolist())
return EmbedResponse(vectors=vectors, model=MODEL_NAME)
if __name__ == "__main__":
import uvicorn
port = int(os.getenv("PORT", "7860"))
uvicorn.run(app, host="0.0.0.0", port=port)
|