LiamKhoaLe commited on
Commit
ddb9445
·
0 Parent(s):

Init commit

Browse files
Files changed (4) hide show
  1. Dockerfile +28 -0
  2. README.md +57 -0
  3. app.py +60 -0
  4. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1
4
+ ENV PYTHONUNBUFFERED=1
5
+
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential git libglib2.0-0 libgl1 \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ RUN useradd -m -u 1000 user
11
+ USER user
12
+ ENV PATH="/home/user/.local/bin:$PATH"
13
+ WORKDIR /app
14
+
15
+ COPY requirements.txt ./
16
+ RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
17
+
18
+ COPY app.py ./
19
+
20
+ ENV HF_HOME="/home/user/.cache/huggingface"
21
+ ENV SENTENCE_TRANSFORMERS_HOME="/home/user/.cache/huggingface/sentence-transformers"
22
+
23
+ ENV PORT=7860
24
+ EXPOSE 7860
25
+
26
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
27
+
28
+
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Embedding
3
+ emoji: 🐠
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ short_description: Simple API run sentence-transformers/all-MiniLM-L6-v2
9
+ ---
10
+
11
+ # Embedder Service (HuggingFace Space)
12
+
13
+ A lightweight microservice exposing sentence-transformers embeddings over HTTP.
14
+
15
+ - Model: `sentence-transformers/all-MiniLM-L6-v2`
16
+ - Sequential queueing: handles one request at a time to avoid resource spikes.
17
+
18
+ ## Endpoints
19
+
20
+ - `GET /health` → `{ ok: true, model: string, loaded: boolean }`
21
+ - `POST /embed`
22
+ - Request:
23
+
24
+ ```
25
+ {
26
+ "texts": ["hello world", "another document"]
27
+ }
28
+ ```
29
+
30
+ - Response:
31
+
32
+ ```
33
+ {
34
+ "vectors": [[0.01, -0.02, ...], [0.03, -0.01, ...]],
35
+ "model": "sentence-transformers/all-MiniLM-L6-v2"
36
+ }
37
+ ```
38
+
39
+ ## Deploy on HF Spaces
40
+
41
+ 1. Create a new Space (Docker type)
42
+ 2. Upload `app.py`, `Dockerfile`, `requirements.txt`
43
+ 3. Set Space hardware to CPU (Small is fine)
44
+ 4. Space will run on port 7860 by default
45
+
46
+ ## Example cURL
47
+
48
+ ```
49
+ curl -s -X POST https://binkhoale1812-embedding.hf.space/embed \
50
+ -H 'Content-Type: application/json' \
51
+ -d '{"texts": ["An embedding request", "Second input"]}' | jq .
52
+ ```
53
+
54
+ ## Notes
55
+
56
+ - The service lazily loads the model on first request.
57
+ - If concurrent clients hit it, requests are serialized by a semaphore to reduce memory and CPU spikes.
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ from typing import List
4
+
5
+ from fastapi import FastAPI
6
+ from pydantic import BaseModel
7
+
8
+
9
+ MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
10
+
11
+ app = FastAPI(title="Embedder Service", version="1.0.0")
12
+
13
+
14
+ class EmbedRequest(BaseModel):
15
+ texts: List[str]
16
+
17
+
18
+ class EmbedResponse(BaseModel):
19
+ vectors: List[List[float]]
20
+ model: str
21
+
22
+
23
+ _model = None
24
+ _model_lock = asyncio.Lock()
25
+ _sequential_gate = asyncio.Semaphore(1) # ensure one job at a time
26
+
27
+
28
+ def _lazy_load_model():
29
+ global _model
30
+ if _model is None:
31
+ # Lazy import to keep container startup light
32
+ from sentence_transformers import SentenceTransformer
33
+ _model = SentenceTransformer(MODEL_NAME)
34
+
35
+
36
+ @app.get("/health")
37
+ async def health():
38
+ return {"ok": True, "model": MODEL_NAME, "loaded": _model is not None}
39
+
40
+
41
+ @app.post("/embed", response_model=EmbedResponse)
42
+ async def embed(req: EmbedRequest):
43
+ # Simple sequential queueing: only one request processes at a time
44
+ async with _sequential_gate:
45
+ # Protect model initialization under a lock to avoid concurrent loads
46
+ async with _model_lock:
47
+ _lazy_load_model()
48
+ # Actual encoding
49
+ # sentence-transformers encode is sync; run in thread pool so we don't block loop
50
+ loop = asyncio.get_event_loop()
51
+ vectors = await loop.run_in_executor(None, lambda: _model.encode(req.texts, show_progress_bar=False, normalize_embeddings=True).tolist())
52
+ return EmbedResponse(vectors=vectors, model=MODEL_NAME)
53
+
54
+
55
+ if __name__ == "__main__":
56
+ import uvicorn
57
+ port = int(os.getenv("PORT", "7860"))
58
+ uvicorn.run(app, host="0.0.0.0", port=port)
59
+
60
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi==0.114.2
2
+ uvicorn[standard]==0.30.6
3
+ sentence-transformers==3.1.1
4
+ torch==2.2.2
5
+ numpy==1.26.4
6
+