GitHub Action commited on
Commit
ca033f7
Β·
0 Parent(s):

Sync from GitHub (f4d2eca2c04b1321c9cce554b35a177152c54e31)

Browse files
Files changed (6) hide show
  1. .dockerignore +9 -0
  2. .gitignore +11 -0
  3. Dockerfile +21 -0
  4. README.md +12 -0
  5. main.py +205 -0
  6. requirements.txt +12 -0
.dockerignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ athena-vector-engine/
6
+ locustfile*.py
7
+ .git/
8
+ .gitignore
9
+ .DS_Store
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ athena-vector-engine/
7
+ locustfile*.py
8
+ .DS_Store
9
+ *.egg-info/
10
+ dist/
11
+ build/
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. Start with a lightweight version of Python
2
+ FROM python:3.10-slim
3
+
4
+ # 2. Set the working directory inside the container
5
+ WORKDIR /app
6
+
7
+ # 3. Copy your requirements file into the container
8
+ COPY requirements.txt .
9
+
10
+ # 4. Install the Python packages
11
+ # (We use --no-cache-dir to keep the container size small!)
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # 5. Copy the rest of your ml_services code into the container
15
+ COPY . .
16
+
17
+ # 6. Expose the port FastAPI will run on
18
+ EXPOSE 7860
19
+
20
+ # 7. The command to start your server
21
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Athena Vector Engine
3
+ emoji: πŸ†
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
+ license: other
9
+ short_description: Dense + sparse embedding microservice for Athena
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
main.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ml_service/main.py
2
+
3
+ import logging
4
+ import time
5
+ import os
6
+ import torch
7
+ from typing import List
8
+ from contextlib import asynccontextmanager
9
+
10
+ from fastapi import FastAPI, HTTPException, Request
11
+ from pydantic import BaseModel, Field, constr
12
+ from sentence_transformers import SentenceTransformer
13
+ from fastembed import SparseTextEmbedding
14
+
15
+ # -----------------------------
16
+ # Configuration
17
+ # -----------------------------
18
+
19
+ MAX_TEXT_LENGTH = 5000
20
+ MAX_BATCH_SIZE = 32
21
+ DENSE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
22
+ SPARSE_MODEL_NAME = "prithivida/Splade_PP_en_v1"
23
+
24
+ # -----------------------------
25
+ # Structured Logging Setup
26
+ # -----------------------------
27
+
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
31
+ )
32
+
33
+ logger = logging.getLogger("athena.vector_engine")
34
+
35
+ # -----------------------------
36
+ # Lifespan Management
37
+ # -----------------------------
38
+
39
+ @asynccontextmanager
40
+ async def lifespan(app: FastAPI):
41
+ logger.info("🧠 Booting Vector Engine...")
42
+
43
+ start_time = time.time()
44
+
45
+ try:
46
+ device = "cuda" if torch.cuda.is_available() else "cpu"
47
+ logger.info(f"Using device: {device}")
48
+
49
+ # Load dense model
50
+ dense_model = SentenceTransformer(
51
+ DENSE_MODEL_NAME,
52
+ trust_remote_code=True,
53
+ device=device,
54
+ )
55
+
56
+ # Load sparse model
57
+ sparse_model = SparseTextEmbedding(
58
+ model_name=SPARSE_MODEL_NAME
59
+ )
60
+
61
+ # Warmup (prevents cold-start latency spike)
62
+ logger.info("πŸ”₯ Warming up models...")
63
+ dense_model.encode("warmup", normalize_embeddings=True)
64
+ list(sparse_model.embed(["warmup"]))
65
+
66
+ # Attach to app state
67
+ app.state.dense_model = dense_model
68
+ app.state.sparse_model = sparse_model
69
+ app.state.device = device
70
+ app.state.start_time = time.time()
71
+
72
+ duration = time.time() - start_time
73
+ logger.info(f"βœ… Models loaded successfully in {duration:.2f}s")
74
+
75
+ yield
76
+
77
+ except Exception as e:
78
+ logger.exception("❌ Failed during startup")
79
+ raise e
80
+
81
+ finally:
82
+ logger.info("πŸ›‘ Shutting down Vector Engine...")
83
+ app.state.__dict__.clear()
84
+
85
+ # -----------------------------
86
+ # FastAPI App
87
+ # -----------------------------
88
+
89
+ app = FastAPI(
90
+ title="Athena Vector Engine",
91
+ description="Production-grade ML microservice for dense + sparse embeddings",
92
+ version="2.0.0",
93
+ lifespan=lifespan,
94
+ )
95
+
96
+ # -----------------------------
97
+ # Schemas
98
+ # -----------------------------
99
+
100
+ class VectorRequest(BaseModel):
101
+ texts: List[constr(min_length=1, max_length=MAX_TEXT_LENGTH)] = Field(
102
+ ..., description="List of input texts to embed"
103
+ )
104
+
105
+ class SparseData(BaseModel):
106
+ indices: List[int]
107
+ values: List[float]
108
+
109
+ class VectorResponse(BaseModel):
110
+ dense_vectors: List[List[float]]
111
+ sparse_vectors: List[SparseData]
112
+
113
+ # -----------------------------
114
+ # Embedding Endpoint
115
+ # -----------------------------
116
+
117
+ @app.post("/vectorize", response_model=VectorResponse)
118
+ def generate_vectors(req: VectorRequest, request: Request):
119
+
120
+ if len(req.texts) > MAX_BATCH_SIZE:
121
+ raise HTTPException(
122
+ status_code=400,
123
+ detail=f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
124
+ )
125
+
126
+ dense_model = request.app.state.dense_model
127
+ sparse_model = request.app.state.sparse_model
128
+
129
+ try:
130
+ start_time = time.perf_counter()
131
+
132
+ # Prefix required for Nomic retrieval queries
133
+ prefixed_texts = [f"search_query: {text}" for text in req.texts]
134
+
135
+ # Dense embeddings (batched)
136
+ dense_results = dense_model.encode(
137
+ prefixed_texts,
138
+ normalize_embeddings=True,
139
+ batch_size=len(prefixed_texts),
140
+ ).tolist()
141
+
142
+ # Sparse embeddings (batched)
143
+ sparse_raw = list(sparse_model.embed(req.texts))
144
+
145
+ sparse_results = [
146
+ {
147
+ "indices": vec.indices.tolist(),
148
+ "values": vec.values.tolist(),
149
+ }
150
+ for vec in sparse_raw
151
+ ]
152
+
153
+ duration = time.perf_counter() - start_time
154
+
155
+ logger.info(
156
+ f"Vectorized batch_size={len(req.texts)} "
157
+ f"latency={duration:.4f}s"
158
+ )
159
+
160
+ return {
161
+ "dense_vectors": dense_results,
162
+ "sparse_vectors": sparse_results,
163
+ }
164
+
165
+ except Exception as e:
166
+ logger.exception("πŸ”₯ Vectorization failed")
167
+ raise HTTPException(
168
+ status_code=500,
169
+ detail="Failed to generate embeddings",
170
+ )
171
+
172
+ # -----------------------------
173
+ # Health Endpoints
174
+ # -----------------------------
175
+
176
+ @app.api_route("/health/live", methods=["GET", "HEAD"])
177
+ async def liveness():
178
+ return {"status": "alive"}
179
+
180
+ @app.api_route("/health/ready", methods=["GET", "HEAD"])
181
+ async def readiness(request: Request):
182
+ ready = (
183
+ hasattr(request.app.state, "dense_model")
184
+ and hasattr(request.app.state, "sparse_model")
185
+ )
186
+ return {"ready": ready}
187
+
188
+ # -----------------------------
189
+ # Metadata Endpoint
190
+ # -----------------------------
191
+
192
+ @app.get("/info")
193
+ async def model_info(request: Request):
194
+ dense_model = request.app.state.dense_model
195
+ device = request.app.state.device
196
+
197
+ return {
198
+ "dense_model": DENSE_MODEL_NAME,
199
+ "sparse_model": SPARSE_MODEL_NAME,
200
+ "embedding_dimension": dense_model.get_sentence_embedding_dimension(),
201
+ "device": device,
202
+ "uptime_seconds": int(time.time() - request.app.state.start_time),
203
+ "max_batch_size": MAX_BATCH_SIZE,
204
+ "max_text_length": MAX_TEXT_LENGTH,
205
+ }
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+
3
+ # Core Web Framework
4
+ fastapi==0.135.1
5
+ uvicorn[standard]==0.41.0
6
+ pydantic==2.12.5
7
+
8
+ # Machine Learning & Embeddings
9
+ torch==2.10.0+cpu
10
+ sentence-transformers==5.2.3
11
+ fastembed==0.7.4
12
+ einops==0.8.2