Spaces:
Running
Running
fahmiaziz98
commited on
Commit
·
36e672d
1
Parent(s):
f435ac4
[UPDATE] Response query
Browse files- src/api/routers/embedding.py +26 -15
- src/config/settings.py +2 -2
src/api/routers/embedding.py
CHANGED
|
@@ -229,43 +229,54 @@ async def create_query_embedding(
|
|
| 229 |
)
|
| 230 |
processing_time = time.time() - start_time
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
response = SparseEmbedResponse(
|
| 240 |
-
|
|
|
|
| 241 |
model_id=request.model_id,
|
| 242 |
processing_time=processing_time,
|
| 243 |
)
|
| 244 |
else:
|
| 245 |
-
# Dense
|
| 246 |
-
embeddings = model.
|
| 247 |
texts=request.texts, prompt=request.prompt, **kwargs
|
| 248 |
)
|
| 249 |
processing_time = time.time() - start_time
|
| 250 |
|
| 251 |
response = DenseEmbedResponse(
|
| 252 |
embeddings=embeddings,
|
| 253 |
-
dimension=len(embeddings[0]),
|
|
|
|
| 254 |
model_id=request.model_id,
|
| 255 |
processing_time=processing_time,
|
| 256 |
-
count=len(embeddings),
|
| 257 |
)
|
| 258 |
|
| 259 |
-
# Cache
|
| 260 |
-
if cache is not None:
|
|
|
|
| 261 |
cache.set(
|
| 262 |
-
texts=
|
| 263 |
model_id=request.model_id,
|
| 264 |
result=response,
|
| 265 |
prompt=request.prompt,
|
| 266 |
-
**
|
| 267 |
)
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
return response
|
| 270 |
|
| 271 |
except (ValidationError, ModelNotFoundError) as e:
|
|
|
|
| 229 |
)
|
| 230 |
processing_time = time.time() - start_time
|
| 231 |
|
| 232 |
+
# Convert to SparseEmbedding objects
|
| 233 |
+
sparse_embeddings = []
|
| 234 |
+
for idx, sparse_result in enumerate(sparse_results):
|
| 235 |
+
sparse_embeddings.append(
|
| 236 |
+
SparseEmbedding(
|
| 237 |
+
text=request.texts[idx],
|
| 238 |
+
indices=sparse_result["indices"],
|
| 239 |
+
values=sparse_result["values"],
|
| 240 |
+
)
|
| 241 |
+
)
|
| 242 |
|
| 243 |
response = SparseEmbedResponse(
|
| 244 |
+
embeddings=sparse_embeddings,
|
| 245 |
+
count=len(sparse_embeddings),
|
| 246 |
model_id=request.model_id,
|
| 247 |
processing_time=processing_time,
|
| 248 |
)
|
| 249 |
else:
|
| 250 |
+
# Dense batch embeddings
|
| 251 |
+
embeddings = model.embed_documents(
|
| 252 |
texts=request.texts, prompt=request.prompt, **kwargs
|
| 253 |
)
|
| 254 |
processing_time = time.time() - start_time
|
| 255 |
|
| 256 |
response = DenseEmbedResponse(
|
| 257 |
embeddings=embeddings,
|
| 258 |
+
dimension=len(embeddings[0]) if embeddings else 0,
|
| 259 |
+
count=len(embeddings),
|
| 260 |
model_id=request.model_id,
|
| 261 |
processing_time=processing_time,
|
|
|
|
| 262 |
)
|
| 263 |
|
| 264 |
+
# Cache small batches
|
| 265 |
+
if cache is not None and len(request.texts) <= 10:
|
| 266 |
+
cache_key = str(sorted(request.texts))
|
| 267 |
cache.set(
|
| 268 |
+
texts=cache_key,
|
| 269 |
model_id=request.model_id,
|
| 270 |
result=response,
|
| 271 |
prompt=request.prompt,
|
| 272 |
+
**kwargs,
|
| 273 |
)
|
| 274 |
|
| 275 |
+
logger.info(
|
| 276 |
+
f"Generated {len(request.texts)} embeddings "
|
| 277 |
+
f"in {processing_time:.3f}s ({len(request.texts) / processing_time:.1f} texts/s)"
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
return response
|
| 281 |
|
| 282 |
except (ValidationError, ModelNotFoundError) as e:
|
src/config/settings.py
CHANGED
|
@@ -36,12 +36,12 @@ class Settings(BaseSettings):
|
|
| 36 |
PRELOAD_MODELS: bool = True # Load all models at startup
|
| 37 |
|
| 38 |
# Request Limits
|
| 39 |
-
MAX_TEXT_LENGTH: int =
|
| 40 |
MAX_BATCH_SIZE: int = 100 # Maximum texts per batch request
|
| 41 |
REQUEST_TIMEOUT: int = 30 # Request timeout in seconds
|
| 42 |
|
| 43 |
# Cache Configuration
|
| 44 |
-
ENABLE_CACHE: bool =
|
| 45 |
CACHE_TTL: int = 3600 # Cache time-to-live in seconds
|
| 46 |
CACHE_MAX_SIZE: int = 1000 # Maximum cache entries
|
| 47 |
|
|
|
|
| 36 |
PRELOAD_MODELS: bool = True # Load all models at startup
|
| 37 |
|
| 38 |
# Request Limits
|
| 39 |
+
MAX_TEXT_LENGTH: int = 32000 # Maximum characters per text
|
| 40 |
MAX_BATCH_SIZE: int = 100 # Maximum texts per batch request
|
| 41 |
REQUEST_TIMEOUT: int = 30 # Request timeout in seconds
|
| 42 |
|
| 43 |
# Cache Configuration
|
| 44 |
+
ENABLE_CACHE: bool = True # Enable response caching (Phase 2)
|
| 45 |
CACHE_TTL: int = 3600 # Cache time-to-live in seconds
|
| 46 |
CACHE_MAX_SIZE: int = 1000 # Maximum cache entries
|
| 47 |
|