Spaces:

Baktabek
/

jina-embeddings-v3-api

Sleeping

App Files Files Community

Baktabek commited on Jan 19

Commit

b2adce0

verified ·

1 Parent(s): 1a98a8c

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -11

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 HuggingFace Space: Jina Embeddings v3 API
 Free embedding service for AI-RAG-Core project
 """
 from fastapi import FastAPI, HTTPException
@@ -9,25 +10,46 @@ from typing import List
 import torch
 from transformers import AutoModel
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(title="Jina Embeddings v3 API", version="1.0.0")
 # Load model on startup
 model = None
 @app.on_event("startup")
 async def load_model():
-    global model
     logger.info("Loading jina-embeddings-v3 model...")
     model = AutoModel.from_pretrained(
         'jinaai/jina-embeddings-v3',
         trust_remote_code=True,
         device_map="auto"
     )
-    logger.info("Model loaded successfully!")
 class EmbeddingRequest(BaseModel):
@@ -47,13 +69,29 @@ async def create_embeddings(request: EmbeddingRequest):
     if model is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
-    try:
-        # Generate embeddings
-        embeddings = model.encode(
-            request.input,
-            task=request.task,
-            batch_size=32
         )
         # Convert to list format
         if isinstance(embeddings, torch.Tensor):
@@ -67,20 +105,44 @@ async def create_embeddings(request: EmbeddingRequest):
             for i, emb in enumerate(embeddings)
         ]
         return EmbeddingResponse(data=data)
     except Exception as e:
         logger.error(f"Embedding generation failed: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
     return {
         "status": "healthy",
         "model": "jina-embeddings-v3",
-        "model_loaded": model is not None
     }
@@ -89,9 +151,22 @@ async def root():
     """Root endpoint"""
     return {
         "service": "Jina Embeddings v3 API",
-        "version": "1.0.0",
         "endpoints": {
             "embeddings": "/embeddings (POST)",
             "health": "/health (GET)"
         }
     }

 """
 HuggingFace Space: Jina Embeddings v3 API
 Free embedding service for AI-RAG-Core project
+FIXED VERSION with memory management and batch limits
 """
 from fastapi import FastAPI, HTTPException
 import torch
 from transformers import AutoModel
 import logging
+import gc
+import asyncio
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+app = FastAPI(title="Jina Embeddings v3 API", version="1.0.1")
 # Load model on startup
 model = None
+device = None
+# Configuration
+MAX_BATCH_SIZE = 50  # Limit batch size to prevent OOM
+MAX_TEXT_LENGTH = 8192  # Jina v3 max tokens
 @app.on_event("startup")
 async def load_model():
+    global model, device
     logger.info("Loading jina-embeddings-v3 model...")
+    # Detect device
+    if torch.cuda.is_available():
+        device = "cuda"
+        logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
+    else:
+        device = "cpu"
+        logger.info("Using CPU")
     model = AutoModel.from_pretrained(
         'jinaai/jina-embeddings-v3',
         trust_remote_code=True,
         device_map="auto"
     )
+    # Set to eval mode to save memory
+    model.eval()
+    logger.info(f"Model loaded successfully on {device}!")
 class EmbeddingRequest(BaseModel):
     if model is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
+    # Validate batch size
+    if len(request.input) > MAX_BATCH_SIZE:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Batch size {len(request.input)} exceeds limit {MAX_BATCH_SIZE}"
         )
+    # Validate text length
+    for text in request.input:
+        if len(text) > MAX_TEXT_LENGTH:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Text length exceeds {MAX_TEXT_LENGTH} characters"
+            )
+    try:
+        # Generate embeddings with no_grad to save memory
+        with torch.no_grad():
+            embeddings = model.encode(
+                request.input,
+                task=request.task,
+                batch_size=16  # Process in smaller chunks
+            )
         # Convert to list format
         if isinstance(embeddings, torch.Tensor):
             for i, emb in enumerate(embeddings)
         ]
+        # CRITICAL: Clear GPU cache after each request
+        if device == "cuda":
+            torch.cuda.empty_cache()
+        # Force garbage collection
+        gc.collect()
         return EmbeddingResponse(data=data)
     except Exception as e:
         logger.error(f"Embedding generation failed: {e}")
+        # Clear cache on error
+        if device == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
+    memory_info = {}
+    if torch.cuda.is_available():
+        memory_info = {
+            "gpu_memory_allocated": f"{torch.cuda.memory_allocated() / 1024**2:.2f} MB",
+            "gpu_memory_reserved": f"{torch.cuda.memory_reserved() / 1024**2:.2f} MB"
+        }
     return {
         "status": "healthy",
         "model": "jina-embeddings-v3",
+        "model_loaded": model is not None,
+        "device": device,
+        "max_batch_size": MAX_BATCH_SIZE,
+        **memory_info
     }
     """Root endpoint"""
     return {
         "service": "Jina Embeddings v3 API",
+        "version": "1.0.1",
         "endpoints": {
             "embeddings": "/embeddings (POST)",
             "health": "/health (GET)"
+        },
+        "limits": {
+            "max_batch_size": MAX_BATCH_SIZE,
+            "max_text_length": MAX_TEXT_LENGTH
         }
     }
+@app.post("/clear_cache")
+async def clear_cache():
+    """Manually clear GPU cache (admin endpoint)"""
+    if device == "cuda":
+        torch.cuda.empty_cache()
+    gc.collect()
+    return {"status": "cache cleared"}