Spaces:

xce009
/

embeddings-api

Running

App Files Files Community

Soumik Bose commited on Feb 13

Commit

58f4a9c

1 Parent(s): 08a63bd

ok

Browse files

Files changed (3) hide show

Dockerfile +9 -5
main.py +84 -107
model_service.py +34 -40

Dockerfile CHANGED Viewed

@@ -5,7 +5,6 @@ FROM python:3.11-slim
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
     PYTHONIOENCODING=UTF-8 \
-    # Set HF_HOME to a writable directory
     HF_HOME=/app/cache \
     TRANSFORMERS_CACHE=/app/cache
@@ -20,10 +19,15 @@ WORKDIR /app
 COPY --chown=user:user requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# --- LAYER 2: Model Download (Cached) ---
-# Instead of copying local files, we download the model during the build.
-# This layer will be CACHED and won't run again unless you change this line.
-RUN python3 -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='BAAI/bge-base-en-v1.5', local_dir='./models/bge-base-en-v1.5')"
 # --- LAYER 3: Application Code ---
 COPY --chown=user:user . .

 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
     PYTHONIOENCODING=UTF-8 \
     HF_HOME=/app/cache \
     TRANSFORMERS_CACHE=/app/cache
 COPY --chown=user:user requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# --- LAYER 2: Download Models (Cached) ---
+# We download models for 384, 768, and 1024 dimensions.
+# 384 dim: BAAI/bge-small-en-v1.5
+# 768 dim: BAAI/bge-base-en-v1.5
+# 1024 dim: BAAI/bge-large-en-v1.5
+RUN python3 -c "from huggingface_hub import snapshot_download; \
+    snapshot_download(repo_id='BAAI/bge-small-en-v1.5', local_dir='./models/bge-384'); \
+    snapshot_download(repo_id='BAAI/bge-base-en-v1.5', local_dir='./models/bge-768'); \
+    snapshot_download(repo_id='BAAI/bge-large-en-v1.5', local_dir='./models/bge-1024')"
 # --- LAYER 3: Application Code ---
 COPY --chown=user:user . .

main.py CHANGED Viewed

@@ -11,11 +11,11 @@ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
-# Ensure this module exists in your project
-from model_service import LocalEmbeddingService
 # ============================================================================
-# LOGGING CONFIGURATION
 # ============================================================================
 logging.basicConfig(
     level=logging.INFO,
@@ -24,76 +24,60 @@ logging.basicConfig(
 logger = logging.getLogger("EmbedAPI")
 # ============================================================================
-# CONFIGURATION & STATE
 # ============================================================================
-LOCAL_MODEL_PATH = os.getenv('MODEL_PATH', './models/bge-base-en-v1.5')
 AUTH_TOKEN = os.getenv('AUTH_TOKEN', None)
 ALLOWED_ORIGINS = os.getenv('ALLOWED_ORIGINS', '*').split(',')
-# Global resource container
 ml_context = {
     "service": None,
     "executor": None
 }
 # ============================================================================
-# LIFESPAN MANAGER (Replaces deprecated startup/shutdown events)
 # ============================================================================
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """
-    Manages the application lifecycle.
-    Initializes the model and thread pool on startup, and cleans them up on shutdown.
-    """
-    # --- Startup Phase ---
-    logger.info("Initializing BGE Embedding Service...")
-    # 1. Setup Thread Pool for CPU-bound inference
     try:
-        cpu_count = multiprocessing.cpu_count()
-        max_workers = cpu_count * 2
-        executor = ThreadPoolExecutor(max_workers=max_workers)
-        ml_context["executor"] = executor
-        logger.info(f"Thread pool initialized with {max_workers} workers.")
-    except Exception as e:
-        logger.error(f"Failed to initialize thread pool: {e}")
-        raise e
-    # 2. Load ML Model
-    try:
-        logger.info(f"Loading model from: {LOCAL_MODEL_PATH}")
-        service = LocalEmbeddingService(LOCAL_MODEL_PATH)
         ml_context["service"] = service
-        logger.info(f"Model loaded successfully. Dimension: {service.embedding_dim}")
     except Exception as e:
-        logger.critical(f"Critical error loading model: {e}", exc_info=True)
         raise e
-    # 3. Log Auth Status
     if AUTH_TOKEN:
-        logger.info("Authentication enabled (Bearer token required).")
-    else:
-        logger.warning("Authentication disabled (no AUTH_TOKEN set).")
-    yield  # Application runs here
-    # --- Shutdown Phase ---
-    logger.info("Shutting down service...")
     if ml_context["executor"]:
         ml_context["executor"].shutdown(wait=True)
     ml_context.clear()
-    logger.info("Shutdown complete.")
 # ============================================================================
-# APP INITIALIZATION
 # ============================================================================
 app = FastAPI(
-    title="BGE Embedding API",
-    description="Production-grade embedding inference API.",
-    version="2.0.0",
-    lifespan=lifespan,
-    docs_url="/docs",
-    redoc_url="/redoc"
 )
 app.add_middleware(
@@ -104,88 +88,58 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# ============================================================================
-# SECURITY
-# ============================================================================
 security = HTTPBearer(auto_error=False)
 async def verify_token(credentials: Optional[HTTPAuthorizationCredentials] = Security(security)):
-    """Dependency to verify Bearer token if configured."""
     if not AUTH_TOKEN:
         return True
-    if not credentials:
-        raise HTTPException(
-            status_code=401,
-            detail="Authentication required",
-            headers={"WWW-Authenticate": "Bearer"},
-        )
-    if credentials.credentials != AUTH_TOKEN:
-        raise HTTPException(
-            status_code=401,
-            detail="Invalid authentication token",
-            headers={"WWW-Authenticate": "Bearer"},
-        )
     return True
 # ============================================================================
-# DATA MODELS (Pydantic V2)
 # ============================================================================
 class EmbedRequest(BaseModel):
-    text: Union[str, List[str]] = Field(
-        ...,
-        description="Single text string or list of texts to embed"
-    )
     model_config = {
         "json_schema_extra": {
             "example": {
-                "text": ["First sentence to embed.", "Second sentence to embed."]
             }
         }
     }
 class EmbedResponse(BaseModel):
-    embeddings: Union[List[float], List[List[float]]] = Field(..., description="Generated vector(s)")
-    dimension: int = Field(..., description="Embedding dimension")
-    count: int = Field(..., description="Number of texts processed")
 # ============================================================================
 # ENDPOINTS
 # ============================================================================
-@app.get("/")
-async def root():
-    """API Metadata."""
-    return {
-        "service": "BGE Embedding API",
-        "status": "running",
-        "version": "2.0.0",
-        "authentication": "enabled" if AUTH_TOKEN else "disabled"
-    }
 @app.get("/health")
 async def health_check():
-    """Liveness probe to ensure model is loaded."""
-    if not ml_context["service"]:
         raise HTTPException(status_code=503, detail="Service not ready")
     return {
         "status": "healthy",
-        "dimension": ml_context["service"].embedding_dim
     }
-@app.get("/ping")
-async def ping():
-    """Simple keep-alive endpoint."""
-    return {"status": "ok", "message": "pong"}
 @app.post("/embed", response_model=EmbedResponse, dependencies=[Depends(verify_token)])
 async def create_embeddings(request: EmbedRequest):
     """
-    Generate embeddings.
-    Runs inference in a separate thread pool to prevent blocking the async event loop.
     """
     service = ml_context.get("service")
     executor = ml_context.get("executor")
@@ -193,30 +147,53 @@ async def create_embeddings(request: EmbedRequest):
     if not service or not executor:
         raise HTTPException(status_code=503, detail="Service unavailable")
     try:
-        # Determine if input is single string or list
-        is_single = isinstance(request.text, str)
-        count = 1 if is_single else len(request.text)
-        # Execute blocking model code in the thread pool
         loop = asyncio.get_running_loop()
         embeddings = await loop.run_in_executor(
             executor,
             service.generate_embedding,
-            request.text
         )
         return EmbedResponse(
             embeddings=embeddings,
-            dimension=service.embedding_dim,
             count=count
         )
     except Exception as e:
-        logger.error(f"Inference failed: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail="Internal processing error")
-@app.post("/embeddings", response_model=EmbedResponse, dependencies=[Depends(verify_token)])
-async def create_embeddings_alias(request: EmbedRequest):
-    """Alias for /embed endpoint."""
-    return await create_embeddings(request)

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
+# Import the new MultiEmbeddingService
+from model_service import MultiEmbeddingService
 # ============================================================================
+# LOGGING
 # ============================================================================
 logging.basicConfig(
     level=logging.INFO,
 logger = logging.getLogger("EmbedAPI")
 # ============================================================================
+# CONFIGURATION
 # ============================================================================
 AUTH_TOKEN = os.getenv('AUTH_TOKEN', None)
 ALLOWED_ORIGINS = os.getenv('ALLOWED_ORIGINS', '*').split(',')
+# Global context container
 ml_context = {
     "service": None,
     "executor": None
 }
 # ============================================================================
+# LIFESPAN MANAGER
 # ============================================================================
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    """Lifecycle manager: Loads models and thread pool."""
+    # --- Startup ---
+    logger.info("Initializing Multi-Dimensional Embedding Service...")
+    # 1. Thread Pool
+    cpu_count = multiprocessing.cpu_count()
+    max_workers = cpu_count * 2
+    executor = ThreadPoolExecutor(max_workers=max_workers)
+    ml_context["executor"] = executor
+    logger.info(f"Thread pool ready: {max_workers} workers")
+    # 2. Load Models
     try:
+        service = MultiEmbeddingService()
+        service.load_all_models() # Loads 384, 768, 1024 models
         ml_context["service"] = service
     except Exception as e:
+        logger.critical(f"Critical error loading models: {e}", exc_info=True)
         raise e
     if AUTH_TOKEN:
+        logger.info("🔒 Auth enabled.")
+    yield
+    # --- Shutdown ---
+    logger.info("Shutting down...")
     if ml_context["executor"]:
         ml_context["executor"].shutdown(wait=True)
     ml_context.clear()
 # ============================================================================
+# APP SETUP
 # ============================================================================
 app = FastAPI(
+    title="Multi-Dim Embedding API",
+    version="3.0.0",
+    lifespan=lifespan
 )
 app.add_middleware(
     allow_headers=["*"],
 )
 security = HTTPBearer(auto_error=False)
 async def verify_token(credentials: Optional[HTTPAuthorizationCredentials] = Security(security)):
     if not AUTH_TOKEN:
         return True
+    if not credentials or credentials.credentials != AUTH_TOKEN:
+        raise HTTPException(status_code=401, detail="Invalid token")
     return True
 # ============================================================================
+# MODELS
 # ============================================================================
 class EmbedRequest(BaseModel):
+    data: Union[str, List[str]] = Field(..., description="Text string or list of strings")
+    dimension: int = Field(768, description="Target dimension (384, 768, or 1024)")
     model_config = {
         "json_schema_extra": {
             "example": {
+                "data": ["Hello world", "Machine learning is great"],
+                "dimension": 768
             }
         }
     }
 class EmbedResponse(BaseModel):
+    embeddings: Union[List[float], List[List[float]]] = Field(...)
+    dimension: int
+    count: int
+class DeEmbedRequest(BaseModel):
+    vector: List[float] = Field(..., description="The embedding vector to decode")
 # ============================================================================
 # ENDPOINTS
 # ============================================================================
 @app.get("/health")
 async def health_check():
+    service = ml_context.get("service")
+    if not service:
         raise HTTPException(status_code=503, detail="Service not ready")
     return {
         "status": "healthy",
+        "loaded_dimensions": list(service.models.keys())
     }
 @app.post("/embed", response_model=EmbedResponse, dependencies=[Depends(verify_token)])
 async def create_embeddings(request: EmbedRequest):
     """
+    Generate embeddings for specific dimensions.
+    Supported dimensions: 384, 768, 1024.
     """
     service = ml_context.get("service")
     executor = ml_context.get("executor")
     if not service or not executor:
         raise HTTPException(status_code=503, detail="Service unavailable")
+    if request.dimension not in service.models:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Dimension {request.dimension} not supported. Use 384, 768, or 1024."
+        )
     try:
+        is_single = isinstance(request.data, str)
+        count = 1 if is_single else len(request.data)
         loop = asyncio.get_running_loop()
         embeddings = await loop.run_in_executor(
             executor,
             service.generate_embedding,
+            request.data,
+            request.dimension
         )
         return EmbedResponse(
             embeddings=embeddings,
+            dimension=request.dimension,
             count=count
         )
     except Exception as e:
+        logger.error(f"Inference error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/deembed", dependencies=[Depends(verify_token)])
+async def de_embed_vector(request: DeEmbedRequest):
+    """
+    Experimental: Reverse vector to text.
+    NOTE: Mathematically, standard embedding models (BERT, BGE) are NOT reversible
+    because they are lossy compression algorithms.
+    To retrieve text from a vector, you must use a Vector Database (retrieval),
+    not a direct model inversion.
+    """
+    # In a real scenario, this would look like:
+    # result = vector_db.search(vector=request.vector, top_k=1)
+    # return {"text": result.text}
+    raise HTTPException(
+        status_code=501,
+        detail=(
+            "De-embedding (Vector-to-Text) is not possible with standalone embedding models. "
+            "This endpoint requires a connected Vector Database to perform a similarity search."
+        )
+    )

model_service.py CHANGED Viewed

@@ -1,47 +1,41 @@
-import os
-from typing import List, Union
 from sentence_transformers import SentenceTransformer
-class LocalEmbeddingService:
-    """Service for generating embeddings using a locally stored model."""
-    def __init__(self, model_folder: str):
-        """
-        Initialize the service by loading the model from a local path.
-        Args:
-            model_folder: Path to the folder containing the saved model
-        """
-        if not os.path.exists(model_folder):
-            raise FileNotFoundError(
-                f"Model folder not found at: {model_folder}. "
-                "Please run download_model.py first."
-            )
-        print(f"Loading model from {model_folder}...")
-        self.model = SentenceTransformer(model_folder)
-        self.embedding_dim = self.model.get_sentence_embedding_dimension()
-        print(f"✅ Model loaded successfully. Dimension: {self.embedding_dim}")
-    def generate_embedding(self, text: Union[str, List[str]]) -> Union[List[float], List[List[float]]]:
-        """
-        Generate embeddings for the given text(s).
-        Args:
-            text: A single string or list of strings to embed
-        Returns:
-            A single embedding (list of floats) or list of embeddings
-        """
-        # Encode the text with normalization for cosine similarity
-        embeddings = self.model.encode(
             text,
             normalize_embeddings=True,
-            convert_to_tensor=False
-        )
-        # Convert to list for JSON serialization
-        if isinstance(text, str):
-            return embeddings.tolist()
-        return embeddings.tolist()

+import logging
 from sentence_transformers import SentenceTransformer
+import torch
+logger = logging.getLogger("EmbedService")
+class MultiEmbeddingService:
+    def __init__(self):
+        self.models = {}
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Map dimensions to local folders (downloaded in Dockerfile)
+        self.model_map = {
+            384: "./models/bge-384",
+            768: "./models/bge-768",
+            1024: "./models/bge-1024"
+        }
+    def load_all_models(self):
+        """Loads all defined models into memory."""
+        for dim, path in self.model_map.items():
+            try:
+                logger.info(f"Loading {dim}-dimension model from {path}...")
+                model = SentenceTransformer(path, device=self.device)
+                model.eval() # Set to evaluation mode
+                self.models[dim] = model
+                logger.info(f"✅ Loaded model for dimension {dim}")
+            except Exception as e:
+                logger.error(f"❌ Failed to load {dim}-dim model: {e}")
+                # We don't raise here, so partial failures don't crash the whole app
+    def generate_embedding(self, text: str | list[str], dimension: int):
+        """Generates embeddings using the specific model for the requested dimension."""
+        if dimension not in self.models:
+            raise ValueError(f"Dimension {dimension} not supported. Available: {list(self.models.keys())}")
+        return self.models[dimension].encode(
             text,
             normalize_embeddings=True,
+            convert_to_numpy=True
+        ).tolist()