Spaces:

xce009
/

embeddings-api

Running

App Files Files Community

Soumik Bose commited on Feb 13

Commit

9ab4c8b

1 Parent(s): 0ba7ee8

ok

Browse files

Files changed (1) hide show

main.py +114 -156

main.py CHANGED Viewed

@@ -1,13 +1,17 @@
-from fastapi import FastAPI, HTTPException, Security, Depends, Header
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field
-from typing import List, Union, Optional
 import os
 import logging
 import asyncio
-from concurrent.futures import ThreadPoolExecutor
 import multiprocessing
 from model_service import LocalEmbeddingService
 # ============================================================================
@@ -15,39 +19,83 @@ from model_service import LocalEmbeddingService
 # ============================================================================
 logging.basicConfig(
     level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler()
-    ]
 )
-logger = logging.getLogger(__name__)
 # ============================================================================
-# CONFIGURATION
 # ============================================================================
 LOCAL_MODEL_PATH = os.getenv('MODEL_PATH', './models/bge-base-en-v1.5')
-AUTH_TOKEN = os.getenv('AUTH_TOKEN', None)  # Set via environment variable
 ALLOWED_ORIGINS = os.getenv('ALLOWED_ORIGINS', '*').split(',')
-# Detect CPU cores for optimal workers
-CPU_COUNT = multiprocessing.cpu_count()
-MAX_WORKERS = CPU_COUNT * 2  # 2x CPU cores for I/O-bound operations
-logger.info(f"Detected {CPU_COUNT} CPU cores. Using {MAX_WORKERS} max workers for thread pool.")
 # ============================================================================
-# FASTAPI APP INITIALIZATION
 # ============================================================================
 app = FastAPI(
     title="BGE Embedding API",
-    description="Production-grade embedding inference API using BAAI/bge-base-en-v1.5",
     version="2.0.0",
     docs_url="/docs",
     redoc_url="/redoc"
 )
-# ============================================================================
-# CORS MIDDLEWARE
-# ============================================================================
 app.add_middleware(
     CORSMiddleware,
     allow_origins=ALLOWED_ORIGINS,
@@ -55,7 +103,6 @@ app.add_middleware(
     allow_methods=["*"],
     allow_headers=["*"],
 )
-logger.info(f"CORS enabled for origins: {ALLOWED_ORIGINS}")
 # ============================================================================
 # SECURITY
@@ -63,101 +110,44 @@ logger.info(f"CORS enabled for origins: {ALLOWED_ORIGINS}")
 security = HTTPBearer(auto_error=False)
 async def verify_token(credentials: Optional[HTTPAuthorizationCredentials] = Security(security)):
-    """Verify Bearer token if AUTH_TOKEN is set."""
-    if AUTH_TOKEN is None:
-        # No authentication required
         return True
-    if credentials is None:
-        logger.warning("Authentication required but no token provided")
         raise HTTPException(
             status_code=401,
             detail="Authentication required",
             headers={"WWW-Authenticate": "Bearer"},
         )
     if credentials.credentials != AUTH_TOKEN:
-        logger.warning(f"Invalid token attempt: {credentials.credentials[:10]}...")
         raise HTTPException(
             status_code=401,
             detail="Invalid authentication token",
             headers={"WWW-Authenticate": "Bearer"},
         )
     return True
 # ============================================================================
-# GLOBAL STATE
-# ============================================================================
-service = None
-executor = None
-@app.on_event("startup")
-async def startup_event():
-    """Load the model on startup and initialize thread pool."""
-    global service, executor
-    try:
-        logger.info("=" * 60)
-        logger.info("Starting BGE Embedding Service")
-        logger.info("=" * 60)
-        # Initialize thread pool executor for non-blocking operations
-        executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
-        logger.info(f"Thread pool executor initialized with {MAX_WORKERS} workers")
-        # Load model
-        logger.info(f"Loading model from: {LOCAL_MODEL_PATH}")
-        service = LocalEmbeddingService(LOCAL_MODEL_PATH)
-        logger.info(f"✅ Model loaded successfully! Dimension: {service.embedding_dim}")
-        # Authentication status
-        if AUTH_TOKEN:
-            logger.info("🔒 Authentication enabled (Bearer token required)")
-        else:
-            logger.warning("⚠️  Authentication disabled (no AUTH_TOKEN set)")
-        logger.info("=" * 60)
-        logger.info("Service ready to accept requests")
-        logger.info("=" * 60)
-    except Exception as e:
-        logger.error(f"❌ Failed to initialize service: {e}", exc_info=True)
-        raise
-@app.on_event("shutdown")
-async def shutdown_event():
-    """Cleanup on shutdown."""
-    global executor
-    logger.info("Shutting down service...")
-    if executor:
-        executor.shutdown(wait=True)
-        logger.info("Thread pool executor shut down")
-    logger.info("Service shutdown complete")
-# ============================================================================
-# REQUEST/RESPONSE MODELS
 # ============================================================================
 class EmbedRequest(BaseModel):
     text: Union[str, List[str]] = Field(
-        ...,
         description="Single text string or list of texts to embed"
     )
-    class Config:
-        schema_extra = {
             "example": {
-                "text": "Ginger was also a smart giraffe. She knew what was wrong."
             }
         }
 class EmbedResponse(BaseModel):
-    embeddings: Union[List[float], List[List[float]]] = Field(
-        ...,
-        description="Generated embedding(s)"
-    )
     dimension: int = Field(..., description="Embedding dimension")
     count: int = Field(..., description="Number of texts processed")
@@ -167,98 +157,66 @@ class EmbedResponse(BaseModel):
 @app.get("/")
 async def root():
-    """API information."""
     return {
-        "message": "BGE Embedding API - Production Ready",
-        "model": "BAAI/bge-base-en-v1.5",
-        "dimension": 768,
         "version": "2.0.0",
-        "authentication": "enabled" if AUTH_TOKEN else "disabled",
-        "endpoints": {
-            "health": "/health",
-            "ping": "/ping",
-            "embed": "/embed",
-            "embeddings": "/embeddings",
-            "docs": "/docs"
-        }
     }
 @app.get("/health")
 async def health_check():
-    """Check if the service is healthy."""
-    if service is None:
-        logger.error("Health check failed: service not initialized")
-        raise HTTPException(status_code=503, detail="Service not initialized")
     return {
         "status": "healthy",
-        "model_dimension": service.embedding_dim,
-        "model_path": LOCAL_MODEL_PATH,
-        "max_workers": MAX_WORKERS,
-        "cpu_count": CPU_COUNT
     }
 @app.get("/ping")
 async def ping():
-    """Simple ping endpoint for keep-alive."""
     return {"status": "ok", "message": "pong"}
-@app.post("/embed", response_model=EmbedResponse)
-async def create_embeddings(
-    request: EmbedRequest,
-    authenticated: bool = Depends(verify_token)
-):
     """
-    Generate embeddings for the provided text(s) - Non-blocking operation.
-    - **text**: Single string or list of strings to embed
-    Returns normalized 768-dimensional embeddings suitable for cosine similarity.
-    Requires Bearer token authentication if AUTH_TOKEN is set.
     """
-    if service is None:
-        logger.error("Embedding request failed: service not initialized")
-        raise HTTPException(status_code=503, detail="Service not initialized")
     try:
-        # Determine input type and count
         is_single = isinstance(request.text, str)
         count = 1 if is_single else len(request.text)
-        logger.info(f"Processing embedding request for {count} text(s)")
-        # Run embedding generation in thread pool (non-blocking)
-        loop = asyncio.get_event_loop()
         embeddings = await loop.run_in_executor(
             executor,
             service.generate_embedding,
             request.text
         )
-        logger.info(f"✅ Successfully generated {count} embedding(s)")
         return EmbedResponse(
             embeddings=embeddings,
             dimension=service.embedding_dim,
             count=count
         )
     except Exception as e:
-        logger.error(f"❌ Embedding generation failed: {e}", exc_info=True)
-        raise HTTPException(
-            status_code=500,
-            detail=f"Embedding generation failed: {str(e)}"
-        )
-@app.post("/embeddings", response_model=EmbedResponse)
-async def create_embeddings_batch(
-    request: EmbedRequest,
-    authenticated: bool = Depends(verify_token)
-):
-    """
-    Alias for /embed endpoint - Non-blocking batch embedding generation.
-    Requires Bearer token authentication if AUTH_TOKEN is set.
-    """
-    return await create_embeddings(request, authenticated)

 import os
 import logging
 import asyncio
 import multiprocessing
+from contextlib import asynccontextmanager
+from concurrent.futures import ThreadPoolExecutor
+from typing import Union, List, Optional, Any
+from fastapi import FastAPI, HTTPException, Security, Depends
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+# Ensure this module exists in your project
 from model_service import LocalEmbeddingService
 # ============================================================================
 # ============================================================================
 logging.basicConfig(
     level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
+logger = logging.getLogger("EmbedAPI")
 # ============================================================================
+# CONFIGURATION & STATE
 # ============================================================================
 LOCAL_MODEL_PATH = os.getenv('MODEL_PATH', './models/bge-base-en-v1.5')
+AUTH_TOKEN = os.getenv('AUTH_TOKEN', None)
 ALLOWED_ORIGINS = os.getenv('ALLOWED_ORIGINS', '*').split(',')
+# Global resource container
+ml_context = {
+    "service": None,
+    "executor": None
+}
+# ============================================================================
+# LIFESPAN MANAGER (Replaces deprecated startup/shutdown events)
+# ============================================================================
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Manages the application lifecycle.
+    Initializes the model and thread pool on startup, and cleans them up on shutdown.
+    """
+    # --- Startup Phase ---
+    logger.info("Initializing BGE Embedding Service...")
+    # 1. Setup Thread Pool for CPU-bound inference
+    try:
+        cpu_count = multiprocessing.cpu_count()
+        max_workers = cpu_count * 2
+        executor = ThreadPoolExecutor(max_workers=max_workers)
+        ml_context["executor"] = executor
+        logger.info(f"Thread pool initialized with {max_workers} workers.")
+    except Exception as e:
+        logger.error(f"Failed to initialize thread pool: {e}")
+        raise e
+    # 2. Load ML Model
+    try:
+        logger.info(f"Loading model from: {LOCAL_MODEL_PATH}")
+        service = LocalEmbeddingService(LOCAL_MODEL_PATH)
+        ml_context["service"] = service
+        logger.info(f"Model loaded successfully. Dimension: {service.embedding_dim}")
+    except Exception as e:
+        logger.critical(f"Critical error loading model: {e}", exc_info=True)
+        raise e
+    # 3. Log Auth Status
+    if AUTH_TOKEN:
+        logger.info("Authentication enabled (Bearer token required).")
+    else:
+        logger.warning("Authentication disabled (no AUTH_TOKEN set).")
+    yield  # Application runs here
+    # --- Shutdown Phase ---
+    logger.info("Shutting down service...")
+    if ml_context["executor"]:
+        ml_context["executor"].shutdown(wait=True)
+    ml_context.clear()
+    logger.info("Shutdown complete.")
 # ============================================================================
+# APP INITIALIZATION
 # ============================================================================
 app = FastAPI(
     title="BGE Embedding API",
+    description="Production-grade embedding inference API.",
     version="2.0.0",
+    lifespan=lifespan,
     docs_url="/docs",
     redoc_url="/redoc"
 )
 app.add_middleware(
     CORSMiddleware,
     allow_origins=ALLOWED_ORIGINS,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 # ============================================================================
 # SECURITY
 security = HTTPBearer(auto_error=False)
 async def verify_token(credentials: Optional[HTTPAuthorizationCredentials] = Security(security)):
+    """Dependency to verify Bearer token if configured."""
+    if not AUTH_TOKEN:
         return True
+    if not credentials:
         raise HTTPException(
             status_code=401,
             detail="Authentication required",
             headers={"WWW-Authenticate": "Bearer"},
         )
     if credentials.credentials != AUTH_TOKEN:
         raise HTTPException(
             status_code=401,
             detail="Invalid authentication token",
             headers={"WWW-Authenticate": "Bearer"},
         )
     return True
 # ============================================================================
+# DATA MODELS (Pydantic V2)
 # ============================================================================
 class EmbedRequest(BaseModel):
     text: Union[str, List[str]] = Field(
+        ...,
         description="Single text string or list of texts to embed"
     )
+    model_config = {
+        "json_schema_extra": {
             "example": {
+                "text": ["First sentence to embed.", "Second sentence to embed."]
             }
         }
+    }
 class EmbedResponse(BaseModel):
+    embeddings: Union[List[float], List[List[float]]] = Field(..., description="Generated vector(s)")
     dimension: int = Field(..., description="Embedding dimension")
     count: int = Field(..., description="Number of texts processed")
 @app.get("/")
 async def root():
+    """API Metadata."""
     return {
+        "service": "BGE Embedding API",
+        "status": "running",
         "version": "2.0.0",
+        "authentication": "enabled" if AUTH_TOKEN else "disabled"
     }
 @app.get("/health")
 async def health_check():
+    """Liveness probe to ensure model is loaded."""
+    if not ml_context["service"]:
+        raise HTTPException(status_code=503, detail="Service not ready")
     return {
         "status": "healthy",
+        "dimension": ml_context["service"].embedding_dim
     }
 @app.get("/ping")
 async def ping():
+    """Simple keep-alive endpoint."""
     return {"status": "ok", "message": "pong"}
+@app.post("/embed", response_model=EmbedResponse, dependencies=[Depends(verify_token)])
+async def create_embeddings(request: EmbedRequest):
     """
+    Generate embeddings.
+    Runs inference in a separate thread pool to prevent blocking the async event loop.
     """
+    service = ml_context.get("service")
+    executor = ml_context.get("executor")
+    if not service or not executor:
+        raise HTTPException(status_code=503, detail="Service unavailable")
     try:
+        # Determine if input is single string or list
         is_single = isinstance(request.text, str)
         count = 1 if is_single else len(request.text)
+        # Execute blocking model code in the thread pool
+        loop = asyncio.get_running_loop()
         embeddings = await loop.run_in_executor(
             executor,
             service.generate_embedding,
             request.text
         )
         return EmbedResponse(
             embeddings=embeddings,
             dimension=service.embedding_dim,
             count=count
         )
     except Exception as e:
+        logger.error(f"Inference failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail="Internal processing error")
+@app.post("/embeddings", response_model=EmbedResponse, dependencies=[Depends(verify_token)])
+async def create_embeddings_alias(request: EmbedRequest):
+    """Alias for /embed endpoint."""
+    return await create_embeddings(request)