Spaces:

studzinsky
/

bielik_app_service

Running

App Files Files Community

Patryk Studzinski commited on 11 days ago

Commit

b50a781

1 Parent(s): a7fd202

model-lazy-loading

Browse files

Files changed (5) hide show

README.md +76 -8
app/main.py +40 -11
app/models/huggingface_local.py +16 -0
app/models/registry.py +117 -38
app/schemas/schemas.py +2 -1

README.md CHANGED Viewed

@@ -11,9 +11,9 @@ This service provides an API for generating enhanced descriptions using multiple
 | Model | Size | Polish Support | Type |
 |-------|------|----------------|------|
 | Bielik-1.5B | 1.5B | Excellent | Local |
 | PLLuM-12B | 12B | Excellent | API |
-| Mistral-Small-3 | 24B | Good | API |
-| Gemma-2-9B | 9B | Medium | API |
 ## API Endpoints
@@ -25,6 +25,13 @@ This service provides an API for generating enhanced descriptions using multiple
 | `GET` | `/health` | API health check and model status |
 | `GET` | `/models` | List all available models |
 ### Generation
 | Method | Endpoint | Description |
@@ -34,18 +41,37 @@ This service provides an API for generating enhanced descriptions using multiple
 ---
 ## Endpoint Details
 ### `GET /health`
-Check API status and model initialization.
 **Response:**
 ```json
 {
   "status": "ok",
-  "local_models_initialized": true,
-  "available_models": 4
 }
 ```
@@ -53,7 +79,7 @@ Check API status and model initialization.
 ### `GET /models`
-List all available models with their details.
 **Response:**
 ```json
@@ -64,13 +90,55 @@ List all available models with their details.
     "type": "local",
     "polish_support": "excellent",
     "size": "1.5B",
-    "initialized": true
   }
 ]
 ```
 ---
 ### `POST /enhance-description`
 Generate enhanced description using a single model.
@@ -119,7 +187,7 @@ Compare outputs from multiple models for the same input.
     "features": ["nawigacja", "klimatyzacja"],
     "condition": "bardzo dobry"
   },
-  "models": ["bielik-1.5b", "pllum-12b", "gemma-2-9b"]
 }
 ```

 | Model | Size | Polish Support | Type |
 |-------|------|----------------|------|
 | Bielik-1.5B | 1.5B | Excellent | Local |
+| Qwen2.5-3B | 3B | Good | Local |
+| Gemma-2-2B | 2B | Medium | Local |
 | PLLuM-12B | 12B | Excellent | API |
 ## API Endpoints
 | `GET` | `/health` | API health check and model status |
 | `GET` | `/models` | List all available models |
+### Model Management (Lazy Loading)
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| `POST` | `/models/{name}/load` | Load a model into memory |
+| `POST` | `/models/{name}/unload` | Unload a model from memory |
 ### Generation
 | Method | Endpoint | Description |
 ---
+## Lazy Loading
+Models are **not loaded at startup** to conserve memory. Instead:
+- Models are loaded **on first request** (lazy loading)
+- Only **one local model** is loaded at a time
+- Switching to a different local model **automatically unloads** the previous one
+- API models (PLLuM) don't affect local model memory
+### Example: Load/Unload Flow
+```
+1. Request with bielik-1.5b → Loads Bielik (first use)
+2. Request with qwen2.5-3b → Unloads Bielik, loads Qwen
+3. Request with pllum-12b → Qwen stays loaded (API model doesn't affect local)
+4. POST /models/qwen2.5-3b/unload → Manually free memory
+```
+---
 ## Endpoint Details
 ### `GET /health`
+Check API status and loaded models.
 **Response:**
 ```json
 {
   "status": "ok",
+  "available_models": 4,
+  "loaded_models": ["bielik-1.5b"],
+  "active_local_model": "bielik-1.5b"
 }
 ```
 ### `GET /models`
+List all available models with their load status.
 **Response:**
 ```json
     "type": "local",
     "polish_support": "excellent",
     "size": "1.5B",
+    "loaded": true,
+    "active": true
+  },
+  {
+    "name": "qwen2.5-3b",
+    "model_id": "Qwen/Qwen2.5-3B-Instruct",
+    "type": "local",
+    "polish_support": "good",
+    "size": "3B",
+    "loaded": false,
+    "active": false
   }
 ]
 ```
 ---
+### `POST /models/{name}/load`
+Explicitly load a model. For local models, unloads the previous one first.
+**Response:**
+```json
+{
+  "status": "loaded",
+  "model": {
+    "name": "bielik-1.5b",
+    "loaded": true,
+    "active": true
+  }
+}
+```
+---
+### `POST /models/{name}/unload`
+Explicitly unload a model to free memory.
+**Response:**
+```json
+{
+  "status": "unloaded",
+  "model": "bielik-1.5b"
+}
+```
+---
 ### `POST /enhance-description`
 Generate enhanced description using a single model.
     "features": ["nawigacja", "klimatyzacja"],
     "condition": "bardzo dobry"
   },
+  "models": ["bielik-1.5b", "qwen2.5-3b", "gemma-2-2b", "pllum-12b"]
 }
 ```

app/main.py CHANGED Viewed

@@ -38,14 +38,12 @@ app.add_middleware(
 @app.on_event("startup")
 async def startup_event():
-    """Initialize local models at startup."""
-    print("Starting up and initializing local models...")
-    try:
-        await registry.initialize_local_models()
-        print("Local models initialized successfully.")
-    except Exception as e:
-        print(f"Error during model initialization: {e}")
-        raise
 # --- Helper function to load domain logic ---
 def get_domain_config(domain: str):
@@ -65,18 +63,49 @@ async def read_root():
 async def health_check():
     """Check API health and model status."""
     models = registry.list_models()
-    local_initialized = any(m["initialized"] for m in models if m["type"] == "local")
     return {
         "status": "ok",
-        "local_models_initialized": local_initialized,
         "available_models": len(models),
     }
 @app.get("/models", response_model=List[ModelInfo])
 async def list_models():
-    """List all available models."""
     return registry.list_models()
 @app.post("/enhance-description", response_model=EnhancedDescriptionResponse)
 async def enhance_description(
     domain: str = Body(..., embed=True),

 @app.on_event("startup")
 async def startup_event():
+    """
+    Startup event - models are loaded lazily on first request.
+    No models are pre-loaded to conserve memory.
+    """
+    print("Application started. Models will be loaded lazily on first request.")
+    print(f"Available models: {registry.get_available_model_names()}")
 # --- Helper function to load domain logic ---
 def get_domain_config(domain: str):
 async def health_check():
     """Check API health and model status."""
     models = registry.list_models()
+    loaded_models = registry.get_loaded_models()
+    active_model = registry.get_active_model()
     return {
         "status": "ok",
         "available_models": len(models),
+        "loaded_models": loaded_models,
+        "active_local_model": active_model,
     }
 @app.get("/models", response_model=List[ModelInfo])
 async def list_models():
+    """List all available models with their load status."""
     return registry.list_models()
+@app.post("/models/{model_name}/load")
+async def load_model(model_name: str):
+    """
+    Explicitly load a model into memory.
+    For local models: unloads any previously loaded local model first.
+    """
+    if model_name not in registry.get_available_model_names():
+        raise HTTPException(status_code=404, detail=f"Unknown model: {model_name}")
+    try:
+        info = await registry.load_model(model_name)
+        return {"status": "loaded", "model": info}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to load model: {str(e)}")
+@app.post("/models/{model_name}/unload")
+async def unload_model(model_name: str):
+    """
+    Explicitly unload a model from memory to free resources.
+    """
+    if model_name not in registry.get_available_model_names():
+        raise HTTPException(status_code=404, detail=f"Unknown model: {model_name}")
+    try:
+        result = await registry.unload_model(model_name)
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to unload model: {str(e)}")
 @app.post("/enhance-description", response_model=EnhancedDescriptionResponse)
 async def enhance_description(
     domain: str = Body(..., embed=True),

app/models/huggingface_local.py CHANGED Viewed

@@ -131,3 +131,19 @@ class HuggingFaceLocal(BaseLLM):
             "initialized": self._initialized,
             "device": self.device
         }

             "initialized": self._initialized,
             "device": self.device
         }
+    async def cleanup(self) -> None:
+        """Release model from memory."""
+        if self.pipeline is not None:
+            del self.pipeline
+            self.pipeline = None
+        if self.tokenizer is not None:
+            del self.tokenizer
+            self.tokenizer = None
+        self._initialized = False
+        # Force CUDA cache clear if available
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        print(f"[{self.name}] Model unloaded from memory")

app/models/registry.py CHANGED Viewed

@@ -1,8 +1,10 @@
 """
 Model Registry - Central configuration and factory for all LLM models.
 """
 import os
 from typing import Dict, List, Any, Optional
 from app.models.base_llm import BaseLLM
@@ -10,7 +12,7 @@ from app.models.huggingface_local import HuggingFaceLocal
 from app.models.huggingface_inference_api import HuggingFaceInferenceAPI
-# Model configuration
 MODEL_CONFIG = {
     "bielik-1.5b": {
         "id": "speakleash/Bielik-1.5B-v3.0-Instruct",
@@ -18,24 +20,24 @@ MODEL_CONFIG = {
         "polish_support": "excellent",
         "size": "1.5B",
     },
     "pllum-12b": {
         "id": "CYFRAGOVPL/PLLuM-12B-instruct",
         "type": "inference_api",
         "polish_support": "excellent",
         "size": "12B",
     },
-    "mistral-small-3": {
-        "id": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
-        "type": "inference_api",
-        "polish_support": "good",
-        "size": "24B",
-    },
-    "gemma-2-9b": {
-        "id": "google/gemma-2-9b-it",
-        "type": "inference_api",
-        "polish_support": "medium",
-        "size": "9B",
-    },
 }
 # For local model override (when model is pre-downloaded in container)
@@ -45,12 +47,14 @@ LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH", "/app/pretrain_model")
 class ModelRegistry:
     """
     Central registry for managing all LLM models.
-    Handles model instantiation, initialization, and access.
     """
     def __init__(self):
         self._models: Dict[str, BaseLLM] = {}
         self._config = MODEL_CONFIG.copy()
     def _create_model(self, name: str) -> BaseLLM:
         """Factory method to create model instance."""
@@ -80,44 +84,119 @@ class ModelRegistry:
         else:
             raise ValueError(f"Unknown model type: {model_type}")
     async def get_model(self, name: str) -> BaseLLM:
-        """Get or create and initialize a model."""
-        if name not in self._models:
-            model = self._create_model(name)
-            await model.initialize()
-            self._models[name] = model
         return self._models[name]
-    async def initialize_model(self, name: str) -> None:
-        """Pre-initialize a specific model."""
         await self.get_model(name)
-    async def initialize_local_models(self) -> None:
-        """Initialize all local models at startup."""
-        for name, config in self._config.items():
-            if config["type"] == "local":
-                await self.initialize_model(name)
     def list_models(self) -> List[Dict[str, Any]]:
         """List all available models with their info."""
-        models = []
-        for name, config in self._config.items():
-            model_info = {
-                "name": name,
-                "model_id": config["id"],
-                "type": config["type"],
-                "polish_support": config["polish_support"],
-                "size": config["size"],
-                "initialized": name in self._models and self._models[name].is_initialized,
-            }
-            models.append(model_info)
-        return models
     def get_available_model_names(self) -> List[str]:
         """Get list of available model names."""
         return list(self._config.keys())
 # Global registry instance

 """
 Model Registry - Central configuration and factory for all LLM models.
+Supports lazy loading and on/off mechanism for memory management.
 """
 import os
+import gc
 from typing import Dict, List, Any, Optional
 from app.models.base_llm import BaseLLM
 from app.models.huggingface_inference_api import HuggingFaceInferenceAPI
+# Model configuration - 3 local + 1 API for Polish language comparison
 MODEL_CONFIG = {
     "bielik-1.5b": {
         "id": "speakleash/Bielik-1.5B-v3.0-Instruct",
         "polish_support": "excellent",
         "size": "1.5B",
     },
+    "qwen2.5-3b": {
+        "id": "Qwen/Qwen2.5-3B-Instruct",
+        "type": "local",
+        "polish_support": "good",
+        "size": "3B",
+    },
+    "gemma-2-2b": {
+        "id": "google/gemma-2-2b-it",
+        "type": "local",
+        "polish_support": "medium",
+        "size": "2B",
+    },
     "pllum-12b": {
         "id": "CYFRAGOVPL/PLLuM-12B-instruct",
         "type": "inference_api",
         "polish_support": "excellent",
         "size": "12B",
     },
 }
 # For local model override (when model is pre-downloaded in container)
 class ModelRegistry:
     """
     Central registry for managing all LLM models.
+    Supports lazy loading (load on first request) and unloading for memory management.
+    Only one local model is loaded at a time to conserve memory.
     """
     def __init__(self):
         self._models: Dict[str, BaseLLM] = {}
         self._config = MODEL_CONFIG.copy()
+        self._active_local_model: Optional[str] = None
     def _create_model(self, name: str) -> BaseLLM:
         """Factory method to create model instance."""
         else:
             raise ValueError(f"Unknown model type: {model_type}")
+    async def _unload_model(self, name: str) -> None:
+        """Unload a model from memory."""
+        if name in self._models:
+            model = self._models[name]
+            # Call cleanup if available
+            if hasattr(model, 'cleanup'):
+                await model.cleanup()
+            del self._models[name]
+            gc.collect()  # Force garbage collection
+            print(f"Model '{name}' unloaded from memory.")
+    async def _unload_all_local_models(self) -> None:
+        """Unload all local models to free memory."""
+        local_models = [
+            name for name, config in self._config.items()
+            if config["type"] == "local" and name in self._models
+        ]
+        for name in local_models:
+            await self._unload_model(name)
+        self._active_local_model = None
     async def get_model(self, name: str) -> BaseLLM:
+        """
+        Get a model (lazy loading).
+        For local models: unloads any previously loaded local model first.
+        For API models: always available without affecting local models.
+        """
+        if name not in self._config:
+            raise ValueError(f"Unknown model: {name}")
+        config = self._config[name]
+        # If it's a local model, ensure only one is loaded at a time
+        if config["type"] == "local":
+            # Unload current local model if different
+            if self._active_local_model and self._active_local_model != name:
+                print(f"Switching from '{self._active_local_model}' to '{name}'...")
+                await self._unload_model(self._active_local_model)
+            # Load the requested model if not already loaded
+            if name not in self._models:
+                print(f"Loading model '{name}'...")
+                model = self._create_model(name)
+                await model.initialize()
+                self._models[name] = model
+                self._active_local_model = name
+                print(f"Model '{name}' loaded successfully.")
+        # For API models, just create/return (no memory concern)
+        elif config["type"] == "inference_api":
+            if name not in self._models:
+                print(f"Initializing API model '{name}'...")
+                model = self._create_model(name)
+                await model.initialize()
+                self._models[name] = model
         return self._models[name]
+    async def load_model(self, name: str) -> Dict[str, Any]:
+        """
+        Explicitly load a model (unloads other local models first).
+        Returns model info.
+        """
         await self.get_model(name)
+        return self.get_model_info(name)
+    async def unload_model(self, name: str) -> Dict[str, str]:
+        """
+        Explicitly unload a model from memory.
+        """
+        if name not in self._config:
+            raise ValueError(f"Unknown model: {name}")
+        if name not in self._models:
+            return {"status": "not_loaded", "model": name}
+        await self._unload_model(name)
+        if self._active_local_model == name:
+            self._active_local_model = None
+        return {"status": "unloaded", "model": name}
+    def get_model_info(self, name: str) -> Dict[str, Any]:
+        """Get info about a specific model."""
+        if name not in self._config:
+            raise ValueError(f"Unknown model: {name}")
+        config = self._config[name]
+        return {
+            "name": name,
+            "model_id": config["id"],
+            "type": config["type"],
+            "polish_support": config["polish_support"],
+            "size": config["size"],
+            "loaded": name in self._models,
+            "active": name == self._active_local_model if config["type"] == "local" else None,
+        }
     def list_models(self) -> List[Dict[str, Any]]:
         """List all available models with their info."""
+        return [self.get_model_info(name) for name in self._config.keys()]
     def get_available_model_names(self) -> List[str]:
         """Get list of available model names."""
         return list(self._config.keys())
+    def get_active_model(self) -> Optional[str]:
+        """Get the currently active (loaded) local model name."""
+        return self._active_local_model
+    def get_loaded_models(self) -> List[str]:
+        """Get list of currently loaded model names."""
+        return list(self._models.keys())
 # Global registry instance

app/schemas/schemas.py CHANGED Viewed

@@ -15,7 +15,8 @@ class ModelInfo(BaseModel):
     type: str
     polish_support: str
     size: str
-    initialized: bool
 class CompareRequest(BaseModel):

     type: str
     polish_support: str
     size: str
+    loaded: bool
+    active: Optional[bool] = None  # Only for local models
 class CompareRequest(BaseModel):