Spaces:

studzinsky
/

bielik_app_service

Sleeping

App Files Files Community

Patryk Studzinski commited on Feb 13

Commit

371aac9

1 Parent(s): 9ecca89

refactor: enhance model unloading and memory management for improved GPU efficiency

Browse files

Files changed (2) hide show

app/models/registry.py +21 -3
app/models/transformers_model.py +24 -1

app/models/registry.py CHANGED Viewed

@@ -74,11 +74,17 @@ class ModelRegistry:
     async def get_model(self, name: str) -> BaseLLM:
         config = self._config[name]
         if name not in self._models:
             model = self._create_model(name)
             await model.initialize()
             self._models[name] = model
         return self._models[name]
     async def _unload_model(self, name: str) -> None:
@@ -123,8 +129,20 @@ class ModelRegistry:
         return self.get_model_info(name)
     async def unload_model(self, name: str) -> Dict[str, str]:
-        """Explicitly unload a model."""
-        await self._unload_model(name)
-        return {"status": "unloaded", "model": name}
 registry = ModelRegistry()

     async def get_model(self, name: str) -> BaseLLM:
         config = self._config[name]
+        # Unload previously active model to free GPU memory when switching models
+        if self._active_local_model and self._active_local_model != name:
+            print(f"Switching models: unloading '{self._active_local_model}' to load '{name}'")
+            await self._unload_model(self._active_local_model)
         if name not in self._models:
             model = self._create_model(name)
             await model.initialize()
             self._models[name] = model
+        self._active_local_model = name
         return self._models[name]
     async def _unload_model(self, name: str) -> None:
         return self.get_model_info(name)
     async def unload_model(self, name: str) -> Dict[str, str]:
+        """Explicitly unload a model and free its memory."""
+        if name in self._models:
+            await self._unload_model(name)
+            if self._active_local_model == name:
+                self._active_local_model = None
+            return {"status": "success", "message": f"Model '{name}' unloaded"}
+        return {"status": "error", "message": f"Model '{name}' not loaded"}
+    async def unload_all_models(self) -> Dict[str, str]:
+        """Unload all loaded models and free GPU memory."""
+        loaded_models = list(self._models.keys())
+        for model_name in loaded_models:
+            await self._unload_model(model_name)
+        self._active_local_model = None
+        return {"status": "success", "message": f"Unloaded {len(loaded_models)} models"}
 registry = ModelRegistry()

app/models/transformers_model.py CHANGED Viewed

@@ -70,6 +70,18 @@ class TransformersModel(BaseLLM):
     def _load_model(self) -> None:
         """Load model with optimal device configuration and quantization support."""
         # Check GPU availability with detailed diagnostics
         cuda_available = torch.cuda.is_available()
         cuda_device_count = torch.cuda.device_count() if cuda_available else 0
@@ -322,6 +334,8 @@ class TransformersModel(BaseLLM):
     async def cleanup(self) -> None:
         """Free memory."""
         if self.model:
             del self.model
             self.model = None
@@ -330,8 +344,17 @@ class TransformersModel(BaseLLM):
             self.tokenizer = None
         self._initialized = False
         # Clear CUDA cache if available
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        print(f"[{self.name}] Transformers Model unloaded")

     def _load_model(self) -> None:
         """Load model with optimal device configuration and quantization support."""
+        import gc
+        # Set PyTorch environment variables for optimal memory management
+        if not os.getenv("PYTORCH_CUDA_ALLOC_CONF"):
+            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+            print(f"[{self.name}] Set PYTORCH_CUDA_ALLOC_CONF to prevent GPU memory fragmentation")
+        # Force garbage collection before loading new model
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         # Check GPU availability with detailed diagnostics
         cuda_available = torch.cuda.is_available()
         cuda_device_count = torch.cuda.device_count() if cuda_available else 0
     async def cleanup(self) -> None:
         """Free memory."""
+        import gc
         if self.model:
             del self.model
             self.model = None
             self.tokenizer = None
         self._initialized = False
+        # Aggressive cleanup
+        gc.collect()  # Force garbage collection
         # Clear CUDA cache if available
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+            try:
+                # Empty reserved memory too (PyTorch 2.0+)
+                device_id = torch.cuda.current_device()
+                torch.cuda.reset_peak_memory_stats(device_id)
+            except:
+                pass
+        print(f"[{self.name}] Transformers Model unloaded and memory freed")