Spaces:

studzinsky
/

bielik_app_service

Sleeping

Patryk Studzinski commited on Feb 2

Commit

7c2f84b

1 Parent(s): 812e56d

feat: enable GPU acceleration for Bielik GGUF models

- Add n_gpu_layers parameter to LlamaCppModel for full/partial GPU offloading
- Configure both bielik-1.5b and bielik-11b with n_gpu_layers=-1 (full GPU)
- Update requirements.txt to CUDA 12.1 torch wheels (cu121)
- Add GPU detection and reporting in startup and /health endpoint
- Llama-3.1-8b continues using Inference API (no changes)

Files changed (4) hide show

app/main.py +22 -0
app/models/llama_cpp_model.py +7 -4
app/models/registry.py +7 -3
requirements.txt +2 -4

app/main.py CHANGED Viewed

@@ -68,6 +68,16 @@ async def startup_event():
     """
     print("Application started. Models will be loaded lazily on first request.")
     print(f"Available models: {registry.get_available_model_names()}")
 # --- Helper function to load domain logic ---
 def get_domain_config(domain: str):
@@ -89,11 +99,23 @@ async def health_check():
     models = registry.list_models()
     loaded_models = registry.get_loaded_models()
     active_model = registry.get_active_model()
     return {
         "status": "ok",
         "available_models": len(models),
         "loaded_models": loaded_models,
         "active_local_model": active_model,
     }
 @app.get("/models", response_model=List[ModelInfo])

     """
     print("Application started. Models will be loaded lazily on first request.")
     print(f"Available models: {registry.get_available_model_names()}")
+    try:
+        import torch
+        gpu_available = torch.cuda.is_available()
+        gpu_name = torch.cuda.get_device_name(0) if gpu_available else "N/A"
+        print(f"GPU available: {gpu_available}, Device: {gpu_name}")
+    except ImportError:
+        print("PyTorch not available for GPU check")
+    except Exception as e:
+        print(f"GPU check failed: {e}")
 # --- Helper function to load domain logic ---
 def get_domain_config(domain: str):
     models = registry.list_models()
     loaded_models = registry.get_loaded_models()
     active_model = registry.get_active_model()
+    gpu_available = False
+    gpu_name = "N/A"
+    try:
+        import torch
+        gpu_available = torch.cuda.is_available()
+        gpu_name = torch.cuda.get_device_name(0) if gpu_available else "N/A"
+    except:
+        pass
     return {
         "status": "ok",
         "available_models": len(models),
         "loaded_models": loaded_models,
         "active_local_model": active_model,
+        "gpu_available": gpu_available,
+        "gpu_device": gpu_name,
     }
 @app.get("/models", response_model=List[ModelInfo])

app/models/llama_cpp_model.py CHANGED Viewed

@@ -23,11 +23,12 @@ class LlamaCppModel(BaseLLM):
     Provides significant speedups on CPU compared to Transformers.
     """
-    def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None):
         super().__init__(name, model_id)
         self.model_path = model_path
         self.n_ctx = n_ctx
         self.grammar_path = grammar_path
         self.default_grammar = None  # Will be loaded from file if provided
         self.llm = None
         self._response_cache = {}
@@ -49,7 +50,7 @@ class LlamaCppModel(BaseLLM):
         try:
             print(f"[{self.name}] Loading GGUF model from: {self.model_path}")
             print(f"[{self.name}] File size: {os.path.getsize(self.model_path) / (1024*1024):.2f} MB")
-            print(f"[{self.name}] n_ctx={self.n_ctx}, n_threads={os.cpu_count()}")
             # Load model in a thread to avoid blocking event loop
             # Enable verbose to see llama.cpp errors
@@ -58,11 +59,12 @@ class LlamaCppModel(BaseLLM):
                 model_path=self.model_path,
                 n_ctx=self.n_ctx,
                 n_threads=os.cpu_count(), # Use all available cores
                 verbose=True  # Enable verbose to see loading errors
             )
             self._initialized = True
-            print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx})")
             # Load grammar file if provided
             if self.grammar_path:
@@ -165,7 +167,8 @@ class LlamaCppModel(BaseLLM):
             "context_length": self.n_ctx,
             "loaded": self._initialized,
             "model_path": self.model_path,
-            "has_grammar": self.default_grammar is not None
         }
     async def cleanup(self) -> None:

     Provides significant speedups on CPU compared to Transformers.
     """
+    def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None, n_gpu_layers: int = -1):
         super().__init__(name, model_id)
         self.model_path = model_path
         self.n_ctx = n_ctx
         self.grammar_path = grammar_path
+        self.n_gpu_layers = n_gpu_layers
         self.default_grammar = None  # Will be loaded from file if provided
         self.llm = None
         self._response_cache = {}
         try:
             print(f"[{self.name}] Loading GGUF model from: {self.model_path}")
             print(f"[{self.name}] File size: {os.path.getsize(self.model_path) / (1024*1024):.2f} MB")
+            print(f"[{self.name}] n_ctx={self.n_ctx}, n_threads={os.cpu_count()}, n_gpu_layers={self.n_gpu_layers}")
             # Load model in a thread to avoid blocking event loop
             # Enable verbose to see llama.cpp errors
                 model_path=self.model_path,
                 n_ctx=self.n_ctx,
                 n_threads=os.cpu_count(), # Use all available cores
+                n_gpu_layers=self.n_gpu_layers,  # GPU layer offloading
                 verbose=True  # Enable verbose to see loading errors
             )
             self._initialized = True
+            print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx}, n_gpu_layers={self.n_gpu_layers})")
             # Load grammar file if provided
             if self.grammar_path:
             "context_length": self.n_ctx,
             "loaded": self._initialized,
             "model_path": self.model_path,
+            "has_grammar": self.default_grammar is not None,
+            "gpu_layers": self.n_gpu_layers
         }
     async def cleanup(self) -> None:

app/models/registry.py CHANGED Viewed

@@ -20,7 +20,8 @@ MODEL_CONFIG = {
         "type": "gguf",
         "size": "1.7 GB",
         "polish_support": "excellent",
-        "grammar_file": "answers.gbnf"
     },
     "bielik-11b-gguf": {
     "id": "speakleash/Bielik-11B-v2.3-Instruct-GGUF",
@@ -29,7 +30,8 @@ MODEL_CONFIG = {
     "type": "gguf",
     "size": "7.2 GB",
     "polish_support": "excellent",
-    "grammar_file": "answers.gbnf"
 },
     "llama-3.1-8b": {
         "id": "meta-llama/Llama-3.1-8B-Instruct",
@@ -81,12 +83,14 @@ class ModelRegistry:
             # Przekazanie gramatyki do modelu
             grammar_path = config.get("grammar_file")
             return LlamaCppModel(
                 name=name,
                 model_id=model_id,
                 model_path=full_path,
-                grammar_path=grammar_path # Upewnij się, że klasa LlamaCppModel to obsługuje
             )
     async def get_model(self, name: str) -> BaseLLM:

         "type": "gguf",
         "size": "1.7 GB",
         "polish_support": "excellent",
+        "grammar_file": "answers.gbnf",
+        "n_gpu_layers": -1
     },
     "bielik-11b-gguf": {
     "id": "speakleash/Bielik-11B-v2.3-Instruct-GGUF",
     "type": "gguf",
     "size": "7.2 GB",
     "polish_support": "excellent",
+    "grammar_file": "answers.gbnf",
+    "n_gpu_layers": -1
 },
     "llama-3.1-8b": {
         "id": "meta-llama/Llama-3.1-8B-Instruct",
             # Przekazanie gramatyki do modelu
             grammar_path = config.get("grammar_file")
+            n_gpu_layers = config.get("n_gpu_layers", -1)
             return LlamaCppModel(
                 name=name,
                 model_id=model_id,
                 model_path=full_path,
+                grammar_path=grammar_path,
+                n_gpu_layers=n_gpu_layers
             )
     async def get_model(self, name: str) -> BaseLLM:

requirements.txt CHANGED Viewed

@@ -5,8 +5,6 @@ accelerate==0.25.0
 huggingface_hub>=0.26.0
 pydantic==2.5.0
 importlib-metadata
---extra-index-url https://download.pytorch.org/whl/cpu
 torch>=2.1.0
-# llama-cpp-python is installed at runtime to avoid build issues in Spaces
-# bitsandbytes is optional for 8-bit quantization (CPU optimization)
-# bitsandbytes==0.49.0

 huggingface_hub>=0.26.0
 pydantic==2.5.0
 importlib-metadata
+--extra-index-url https://download.pytorch.org/whl/cu121
 torch>=2.1.0