Spaces:
Sleeping
Sleeping
Patryk Studzinski commited on
Commit ·
7c2f84b
1
Parent(s): 812e56d
feat: enable GPU acceleration for Bielik GGUF models
Browse files- Add n_gpu_layers parameter to LlamaCppModel for full/partial GPU offloading
- Configure both bielik-1.5b and bielik-11b with n_gpu_layers=-1 (full GPU)
- Update requirements.txt to CUDA 12.1 torch wheels (cu121)
- Add GPU detection and reporting in startup and /health endpoint
- Llama-3.1-8b continues using Inference API (no changes)
- app/main.py +22 -0
- app/models/llama_cpp_model.py +7 -4
- app/models/registry.py +7 -3
- requirements.txt +2 -4
app/main.py
CHANGED
|
@@ -68,6 +68,16 @@ async def startup_event():
|
|
| 68 |
"""
|
| 69 |
print("Application started. Models will be loaded lazily on first request.")
|
| 70 |
print(f"Available models: {registry.get_available_model_names()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# --- Helper function to load domain logic ---
|
| 73 |
def get_domain_config(domain: str):
|
|
@@ -89,11 +99,23 @@ async def health_check():
|
|
| 89 |
models = registry.list_models()
|
| 90 |
loaded_models = registry.get_loaded_models()
|
| 91 |
active_model = registry.get_active_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
return {
|
| 93 |
"status": "ok",
|
| 94 |
"available_models": len(models),
|
| 95 |
"loaded_models": loaded_models,
|
| 96 |
"active_local_model": active_model,
|
|
|
|
|
|
|
| 97 |
}
|
| 98 |
|
| 99 |
@app.get("/models", response_model=List[ModelInfo])
|
|
|
|
| 68 |
"""
|
| 69 |
print("Application started. Models will be loaded lazily on first request.")
|
| 70 |
print(f"Available models: {registry.get_available_model_names()}")
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
import torch
|
| 74 |
+
gpu_available = torch.cuda.is_available()
|
| 75 |
+
gpu_name = torch.cuda.get_device_name(0) if gpu_available else "N/A"
|
| 76 |
+
print(f"GPU available: {gpu_available}, Device: {gpu_name}")
|
| 77 |
+
except ImportError:
|
| 78 |
+
print("PyTorch not available for GPU check")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"GPU check failed: {e}")
|
| 81 |
|
| 82 |
# --- Helper function to load domain logic ---
|
| 83 |
def get_domain_config(domain: str):
|
|
|
|
| 99 |
models = registry.list_models()
|
| 100 |
loaded_models = registry.get_loaded_models()
|
| 101 |
active_model = registry.get_active_model()
|
| 102 |
+
|
| 103 |
+
gpu_available = False
|
| 104 |
+
gpu_name = "N/A"
|
| 105 |
+
try:
|
| 106 |
+
import torch
|
| 107 |
+
gpu_available = torch.cuda.is_available()
|
| 108 |
+
gpu_name = torch.cuda.get_device_name(0) if gpu_available else "N/A"
|
| 109 |
+
except:
|
| 110 |
+
pass
|
| 111 |
+
|
| 112 |
return {
|
| 113 |
"status": "ok",
|
| 114 |
"available_models": len(models),
|
| 115 |
"loaded_models": loaded_models,
|
| 116 |
"active_local_model": active_model,
|
| 117 |
+
"gpu_available": gpu_available,
|
| 118 |
+
"gpu_device": gpu_name,
|
| 119 |
}
|
| 120 |
|
| 121 |
@app.get("/models", response_model=List[ModelInfo])
|
app/models/llama_cpp_model.py
CHANGED
|
@@ -23,11 +23,12 @@ class LlamaCppModel(BaseLLM):
|
|
| 23 |
Provides significant speedups on CPU compared to Transformers.
|
| 24 |
"""
|
| 25 |
|
| 26 |
-
def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None):
|
| 27 |
super().__init__(name, model_id)
|
| 28 |
self.model_path = model_path
|
| 29 |
self.n_ctx = n_ctx
|
| 30 |
self.grammar_path = grammar_path
|
|
|
|
| 31 |
self.default_grammar = None # Will be loaded from file if provided
|
| 32 |
self.llm = None
|
| 33 |
self._response_cache = {}
|
|
@@ -49,7 +50,7 @@ class LlamaCppModel(BaseLLM):
|
|
| 49 |
try:
|
| 50 |
print(f"[{self.name}] Loading GGUF model from: {self.model_path}")
|
| 51 |
print(f"[{self.name}] File size: {os.path.getsize(self.model_path) / (1024*1024):.2f} MB")
|
| 52 |
-
print(f"[{self.name}] n_ctx={self.n_ctx}, n_threads={os.cpu_count()}")
|
| 53 |
|
| 54 |
# Load model in a thread to avoid blocking event loop
|
| 55 |
# Enable verbose to see llama.cpp errors
|
|
@@ -58,11 +59,12 @@ class LlamaCppModel(BaseLLM):
|
|
| 58 |
model_path=self.model_path,
|
| 59 |
n_ctx=self.n_ctx,
|
| 60 |
n_threads=os.cpu_count(), # Use all available cores
|
|
|
|
| 61 |
verbose=True # Enable verbose to see loading errors
|
| 62 |
)
|
| 63 |
|
| 64 |
self._initialized = True
|
| 65 |
-
print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx})")
|
| 66 |
|
| 67 |
# Load grammar file if provided
|
| 68 |
if self.grammar_path:
|
|
@@ -165,7 +167,8 @@ class LlamaCppModel(BaseLLM):
|
|
| 165 |
"context_length": self.n_ctx,
|
| 166 |
"loaded": self._initialized,
|
| 167 |
"model_path": self.model_path,
|
| 168 |
-
"has_grammar": self.default_grammar is not None
|
|
|
|
| 169 |
}
|
| 170 |
|
| 171 |
async def cleanup(self) -> None:
|
|
|
|
| 23 |
Provides significant speedups on CPU compared to Transformers.
|
| 24 |
"""
|
| 25 |
|
| 26 |
+
def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None, n_gpu_layers: int = -1):
|
| 27 |
super().__init__(name, model_id)
|
| 28 |
self.model_path = model_path
|
| 29 |
self.n_ctx = n_ctx
|
| 30 |
self.grammar_path = grammar_path
|
| 31 |
+
self.n_gpu_layers = n_gpu_layers
|
| 32 |
self.default_grammar = None # Will be loaded from file if provided
|
| 33 |
self.llm = None
|
| 34 |
self._response_cache = {}
|
|
|
|
| 50 |
try:
|
| 51 |
print(f"[{self.name}] Loading GGUF model from: {self.model_path}")
|
| 52 |
print(f"[{self.name}] File size: {os.path.getsize(self.model_path) / (1024*1024):.2f} MB")
|
| 53 |
+
print(f"[{self.name}] n_ctx={self.n_ctx}, n_threads={os.cpu_count()}, n_gpu_layers={self.n_gpu_layers}")
|
| 54 |
|
| 55 |
# Load model in a thread to avoid blocking event loop
|
| 56 |
# Enable verbose to see llama.cpp errors
|
|
|
|
| 59 |
model_path=self.model_path,
|
| 60 |
n_ctx=self.n_ctx,
|
| 61 |
n_threads=os.cpu_count(), # Use all available cores
|
| 62 |
+
n_gpu_layers=self.n_gpu_layers, # GPU layer offloading
|
| 63 |
verbose=True # Enable verbose to see loading errors
|
| 64 |
)
|
| 65 |
|
| 66 |
self._initialized = True
|
| 67 |
+
print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx}, n_gpu_layers={self.n_gpu_layers})")
|
| 68 |
|
| 69 |
# Load grammar file if provided
|
| 70 |
if self.grammar_path:
|
|
|
|
| 167 |
"context_length": self.n_ctx,
|
| 168 |
"loaded": self._initialized,
|
| 169 |
"model_path": self.model_path,
|
| 170 |
+
"has_grammar": self.default_grammar is not None,
|
| 171 |
+
"gpu_layers": self.n_gpu_layers
|
| 172 |
}
|
| 173 |
|
| 174 |
async def cleanup(self) -> None:
|
app/models/registry.py
CHANGED
|
@@ -20,7 +20,8 @@ MODEL_CONFIG = {
|
|
| 20 |
"type": "gguf",
|
| 21 |
"size": "1.7 GB",
|
| 22 |
"polish_support": "excellent",
|
| 23 |
-
"grammar_file": "answers.gbnf"
|
|
|
|
| 24 |
},
|
| 25 |
"bielik-11b-gguf": {
|
| 26 |
"id": "speakleash/Bielik-11B-v2.3-Instruct-GGUF",
|
|
@@ -29,7 +30,8 @@ MODEL_CONFIG = {
|
|
| 29 |
"type": "gguf",
|
| 30 |
"size": "7.2 GB",
|
| 31 |
"polish_support": "excellent",
|
| 32 |
-
"grammar_file": "answers.gbnf"
|
|
|
|
| 33 |
},
|
| 34 |
"llama-3.1-8b": {
|
| 35 |
"id": "meta-llama/Llama-3.1-8B-Instruct",
|
|
@@ -81,12 +83,14 @@ class ModelRegistry:
|
|
| 81 |
|
| 82 |
# Przekazanie gramatyki do modelu
|
| 83 |
grammar_path = config.get("grammar_file")
|
|
|
|
| 84 |
|
| 85 |
return LlamaCppModel(
|
| 86 |
name=name,
|
| 87 |
model_id=model_id,
|
| 88 |
model_path=full_path,
|
| 89 |
-
grammar_path=grammar_path
|
|
|
|
| 90 |
)
|
| 91 |
|
| 92 |
async def get_model(self, name: str) -> BaseLLM:
|
|
|
|
| 20 |
"type": "gguf",
|
| 21 |
"size": "1.7 GB",
|
| 22 |
"polish_support": "excellent",
|
| 23 |
+
"grammar_file": "answers.gbnf",
|
| 24 |
+
"n_gpu_layers": -1
|
| 25 |
},
|
| 26 |
"bielik-11b-gguf": {
|
| 27 |
"id": "speakleash/Bielik-11B-v2.3-Instruct-GGUF",
|
|
|
|
| 30 |
"type": "gguf",
|
| 31 |
"size": "7.2 GB",
|
| 32 |
"polish_support": "excellent",
|
| 33 |
+
"grammar_file": "answers.gbnf",
|
| 34 |
+
"n_gpu_layers": -1
|
| 35 |
},
|
| 36 |
"llama-3.1-8b": {
|
| 37 |
"id": "meta-llama/Llama-3.1-8B-Instruct",
|
|
|
|
| 83 |
|
| 84 |
# Przekazanie gramatyki do modelu
|
| 85 |
grammar_path = config.get("grammar_file")
|
| 86 |
+
n_gpu_layers = config.get("n_gpu_layers", -1)
|
| 87 |
|
| 88 |
return LlamaCppModel(
|
| 89 |
name=name,
|
| 90 |
model_id=model_id,
|
| 91 |
model_path=full_path,
|
| 92 |
+
grammar_path=grammar_path,
|
| 93 |
+
n_gpu_layers=n_gpu_layers
|
| 94 |
)
|
| 95 |
|
| 96 |
async def get_model(self, name: str) -> BaseLLM:
|
requirements.txt
CHANGED
|
@@ -5,8 +5,6 @@ accelerate==0.25.0
|
|
| 5 |
huggingface_hub>=0.26.0
|
| 6 |
pydantic==2.5.0
|
| 7 |
importlib-metadata
|
| 8 |
-
--extra-index-url https://download.pytorch.org/whl/
|
| 9 |
torch>=2.1.0
|
| 10 |
-
|
| 11 |
-
# bitsandbytes is optional for 8-bit quantization (CPU optimization)
|
| 12 |
-
# bitsandbytes==0.49.0
|
|
|
|
| 5 |
huggingface_hub>=0.26.0
|
| 6 |
pydantic==2.5.0
|
| 7 |
importlib-metadata
|
| 8 |
+
--extra-index-url https://download.pytorch.org/whl/cu121
|
| 9 |
torch>=2.1.0
|
| 10 |
+
|
|
|
|
|
|