Patryk Studzinski commited on
Commit
7c2f84b
·
1 Parent(s): 812e56d

feat: enable GPU acceleration for Bielik GGUF models

Browse files

- Add n_gpu_layers parameter to LlamaCppModel for full/partial GPU offloading
- Configure both bielik-1.5b and bielik-11b with n_gpu_layers=-1 (full GPU)
- Update requirements.txt to CUDA 12.1 torch wheels (cu121)
- Add GPU detection and reporting in startup and /health endpoint
- Llama-3.1-8b continues using Inference API (no changes)

app/main.py CHANGED
@@ -68,6 +68,16 @@ async def startup_event():
68
  """
69
  print("Application started. Models will be loaded lazily on first request.")
70
  print(f"Available models: {registry.get_available_model_names()}")
 
 
 
 
 
 
 
 
 
 
71
 
72
  # --- Helper function to load domain logic ---
73
  def get_domain_config(domain: str):
@@ -89,11 +99,23 @@ async def health_check():
89
  models = registry.list_models()
90
  loaded_models = registry.get_loaded_models()
91
  active_model = registry.get_active_model()
 
 
 
 
 
 
 
 
 
 
92
  return {
93
  "status": "ok",
94
  "available_models": len(models),
95
  "loaded_models": loaded_models,
96
  "active_local_model": active_model,
 
 
97
  }
98
 
99
  @app.get("/models", response_model=List[ModelInfo])
 
68
  """
69
  print("Application started. Models will be loaded lazily on first request.")
70
  print(f"Available models: {registry.get_available_model_names()}")
71
+
72
+ try:
73
+ import torch
74
+ gpu_available = torch.cuda.is_available()
75
+ gpu_name = torch.cuda.get_device_name(0) if gpu_available else "N/A"
76
+ print(f"GPU available: {gpu_available}, Device: {gpu_name}")
77
+ except ImportError:
78
+ print("PyTorch not available for GPU check")
79
+ except Exception as e:
80
+ print(f"GPU check failed: {e}")
81
 
82
  # --- Helper function to load domain logic ---
83
  def get_domain_config(domain: str):
 
99
  models = registry.list_models()
100
  loaded_models = registry.get_loaded_models()
101
  active_model = registry.get_active_model()
102
+
103
+ gpu_available = False
104
+ gpu_name = "N/A"
105
+ try:
106
+ import torch
107
+ gpu_available = torch.cuda.is_available()
108
+ gpu_name = torch.cuda.get_device_name(0) if gpu_available else "N/A"
109
+ except:
110
+ pass
111
+
112
  return {
113
  "status": "ok",
114
  "available_models": len(models),
115
  "loaded_models": loaded_models,
116
  "active_local_model": active_model,
117
+ "gpu_available": gpu_available,
118
+ "gpu_device": gpu_name,
119
  }
120
 
121
  @app.get("/models", response_model=List[ModelInfo])
app/models/llama_cpp_model.py CHANGED
@@ -23,11 +23,12 @@ class LlamaCppModel(BaseLLM):
23
  Provides significant speedups on CPU compared to Transformers.
24
  """
25
 
26
- def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None):
27
  super().__init__(name, model_id)
28
  self.model_path = model_path
29
  self.n_ctx = n_ctx
30
  self.grammar_path = grammar_path
 
31
  self.default_grammar = None # Will be loaded from file if provided
32
  self.llm = None
33
  self._response_cache = {}
@@ -49,7 +50,7 @@ class LlamaCppModel(BaseLLM):
49
  try:
50
  print(f"[{self.name}] Loading GGUF model from: {self.model_path}")
51
  print(f"[{self.name}] File size: {os.path.getsize(self.model_path) / (1024*1024):.2f} MB")
52
- print(f"[{self.name}] n_ctx={self.n_ctx}, n_threads={os.cpu_count()}")
53
 
54
  # Load model in a thread to avoid blocking event loop
55
  # Enable verbose to see llama.cpp errors
@@ -58,11 +59,12 @@ class LlamaCppModel(BaseLLM):
58
  model_path=self.model_path,
59
  n_ctx=self.n_ctx,
60
  n_threads=os.cpu_count(), # Use all available cores
 
61
  verbose=True # Enable verbose to see loading errors
62
  )
63
 
64
  self._initialized = True
65
- print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx})")
66
 
67
  # Load grammar file if provided
68
  if self.grammar_path:
@@ -165,7 +167,8 @@ class LlamaCppModel(BaseLLM):
165
  "context_length": self.n_ctx,
166
  "loaded": self._initialized,
167
  "model_path": self.model_path,
168
- "has_grammar": self.default_grammar is not None
 
169
  }
170
 
171
  async def cleanup(self) -> None:
 
23
  Provides significant speedups on CPU compared to Transformers.
24
  """
25
 
26
+ def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None, n_gpu_layers: int = -1):
27
  super().__init__(name, model_id)
28
  self.model_path = model_path
29
  self.n_ctx = n_ctx
30
  self.grammar_path = grammar_path
31
+ self.n_gpu_layers = n_gpu_layers
32
  self.default_grammar = None # Will be loaded from file if provided
33
  self.llm = None
34
  self._response_cache = {}
 
50
  try:
51
  print(f"[{self.name}] Loading GGUF model from: {self.model_path}")
52
  print(f"[{self.name}] File size: {os.path.getsize(self.model_path) / (1024*1024):.2f} MB")
53
+ print(f"[{self.name}] n_ctx={self.n_ctx}, n_threads={os.cpu_count()}, n_gpu_layers={self.n_gpu_layers}")
54
 
55
  # Load model in a thread to avoid blocking event loop
56
  # Enable verbose to see llama.cpp errors
 
59
  model_path=self.model_path,
60
  n_ctx=self.n_ctx,
61
  n_threads=os.cpu_count(), # Use all available cores
62
+ n_gpu_layers=self.n_gpu_layers, # GPU layer offloading
63
  verbose=True # Enable verbose to see loading errors
64
  )
65
 
66
  self._initialized = True
67
+ print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx}, n_gpu_layers={self.n_gpu_layers})")
68
 
69
  # Load grammar file if provided
70
  if self.grammar_path:
 
167
  "context_length": self.n_ctx,
168
  "loaded": self._initialized,
169
  "model_path": self.model_path,
170
+ "has_grammar": self.default_grammar is not None,
171
+ "gpu_layers": self.n_gpu_layers
172
  }
173
 
174
  async def cleanup(self) -> None:
app/models/registry.py CHANGED
@@ -20,7 +20,8 @@ MODEL_CONFIG = {
20
  "type": "gguf",
21
  "size": "1.7 GB",
22
  "polish_support": "excellent",
23
- "grammar_file": "answers.gbnf"
 
24
  },
25
  "bielik-11b-gguf": {
26
  "id": "speakleash/Bielik-11B-v2.3-Instruct-GGUF",
@@ -29,7 +30,8 @@ MODEL_CONFIG = {
29
  "type": "gguf",
30
  "size": "7.2 GB",
31
  "polish_support": "excellent",
32
- "grammar_file": "answers.gbnf"
 
33
  },
34
  "llama-3.1-8b": {
35
  "id": "meta-llama/Llama-3.1-8B-Instruct",
@@ -81,12 +83,14 @@ class ModelRegistry:
81
 
82
  # Przekazanie gramatyki do modelu
83
  grammar_path = config.get("grammar_file")
 
84
 
85
  return LlamaCppModel(
86
  name=name,
87
  model_id=model_id,
88
  model_path=full_path,
89
- grammar_path=grammar_path # Upewnij się, że klasa LlamaCppModel to obsługuje
 
90
  )
91
 
92
  async def get_model(self, name: str) -> BaseLLM:
 
20
  "type": "gguf",
21
  "size": "1.7 GB",
22
  "polish_support": "excellent",
23
+ "grammar_file": "answers.gbnf",
24
+ "n_gpu_layers": -1
25
  },
26
  "bielik-11b-gguf": {
27
  "id": "speakleash/Bielik-11B-v2.3-Instruct-GGUF",
 
30
  "type": "gguf",
31
  "size": "7.2 GB",
32
  "polish_support": "excellent",
33
+ "grammar_file": "answers.gbnf",
34
+ "n_gpu_layers": -1
35
  },
36
  "llama-3.1-8b": {
37
  "id": "meta-llama/Llama-3.1-8B-Instruct",
 
83
 
84
  # Przekazanie gramatyki do modelu
85
  grammar_path = config.get("grammar_file")
86
+ n_gpu_layers = config.get("n_gpu_layers", -1)
87
 
88
  return LlamaCppModel(
89
  name=name,
90
  model_id=model_id,
91
  model_path=full_path,
92
+ grammar_path=grammar_path,
93
+ n_gpu_layers=n_gpu_layers
94
  )
95
 
96
  async def get_model(self, name: str) -> BaseLLM:
requirements.txt CHANGED
@@ -5,8 +5,6 @@ accelerate==0.25.0
5
  huggingface_hub>=0.26.0
6
  pydantic==2.5.0
7
  importlib-metadata
8
- --extra-index-url https://download.pytorch.org/whl/cpu
9
  torch>=2.1.0
10
- # llama-cpp-python is installed at runtime to avoid build issues in Spaces
11
- # bitsandbytes is optional for 8-bit quantization (CPU optimization)
12
- # bitsandbytes==0.49.0
 
5
  huggingface_hub>=0.26.0
6
  pydantic==2.5.0
7
  importlib-metadata
8
+ --extra-index-url https://download.pytorch.org/whl/cu121
9
  torch>=2.1.0
10
+