bielik_app_service / app /models /llama_cpp_model.py
Patryk Studzinski
feat: enable GPU acceleration for Bielik GGUF models
7c2f84b
"""
GGUF Model implementation using llama-cpp-python.
Highly optimized for CPU inference.
"""
import os
import asyncio
import traceback
from typing import List, Dict, Any, Optional
from app.models.base_llm import BaseLLM
try:
from llama_cpp import Llama, LlamaGrammar
HAS_LLAMA_CPP = True
except ImportError:
HAS_LLAMA_CPP = False
LlamaGrammar = None
class LlamaCppModel(BaseLLM):
"""
Wrapper for GGUF models using llama.cpp.
Provides significant speedups on CPU compared to Transformers.
"""
def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None, n_gpu_layers: int = -1):
super().__init__(name, model_id)
self.model_path = model_path
self.n_ctx = n_ctx
self.grammar_path = grammar_path
self.n_gpu_layers = n_gpu_layers
self.default_grammar = None # Will be loaded from file if provided
self.llm = None
self._response_cache = {}
self._max_cache_size = 100
if not HAS_LLAMA_CPP:
raise ImportError("llama-cpp-python is not installed. Cannot use GGUF models.")
async def initialize(self) -> None:
"""Load GGUF model."""
if self._initialized:
return
if not self.model_path or not os.path.exists(self.model_path):
# If exact path isn't provided, try to find it in the model directory
# logic handled in registry usually, but safety check here
raise FileNotFoundError(f"GGUF model file not found at: {self.model_path}")
try:
print(f"[{self.name}] Loading GGUF model from: {self.model_path}")
print(f"[{self.name}] File size: {os.path.getsize(self.model_path) / (1024*1024):.2f} MB")
print(f"[{self.name}] n_ctx={self.n_ctx}, n_threads={os.cpu_count()}, n_gpu_layers={self.n_gpu_layers}")
# Load model in a thread to avoid blocking event loop
# Enable verbose to see llama.cpp errors
self.llm = await asyncio.to_thread(
Llama,
model_path=self.model_path,
n_ctx=self.n_ctx,
n_threads=os.cpu_count(), # Use all available cores
n_gpu_layers=self.n_gpu_layers, # GPU layer offloading
verbose=True # Enable verbose to see loading errors
)
self._initialized = True
print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx}, n_gpu_layers={self.n_gpu_layers})")
# Load grammar file if provided
if self.grammar_path:
grammar_full_path = os.path.join(os.path.dirname(__file__), "..", "logic", self.grammar_path)
if os.path.exists(grammar_full_path):
with open(grammar_full_path, 'r', encoding='utf-8') as f:
self.default_grammar = f.read()
print(f"[{self.name}] Loaded grammar from: {grammar_full_path}")
else:
print(f"[{self.name}] Grammar file not found: {grammar_full_path}")
except Exception as e:
error_msg = str(e) if str(e) else repr(e)
print(f"[{self.name}] Failed to load GGUF model: {error_msg}")
print(f"[{self.name}] Full traceback:")
traceback.print_exc()
raise RuntimeError(f"Failed to load GGUF model: {error_msg}") from e
async def generate(
self,
prompt: str = None,
chat_messages: List[Dict[str, str]] = None,
max_new_tokens: int = 150,
temperature: float = 0.7,
top_p: float = 0.9,
grammar: str = None,
**kwargs
) -> str:
"""Generate text using llama.cpp
Args:
prompt: Simple text prompt (converted to user message)
chat_messages: List of chat messages with role/content
max_new_tokens: Maximum tokens to generate
temperature: Sampling temperature (lower = more deterministic)
top_p: Nucleus sampling threshold
grammar: Optional GBNF grammar string to constrain output
"""
if not self._initialized or self.llm is None:
raise RuntimeError(f"[{self.name}] Model not initialized")
# Ensure we have a list of messages
messages = chat_messages
if not messages and prompt:
messages = [{"role": "user", "content": prompt}]
if not messages:
raise ValueError("Either prompt or chat_messages required")
# Cache Check - using stringified messages for the key
import json
cache_key = f"{json.dumps(messages)}_{max_new_tokens}_{temperature}_{top_p}_{grammar is not None}"
if cache_key in self._response_cache:
return self._response_cache[cache_key]
print(f"DEBUG: Generating with messages: {messages}", flush=True)
if grammar:
print(f"DEBUG: Using GBNF grammar constraint", flush=True)
# Prepare grammar object if provided
llama_grammar = None
if grammar and LlamaGrammar:
try:
llama_grammar = LlamaGrammar.from_string(grammar)
except Exception as e:
print(f"DEBUG: Failed to parse grammar: {e}", flush=True)
llama_grammar = None
# Generate using chat completion to leverage internal templates
output = await asyncio.to_thread(
self.llm.create_chat_completion,
messages=messages,
max_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
grammar=llama_grammar,
)
print(f"DEBUG: Raw output object: {output}", flush=True)
response_text = output['choices'][0]['message']['content'].strip()
print(f"DEBUG: Extracted text: {response_text}", flush=True)
# Cache Store
if len(self._response_cache) >= self._max_cache_size:
first_key = next(iter(self._response_cache))
del self._response_cache[first_key]
self._response_cache[cache_key] = response_text
return response_text
def get_info(self) -> Dict[str, Any]:
"""Return model information for /models endpoint."""
return {
"name": self.name,
"model_id": self.model_id,
"type": "gguf",
"backend": "llama.cpp",
"context_length": self.n_ctx,
"loaded": self._initialized,
"model_path": self.model_path,
"has_grammar": self.default_grammar is not None,
"gpu_layers": self.n_gpu_layers
}
async def cleanup(self) -> None:
"""Free memory."""
if self.llm:
del self.llm
self.llm = None
self._initialized = False
print(f"[{self.name}] GGUF Model unloaded")