Spaces:

studzinsky
/

bielik_app_service

Sleeping

bielik_app_service / app /models /llama_cpp_model.py

Patryk Studzinski

feat: enable GPU acceleration for Bielik GGUF models

7c2f84b 11 days ago

7.04 kB

	"""
	GGUF Model implementation using llama-cpp-python.
	Highly optimized for CPU inference.
	"""

	import os
	import asyncio
	import traceback
	from typing import List, Dict, Any, Optional
	from app.models.base_llm import BaseLLM

	try:
	from llama_cpp import Llama, LlamaGrammar
	HAS_LLAMA_CPP = True
	except ImportError:
	HAS_LLAMA_CPP = False
	LlamaGrammar = None


	class LlamaCppModel(BaseLLM):
	"""
	Wrapper for GGUF models using llama.cpp.
	Provides significant speedups on CPU compared to Transformers.
	"""

	def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None, n_gpu_layers: int = -1):
	super().__init__(name, model_id)
	self.model_path = model_path
	self.n_ctx = n_ctx
	self.grammar_path = grammar_path
	self.n_gpu_layers = n_gpu_layers
	self.default_grammar = None # Will be loaded from file if provided
	self.llm = None
	self._response_cache = {}
	self._max_cache_size = 100

	if not HAS_LLAMA_CPP:
	raise ImportError("llama-cpp-python is not installed. Cannot use GGUF models.")

	async def initialize(self) -> None:
	"""Load GGUF model."""
	if self._initialized:
	return

	if not self.model_path or not os.path.exists(self.model_path):
	# If exact path isn't provided, try to find it in the model directory
	# logic handled in registry usually, but safety check here
	raise FileNotFoundError(f"GGUF model file not found at: {self.model_path}")

	try:
	print(f"[{self.name}] Loading GGUF model from: {self.model_path}")
	print(f"[{self.name}] File size: {os.path.getsize(self.model_path) / (1024*1024):.2f} MB")
	print(f"[{self.name}] n_ctx={self.n_ctx}, n_threads={os.cpu_count()}, n_gpu_layers={self.n_gpu_layers}")

	# Load model in a thread to avoid blocking event loop
	# Enable verbose to see llama.cpp errors
	self.llm = await asyncio.to_thread(
	Llama,
	model_path=self.model_path,
	n_ctx=self.n_ctx,
	n_threads=os.cpu_count(), # Use all available cores
	n_gpu_layers=self.n_gpu_layers, # GPU layer offloading
	verbose=True # Enable verbose to see loading errors
	)

	self._initialized = True
	print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx}, n_gpu_layers={self.n_gpu_layers})")

	# Load grammar file if provided
	if self.grammar_path:
	grammar_full_path = os.path.join(os.path.dirname(__file__), "..", "logic", self.grammar_path)
	if os.path.exists(grammar_full_path):
	with open(grammar_full_path, 'r', encoding='utf-8') as f:
	self.default_grammar = f.read()
	print(f"[{self.name}] Loaded grammar from: {grammar_full_path}")
	else:
	print(f"[{self.name}] Grammar file not found: {grammar_full_path}")

	except Exception as e:
	error_msg = str(e) if str(e) else repr(e)
	print(f"[{self.name}] Failed to load GGUF model: {error_msg}")
	print(f"[{self.name}] Full traceback:")
	traceback.print_exc()
	raise RuntimeError(f"Failed to load GGUF model: {error_msg}") from e

	async def generate(
	self,
	prompt: str = None,
	chat_messages: List[Dict[str, str]] = None,
	max_new_tokens: int = 150,
	temperature: float = 0.7,
	top_p: float = 0.9,
	grammar: str = None,
	**kwargs
	) -> str:
	"""Generate text using llama.cpp

	Args:
	prompt: Simple text prompt (converted to user message)
	chat_messages: List of chat messages with role/content
	max_new_tokens: Maximum tokens to generate
	temperature: Sampling temperature (lower = more deterministic)
	top_p: Nucleus sampling threshold
	grammar: Optional GBNF grammar string to constrain output
	"""

	if not self._initialized or self.llm is None:
	raise RuntimeError(f"[{self.name}] Model not initialized")

	# Ensure we have a list of messages
	messages = chat_messages
	if not messages and prompt:
	messages = [{"role": "user", "content": prompt}]

	if not messages:
	raise ValueError("Either prompt or chat_messages required")

	# Cache Check - using stringified messages for the key
	import json
	cache_key = f"{json.dumps(messages)}_{max_new_tokens}_{temperature}_{top_p}_{grammar is not None}"
	if cache_key in self._response_cache:
	return self._response_cache[cache_key]

	print(f"DEBUG: Generating with messages: {messages}", flush=True)
	if grammar:
	print(f"DEBUG: Using GBNF grammar constraint", flush=True)

	# Prepare grammar object if provided
	llama_grammar = None
	if grammar and LlamaGrammar:
	try:
	llama_grammar = LlamaGrammar.from_string(grammar)
	except Exception as e:
	print(f"DEBUG: Failed to parse grammar: {e}", flush=True)
	llama_grammar = None

	# Generate using chat completion to leverage internal templates
	output = await asyncio.to_thread(
	self.llm.create_chat_completion,
	messages=messages,
	max_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	grammar=llama_grammar,
	)

	print(f"DEBUG: Raw output object: {output}", flush=True)

	response_text = output['choices'][0]['message']['content'].strip()
	print(f"DEBUG: Extracted text: {response_text}", flush=True)

	# Cache Store
	if len(self._response_cache) >= self._max_cache_size:
	first_key = next(iter(self._response_cache))
	del self._response_cache[first_key]
	self._response_cache[cache_key] = response_text

	return response_text

	def get_info(self) -> Dict[str, Any]:
	"""Return model information for /models endpoint."""
	return {
	"name": self.name,
	"model_id": self.model_id,
	"type": "gguf",
	"backend": "llama.cpp",
	"context_length": self.n_ctx,
	"loaded": self._initialized,
	"model_path": self.model_path,
	"has_grammar": self.default_grammar is not None,
	"gpu_layers": self.n_gpu_layers
	}

	async def cleanup(self) -> None:
	"""Free memory."""
	if self.llm:
	del self.llm
	self.llm = None
	self._initialized = False
	print(f"[{self.name}] GGUF Model unloaded")