ContinuumAgent / runtime\gguf_lora_runtime.py

Upload runtime\gguf_lora_runtime.py with huggingface_hub

0a20b7b verified 9 months ago

8.84 kB

	#!/usr/bin/env python
	"""
	GGUF LoRA Runtime for ContinuumAgent Project
	Integrates LoRA patches with llama-cpp-python GGUF models
	Modified for better CPU compatibility
	"""

	import os
	import json
	import time
	from typing import List, Dict, Any, Optional, Union
	from llama_cpp import Llama
	from runtime.lora_mux import LoraMux

	class GGUFLoraRuntime:
	"""Runtime for applying LoRA patches to GGUF models"""

	def __init__(self,
	model_path: str,
	registry_dir: str = "models/registry",
	n_gpu_layers: int = 0, # Force CPU-only by default
	n_ctx: int = 1024, # Reduced context size for better memory usage
	verbose: bool = False):
	"""
	Initialize the GGUF LoRA runtime

	Args:
	model_path: Path to GGUF model file
	registry_dir: Path to LoRA registry directory
	n_gpu_layers: Number of layers to offload to GPU (0 for CPU-only)
	n_ctx: Context size
	verbose: Enable verbose output
	"""
	self.model_path = model_path
	self.registry_dir = registry_dir

	# Get n_gpu_layers from environment variable if set
	env_n_gpu_layers = os.environ.get("N_GPU_LAYERS")
	if env_n_gpu_layers is not None:
	self.n_gpu_layers = int(env_n_gpu_layers)
	else:
	self.n_gpu_layers = n_gpu_layers

	self.n_ctx = n_ctx
	self.verbose = verbose

	# Initialize LoraMux
	self.lora_mux = LoraMux(registry_dir=registry_dir)

	# Loaded adapters
	self.loaded_adapters = []

	# Model instance
	self.model = None

	# Initialize model with no adapters
	try:
	self._load_base_model()
	except Exception as e:
	print(f"Error loading base model: {e}")
	print("Continuing with model as None - this will cause failures later but allows initialization")

	def _load_base_model(self) -> None:
	"""Load base GGUF model"""
	print(f"Loading base GGUF model from {self.model_path}...")

	try:
	# Additional parameters for better CPU performance
	self.model = Llama(
	model_path=self.model_path,
	n_gpu_layers=self.n_gpu_layers,
	n_ctx=self.n_ctx,
	verbose=self.verbose,
	seed=42, # Set seed for reproducibility
	n_threads=4, # Use 4 threads for CPU
	n_batch=512 # Smaller batch size for CPU
	)
	print("Base model loaded successfully")
	except Exception as e:
	print(f"Error loading base model: {e}")
	raise

	def load_adapters(self, date_str: Optional[str] = None) -> List[str]:
	"""
	Load LoRA adapters for a specific date

	Args:
	date_str: Date string in YYYYMMDD format (defaults to today)

	Returns:
	List of loaded adapter paths
	"""
	# Get patches for date
	patch_paths = self.lora_mux.load_patches(date_str)

	if not patch_paths:
	print("No adapters available to load")
	return []

	# Reset loaded adapters
	self.loaded_adapters = []

	for patch_path in patch_paths:
	try:
	# Load adapter
	adapter_path = os.path.join(patch_path, "adapter_model.bin")

	# NOTE: This is a hypothetical implementation, as llama-cpp-python
	# doesn't currently support dynamically loading LoRA adapters.
	# In a real implementation, we would need to use a custom build or extension.

	# self.model.load_adapter(adapter_path)
	print(f"Loaded adapter from {adapter_path}")
	self.loaded_adapters.append(patch_path)

	except Exception as e:
	print(f"Error loading adapter from {patch_path}: {e}")

	print(f"Loaded {len(self.loaded_adapters)} adapters")
	return self.loaded_adapters

	def complete(self,
	prompt: str,
	max_tokens: int = 256,
	temperature: float = 0.7,
	top_p: float = 0.95,
	with_adapters: bool = True) -> Dict[str, Any]:
	"""
	Generate completion with model

	Args:
	prompt: Input prompt
	max_tokens: Maximum tokens to generate
	temperature: Sampling temperature
	top_p: Top-p sampling parameter
	with_adapters: Whether to use loaded adapters

	Returns:
	Completion result
	"""
	# Check if model is loaded
	if self.model is None:
	return {
	"text": "[Error: Model not loaded]",
	"elapsed_seconds": 0.0,
	"with_adapters": with_adapters,
	"adapters_used": []
	}

	# Check if adapters are loaded
	if with_adapters and not self.loaded_adapters:
	print("No adapters loaded, loading latest adapters...")
	self.load_adapters()

	# Generate completion
	start_time = time.time()

	try:
	# NOTE: In a real implementation, this would need to configure
	# the model to use/not use adapters based on with_adapters.
	completion = self.model.create_completion(
	prompt=prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stop=["</s>"] # Stop at end of sequence token
	)

	output_text = completion.get("choices", [{}])[0].get("text", "")
	except Exception as e:
	print(f"Error generating completion: {e}")
	output_text = f"[Error generating text: {str(e)}]"

	elapsed = time.time() - start_time

	# Format result
	result = {
	"text": output_text,
	"elapsed_seconds": elapsed,
	"with_adapters": with_adapters,
	"adapters_used": self.loaded_adapters if with_adapters else []
	}

	return result

	def generate(self,
	prompt: str,
	system_prompt: Optional[str] = None,
	max_tokens: int = 256,
	temperature: float = 0.7,
	top_p: float = 0.95,
	with_adapters: bool = True) -> Dict[str, Any]:
	"""
	Generate response with Mistral chat format

	Args:
	prompt: User prompt
	system_prompt: Optional system prompt
	max_tokens: Maximum tokens to generate
	temperature: Sampling temperature
	top_p: Top-p sampling parameter
	with_adapters: Whether to use loaded adapters

	Returns:
	Generation result
	"""
	# Format prompt with Mistral chat template
	if system_prompt:
	formatted_prompt = f"<s>[INST] {system_prompt} [/INST]</s>[INST] {prompt} [/INST]"
	else:
	formatted_prompt = f"<s>[INST] {prompt} [/INST]"

	# Generate completion
	result = self.complete(
	prompt=formatted_prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	with_adapters=with_adapters
	)

	return result


	def main():
	"""Test GGUF LoRA runtime"""
	# Find model path
	model_dir = "models/slow"
	model_files = [f for f in os.listdir(model_dir) if f.endswith(".gguf")]

	if not model_files:
	print(f"No GGUF models found in {model_dir}")
	return

	model_path = os.path.join(model_dir, model_files[0])
	print(f"Using model: {model_path}")

	# Initialize runtime with forced CPU mode
	runtime = GGUFLoraRuntime(
	model_path=model_path,
	n_gpu_layers=0, # CPU only
	n_ctx=1024 # Reduced context
	)

	# Test simple completion
	print("Testing simple completion...")
	result = runtime.complete(
	prompt="Hello, world!",
	max_tokens=20
	)

	print(f"Completion: {result['text']}")
	print(f"Elapsed: {result['elapsed_seconds']:.2f}s")

	if __name__ == "__main__":
	main()