#!/usr/bin/env python
"""
GGUF LoRA Runtime for ContinuumAgent Project
Integrates LoRA patches with llama-cpp-python GGUF models
Modified for better CPU compatibility
"""
import os
import json
import time
from typing import List, Dict, Any, Optional, Union
from llama_cpp import Llama
from runtime.lora_mux import LoraMux
class GGUFLoraRuntime:
"""Runtime for applying LoRA patches to GGUF models"""
def __init__(self,
model_path: str,
registry_dir: str = "models/registry",
n_gpu_layers: int = 0, # Force CPU-only by default
n_ctx: int = 1024, # Reduced context size for better memory usage
verbose: bool = False):
"""
Initialize the GGUF LoRA runtime
Args:
model_path: Path to GGUF model file
registry_dir: Path to LoRA registry directory
n_gpu_layers: Number of layers to offload to GPU (0 for CPU-only)
n_ctx: Context size
verbose: Enable verbose output
"""
self.model_path = model_path
self.registry_dir = registry_dir
# Get n_gpu_layers from environment variable if set
env_n_gpu_layers = os.environ.get("N_GPU_LAYERS")
if env_n_gpu_layers is not None:
self.n_gpu_layers = int(env_n_gpu_layers)
else:
self.n_gpu_layers = n_gpu_layers
self.n_ctx = n_ctx
self.verbose = verbose
# Initialize LoraMux
self.lora_mux = LoraMux(registry_dir=registry_dir)
# Loaded adapters
self.loaded_adapters = []
# Model instance
self.model = None
# Initialize model with no adapters
try:
self._load_base_model()
except Exception as e:
print(f"Error loading base model: {e}")
print("Continuing with model as None - this will cause failures later but allows initialization")
def _load_base_model(self) -> None:
"""Load base GGUF model"""
print(f"Loading base GGUF model from {self.model_path}...")
try:
# Additional parameters for better CPU performance
self.model = Llama(
model_path=self.model_path,
n_gpu_layers=self.n_gpu_layers,
n_ctx=self.n_ctx,
verbose=self.verbose,
seed=42, # Set seed for reproducibility
n_threads=4, # Use 4 threads for CPU
n_batch=512 # Smaller batch size for CPU
)
print("Base model loaded successfully")
except Exception as e:
print(f"Error loading base model: {e}")
raise
def load_adapters(self, date_str: Optional[str] = None) -> List[str]:
"""
Load LoRA adapters for a specific date
Args:
date_str: Date string in YYYYMMDD format (defaults to today)
Returns:
List of loaded adapter paths
"""
# Get patches for date
patch_paths = self.lora_mux.load_patches(date_str)
if not patch_paths:
print("No adapters available to load")
return []
# Reset loaded adapters
self.loaded_adapters = []
for patch_path in patch_paths:
try:
# Load adapter
adapter_path = os.path.join(patch_path, "adapter_model.bin")
# NOTE: This is a hypothetical implementation, as llama-cpp-python
# doesn't currently support dynamically loading LoRA adapters.
# In a real implementation, we would need to use a custom build or extension.
# self.model.load_adapter(adapter_path)
print(f"Loaded adapter from {adapter_path}")
self.loaded_adapters.append(patch_path)
except Exception as e:
print(f"Error loading adapter from {patch_path}: {e}")
print(f"Loaded {len(self.loaded_adapters)} adapters")
return self.loaded_adapters
def complete(self,
prompt: str,
max_tokens: int = 256,
temperature: float = 0.7,
top_p: float = 0.95,
with_adapters: bool = True) -> Dict[str, Any]:
"""
Generate completion with model
Args:
prompt: Input prompt
max_tokens: Maximum tokens to generate
temperature: Sampling temperature
top_p: Top-p sampling parameter
with_adapters: Whether to use loaded adapters
Returns:
Completion result
"""
# Check if model is loaded
if self.model is None:
return {
"text": "[Error: Model not loaded]",
"elapsed_seconds": 0.0,
"with_adapters": with_adapters,
"adapters_used": []
}
# Check if adapters are loaded
if with_adapters and not self.loaded_adapters:
print("No adapters loaded, loading latest adapters...")
self.load_adapters()
# Generate completion
start_time = time.time()
try:
# NOTE: In a real implementation, this would need to configure
# the model to use/not use adapters based on with_adapters.
completion = self.model.create_completion(
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop=[""] # Stop at end of sequence token
)
output_text = completion.get("choices", [{}])[0].get("text", "")
except Exception as e:
print(f"Error generating completion: {e}")
output_text = f"[Error generating text: {str(e)}]"
elapsed = time.time() - start_time
# Format result
result = {
"text": output_text,
"elapsed_seconds": elapsed,
"with_adapters": with_adapters,
"adapters_used": self.loaded_adapters if with_adapters else []
}
return result
def generate(self,
prompt: str,
system_prompt: Optional[str] = None,
max_tokens: int = 256,
temperature: float = 0.7,
top_p: float = 0.95,
with_adapters: bool = True) -> Dict[str, Any]:
"""
Generate response with Mistral chat format
Args:
prompt: User prompt
system_prompt: Optional system prompt
max_tokens: Maximum tokens to generate
temperature: Sampling temperature
top_p: Top-p sampling parameter
with_adapters: Whether to use loaded adapters
Returns:
Generation result
"""
# Format prompt with Mistral chat template
if system_prompt:
formatted_prompt = f"[INST] {system_prompt} [/INST][INST] {prompt} [/INST]"
else:
formatted_prompt = f"[INST] {prompt} [/INST]"
# Generate completion
result = self.complete(
prompt=formatted_prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
with_adapters=with_adapters
)
return result
def main():
"""Test GGUF LoRA runtime"""
# Find model path
model_dir = "models/slow"
model_files = [f for f in os.listdir(model_dir) if f.endswith(".gguf")]
if not model_files:
print(f"No GGUF models found in {model_dir}")
return
model_path = os.path.join(model_dir, model_files[0])
print(f"Using model: {model_path}")
# Initialize runtime with forced CPU mode
runtime = GGUFLoraRuntime(
model_path=model_path,
n_gpu_layers=0, # CPU only
n_ctx=1024 # Reduced context
)
# Test simple completion
print("Testing simple completion...")
result = runtime.complete(
prompt="Hello, world!",
max_tokens=20
)
print(f"Completion: {result['text']}")
print(f"Elapsed: {result['elapsed_seconds']:.2f}s")
if __name__ == "__main__":
main()