File size: 8,836 Bytes

0a20b7b

#!/usr/bin/env python
"""

GGUF LoRA Runtime for ContinuumAgent Project

Integrates LoRA patches with llama-cpp-python GGUF models

Modified for better CPU compatibility

"""

import os
import json
import time
from typing import List, Dict, Any, Optional, Union
from llama_cpp import Llama
from runtime.lora_mux import LoraMux

class GGUFLoraRuntime:
    """Runtime for applying LoRA patches to GGUF models"""
    
    def __init__(self, 

                 model_path: str,

                 registry_dir: str = "models/registry",

                 n_gpu_layers: int = 0,  # Force CPU-only by default

                 n_ctx: int = 1024,  # Reduced context size for better memory usage

                 verbose: bool = False):
        """

        Initialize the GGUF LoRA runtime

        

        Args:

            model_path: Path to GGUF model file

            registry_dir: Path to LoRA registry directory

            n_gpu_layers: Number of layers to offload to GPU (0 for CPU-only)

            n_ctx: Context size

            verbose: Enable verbose output

        """
        self.model_path = model_path
        self.registry_dir = registry_dir
        
        # Get n_gpu_layers from environment variable if set
        env_n_gpu_layers = os.environ.get("N_GPU_LAYERS")
        if env_n_gpu_layers is not None:
            self.n_gpu_layers = int(env_n_gpu_layers)
        else:
            self.n_gpu_layers = n_gpu_layers
            
        self.n_ctx = n_ctx
        self.verbose = verbose
        
        # Initialize LoraMux
        self.lora_mux = LoraMux(registry_dir=registry_dir)
        
        # Loaded adapters
        self.loaded_adapters = []
        
        # Model instance
        self.model = None
        
        # Initialize model with no adapters
        try:
            self._load_base_model()
        except Exception as e:
            print(f"Error loading base model: {e}")
            print("Continuing with model as None - this will cause failures later but allows initialization")
    
    def _load_base_model(self) -> None:
        """Load base GGUF model"""
        print(f"Loading base GGUF model from {self.model_path}...")
        
        try:
            # Additional parameters for better CPU performance
            self.model = Llama(
                model_path=self.model_path,
                n_gpu_layers=self.n_gpu_layers,
                n_ctx=self.n_ctx,
                verbose=self.verbose,
                seed=42,  # Set seed for reproducibility
                n_threads=4,  # Use 4 threads for CPU
                n_batch=512  # Smaller batch size for CPU
            )
            print("Base model loaded successfully")
        except Exception as e:
            print(f"Error loading base model: {e}")
            raise
    
    def load_adapters(self, date_str: Optional[str] = None) -> List[str]:
        """

        Load LoRA adapters for a specific date

        

        Args:

            date_str: Date string in YYYYMMDD format (defaults to today)

            

        Returns:

            List of loaded adapter paths

        """
        # Get patches for date
        patch_paths = self.lora_mux.load_patches(date_str)
        
        if not patch_paths:
            print("No adapters available to load")
            return []
        
        # Reset loaded adapters
        self.loaded_adapters = []
        
        for patch_path in patch_paths:
            try:
                # Load adapter
                adapter_path = os.path.join(patch_path, "adapter_model.bin")
                
                # NOTE: This is a hypothetical implementation, as llama-cpp-python
                # doesn't currently support dynamically loading LoRA adapters.
                # In a real implementation, we would need to use a custom build or extension.
                
                # self.model.load_adapter(adapter_path)
                print(f"Loaded adapter from {adapter_path}")
                self.loaded_adapters.append(patch_path)
                
            except Exception as e:
                print(f"Error loading adapter from {patch_path}: {e}")
        
        print(f"Loaded {len(self.loaded_adapters)} adapters")
        return self.loaded_adapters
    
    def complete(self, 

                prompt: str, 

                max_tokens: int = 256, 

                temperature: float = 0.7,

                top_p: float = 0.95,

                with_adapters: bool = True) -> Dict[str, Any]:
        """

        Generate completion with model

        

        Args:

            prompt: Input prompt

            max_tokens: Maximum tokens to generate

            temperature: Sampling temperature

            top_p: Top-p sampling parameter

            with_adapters: Whether to use loaded adapters

            

        Returns:

            Completion result

        """
        # Check if model is loaded
        if self.model is None:
            return {
                "text": "[Error: Model not loaded]",
                "elapsed_seconds": 0.0,
                "with_adapters": with_adapters,
                "adapters_used": []
            }
            
        # Check if adapters are loaded
        if with_adapters and not self.loaded_adapters:
            print("No adapters loaded, loading latest adapters...")
            self.load_adapters()
        
        # Generate completion
        start_time = time.time()
        
        try:
            # NOTE: In a real implementation, this would need to configure
            # the model to use/not use adapters based on with_adapters.
            completion = self.model.create_completion(
                prompt=prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                stop=["</s>"]  # Stop at end of sequence token
            )
            
            output_text = completion.get("choices", [{}])[0].get("text", "")
        except Exception as e:
            print(f"Error generating completion: {e}")
            output_text = f"[Error generating text: {str(e)}]"
        
        elapsed = time.time() - start_time
        
        # Format result
        result = {
            "text": output_text,
            "elapsed_seconds": elapsed,
            "with_adapters": with_adapters,
            "adapters_used": self.loaded_adapters if with_adapters else []
        }
        
        return result
    
    def generate(self, 

                prompt: str, 

                system_prompt: Optional[str] = None,

                max_tokens: int = 256,

                temperature: float = 0.7,

                top_p: float = 0.95,

                with_adapters: bool = True) -> Dict[str, Any]:
        """

        Generate response with Mistral chat format

        

        Args:

            prompt: User prompt

            system_prompt: Optional system prompt

            max_tokens: Maximum tokens to generate

            temperature: Sampling temperature

            top_p: Top-p sampling parameter

            with_adapters: Whether to use loaded adapters

            

        Returns:

            Generation result

        """
        # Format prompt with Mistral chat template
        if system_prompt:
            formatted_prompt = f"<s>[INST] {system_prompt} [/INST]</s>[INST] {prompt} [/INST]"
        else:
            formatted_prompt = f"<s>[INST] {prompt} [/INST]"
        
        # Generate completion
        result = self.complete(
            prompt=formatted_prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            with_adapters=with_adapters
        )
        
        return result


def main():
    """Test GGUF LoRA runtime"""
    # Find model path
    model_dir = "models/slow"
    model_files = [f for f in os.listdir(model_dir) if f.endswith(".gguf")]
    
    if not model_files:
        print(f"No GGUF models found in {model_dir}")
        return
    
    model_path = os.path.join(model_dir, model_files[0])
    print(f"Using model: {model_path}")
    
    # Initialize runtime with forced CPU mode
    runtime = GGUFLoraRuntime(
        model_path=model_path,
        n_gpu_layers=0,  # CPU only
        n_ctx=1024       # Reduced context
    )
    
    # Test simple completion
    print("Testing simple completion...")
    result = runtime.complete(
        prompt="Hello, world!",
        max_tokens=20
    )
    
    print(f"Completion: {result['text']}")
    print(f"Elapsed: {result['elapsed_seconds']:.2f}s")

if __name__ == "__main__":
    main()