Spaces:

jdesiree
/

Mimir

Sleeping

File size: 8,644 Bytes

# model_manager.py
"""
Lazy-loading Llama-3.2-3B-Instruct with proper ZeroGPU context management.

KEY FIX: Each generate() call is wrapped with @spaces.GPU to ensure
the model is accessible during generation.
"""

import os
import torch
import logging
from typing import Optional, Iterator
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline as create_pipeline
)

# ZeroGPU support
try:
    import spaces
    HF_SPACES_AVAILABLE = True
except ImportError:
    HF_SPACES_AVAILABLE = False
    class DummySpaces:
        @staticmethod
        def GPU(duration=90):
            def decorator(func):
                return func
            return decorator
    spaces = DummySpaces()

logger = logging.getLogger(__name__)

# Configuration
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")


class LazyLlamaModel:
    """
    Singleton lazy-loading model with proper ZeroGPU context management.
    
    CRITICAL FIX: Model components are loaded fresh within each @spaces.GPU
    decorated call, ensuring GPU context is maintained throughout generation.
    """
    
    _instance = None
    _initialized = False
    
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
        return cls._instance
    
    def __init__(self):
        if not self._initialized:
            self.model_id = MODEL_ID
            self.token = HF_TOKEN
            
            # Don't load model here - load it inside GPU-decorated functions
            self.tokenizer = None
            self.model = None
            self.pipeline = None
            
            LazyLlamaModel._initialized = True
            logger.info(f"LazyLlamaModel initialized (model will load on first generate)")
    
    def _load_model_components(self):
        """
        Load model components. Called INSIDE @spaces.GPU decorated functions.
        This ensures GPU context is maintained.
        """
        if self.model is not None and self.tokenizer is not None:
            return  # Already loaded in this context
        
        logger.info("="*60)
        logger.info("LOADING LLAMA-3.2-3B-INSTRUCT")
        logger.info("="*60)
        
        # Load tokenizer
        logger.info(f"Loading: {self.model_id}")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_id,
            token=self.token,
            trust_remote_code=True
        )
        logger.info(f"✓ Tokenizer loaded: {type(self.tokenizer).__name__}")
        
        # Configure 4-bit quantization
        logger.info("Config: 4-bit NF4 quantization")
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
        
        # Load model with quantization
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            quantization_config=bnb_config,
            device_map="auto",
            token=self.token,
            trust_remote_code=True,
            torch_dtype=torch.float16,
        )
        logger.info(f"✓ Model loaded: {type(self.model).__name__}")
        
        # Create pipeline
        self.pipeline = create_pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device_map="auto"
        )
        logger.info("✓ Pipeline created and verified: TextGenerationPipeline")
        
        logger.info("="*60)
        logger.info("✅ MODEL LOADED & CACHED")
        logger.info(f"  Model: {self.model_id}")
        logger.info(f"  Tokenizer: {type(self.tokenizer).__name__}")
        logger.info(f"  Pipeline: {type(self.pipeline).__name__}")
        logger.info(f"  Memory: ~1GB VRAM")
        logger.info(f"  Context: 128K tokens")
        logger.info("="*60)
    
    @spaces.GPU(duration=90)
    def generate(
        self,
        system_prompt: str,
        user_message: str,
        max_tokens: int = 500,
        temperature: float = 0.7
    ) -> str:
        """
        Generate text with proper GPU context management.
        
        CRITICAL: @spaces.GPU decorator ensures model stays in GPU context
        throughout the entire generation process.
        """
        # Load model components if not already loaded
        self._load_model_components()
        
        # Verify pipeline is available
        if self.pipeline is None:
            raise RuntimeError(
                "Pipeline is None after loading. This may be a ZeroGPU context issue. "
                "Check that _load_model_components() completed successfully."
            )
        
        # Format prompt with chat template
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
        
        prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        # Generate
        outputs = self.pipeline(
            prompt,
            max_new_tokens=max_tokens,
            temperature=temperature,
            do_sample=temperature > 0,
            pad_token_id=self.tokenizer.eos_token_id,
            eos_token_id=self.tokenizer.eos_token_id,
            return_full_text=False
        )
        
        response = outputs[0]['generated_text']
        return response.strip()
    
    @spaces.GPU(duration=90)
    def generate_streaming(
        self,
        system_prompt: str,
        user_message: str,
        max_tokens: int = 500,
        temperature: float = 0.7
    ) -> Iterator[str]:
        """
        Generate text with streaming output.
        
        CRITICAL: @spaces.GPU decorator ensures model stays in GPU context.
        """
        # Load model components if not already loaded
        self._load_model_components()
        
        # Verify pipeline is available
        if self.pipeline is None:
            raise RuntimeError(
                "Pipeline is None after loading. This may be a ZeroGPU context issue."
            )
        
        # Format prompt
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
        
        prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        # Tokenize
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        
        # Generate with streaming
        last_output_len = 0
        
        with torch.no_grad():
            for _ in range(max_tokens):
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=1,
                    temperature=temperature,
                    do_sample=temperature > 0,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                )
                
                # Decode new tokens
                current_output = self.tokenizer.decode(
                    outputs[0][inputs['input_ids'].shape[1]:],
                    skip_special_tokens=True
                )
                
                # Yield new content
                if len(current_output) > last_output_len:
                    new_text = current_output[last_output_len:]
                    yield new_text
                    last_output_len = len(current_output)
                
                # Check for EOS
                if outputs[0][-1] == self.tokenizer.eos_token_id:
                    break
                
                # Update inputs for next iteration
                inputs = {
                    'input_ids': outputs,
                    'attention_mask': torch.ones_like(outputs)
                }


# Singleton instance
_model_instance = None

def get_model() -> LazyLlamaModel:
    """Get the singleton model instance"""
    global _model_instance
    if _model_instance is None:
        _model_instance = LazyLlamaModel()
    return _model_instance


# Backwards compatibility aliases (within same module - no import)
get_shared_llama = get_model
MistralSharedAgent = LazyLlamaModel
LlamaSharedAgent = LazyLlamaModel

# DO NOT ADD THIS LINE - IT CAUSES CIRCULAR IMPORT:
# from model_manager import get_model as get_shared_llama, LazyLlamaModel as LlamaSharedAgent