Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Oct 24, 2025

Commit

eff7d5f

verified ·

1 Parent(s): 038e223

Create model_manager.py

Browse files

Files changed (1) hide show

model_manager.py +270 -0

model_manager.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# model_manager.py
+"""
+Lazy-loading model manager for Llama-3.2-3B-Instruct.
+Model is loaded on first use and cached for subsequent calls.
+"""
+import os
+import torch
+import logging
+import threading
+from typing import Optional, List
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+    TextIteratorStreamer,
+    pipeline
+)
+logger = logging.getLogger(__name__)
+# ZeroGPU support
+try:
+    import spaces
+    HF_SPACES_AVAILABLE = True
+    logger.info("✅ ZeroGPU (spaces) available")
+except ImportError:
+    HF_SPACES_AVAILABLE = False
+    class DummySpaces:
+        @staticmethod
+        def GPU(duration=90):
+            def decorator(func):
+                return func
+            return decorator
+    spaces = DummySpaces()
+    logger.warning("⚠️ ZeroGPU not available - running without GPU allocation")
+# Configuration
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+LLAMA_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
+class LazyLlamaModel:
+    """
+    Lazy-loading Llama-3.2-3B model with caching.
+    Thread-safe singleton pattern - model loads on first use.
+    """
+    _instance = None
+    _lock = threading.Lock()
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._initialized = False
+        return cls._instance
+    def __init__(self):
+        """Initialize only once"""
+        if self._initialized:
+            return
+        self.model = None
+        self.tokenizer = None
+        self.pipe = None
+        self._initialized = True
+        logger.info("LazyLlamaModel created (model not loaded yet)")
+    @spaces.GPU(duration=120)
+    def _load_model(self):
+        """Load model with 4-bit quantization. Called automatically on first use."""
+        if self.model is not None:
+            return  # Already loaded
+        logger.info("="*60)
+        logger.info("LOADING LLAMA-3.2-3B-INSTRUCT (First Use)")
+        logger.info("="*60)
+        try:
+            # 4-bit quantization for memory efficiency
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16,
+                bnb_4bit_use_double_quant=True,
+            )
+            logger.info(f"Loading: {LLAMA_MODEL_ID}")
+            logger.info("Config: 4-bit NF4 quantization")
+            # Load tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                LLAMA_MODEL_ID,
+                token=HF_TOKEN,
+                trust_remote_code=True,
+            )
+            # Load model
+            self.model = AutoModelForCausalLM.from_pretrained(
+                LLAMA_MODEL_ID,
+                quantization_config=quantization_config,
+                device_map="auto",
+                token=HF_TOKEN,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16,
+            )
+            # Create pipeline
+            self.pipe = pipeline(
+                "text-generation",
+                model=self.model,
+                tokenizer=self.tokenizer,
+                torch_dtype=torch.bfloat16,
+                device_map="auto",
+            )
+            logger.info("="*60)
+            logger.info("✅ MODEL LOADED & CACHED")
+            logger.info(f"  Model: {LLAMA_MODEL_ID}")
+            logger.info(f"  Memory: ~1GB VRAM")
+            logger.info(f"  Context: 128K tokens")
+            logger.info("="*60)
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            raise
+    def generate(
+        self,
+        system_prompt: str,
+        user_message: str,
+        max_tokens: int = 100,
+        temperature: float = 0.7,
+        stop_sequences: Optional[List[str]] = None
+    ) -> str:
+        """
+        Generate response. Automatically loads model on first call.
+        Args:
+            system_prompt: System instruction
+            user_message: User query
+            max_tokens: Max tokens to generate
+            temperature: Sampling temperature
+            stop_sequences: Optional stop sequences (not used with pipeline)
+        Returns:
+            Generated text
+        """
+        # Lazy load on first use
+        if self.model is None:
+            self._load_model()
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_message},
+        ]
+        try:
+            outputs = self.pipe(
+                messages,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=True,
+                top_p=0.9,
+                top_k=40,
+                repetition_penalty=1.15,
+            )
+            result = outputs[0]["generated_text"][-1]["content"]
+            return result.strip()
+        except Exception as e:
+            logger.error(f"Generation error: {e}")
+            return ""
+    def generate_streaming(
+        self,
+        system_prompt: str,
+        user_message: str,
+        max_tokens: int = 512,
+        temperature: float = 0.7,
+    ):
+        """
+        Generate response with streaming. Automatically loads model on first call.
+        Yields:
+            str: Generated text chunks
+        """
+        # Lazy load on first use
+        if self.model is None:
+            self._load_model()
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_message},
+        ]
+        try:
+            # Apply chat template
+            input_ids = self.tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                return_tensors="pt"
+            ).to(self.model.device)
+            streamer = TextIteratorStreamer(
+                self.tokenizer,
+                skip_prompt=True,
+                skip_special_tokens=True
+            )
+            generation_kwargs = dict(
+                input_ids=input_ids,
+                streamer=streamer,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=True,
+                top_p=0.9,
+                top_k=40,
+                repetition_penalty=1.15,
+            )
+            # Generate in separate thread
+            thread = threading.Thread(target=self.model.generate, kwargs=generation_kwargs)
+            thread.start()
+            # Yield generated text
+            for text in streamer:
+                yield text
+        except Exception as e:
+            logger.error(f"Streaming error: {e}")
+            yield ""
+    def is_loaded(self) -> bool:
+        """Check if model is loaded"""
+        return self.model is not None
+    def get_model_info(self) -> dict:
+        """Get model information"""
+        return {
+            "model_id": LLAMA_MODEL_ID,
+            "loaded": self.is_loaded(),
+            "quantization": "4-bit NF4",
+            "size_gb": 1.0,
+            "context_length": 128000,
+            "lazy_loading": True,
+        }
+# Global instance - model loads on first use
+_model_instance = None
+def get_model() -> LazyLlamaModel:
+    """
+    Get the lazy-loading model instance.
+    Model will automatically load on first generate() call.
+    """
+    global _model_instance
+    if _model_instance is None:
+        _model_instance = LazyLlamaModel()
+    return _model_instance
+# Backwards compatibility aliases
+get_shared_llama = get_model
+get_shared_qwen3 = get_model
+get_shared_mistral = get_model
+LlamaSharedAgent = LazyLlamaModel