import os import llama_cpp._internals # Silently fix LlamaModel destructor bug when initialization fails try: _original_close = llama_cpp._internals.LlamaModel.close def _safe_close(self, *args, **kwargs): if not hasattr(self, 'sampler'): self.sampler = None return _original_close(self, *args, **kwargs) llama_cpp._internals.LlamaModel.close = _safe_close except Exception: pass from llama_cpp import Llama from config import MODEL_PATH _llm_instance = None _llama_failed = False def get_llm() -> Llama: """Lazy-load the LLM so the server starts even without a model file.""" global _llm_instance, _llama_failed if _llama_failed: raise RuntimeError("llama-cpp-python previously failed to initialize on this hardware.") if _llm_instance is None: if not os.path.exists(MODEL_PATH): raise FileNotFoundError( f"\n\n Model file not found: {os.path.abspath(MODEL_PATH)}\n" f" Download a GGUF model and place it at: {MODEL_PATH}\n" f" Recommended: Gemma 3 1B Instruct (Q4_K_M)\n" f" URL: https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF\n" ) try: cpu_count = os.cpu_count() default_threads = max(1, min(4, cpu_count if cpu_count else 2)) threads = int(os.getenv("LLAMA_THREADS", str(default_threads))) _llm_instance = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=threads, verbose=False, ) except Exception as e: _llama_failed = True raise e return _llm_instance # Keep a module-level alias for backwards compatibility with existing imports class _LazyLlm: """Proxy that loads the real model on first attribute access.""" def __call__(self, *args, **kwargs): return get_llm()(*args, **kwargs) def __getattr__(self, name): return getattr(get_llm(), name) llm = _LazyLlm()