ai-assistant-engine / llm /model_loader.py
khubchand's picture
Update system model to Gemma 3 1B Instruct and humanize responses
9eed65c
import os
import llama_cpp._internals
# Silently fix LlamaModel destructor bug when initialization fails
try:
_original_close = llama_cpp._internals.LlamaModel.close
def _safe_close(self, *args, **kwargs):
if not hasattr(self, 'sampler'):
self.sampler = None
return _original_close(self, *args, **kwargs)
llama_cpp._internals.LlamaModel.close = _safe_close
except Exception:
pass
from llama_cpp import Llama
from config import MODEL_PATH
_llm_instance = None
_llama_failed = False
def get_llm() -> Llama:
"""Lazy-load the LLM so the server starts even without a model file."""
global _llm_instance, _llama_failed
if _llama_failed:
raise RuntimeError("llama-cpp-python previously failed to initialize on this hardware.")
if _llm_instance is None:
if not os.path.exists(MODEL_PATH):
raise FileNotFoundError(
f"\n\n Model file not found: {os.path.abspath(MODEL_PATH)}\n"
f" Download a GGUF model and place it at: {MODEL_PATH}\n"
f" Recommended: Gemma 3 1B Instruct (Q4_K_M)\n"
f" URL: https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF\n"
)
try:
cpu_count = os.cpu_count()
default_threads = max(1, min(4, cpu_count if cpu_count else 2))
threads = int(os.getenv("LLAMA_THREADS", str(default_threads)))
_llm_instance = Llama(
model_path=MODEL_PATH,
n_ctx=2048,
n_threads=threads,
verbose=False,
)
except Exception as e:
_llama_failed = True
raise e
return _llm_instance
# Keep a module-level alias for backwards compatibility with existing imports
class _LazyLlm:
"""Proxy that loads the real model on first attribute access."""
def __call__(self, *args, **kwargs):
return get_llm()(*args, **kwargs)
def __getattr__(self, name):
return getattr(get_llm(), name)
llm = _LazyLlm()