# src/pipeline/load_model.py import logging import os logger = logging.getLogger(__name__) GGUF_MODEL_PATH = r"MODELS\gguf\llama-3.2-1b-instruct.Q4_K_M.gguf" def load_llm_model(): try: from llama_cpp import Llama if not os.path.exists(GGUF_MODEL_PATH): raise FileNotFoundError(f"GGUF model not found at: {GGUF_MODEL_PATH}") logger.info("Loading GGUF model...") print(f"👉 Loading model from {GGUF_MODEL_PATH}") llm = Llama( model_path=GGUF_MODEL_PATH, n_ctx=2048, # context window n_threads=4, # CPU threads — adjust to your core count n_gpu_layers=0, # 0 = CPU only; increase if you have GPU verbose=False, ) print("✅ Model fully loaded!") return llm, None # no separate tokenizer needed except Exception as e: import traceback print("❌ ERROR LOADING MODEL:") traceback.print_exc() raise e