import os from huggingface_hub import hf_hub_download from llama_cpp import Llama # Quantized GGUF Model tracking paths REPO_ID = "Qwen/Qwen2.5-0.5B-Instruct-GGUF" MODEL_FILENAME = "qwen2.5-0.5b-instruct-q4_k_m.gguf" print("[SYSTEM] Fetching quantized model files from HuggingFace Hub cluster...") model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME) print(f"[SYSTEM] Model secured safely at: {model_path}") def get_local_llm_instance(): """ Initializes LlamaCpp instance allocated to optimal CPU thread counts. Context size restricted to 2048 to drastically speed up processing on 15GB RAM. """ print("[SYSTEM] Loading weights inside internal RAM parameters...") llm = Llama( model_path=model_path, n_ctx=2048, # Optimized context tracking limit n_threads=4, # Standard core optimizations for HuggingFace Free Tier n_batch=512, # Batch sequence calculation limit verbose=False ) print("[SYSTEM] Model weights successfully attached!") return llm