import os from huggingface_hub import hf_hub_download from llama_cpp import Llama # Quantized GGUF Model tracking paths (100% verified single-file repo) REPO_ID = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" MODEL_FILENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf" print("[SYSTEM] Fetching verified Meta-Llama-3-8B-Instruct GGUF from Hub...") try: model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME) print(f"[SYSTEM] Model secured safely at: {model_path}") except Exception as download_err: print(f"[CRITICAL DOWNLOAD ERROR] Failed to fetch target file: {download_err}") raise download_err def get_local_llm_instance(): """ Initializes LlamaCpp instance allocated to optimal CPU thread counts. Context size restricted to 2048 to drastically speed up processing on 15GB RAM. """ print("[SYSTEM] Loading weights inside internal RAM parameters...") llm = Llama( model_path=model_path, n_ctx=2048, # Optimized context tracking limit n_threads=4, # Standard core optimizations for HuggingFace Free Tier n_batch=512, # Batch sequence calculation limit verbose=False ) print("[SYSTEM] Model weights successfully attached!") return llm