| import os |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
|
|
| |
| REPO_ID = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" |
| MODEL_FILENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf" |
|
|
| print("[SYSTEM] Fetching verified Meta-Llama-3-8B-Instruct GGUF from Hub...") |
| try: |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME) |
| print(f"[SYSTEM] Model secured safely at: {model_path}") |
| except Exception as download_err: |
| print(f"[CRITICAL DOWNLOAD ERROR] Failed to fetch target file: {download_err}") |
| raise download_err |
|
|
| def get_local_llm_instance(): |
| """ |
| Initializes LlamaCpp instance allocated to optimal CPU thread counts. |
| Context size restricted to 2048 to drastically speed up processing on 15GB RAM. |
| """ |
| print("[SYSTEM] Loading weights inside internal RAM parameters...") |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=2048, |
| n_threads=4, |
| n_batch=512, |
| verbose=False |
| ) |
| print("[SYSTEM] Model weights successfully attached!") |
| return llm |