| import os |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
|
|
| |
| REPO_ID = "Qwen/Qwen2.5-0.5B-Instruct-GGUF" |
| MODEL_FILENAME = "qwen2.5-0.5b-instruct-q4_k_m.gguf" |
|
|
| print("[SYSTEM] Fetching quantized model files from HuggingFace Hub cluster...") |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME) |
| print(f"[SYSTEM] Model secured safely at: {model_path}") |
|
|
| def get_local_llm_instance(): |
| """ |
| Initializes LlamaCpp instance allocated to optimal CPU thread counts. |
| Context size restricted to 2048 to drastically speed up processing on 15GB RAM. |
| """ |
| print("[SYSTEM] Loading weights inside internal RAM parameters...") |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=2048, |
| n_threads=4, |
| n_batch=512, |
| verbose=False |
| ) |
| print("[SYSTEM] Model weights successfully attached!") |
| return llm |