import os
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Quantized GGUF Model tracking paths (100% verified single-file repo)
REPO_ID = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
MODEL_FILENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"

print("[SYSTEM] Fetching verified Meta-Llama-3-8B-Instruct GGUF from Hub...")
try:
    model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
    print(f"[SYSTEM] Model secured safely at: {model_path}")
except Exception as download_err:
    print(f"[CRITICAL DOWNLOAD ERROR] Failed to fetch target file: {download_err}")
    raise download_err

def get_local_llm_instance():
    """
    Initializes LlamaCpp instance allocated to optimal CPU thread counts.
    Context size restricted to 2048 to drastically speed up processing on 15GB RAM.
    """
    print("[SYSTEM] Loading weights inside internal RAM parameters...")
    llm = Llama(
        model_path=model_path,
        n_ctx=2048,          # Optimized context tracking limit
        n_threads=4,         # Standard core optimizations for HuggingFace Free Tier
        n_batch=512,         # Batch sequence calculation limit
        verbose=False
    )
    print("[SYSTEM] Model weights successfully attached!")
    return llm