import os
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Configuration
REPO_ID = "MaziyarPanahi/Llama-3-8B-Instruct-v0.3-GGUF"
FILENAME = "Llama-3-8B-Instruct-v0.3.Q4_K_M.gguf"
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models", FILENAME)

def get_model():
    """
    Downloads the model if not present, then loads it into memory.
    Returns a Llama instance.
    """
    if not os.path.exists(MODEL_PATH):
        print(f"⬇️ Model not found. Downloading {FILENAME} from Hugging Face...")
        os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
        hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME,
            local_dir=os.path.dirname(MODEL_PATH),
            local_dir_use_symlinks=False
        )
        print("✅ Download complete.")
    else:
        print(f"✅ Model found at {MODEL_PATH}")

    print("🚀 Loading Llama-3 into memory (CPU Mode)...")
    # Initialize Llama (Free Tier: 2 vCPU, 16GB RAM)
    # n_ctx=2048 (Context window)
    llm = Llama(
        model_path=MODEL_PATH,
        n_ctx=2048,
        n_threads=2, # Optimizing for HF Spaces Free Tier
        verbose=False
    )
    return llm

# Global instance for re-use
_llm_instance = None

def generate_response(prompt: str, system_prompt: str = "") -> str:
    global _llm_instance
    if _llm_instance is None:
        _llm_instance = get_model()

    full_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
    
    output = _llm_instance(
        full_prompt,
        max_tokens=512,
        stop=["<|eot_id|>"],
        echo=False
    )
    return output['choices'][0]['text']