import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

# ----------------------------------------
# Global model cache
# ----------------------------------------
loaded_models = {}   # Cache loaded Llama models
current_model_name = None

MODEL_CONFIGS = {
    "1B Model (Datangtang/GGUF1B)": {
        "repo_id": "Datangtang/GGUF1B",
        "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
    },
    "3B Model (Datangtang/GGUF3B)": {
        "repo_id": "Datangtang/GGUF3B",
        "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
    }
}


# ----------------------------------------
# Load model function
# ----------------------------------------
def load_model(model_choice):
    global loaded_models, current_model_name

    if model_choice in loaded_models:
        print(f"Reusing already loaded model: {model_choice}")
        current_model_name = model_choice
        return loaded_models[model_choice]

    print(f"Downloading model: {model_choice}")

    cfg = MODEL_CONFIGS[model_choice]

    model_path = hf_hub_download(
        repo_id=cfg["repo_id"],
        filename=cfg["filename"],
        local_dir="./model",
        token=os.environ["HF_TOKEN"]
    )

    print(f"Model downloaded to: {model_path}")
    print("Loading GGUF model into memory...")

    llm = Llama(
        model_path=model_path,
        n_ctx=1024,
        n_threads=6,
        n_batch=512,
        n_gpu_layers=0,
        use_mmap=True,
        use_mlock=True,
        verbose=False,
    )

    loaded_models[model_choice] = llm
    current_model_name = model_choice

    print("Model loaded successfully!")
    return llm


# ----------------------------------------
# Chat function
# ----------------------------------------
def chat(message, history, model_choice):
    llm = load_model(model_choice)

    # System prompt
    conversation = "System: You are a helpful assistant.\n"

    # Convert ChatInterface history (list of dicts) into text prompt
    for msg in history[-3:]:
        # ChatInterface format: {"role": "...", "content": "..."}
        if isinstance(msg, dict):
            role = msg.get("role")
            content = msg.get("content", "")
            if role == "user":
                conversation += f"User: {content}\n"
            elif role == "assistant":
                conversation += f"Assistant: {content}\n"

        # Safety: old tuple format
        elif isinstance(msg, list) or isinstance(msg, tuple):
            human, assistant = msg
            conversation += f"User: {human}\n"
            if assistant:
                conversation += f"Assistant: {assistant}\n"

    # Add current message
    conversation += f"User: {message}\nAssistant:"

    # Generate model response
    response = llm(
        conversation,
        max_tokens=128,
        temperature=0.7,
        top_p=0.9,
        top_k=40,
        repeat_penalty=1.1,
        stop=["User:", "Assistant:"],
        echo=False
    )

    return response["choices"][0]["text"].strip()

# ----------------------------------------
# Gradio UI
# ----------------------------------------
with gr.Blocks() as demo:

    gr.Markdown("# 🦙 Datangtang GGUF Model Demo")
    gr.Markdown("Switch between **1B** and **3B** GGUF models in real-time.")

    model_choice = gr.Dropdown(
        label="Select Model",
        choices=list(MODEL_CONFIGS.keys()),
        value="1B Model (Datangtang/GGUF1B)",
    )

    chat_iface = gr.ChatInterface(
        fn=lambda message, history: chat(message, history, model_choice.value),
        examples=[
            "Explain deep learning in one paragraph.",
            "What is the difference between supervised and unsupervised learning?",
            "Explain what a transformer model is.",
        ],
        cache_examples=False,
    )

    model_choice.change(
        fn=lambda x: f"🔄 Switched to: {x}",
        inputs=[model_choice],
        outputs=[],
    )


if __name__ == "__main__":
    demo.launch()