import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "gradio==4.44.0", "--force-reinstall", "--no-deps"])
import gradio as gr

try:
    from llama_cpp import Llama
    print("llama-cpp-python already installed.")
except ImportError:
    print("Installing llama-cpp-python (fast CPU wheel)...")
    try:
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", "--no-cache-dir",
            "https://github.com/yownas/llama-cpp-python-wheels/releases/download/v0.3.16/llama_cpp_python-0.3.16+cpuavx-cp310-cp310-linux_x86_64.whl"
        ])
        print("llama-cpp-python installed from wheel.")
    except Exception as e:
        print("Wheel failed → falling back to PyPI...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", "--no-cache-dir",
            "llama-cpp-python==0.3.16", "--force-reinstall"
        ])
    from llama_cpp import Llama  # ← INDENTED CORRECTLY

from huggingface_hub import hf_hub_download

# === BULLETPROOF WORKING MODEL (hugging-quants repo) ===
MODEL_REPO = "hugging-quants/Llama-3.2-3B-Instruct-Q4_K_M-GGUF"
MODEL_FILE = "llama-3.2-3b-instruct-q4_k_m.gguf"

print("Downloading Llama 3.2 3B Instruct (Q4_K_M)...")
model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE,
    local_dir="./models",
    local_dir_use_symlinks=False
)
print(f"Model downloaded: {model_path}")

print("Loading model into memory (20–40 sec)...")
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_threads=8,
    n_batch=512,
    n_gpu_layers=0,
    verbose=False
)
print("Model loaded — ready to chat!")

def chat(message, history):
    if not message.strip():
        return history, ""

    messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if bot_msg:
            messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})

    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        top_p=0.9,
        stop=["<|eot_id|>", "<|end_of_text|>"],
        stream=False
    )

    bot_response = response["choices"][0]["message"]["content"].strip()
    history.append((message, bot_response))
    return history, ""

# === CSS & UI (perfect) ===
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;700&family=Source+Code+Pro:wght@400;600&display=swap');
body, .gradio-container { background: #0c0c0c !important; font-family: 'JetBrains Mono', monospace !important; }
.gradio-container { max-width: 1400px !important; border: 1px solid #00ff00 !important; box-shadow: 0 0 10px rgba(0,255,0,0.3) !important; }
*, h1, h2, h3, label, p { color: #00ff00 !important; }
.message { background: #1a1a1a !important; border-left: 3px solid #00ff00 !important; padding: 12px !important; }
.user { border-left-color: #00cc00 !important; }
input, textarea { background: #1a1a1a !important; border: 1px solid #00ff00 !important; color: #00ff00 !important; }
button { background: #1a1a1a !important; border: 1px solid #00ff00 !important; color: #00ff00 !important; }
button:hover { background: #00ff00 !important; color: #000 !important; }
.primary { background: #00ff00 !important; color: #000 !important; }
footer { display: none !important; }
"""

with gr.Blocks(theme=gr.themes.Base(primary_hue="green"), css=custom_css, title="$ LLAMA TERMINAL") as demo:
    gr.Markdown("# $ LLAMA TERMINAL\n```\n> System Online | Llama 3.2 3B Ready\n> Type your query below...\n```")
    chatbot = gr.Chatbot(height=600)
    with gr.Row():
        msg = gr.Textbox(placeholder="$ Enter command...", show_label=False, scale=8, container=False)
        submit = gr.Button("SEND", scale=1, variant="primary")
    gr.Examples(["What is the capital of France?", "Write a haiku about AI"], inputs=msg)
    gr.ClearButton([msg, chatbot], value="CLEAR")
    submit.click(chat, [msg, chatbot], [chatbot, msg])
    msg.submit(chat, [msg, chatbot], [chatbot, msg])

if __name__ == "__main__":
    demo.queue().launch(share=True, server_name="0.0.0.0", server_port=7860)