import os
import time
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# 🔧 CONFIGURATION
MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf"

print("⏳ Starting Python Dev Assistant Space...")
START_TIME = time.time()

# 1️⃣ Download (only happens on first boot or cache miss)
print(f"📦 Checking cache for {MODEL_FILE}...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
print(f"✅ Model cached at: {model_path}")

# 2️⃣ Load into RAM (runs ONCE per Space startup)
print("🧠 Loading model into memory...")
llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=2,
    n_batch=512,
    verbose=False,
    use_mlock=True
)
LOAD_TIME = round(time.time() - START_TIME, 1)
print(f"🚀 Model loaded in {LOAD_TIME}s. Ready for prompts!")

# 3️⃣ Generation function (reuses `llm` every time)
def generate_python_code(user_prompt):
    inference_start = time.time()
    print(f"🔹 Processing prompt at {time.strftime('%H:%M:%S')}")
    
    messages = [
        {"role": "system", "content": "You are an expert Python developer. Write clean, PEP-8 compliant code with type hints. Output only code unless asked otherwise."},
        {"role": "user", "content": user_prompt}
    ]
    
    output = llm.create_chat_completion(
        messages=messages,
        max_tokens=1024,
        temperature=0.2,
        top_p=0.9,
        repeat_penalty=1.1,
        stop=["</s>", "```"]
    )
    
    inference_time = round(time.time() - inference_start, 2)
    print(f"✅ Done in {inference_time}s")
    return output["choices"][0]["message"]["content"]

# 4️⃣ Gradio UI
demo = gr.Interface(
    fn=generate_python_code,
    inputs=gr.Textbox(lines=4, placeholder="Describe your Python task..."),
    outputs=gr.Code(language="python"),
    title="🐍 Python Dev Assistant",
    description=f"Loaded `{MODEL_FILE}` in {LOAD_TIME}s. Model stays in RAM between prompts.",
    examples=[
        ["Write a Pydantic v2 model for a User with email validation"],
        ["Create an async retry wrapper for HTTP requests using aiohttp"]
    ]
)

if __name__ == "__main__":
    demo.launch()