import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
import torch
import time
import psutil
import os

# --- FastAPI Imports ---
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn

# CONFIGURATION
MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Global variables for model and tokenizer
model = None
tokenizer = None
load_status = "🔄 Initializing..."

def load_model():
    global model, tokenizer, load_status
    try:
        print(f"Loading tokenizer from {TOKENIZER_ID}...")
        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
        
        print(f"Loading GGUF weights from {MODEL_ID}...")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            gguf_file=GGUF_FILE,
            torch_dtype=torch.float32,
            device_map="cpu"
        )
        load_status = "✅ Model Loaded Successfully"
        print(load_status)
    except Exception as e:
        load_status = f"❌ Error: {str(e)}"
        print(load_status)

# Start loading in the background
Thread(target=load_model, daemon=True).start()

def get_stats():
    vm = psutil.virtual_memory()
    return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / {vm.total / 1024**3:.1f}GB"

# ─────────────────────────────────────────────────────────────
# GRADIO CHAT GENERATOR (For the UI)
# ─────────────────────────────────────────────────────────────
def chat(message, history):
    if model is None:
        yield "Model is still loading or failed to load. Check status.", load_status
        return

    prompt = f"<｜begin_of_sentence｜><｜User｜>{message}<｜Assistant｜><think>\n"
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    start_time = time.time()
    generated_text = ""
    token_count = 0
    
    for new_text in streamer:
        generated_text += new_text
        token_count += 1
        elapsed = time.time() - start_time
        tps = token_count / elapsed if elapsed > 0 else 0
        stats = f"⏱️ {elapsed:.1f}s | ⚡ {tps:.2f} t/s | {get_stats()} | {load_status}"
        yield generated_text, stats

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🚀 DeepSeek-R1 CPU Dashboard + API")
    
    with gr.Row():
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(label="Response Console", height=500)
            msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...")
        with gr.Column(scale=1):
            stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}")
            gr.Markdown("---")
            clear = gr.Button("Clear Chat")

    def respond(message, chat_history):
        return "", chat_history + [[message, ""]]

    def stream_bot(chat_history):
        user_input = chat_history[-1][0]
        for content, stats in chat(user_input, chat_history[:-1]):
            chat_history[-1][1] = content
            yield chat_history, stats

    msg.submit(respond, [msg, chatbot], [msg, chatbot]).then(
        stream_bot, chatbot, [chatbot, stats_box]
    )
    clear.click(lambda: None, None, chatbot, queue=False)


# ─────────────────────────────────────────────────────────────
# FASTAPI APPLICATION (The Bridge API)
# ─────────────────────────────────────────────────────────────
app = FastAPI(title="DeepSeek API Bridge")

class ChatRequest(BaseModel):
    message: str
    system: str = ""

@app.post("/chat")
def api_chat(req: ChatRequest):
    """
    This endpoint catches the JSON payload from the Perspective Engine
    and processes it through DeepSeek-R1 synchronously.
    """
    if model is None:
        raise HTTPException(status_code=503, detail="Model is still loading into RAM.")

    # Combine the Engine's structured JSON prompt with the actual instruction
    combined_prompt = f"{req.system}\n\n{req.message}".strip()
    
    # Format exactly as DeepSeek expects
    prompt = f"<｜begin_of_sentence｜><｜User｜>{combined_prompt}<｜Assistant｜><think>\n"
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Strip the input prompt out of the generated tokens
    input_length = inputs.input_ids.shape[1]
    generated_tokens = outputs[0][input_length:]
    response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    print("\n[API] Responded to Perspective Engine constraint query.")
    
    return {"response": response_text}

# Mount the Gradio UI onto the FastAPI app
app = gr.mount_gradio_app(app, demo, path="/")

if __name__ == "__main__":
    print("\n🌐 Starting DeepSeek Server on port 7860...")
    print("   UI  available at: http://0.0.0.0:7860/")
    print("   API available at: http://0.0.0.0:7860/chat\n")
    uvicorn.run(app, host="0.0.0.0", port=7860)