# model_final.py ← FINAL VERSION: No more echoing, no crashes, super fast
from ctransformers import AutoModelForCausalLM
from llama_cpp import Llama
import gradio as gr
import re
import threading

# ==============================
# LOAD MODELS – OPTIMAL SPEED
# ==============================
print("Loading Mistral...")
mistral_model = AutoModelForCausalLM.from_pretrained(
    r"C:\Users\ksrvisitor\Downloads\optimizationmodel\quant_model.gguf",
    model_type="mistral",
    threads=8,
    batch_size=512,
    context_length=8192,
    gpu_layers=0,
    temperature=0.7,
    top_p=0.9,
    top_k=30,
    repetition_penalty=1.1,
    max_new_tokens=1024
)

print("Loading Qwen2.5-Coder...")
qwen_model = Llama(
    r"C:\Users\ksrvisitor\Downloads\qwen2.5-coder-7b-instruct-q4_k_m.gguf",
    n_ctx=8192,
    n_threads=4,       # Fastest on CPU
    n_batch=512,       # Fastest on CPU
    n_gpu_layers=0,    # Change to 35–99 if GPU
    use_mlock=True,
    verbose=False
)

stop_event = threading.Event()

# ==============================
# SMART DETECTION
# ==============================
# ==============================
# BULLETPROOF CODE DETECTION (Qwen will catch EVERYTHING now)
# ==============================
# ==============================
# BULLETPROOF DETECTION — MATH + CODE = ALWAYS QWEN
# ==============================
def is_coding_or_math(text: str) -> bool:
    text = text.lower()
    
    # Math & number series triggers
    math_triggers = [
        "next number", "series", "sequence", "pattern", "find the next", "what comes next",
        "solve", "calculate", "equation", "math", "mathematics", "integral", "derivative",
        "factorial", "prime", "geometry", "algebra", "probability", "statistics", "seconds", "minutes", "hours", "number", "triangular"
    ]
    
    # Coding triggers
    code_triggers = [
        "code", "program", "write a", "implement", "function", "class", "python", "java",
        "c++", "javascript", "sql", "debug", "algorithm", "leetcode", "binary search"
    ]
    
    # If any math or code keyword is found → Qwen
    if any(trigger in text for trigger in math_triggers + code_triggers):
        return True
        
    # If contains numbers + math symbols → Qwen
    if re.search(r'\d', text) and any(op in text for op in "+-*/=^()[]{}"):
        return True
        
    # If contains comma-separated numbers (like 2, 6, 12, 20) → Qwen
    if re.search(r'\d+\s*[,]\s*\d+', text):
        return True
        
    return False

# ==============================
# FIXED STREAMING (NO ECHOING!)
# ==============================
def stream_mistral(prompt):
    stop_event.clear()

    system_prompt = (
        "You are a helpful, concise assistant. "
        "Do NOT repeat the user's question. "
        "Answer directly and clearly."
    )

    formatted_prompt = f"<s>[INST] <<SYS>>{system_prompt}<</SYS>> {prompt} [/INST]"

    yield [{"role": "assistant", "content": "**[Mistral]**\n\n"}]

    output = ""
    for token in mistral_model(
        formatted_prompt,
        stream=True,
        max_new_tokens=800,
        stop=["</s>"]
    ):
        if stop_event.is_set():
            break

        output += token
        clean = output.strip()

        yield [{"role": "assistant", "content": f"**[Mistral]**\n\n{clean}"}]

def stream_qwen(prompt):
    stop_event.clear()
    resp = ""
    
    # Start output
    yield [{"role": "assistant", "content": "**[Qwen2.5-Coder]**\n\n"}]

    formatted = (
        "<|im_start|>system\n"
        "You are a world-class math and coding assistant. "
        "ALWAYS respond with clean LaTeX. Use $...$ for inline and $$...$$ for display. "
        "Use \\boxed{} for final answers.\n"
        "<|im_end|>\n"
        "<|im_start|>user\n" + prompt + "\n<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    for chunk in qwen_model(
        formatted,
        stream=True,
        max_tokens=800,
        temperature=0.1,
        top_p=0.9,
        top_k=20,
        repeat_penalty=1.05
    ):
        if stop_event.is_set():
            break

        # SAFE EXTRACTION — won't crash
        choice = chunk["choices"][0]
        token = (
            choice.get("text") or
            choice.get("delta", {}).get("content", "") or
            ""
        )

        resp += token

        yield [{"role": "assistant", "content": f"**[Qwen2.5-Coder]**\n\n{resp}"}]

# ==============================
# MAIN CHAT — WORKS WITH MESSAGES FORMAT
# ==============================
def chat(message, history):
    stop_event.clear()

    # Handle history as list of dicts (Gradio's type="messages")
    messages = []
    for msg in history:
        if isinstance(msg, dict) and "role" in msg:
            messages.append(msg)
        else:
            # Fallback for tuples (old format)
            for u, a in msg if isinstance(msg, (list, tuple)) else []:
                if u: messages.append({"role": "user", "content": u})
                if a: messages.append({"role": "assistant", "content": a})
    messages.append({"role": "user", "content": message})

    streamer = stream_qwen(message) if is_coding_or_math(message) else stream_mistral(message)

    partial = messages.copy()
    first = True
    for chunk in streamer:
        if stop_event.is_set(): break
        if first:
            partial.append(chunk[0])
            first = False
        else:
            partial[-1] = chunk[0]
        yield partial

def stop():
    stop_event.set()

# ==============================
# UI
# ==============================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Dual Local AI — Clean Responses (No Echoing!)\n**Code/Math → Qwen2.5-Coder** | **Chat → Mistral**")
    chatbot = gr.Chatbot(height=720, type="messages", show_copy_button=True)
    with gr.Row():
        txt = gr.Textbox(placeholder="Ask anything…", label="Message", lines=4, scale=8)
        send = gr.Button("Send", variant="primary")
        stop_btn = gr.Button("Stop", variant="stop")

    send.click(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt)
    txt.submit(chat, [txt, chatbot], chatbot).then(lambda: gr.update(value=""), outputs=txt)
    stop_btn.click(stop)

print("Launching FINAL version (no echoing, no crashes)...")
demo.launch(server_port=7860, inbrowser=True)