import spaces
import gradio as gr
import torch
import time
import re
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "rahul7star/Qwen3-4B-Thinking-2509-Genius-Coder-AI-Full"

MAX_INPUT_TOKENS = 4096
MAX_NEW_TOKENS = 4096

# ---------------- Model ----------------

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)

# Important for Qwen
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

print("✅ Model loaded successfully")


SYSTEM_PROMPT = """You are a professional AI Coding Assistant.
Your responses must be:
- Clear and concise
- Well-structured with headings and bullet points
- Technically accurate
- Written in a formal, professional tone
- Focused on best practices and production-quality code
"""


# ---------------- Helper ----------------

def strip_thinking(text: str):
    """Remove <think> reasoning blocks"""
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()


# ---------------- Inference ----------------

@spaces.GPU()
def generate_answer(question, max_tokens):

    print("\n================ GENERATE ANSWER START ================")

    if not question or not question.strip():
        return "Please enter a valid question."

    try:
        start_time = time.time()

        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": question.strip()},
        ]

        # ✅ New Qwen chat template
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )

        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=MAX_INPUT_TOKENS,
        ).to(model.device)

        input_token_count = inputs.input_ids.shape[-1]
        print(f"Input tokens: {input_token_count}")

        max_tokens = min(int(max_tokens), MAX_NEW_TOKENS)
        print(f"Final max_new_tokens: {max_tokens}")

        print("🚀 Starting generation...")

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=False,          # Deterministic (recommended)
                temperature=0.0,          # Important when do_sample=False
                repetition_penalty=1.05,
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        print("✅ Generation finished")

        generated_tokens = output[0][input_token_count:]

        response = tokenizer.decode(
            generated_tokens,
            skip_special_tokens=True,
        )

        # 🔥 Remove thinking block
        response = strip_thinking(response)

        print(f"Generated tokens: {generated_tokens.shape[-1]}")
        print(f"⏱ Total time: {time.time() - start_time:.2f} sec")
        print("================ GENERATE ANSWER END ==================\n")

        return response if response else "No output generated."

    except Exception as e:
        import traceback
        traceback.print_exc()

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return f"Error occurred: {str(e)}"


# ---------------- UI ----------------

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # 🤖 Professional Coding Assistant  
        **Powered by Qwen3-4B Thinking**

        Optimized for:
        - ⚡ Stable GPU inference
        - 🧠 Deterministic responses
        - 💻 Production-quality code
        """
    )

    question = gr.Textbox(
        label="Your Question",
        placeholder="Explain Quick Sort with complexity and a Python example",
        value="write a python code using pytorch for a simple neural network demo",
        lines=4,
    )

    answer = gr.Markdown(label="AI Response", elem_id="answer_box")

    max_tokens = gr.Slider(
        64, 4096, value=2048, step=32, label="Max New Tokens"
    )

    with gr.Row():
        submit = gr.Button("Generate Answer", variant="primary")
        copy_btn = gr.Button("📋 Copy Response")
        clear = gr.Button("Clear")

    submit.click(
        fn=generate_answer,
        inputs=[question, max_tokens],
        outputs=answer,
    )

    clear.click(
        fn=lambda: ("", ""),
        outputs=[question, answer],
    )

    copy_btn.click(
        fn=None,
        js="""
        () => {
            const el = document.querySelector('#answer_box');
            navigator.clipboard.writeText(el.innerText);
        }
        """,
    )

demo.launch(
    theme=gr.themes.Soft(),
    css="""
    .gradio-container { max-width: 900px !important; margin: auto; }
    textarea { font-size: 14px !important; }
    """,
)