# app.py
import torch
import gradio as gr
from unsloth import FastLanguageModel

# ────────────────────────────────────────────────────────────────
#  Configuration - change here if needed
# ────────────────────────────────────────────────────────────────
MAX_NEW_TOKENS = 96
TEMPERATURE = 0.0          # 0.0 = greedy decoding = fastest
BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"

# ────────────────────────────────────────────────────────────────
print("Loading model with Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=2048,
    dtype=None,                # auto-detect (bf16 on GPU)
    load_in_4bit=True,
)

print("Loading LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r=64,                      # your original rank
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=128,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
)

print("Merging LoRA and preparing for inference...")
model = FastLanguageModel.for_inference(model)     # important! activates 2x faster kernels

# Optional - compile can give additional 20-60% speedup (PyTorch 2.0+)
if torch.cuda.is_available() and torch.__version__ >= "2.0":
    print("Compiling model...")
    model = torch.compile(model, mode="reduce-overhead")

print("Model ready!")

# ────────────────────────────────────────────────────────────────
def generate_sql(prompt: str):
    # Very clean chat template usage
    messages = [{"role": "user", "content": prompt}]
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        do_sample=(TEMPERATURE > 0.01),
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Try to cut after assistant's answer
    if "<|assistant|>" in response:
        response = response.split("<|assistant|>", 1)[-1].strip()
    if "<|end|>" in response:
        response = response.split("<|end|>")[0].strip()

    return response


# ────────────────────────────────────────────────────────────────
demo = gr.Interface(
    fn=generate_sql,
    inputs=gr.Textbox(
        label="Ask SQL related question",
        placeholder="Show me all employees with salary > 50000...",
        lines=3,
    ),
    outputs=gr.Textbox(label="Generated SQL / Answer"),
    title="SQL Chat Assistant (Phi-3-mini fine-tuned)",
    description="Fast version using Unsloth",
    examples=[
        ["Find all duplicate emails in users table"],
        ["Get top 5 highest paid employees"],
        ["How many orders per customer last month?"],
    ],
    allow_flagging="never",
)

if __name__ == "__main__":
    demo.launch()