# app.py
# Minimal & stable version for free CPU Hugging Face Space – Phi-3-mini + LoRA

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# ────────────────────────────────────────────────────────────────
# Config
# ────────────────────────────────────────────────────────────────

BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"

MAX_NEW_TOKENS = 180
TEMPERATURE    = 0.0
DO_SAMPLE      = False

# ────────────────────────────────────────────────────────────────
# Load model & tokenizer
# ────────────────────────────────────────────────────────────────

print("Loading base model (CPU)...")
try:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config = bnb_config,
        device_map          = "cpu",
        trust_remote_code   = True,
        low_cpu_mem_usage   = True
    )

    print("Loading LoRA...")
    model = PeftModel.from_pretrained(model, LORA_PATH)
    print("Merging LoRA weights...")
    model = model.merge_and_unload()

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    model.eval()

    print("Model & tokenizer loaded successfully")
except Exception as e:
    print(f"Model loading failed: {str(e)}")
    raise

# ────────────────────────────────────────────────────────────────
# Inference function
# ────────────────────────────────────────────────────────────────

def generate_sql(question: str):
    try:
        messages = [{"role": "user", "content": question.strip()}]

        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        )

        with torch.inference_mode():
            outputs = model.generate(
                input_ids       = inputs,
                max_new_tokens  = MAX_NEW_TOKENS,
                temperature     = TEMPERATURE,
                do_sample       = DO_SAMPLE,
                use_cache       = True,
                pad_token_id    = tokenizer.eos_token_id,
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Clean typical Phi-3 output markers
        for marker in ["<|assistant|>", "<|end|>", "<|user|>"]:
            if marker in response:
                response = response.split(marker, 1)[-1].strip()

        return response.strip() or "(empty response)"

    except Exception as e:
        return f"Generation error: {str(e)}"

# ────────────────────────────────────────────────────────────────
# Gradio UI
# ────────────────────────────────────────────────────────────────

demo = gr.Interface(
    fn              = generate_sql,
    inputs          = gr.Textbox(
        label       = "SQL question",
        placeholder = "Find duplicate emails in users table",
        lines       = 3,
        max_lines   = 6
    ),
    outputs         = gr.Textbox(
        label       = "Generated SQL",
        lines       = 8
    ),
    title           = "SQL Chat – Phi-3-mini fine-tuned (CPU)",
    description     = (
        "Free CPU version – first answer usually takes 60–180+ seconds.\n"
        "Later answers are faster (model stays in memory)."
    ),
    examples        = [
        ["Find duplicate emails in users table"],
        ["Top 5 highest paid employees"],
        ["Count orders per customer last month"],
        ["Delete duplicate rows based on email"]
    ],
    cache_examples  = False,
)

if __name__ == "__main__":
    print("Launching interface...")
    demo.launch(
        server_name       = "0.0.0.0",
        # NO fixed server_port → let Gradio pick free port automatically
        debug             = False,
        quiet             = False,
        show_error        = True,
        prevent_thread_lock = True
    )