# CPU SAFE HuggingFace Space (2026 stable)

import warnings
warnings.filterwarnings("ignore")

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# reduce CPU overload on free tier
torch.set_num_threads(1)

# ─────────────────────────
# Config
# ─────────────────────────
BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"

MAX_NEW_TOKENS = 180

print("Loading model...")

# ─────────────────────────
# Load base model
# ─────────────────────────
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="cpu",
    torch_dtype=torch.float32,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)

print("Loading LoRA...")
model = PeftModel.from_pretrained(model, LORA_PATH)

print("Merging LoRA...")
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

model.eval()
print("Model ready")

# ─────────────────────────
# Inference
# ─────────────────────────
def generate_sql(question):
    if not question:
        return "Enter a SQL question."

    messages = [{"role": "user", "content": question}]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    )

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=0,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    text = tokenizer.decode(output[0], skip_special_tokens=True)

    # clean artifacts
    for t in ["<|assistant|>", "<|user|>", "<|end|>"]:
        text = text.replace(t, "")

    return text.strip()

# ─────────────────────────
# UI
# ─────────────────────────
demo = gr.Interface(
    fn=generate_sql,
    inputs=gr.Textbox(lines=3, label="SQL Question"),
    outputs=gr.Textbox(lines=8, label="Generated SQL"),
    title="SQL Chat – Phi-3 mini",
    description="Free CPU Space. First response may take ~90s",
    cache_examples=False,
)

demo.launch()