SQL_chatbot_API / app.py
saadkhi's picture
Update app.py
e95c2d3 verified
raw
history blame
2.54 kB
# CPU SAFE HuggingFace Space (2026 stable)
import warnings
warnings.filterwarnings("ignore")
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
# reduce CPU overload on free tier
torch.set_num_threads(1)
# ─────────────────────────
# Config
# ─────────────────────────
BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
MAX_NEW_TOKENS = 180
print("Loading model...")
# ─────────────────────────
# Load base model
# ─────────────────────────
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="cpu",
torch_dtype=torch.float32,
trust_remote_code=True,
low_cpu_mem_usage=True,
)
print("Loading LoRA...")
model = PeftModel.from_pretrained(model, LORA_PATH)
print("Merging LoRA...")
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model.eval()
print("Model ready")
# ─────────────────────────
# Inference
# ─────────────────────────
def generate_sql(question):
if not question:
return "Enter a SQL question."
messages = [{"role": "user", "content": question}]
input_ids = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
)
with torch.no_grad():
output = model.generate(
input_ids,
max_new_tokens=MAX_NEW_TOKENS,
temperature=0,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
)
text = tokenizer.decode(output[0], skip_special_tokens=True)
# clean artifacts
for t in ["<|assistant|>", "<|user|>", "<|end|>"]:
text = text.replace(t, "")
return text.strip()
# ─────────────────────────
# UI
# ─────────────────────────
demo = gr.Interface(
fn=generate_sql,
inputs=gr.Textbox(lines=3, label="SQL Question"),
outputs=gr.Textbox(lines=8, label="Generated SQL"),
title="SQL Chat – Phi-3 mini",
description="Free CPU Space. First response may take ~90s",
cache_examples=False,
)
demo.launch()