Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

SQL_chatbot_API / app.py

saadkhi

Update app.py

bbdf923 verified about 1 month ago

raw

history blame

4.17 kB

	# app.py - Fixed for recent Gradio versions (no allow_flagging)

	import torch
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	from peft import PeftModel

	# ────────────────────────────────────────────────────────────────
	# Fastest practical configuration
	# ────────────────────────────────────────────────────────────────

	BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
	LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"

	MAX_NEW_TOKENS = 180
	TEMPERATURE = 0.0 # greedy = fastest
	DO_SAMPLE = False

	# ────────────────────────────────────────────────────────────────
	# 4-bit quantization (very important for speed)
	# ────────────────────────────────────────────────────────────────

	bnb_config = BitsAndBytesConfig(
	load_in_4bit = True,
	bnb_4bit_quant_type = "nf4",
	bnb_4bit_use_double_quant = True,
	bnb_4bit_compute_dtype = torch.bfloat16
	)

	print("Loading quantized base model...")
	model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	quantization_config = bnb_config,
	device_map = "auto",
	trust_remote_code = True,
	torch_dtype = torch.bfloat16
	)

	print("Loading LoRA adapters...")
	model = PeftModel.from_pretrained(model, LORA_PATH)

	# Merge LoRA into base model → much faster inference
	model = model.merge_and_unload()

	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

	model.eval()
	print("Model ready!")

	# ────────────────────────────────────────────────────────────────
	def generate_sql(prompt: str):
	messages = [{"role": "user", "content": prompt}]

	inputs = tokenizer.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(model.device)

	with torch.inference_mode():
	outputs = model.generate(
	input_ids = inputs,
	max_new_tokens = MAX_NEW_TOKENS,
	temperature = TEMPERATURE,
	do_sample = DO_SAMPLE,
	use_cache = True,
	pad_token_id = tokenizer.eos_token_id,
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Clean output
	if "<\|assistant\|>" in response:
	response = response.split("<\|assistant\|>", 1)[-1].strip()
	response = response.split("<\|end\|>")[0].strip() if "<\|end\|>" in response else response

	return response

	# ────────────────────────────────────────────────────────────────
	# Gradio interface - modern style (no allow_flagging)
	# ────────────────────────────────────────────────────────────────

	demo = gr.Interface(
	fn=generate_sql,
	inputs=gr.Textbox(
	label="Ask SQL related question",
	placeholder="Show me all employees with salary > 50000...",
	lines=3
	),
	outputs=gr.Textbox(label="Generated SQL / Answer"),
	title="SQL Chatbot - Optimized",
	description="Phi-3-mini 4bit + LoRA merged",
	examples=[
	["Find duplicate emails in users table"],
	["Top 5 highest paid employees"],
	["Count orders per customer last month"]
	],
	# flag button is disabled by default in newer versions → no need for allow_flagging
	)

	if __name__ == "__main__":
	demo.launch()