Spaces:

saadkhi
/

SQL_chatbot_API

Runtime error

App Files Files Community

SQL_chatbot_API / app.py

saadkhi

Update app.py

4bc3e8b verified about 1 month ago

raw

history blame

5.08 kB

	# app.py
	# Minimal & stable version for free CPU Hugging Face Space – Phi-3-mini + LoRA

	import torch
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	from peft import PeftModel

	# ────────────────────────────────────────────────────────────────
	# Config
	# ────────────────────────────────────────────────────────────────

	BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
	LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"

	MAX_NEW_TOKENS = 180
	TEMPERATURE = 0.0
	DO_SAMPLE = False

	# ────────────────────────────────────────────────────────────────
	# Load model & tokenizer
	# ────────────────────────────────────────────────────────────────

	print("Loading base model (CPU)...")
	try:
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16
	)

	model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	quantization_config = bnb_config,
	device_map = "cpu",
	trust_remote_code = True,
	low_cpu_mem_usage = True
	)

	print("Loading LoRA...")
	model = PeftModel.from_pretrained(model, LORA_PATH)
	print("Merging LoRA weights...")
	model = model.merge_and_unload()

	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
	model.eval()

	print("Model & tokenizer loaded successfully")
	except Exception as e:
	print(f"Model loading failed: {str(e)}")
	raise

	# ────────────────────────────────────────────────────────────────
	# Inference function
	# ────────────────────────────────────────────────────────────────

	def generate_sql(question: str):
	try:
	messages = [{"role": "user", "content": question.strip()}]

	inputs = tokenizer.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_tensors="pt"
	)

	with torch.inference_mode():
	outputs = model.generate(
	input_ids = inputs,
	max_new_tokens = MAX_NEW_TOKENS,
	temperature = TEMPERATURE,
	do_sample = DO_SAMPLE,
	use_cache = True,
	pad_token_id = tokenizer.eos_token_id,
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Clean typical Phi-3 output markers
	for marker in ["<\|assistant\|>", "<\|end\|>", "<\|user\|>"]:
	if marker in response:
	response = response.split(marker, 1)[-1].strip()

	return response.strip() or "(empty response)"

	except Exception as e:
	return f"Generation error: {str(e)}"

	# ────────────────────────────────────────────────────────────────
	# Gradio UI
	# ────────────────────────────────────────────────────────────────

	demo = gr.Interface(
	fn = generate_sql,
	inputs = gr.Textbox(
	label = "SQL question",
	placeholder = "Find duplicate emails in users table",
	lines = 3,
	max_lines = 6
	),
	outputs = gr.Textbox(
	label = "Generated SQL",
	lines = 8
	),
	title = "SQL Chat – Phi-3-mini fine-tuned (CPU)",
	description = (
	"Free CPU version – first answer usually takes 60–180+ seconds.\n"
	"Later answers are faster (model stays in memory)."
	),
	examples = [
	["Find duplicate emails in users table"],
	["Top 5 highest paid employees"],
	["Count orders per customer last month"],
	["Delete duplicate rows based on email"]
	],
	cache_examples = False,
	)

	if __name__ == "__main__":
	print("Launching interface...")
	demo.launch(
	server_name = "0.0.0.0",
	# NO fixed server_port → let Gradio pick free port automatically
	debug = False,
	quiet = False,
	show_error = True,
	prevent_thread_lock = True
	)