Spaces:
Sleeping
Sleeping
File size: 3,667 Bytes
a663164 ef89ab8 43c048b 0ddc005 ef89ab8 c38fb83 43c048b 0ddc005 43c048b 0ddc005 ef89ab8 c38fb83 43c048b c38fb83 43c048b c38fb83 0ddc005 979ad48 43c048b 979ad48 0ddc005 c38fb83 0ddc005 43c048b 0ddc005 ef89ab8 a663164 0ddc005 c38fb83 43c048b 0ddc005 c38fb83 43c048b 0ddc005 43c048b 0ddc005 c38fb83 0ddc005 c38fb83 a663164 0ddc005 43c048b 0ddc005 979ad48 0ddc005 ef89ab8 43c048b 0ddc005 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# import torch
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from peft import PeftModel
# from transformers import BitsAndBytesConfig
# device = "cuda" if torch.cuda.is_available() else "cpu"
# base_model = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
# finetuned_model = "saadkhi/SQL_Chat_finetuned_model"
# tokenizer = AutoTokenizer.from_pretrained(base_model)
# bnb = BitsAndBytesConfig(load_in_4bit=True)
# model = AutoModelForCausalLM.from_pretrained(
# base_model,
# quantization_config=bnb,
# torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
# device_map="auto"
# )
# model = PeftModel.from_pretrained(model, finetuned_model).to(device)
# model.eval()
# def chat(prompt):
# inputs = tokenizer(prompt, return_tensors="pt").to(device)
# with torch.inference_mode():
# output = model.generate(
# **inputs,
# max_new_tokens=60,
# temperature=0.1,
# do_sample=False
# )
# return tokenizer.decode(output[0], skip_special_tokens=True)
# iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="SQL Chatbot")
# iface.launch()
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch
# Best 4-bit config for speed + low memory
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
# Load base + your LoRA once
base_model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
lora_model_name = "saadkhi/SQL_Chat_finetuned_model"
print("Loading model (20β40s first time)...")
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=quant_config,
device_map="auto",
trust_remote_code=True,
# Removed flash_attention_2 β avoids install issues
)
model = PeftModel.from_pretrained(base_model, lora_model_name)
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
model.eval()
print("Model ready!")
def chat(message, history):
# Full conversation history
messages = []
for user, assistant in history:
messages.append({"role": "user", "content": user})
if assistant:
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
# Optimized generation
outputs = model.generate(
inputs,
max_new_tokens=256,
temperature=0.7,
do_sample=True,
top_p=0.9,
repetition_penalty=1.1,
use_cache=True, # KV cache = faster sequential tokens
eos_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
history.append((message, response))
return history, ""
# UI
with gr.Blocks(title="SQL Chatbot", theme=gr.themes.Soft()) as demo:
gr.Markdown("# SQL Chat Assistant")
gr.Markdown("Fine-tuned Phi-3 Mini for SQL. Fast responses (3β8s on GPU).")
chatbot = gr.Chatbot(height=500)
msg = gr.Textbox(label="Your Question", placeholder="e.g., delete duplicate rows from users table based on email", lines=2)
clear = gr.Button("Clear")
msg.submit(chat, [msg, chatbot], [chatbot, msg])
clear.click(lambda: ([], ""), None, chatbot)
demo.queue(max_size=30)
demo.launch() |