Spaces:
Running
Running
| # import torch | |
| # import gradio as gr | |
| # from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # from peft import PeftModel | |
| # from transformers import BitsAndBytesConfig | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # base_model = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" | |
| # finetuned_model = "saadkhi/SQL_Chat_finetuned_model" | |
| # tokenizer = AutoTokenizer.from_pretrained(base_model) | |
| # bnb = BitsAndBytesConfig(load_in_4bit=True) | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # base_model, | |
| # quantization_config=bnb, | |
| # torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, | |
| # device_map="auto" | |
| # ) | |
| # model = PeftModel.from_pretrained(model, finetuned_model).to(device) | |
| # model.eval() | |
| # def chat(prompt): | |
| # inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| # with torch.inference_mode(): | |
| # output = model.generate( | |
| # **inputs, | |
| # max_new_tokens=60, | |
| # temperature=0.1, | |
| # do_sample=False | |
| # ) | |
| # return tokenizer.decode(output[0], skip_special_tokens=True) | |
| # iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="SQL Chatbot") | |
| # iface.launch() | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from peft import PeftModel | |
| import torch | |
| # Best 4-bit config for speed + low memory | |
| quant_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| # Load base + your LoRA once | |
| base_model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" | |
| lora_model_name = "saadkhi/SQL_Chat_finetuned_model" | |
| print("Loading model (20β40s first time)...") | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| base_model_name, | |
| quantization_config=quant_config, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| # Removed flash_attention_2 β avoids install issues | |
| ) | |
| model = PeftModel.from_pretrained(base_model, lora_model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) | |
| model.eval() | |
| print("Model ready!") | |
| def chat(message, history): | |
| # Full conversation history | |
| messages = [] | |
| for user, assistant in history: | |
| messages.append({"role": "user", "content": user}) | |
| if assistant: | |
| messages.append({"role": "assistant", "content": assistant}) | |
| messages.append({"role": "user", "content": message}) | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ).to(model.device) | |
| # Optimized generation | |
| outputs = model.generate( | |
| inputs, | |
| max_new_tokens=256, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_p=0.9, | |
| repetition_penalty=1.1, | |
| use_cache=True, # KV cache = faster sequential tokens | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True) | |
| history.append((message, response)) | |
| return history, "" | |
| # UI | |
| with gr.Blocks(title="SQL Chatbot", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# SQL Chat Assistant") | |
| gr.Markdown("Fine-tuned Phi-3 Mini for SQL. Fast responses (3β8s on GPU).") | |
| chatbot = gr.Chatbot(height=500) | |
| msg = gr.Textbox(label="Your Question", placeholder="e.g., delete duplicate rows from users table based on email", lines=2) | |
| clear = gr.Button("Clear") | |
| msg.submit(chat, [msg, chatbot], [chatbot, msg]) | |
| clear.click(lambda: ([], ""), None, chatbot) | |
| demo.queue(max_size=30) | |
| demo.launch() |