Spaces:
Sleeping
Sleeping
| # import torch | |
| # import gradio as gr | |
| # from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # from peft import PeftModel | |
| # from transformers import BitsAndBytesConfig | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # base_model = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" | |
| # finetuned_model = "saadkhi/SQL_Chat_finetuned_model" | |
| # tokenizer = AutoTokenizer.from_pretrained(base_model) | |
| # bnb = BitsAndBytesConfig(load_in_4bit=True) | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # base_model, | |
| # quantization_config=bnb, | |
| # torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, | |
| # device_map="auto" | |
| # ) | |
| # model = PeftModel.from_pretrained(model, finetuned_model).to(device) | |
| # model.eval() | |
| # def chat(prompt): | |
| # inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| # with torch.inference_mode(): | |
| # output = model.generate( | |
| # **inputs, | |
| # max_new_tokens=60, | |
| # temperature=0.1, | |
| # do_sample=False | |
| # ) | |
| # return tokenizer.decode(output[0], skip_special_tokens=True) | |
| # iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="SQL Chatbot") | |
| # iface.launch() | |
| import gradio as gr | |
| from unsloth import FastLanguageModel | |
| import torch | |
| # Load model once at startup — Unsloth makes it 2.5x faster | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit", | |
| max_seq_length=4096, | |
| dtype=None, # Auto detect (bfloat16 if supported) | |
| load_in_4bit=True, | |
| ) | |
| # Load your fine-tuned LoRA adapter | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| "saadkhi/SQL_Chat_finetuned_model", # Your HF repo | |
| ) | |
| # Enable fast inference mode (critical for speed!) | |
| FastLanguageModel.for_inference(model) | |
| def chat(message, history): | |
| # Build proper Phi-3 chat format | |
| messages = [] | |
| for user_msg, bot_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| if bot_msg: | |
| messages.append({"role": "assistant", "content": bot_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| # Apply chat template and tokenize | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ).to(model.device) | |
| # Generate fast | |
| output = model.generate( | |
| input_ids=inputs, | |
| max_new_tokens=256, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_p=0.9, | |
| use_cache=True, | |
| repetition_penalty=1.1, | |
| ) | |
| # Decode only the new part | |
| response = tokenizer.decode(output[0][inputs.shape[-1]:], skip_special_tokens=True) | |
| history.append((message, response)) | |
| return history, "" | |
| # Clean Gradio Chat Interface | |
| with gr.Blocks(title="SQL Chatbot", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# SQL Chat Assistant") | |
| gr.Markdown("Ask any SQL-related question. Fast responses powered by fine-tuned Phi-3 Mini.") | |
| chatbot = gr.Chatbot(height=500) | |
| msg = gr.Textbox(label="Your Message", placeholder="e.g., delete duplicate rows from users table", lines=2) | |
| clear = gr.Button("Clear") | |
| msg.submit(chat, [msg, chatbot], [chatbot, msg]) | |
| clear.click(lambda: ([], ""), None, chatbot) | |
| demo.queue(max_size=20) # Handle multiple users smoothly | |
| demo.launch() |