import gradio as gr import torch from unsloth import FastLanguageModel # ── Global model (loaded once at startup) ─────────────────────────────── print("Loading model...") model, tokenizer = FastLanguageModel.from_pretrained( "unsloth/Phi-3-mini-4k-instruct-bnb-4bit", # very fast pre-quantized base max_seq_length=2048, dtype=None, # auto (bf16/float16) load_in_4bit=True, ) # Load your LoRA adapter model = FastLanguageModel.for_inference( model.load_adapter("saadkhi/SQL_Chat_finetuned_model") ) print("Model loaded successfully!") # ── Chat function ─────────────────────────────────────────────────────── def generate_response(message, history): # Build messages list (multi-turn support) messages = [] for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) # Use the proper chat template (very important for Phi-3) inputs = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" ).to("cuda" if torch.cuda.is_available() else "cpu") # Generate outputs = model.generate( input_ids=inputs, max_new_tokens=180, # ← increased but still reasonable temperature=0.0, do_sample=False, # greedy = fastest & most deterministic use_cache=True, ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Clean up output (remove input prompt part) if "<|assistant|>" in response: response = response.split("<|assistant|>")[-1].strip() return response # ── Gradio UI ─────────────────────────────────────────────────────────── demo = gr.ChatInterface( fn=generate_response, title="SQL Chat Assistant (Fast Version)", description="Ask SQL related questions • Powered by Phi-3-mini + your fine-tune", examples=[ "Write a query to find duplicate emails in users table", "How to delete rows with NULL values in column price?", "Select top 10 most expensive products", ], cache_examples=False, ) if __name__ == "__main__": demo.launch()