Spaces:
Sleeping
Sleeping
| # app.py - Optimized for Hugging Face Spaces (Unsloth = 2-4x faster) | |
| import torch | |
| import gradio as gr | |
| from unsloth import FastLanguageModel | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" | |
| LORA_PATH = "saadkhi/SQL_Chat_finetuned_model" | |
| MAX_NEW_TOKENS = 180 | |
| TEMPERATURE = 0.0 # Greedy = fastest & deterministic | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Loading base model with Unsloth (4-bit)...") | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name = BASE_MODEL, | |
| max_seq_length = 2048, | |
| dtype = None, # Auto: bfloat16 on GPU | |
| load_in_4bit = True, # Already quantized base | |
| ) | |
| print("Applying your LoRA adapter...") | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r = 64, # Match your original rank | |
| target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj"], | |
| lora_alpha = 128, | |
| lora_dropout = 0, | |
| bias = "none", | |
| use_gradient_checkpointing = "unsloth", | |
| ) | |
| # Enable 2x faster inference kernels | |
| FastLanguageModel.for_inference(model) | |
| print("Model ready! (very fast now)") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_sql(prompt: str): | |
| messages = [{"role": "user", "content": prompt}] | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ).to("cuda" if torch.cuda.is_available() else "cpu") | |
| outputs = model.generate( | |
| input_ids = inputs, | |
| max_new_tokens = MAX_NEW_TOKENS, | |
| temperature = TEMPERATURE, | |
| do_sample = (TEMPERATURE > 0.01), | |
| use_cache = True, | |
| pad_token_id = tokenizer.eos_token_id, | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract only assistant response | |
| if "<|assistant|>" in response: | |
| response = response.split("<|assistant|>", 1)[-1].strip() | |
| response = response.split("<|end|>")[0].strip() | |
| return response | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| demo = gr.Interface( | |
| fn = generate_sql, | |
| inputs = gr.Textbox( | |
| label = "Ask SQL question", | |
| placeholder = "Delete duplicate rows from users table based on email", | |
| lines = 3 | |
| ), | |
| outputs = gr.Textbox(label="Generated SQL"), | |
| title = "SQL Chatbot - Ultra Fast (Unsloth)", | |
| description = "Phi-3-mini 4-bit + your LoRA", | |
| examples = [ | |
| ["Find duplicate emails in users table"], | |
| ["Top 5 highest paid employees"], | |
| ["Count orders per customer last month"] | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |