Spaces:
Sleeping
Sleeping
| # app.py | |
| import torch | |
| import gradio as gr | |
| from unsloth import FastLanguageModel | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Configuration - change here if needed | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MAX_NEW_TOKENS = 96 | |
| TEMPERATURE = 0.0 # 0.0 = greedy decoding = fastest | |
| BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" | |
| LORA_PATH = "saadkhi/SQL_Chat_finetuned_model" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Loading model with Unsloth...") | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=BASE_MODEL, | |
| max_seq_length=2048, | |
| dtype=None, # auto-detect (bf16 on GPU) | |
| load_in_4bit=True, | |
| ) | |
| print("Loading LoRA adapters...") | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r=64, # your original rank | |
| target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], | |
| lora_alpha=128, | |
| lora_dropout=0, | |
| bias="none", | |
| use_gradient_checkpointing="unsloth", | |
| ) | |
| print("Merging LoRA and preparing for inference...") | |
| model = FastLanguageModel.for_inference(model) # important! activates 2x faster kernels | |
| # Optional - compile can give additional 20-60% speedup (PyTorch 2.0+) | |
| if torch.cuda.is_available() and torch.__version__ >= "2.0": | |
| print("Compiling model...") | |
| model = torch.compile(model, mode="reduce-overhead") | |
| print("Model ready!") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_sql(prompt: str): | |
| # Very clean chat template usage | |
| messages = [{"role": "user", "content": prompt}] | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ).to("cuda" if torch.cuda.is_available() else "cpu") | |
| outputs = model.generate( | |
| input_ids=inputs, | |
| max_new_tokens=MAX_NEW_TOKENS, | |
| temperature=TEMPERATURE, | |
| do_sample=(TEMPERATURE > 0.01), | |
| use_cache=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Try to cut after assistant's answer | |
| if "<|assistant|>" in response: | |
| response = response.split("<|assistant|>", 1)[-1].strip() | |
| if "<|end|>" in response: | |
| response = response.split("<|end|>")[0].strip() | |
| return response | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| demo = gr.Interface( | |
| fn=generate_sql, | |
| inputs=gr.Textbox( | |
| label="Ask SQL related question", | |
| placeholder="Show me all employees with salary > 50000...", | |
| lines=3, | |
| ), | |
| outputs=gr.Textbox(label="Generated SQL / Answer"), | |
| title="SQL Chat Assistant (Phi-3-mini fine-tuned)", | |
| description="Fast version using Unsloth", | |
| examples=[ | |
| ["Find all duplicate emails in users table"], | |
| ["Get top 5 highest paid employees"], | |
| ["How many orders per customer last month?"], | |
| ], | |
| allow_flagging="never", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |