Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on Dec 23, 2025

Commit

c38fb83

verified ·

1 Parent(s): c2d5c36

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -11

app.py CHANGED Viewed

@@ -52,7 +52,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
 import torch
-# Quantization config for fast 4-bit loading
 quant_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -60,17 +60,17 @@ quant_config = BitsAndBytesConfig(
     bnb_4bit_use_double_quant=True,
 )
-# Load base model + your LoRA once at startup
 base_model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
 lora_model_name = "saadkhi/SQL_Chat_finetuned_model"
-print("Loading model (20–40 seconds first time)...")
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_name,
     quantization_config=quant_config,
     device_map="auto",
     trust_remote_code=True,
-    attn_implementation="flash_attention_2",  # Fastest on T4/A10G
 )
 model = PeftModel.from_pretrained(base_model, lora_model_name)
@@ -80,7 +80,7 @@ model.eval()
 print("Model ready!")
 def chat(message, history):
-    # Build full conversation history in Phi-3 format
     messages = []
     for user, assistant in history:
         messages.append({"role": "user", "content": user})
@@ -88,7 +88,6 @@ def chat(message, history):
             messages.append({"role": "assistant", "content": assistant})
     messages.append({"role": "user", "content": message})
-    # Tokenize with chat template
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
@@ -96,7 +95,7 @@ def chat(message, history):
         return_tensors="pt"
     ).to(model.device)
-    # Generate with optimal settings
     outputs = model.generate(
         inputs,
         max_new_tokens=256,
@@ -104,20 +103,19 @@ def chat(message, history):
         do_sample=True,
         top_p=0.9,
         repetition_penalty=1.1,
-        use_cache=True,  # KV caching = much faster
         eos_token_id=tokenizer.eos_token_id,
     )
-    # Decode only the new response
     response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
     history.append((message, response))
     return history, ""
-# Gradio interface
 with gr.Blocks(title="SQL Chatbot", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# SQL Chat Assistant")
-    gr.Markdown("Fine-tuned Phi-3 Mini for SQL queries. Responses in 2–6 seconds on GPU.")
     chatbot = gr.Chatbot(height=500)
     msg = gr.Textbox(label="Your Question", placeholder="e.g., delete duplicate rows from users table based on email", lines=2)

 from peft import PeftModel
 import torch
+# Best 4-bit config for speed + low memory
 quant_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_use_double_quant=True,
 )
+# Load base + your LoRA once
 base_model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
 lora_model_name = "saadkhi/SQL_Chat_finetuned_model"
+print("Loading model (20–40s first time)...")
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_name,
     quantization_config=quant_config,
     device_map="auto",
     trust_remote_code=True,
+    # Removed flash_attention_2 — avoids install issues
 )
 model = PeftModel.from_pretrained(base_model, lora_model_name)
 print("Model ready!")
 def chat(message, history):
+    # Full conversation history
     messages = []
     for user, assistant in history:
         messages.append({"role": "user", "content": user})
             messages.append({"role": "assistant", "content": assistant})
     messages.append({"role": "user", "content": message})
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         return_tensors="pt"
     ).to(model.device)
+    # Optimized generation
     outputs = model.generate(
         inputs,
         max_new_tokens=256,
         do_sample=True,
         top_p=0.9,
         repetition_penalty=1.1,
+        use_cache=True,  # KV cache = faster sequential tokens
         eos_token_id=tokenizer.eos_token_id,
     )
     response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
     history.append((message, response))
     return history, ""
+# UI
 with gr.Blocks(title="SQL Chatbot", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# SQL Chat Assistant")
+    gr.Markdown("Fine-tuned Phi-3 Mini for SQL. Fast responses (3–8s on GPU).")
     chatbot = gr.Chatbot(height=500)
     msg = gr.Textbox(label="Your Question", placeholder="e.g., delete duplicate rows from users table based on email", lines=2)