Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on Dec 23, 2025

Commit

43c048b

verified ·

1 Parent(s): 5c843a5

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -28

app.py CHANGED Viewed

@@ -48,36 +48,47 @@
 import gradio as gr
-from unsloth import FastLanguageModel
 import torch
-# Load model once at startup — Unsloth makes it 2.5x faster
-model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
-    max_seq_length=4096,
-    dtype=None,  # Auto detect (bfloat16 if supported)
     load_in_4bit=True,
 )
-# Load your fine-tuned LoRA adapter
-model = FastLanguageModel.get_peft_model(
-    model,
-    "saadkhi/SQL_Chat_finetuned_model",  # Your HF repo
 )
-# Enable fast inference mode (critical for speed!)
-FastLanguageModel.for_inference(model)
 def chat(message, history):
-    # Build proper Phi-3 chat format
     messages = []
-    for user_msg, bot_msg in history:
-        messages.append({"role": "user", "content": user_msg})
-        if bot_msg:
-            messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
-    # Apply chat template and tokenize
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
@@ -85,33 +96,35 @@ def chat(message, history):
         return_tensors="pt"
     ).to(model.device)
-    # Generate fast
-    output = model.generate(
-        input_ids=inputs,
         max_new_tokens=256,
         temperature=0.7,
         do_sample=True,
         top_p=0.9,
-        use_cache=True,
         repetition_penalty=1.1,
     )
-    # Decode only the new part
-    response = tokenizer.decode(output[0][inputs.shape[-1]:], skip_special_tokens=True)
     history.append((message, response))
     return history, ""
-# Clean Gradio Chat Interface
 with gr.Blocks(title="SQL Chatbot", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# SQL Chat Assistant")
-    gr.Markdown("Ask any SQL-related question. Fast responses powered by fine-tuned Phi-3 Mini.")
     chatbot = gr.Chatbot(height=500)
-    msg = gr.Textbox(label="Your Message", placeholder="e.g., delete duplicate rows from users table", lines=2)
     clear = gr.Button("Clear")
     msg.submit(chat, [msg, chatbot], [chatbot, msg])
     clear.click(lambda: ([], ""), None, chatbot)
-demo.queue(max_size=20)  # Handle multiple users smoothly
 demo.launch()

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import PeftModel
 import torch
+# Quantization config for fast 4-bit loading
+quant_config = BitsAndBytesConfig(
     load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
 )
+# Load base model + your LoRA once at startup
+base_model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
+lora_model_name = "saadkhi/SQL_Chat_finetuned_model"
+print("Loading model (20–40 seconds first time)...")
+base_model = AutoModelForCausalLM.from_pretrained(
+    base_model_name,
+    quantization_config=quant_config,
+    device_map="auto",
+    trust_remote_code=True,
+    attn_implementation="flash_attention_2",  # Fastest on T4/A10G
 )
+model = PeftModel.from_pretrained(base_model, lora_model_name)
+tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
+model.eval()
+print("Model ready!")
 def chat(message, history):
+    # Build full conversation history in Phi-3 format
     messages = []
+    for user, assistant in history:
+        messages.append({"role": "user", "content": user})
+        if assistant:
+            messages.append({"role": "assistant", "content": assistant})
     messages.append({"role": "user", "content": message})
+    # Tokenize with chat template
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         return_tensors="pt"
     ).to(model.device)
+    # Generate with optimal settings
+    outputs = model.generate(
+        inputs,
         max_new_tokens=256,
         temperature=0.7,
         do_sample=True,
         top_p=0.9,
         repetition_penalty=1.1,
+        use_cache=True,  # KV caching = much faster
+        eos_token_id=tokenizer.eos_token_id,
     )
+    # Decode only the new response
+    response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
     history.append((message, response))
     return history, ""
+# Gradio interface
 with gr.Blocks(title="SQL Chatbot", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# SQL Chat Assistant")
+    gr.Markdown("Fine-tuned Phi-3 Mini for SQL queries. Responses in 2–6 seconds on GPU.")
     chatbot = gr.Chatbot(height=500)
+    msg = gr.Textbox(label="Your Question", placeholder="e.g., delete duplicate rows from users table based on email", lines=2)
     clear = gr.Button("Clear")
     msg.submit(chat, [msg, chatbot], [chatbot, msg])
     clear.click(lambda: ([], ""), None, chatbot)
+demo.queue(max_size=30)
 demo.launch()