Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on Jan 7

Commit

a2f39c6

verified ·

1 Parent(s): 32343cc

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -33

app.py CHANGED Viewed

@@ -1,93 +1,119 @@
-# app.py - ZeroGPU safe version (no .to("cuda") outside decorated fn + no caching)
 import torch
 import gradio as gr
-import spaces  # Correct import
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
 # ────────────────────────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
-LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
-TEMPERATURE    = 0.0
-DO_SAMPLE      = False
-print("Loading quantized base model on CPU (GPU only in @spaces.GPU)...")
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16
 )
 model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     quantization_config=bnb_config,
-    device_map="cpu",  # ← Force CPU at load time (required for ZeroGPU)
-    trust_remote_code=True
 )
-print("Loading LoRA...")
 model = PeftModel.from_pretrained(model, LORA_PATH)
-model = model.merge_and_unload()  # Merge for speed
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
-model.eval()
 # ────────────────────────────────────────────────────────────────
-@spaces.GPU(duration=60)  # 60s max is safe & gives good queue priority
 def generate_sql(prompt: str):
-    messages = [{"role": "user", "content": prompt}]
-    # Tokenize on CPU first
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_tensors="pt"
     )
-    # Move to CUDA ONLY inside here (GPU is now allocated)
-    inputs = inputs.to("cuda")
     with torch.inference_mode():
         outputs = model.generate(
             input_ids=inputs,
             max_new_tokens=MAX_NEW_TOKENS,
             temperature=TEMPERATURE,
             do_sample=DO_SAMPLE,
-            use_cache=True,
             pad_token_id=tokenizer.eos_token_id,
         )
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Clean output
     if "<|assistant|>" in response:
         response = response.split("<|assistant|>", 1)[-1].strip()
     if "<|end|>" in response:
-        response = response.split("<|end|>")[0].strip()
-    return response
 # ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(
-        label="Ask SQL question",
         placeholder="Delete duplicate rows from users table based on email",
-        lines=3
     ),
-    outputs=gr.Textbox(label="Generated SQL"),
-    title="SQL Chatbot (ZeroGPU Safe)",
-    description="Phi-3-mini 4bit + LoRA - GPU allocated only during generation",
     examples=[
         ["Find duplicate emails in users table"],
         ["Top 5 highest paid employees"],
-        ["Count orders per customer last month"]
     ],
-    cache_examples=False  # ← CRITICAL: Disable caching to avoid startup .to("cuda") call
 )
 if __name__ == "__main__":

+# app.py
 import torch
 import gradio as gr
+import spaces
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
 # ────────────────────────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
+LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
+TEMPERATURE = 0.0
+DO_SAMPLE = False
+print("Loading quantized base model on CPU...")
+print("(GPU will be used only during inference if available)")
+# 4-bit quantization config
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
 )
+# Load base model → always on CPU first
 model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     quantization_config=bnb_config,
+    device_map="cpu",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
 )
+print("Loading LoRA adapters...")
 model = PeftModel.from_pretrained(model, LORA_PATH)
+# Merge for faster inference (very recommended)
+print("Merging LoRA into base model...")
+model = model.merge_and_unload()
+# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+tokenizer.pad_token = tokenizer.eos_token
+model.eval()
 # ────────────────────────────────────────────────────────────────
+@spaces.GPU(duration=60, max_requests=20)  # safe values for ZeroGPU
 def generate_sql(prompt: str):
+    # Prepare chat format
+    messages = [
+        {"role": "user", "content": prompt}
+    ]
+    # Tokenize on CPU (safe everywhere)
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_tensors="pt"
     )
+    # Choose device dynamically - this is the ZeroGPU-safe way
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"→ Running inference on device: {device}")
+    inputs = inputs.to(device)
     with torch.inference_mode():
         outputs = model.generate(
             input_ids=inputs,
             max_new_tokens=MAX_NEW_TOKENS,
             temperature=TEMPERATURE,
             do_sample=DO_SAMPLE,
             pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
         )
+    # Decode and clean output
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Remove user's prompt + assistant tag if present
     if "<|assistant|>" in response:
         response = response.split("<|assistant|>", 1)[-1].strip()
+    # Cut at end token if exists
     if "<|end|>" in response:
+        response = response.split("<|end|>", 1)[0].strip()
+    return response.strip()
 # ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(
+        label="Ask a question about SQL",
         placeholder="Delete duplicate rows from users table based on email",
+        lines=3,
+    ),
+    outputs=gr.Textbox(label="Generated SQL Query"),
+    title="SQL Chatbot – Phi-3-mini + LoRA",
+    description=(
+        "Fine-tuned Phi-3-mini-4k-instruct (4bit) for generating SQL queries\n\n"
+        "Works on ZeroGPU and regular GPU hardware"
     ),
     examples=[
         ["Find duplicate emails in users table"],
         ["Top 5 highest paid employees"],
+        ["Count orders per customer last month"],
+        ["Show all products that haven't been ordered in the last 6 months"],
+        ["Update all orders from 2024 to status 'completed'"],
     ],
+    cache_examples=False,
 )
 if __name__ == "__main__":