Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on Jan 7

Commit

32343cc

verified ·

1 Parent(s): 02976e0

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -12

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
-# app.py - ZeroGPU compatible version (standard transformers + @spaces.GPU)
 import torch
 import gradio as gr
-import spaces  # ← Correct import!
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
@@ -14,7 +14,7 @@ MAX_NEW_TOKENS = 180
 TEMPERATURE    = 0.0
 DO_SAMPLE      = False
-print("Loading quantized base model (CPU first)...")
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -24,29 +24,33 @@ bnb_config = BitsAndBytesConfig(
 model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     quantization_config=bnb_config,
-    device_map="auto",
     trust_remote_code=True
 )
 print("Loading LoRA...")
 model = PeftModel.from_pretrained(model, LORA_PATH)
-model = model.merge_and_unload()  # Merge for faster inference
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 model.eval()
 # ────────────────────────────────────────────────────────────────
-@spaces.GPU(duration=60)  # ← Decorator! Requests GPU slice only here (60s max recommended)
 def generate_sql(prompt: str):
     messages = [{"role": "user", "content": prompt}]
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_tensors="pt"
-    ).to("cuda")  # ZeroGPU makes cuda available here
     with torch.inference_mode():
         outputs = model.generate(
             input_ids=inputs,
@@ -62,7 +66,8 @@ def generate_sql(prompt: str):
     # Clean output
     if "<|assistant|>" in response:
         response = response.split("<|assistant|>", 1)[-1].strip()
-    response = response.split("<|end|>")[0].strip() if "<|end|>" in response else response
     return response
@@ -75,13 +80,14 @@ demo = gr.Interface(
         lines=3
     ),
     outputs=gr.Textbox(label="Generated SQL"),
-    title="SQL Chatbot (ZeroGPU)",
-    description="Phi-3-mini 4bit + LoRA - Free but limited daily GPU time",
     examples=[
         ["Find duplicate emails in users table"],
         ["Top 5 highest paid employees"],
         ["Count orders per customer last month"]
-    ]
 )
 if __name__ == "__main__":

+# app.py - ZeroGPU safe version (no .to("cuda") outside decorated fn + no caching)
 import torch
 import gradio as gr
+import spaces  # Correct import
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
 TEMPERATURE    = 0.0
 DO_SAMPLE      = False
+print("Loading quantized base model on CPU (GPU only in @spaces.GPU)...")
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
 model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     quantization_config=bnb_config,
+    device_map="cpu",  # ← Force CPU at load time (required for ZeroGPU)
     trust_remote_code=True
 )
 print("Loading LoRA...")
 model = PeftModel.from_pretrained(model, LORA_PATH)
+model = model.merge_and_unload()  # Merge for speed
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 model.eval()
 # ────────────────────────────────────────────────────────────────
+@spaces.GPU(duration=60)  # 60s max is safe & gives good queue priority
 def generate_sql(prompt: str):
     messages = [{"role": "user", "content": prompt}]
+    # Tokenize on CPU first
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_tensors="pt"
+    )
+    # Move to CUDA ONLY inside here (GPU is now allocated)
+    inputs = inputs.to("cuda")
     with torch.inference_mode():
         outputs = model.generate(
             input_ids=inputs,
     # Clean output
     if "<|assistant|>" in response:
         response = response.split("<|assistant|>", 1)[-1].strip()
+    if "<|end|>" in response:
+        response = response.split("<|end|>")[0].strip()
     return response
         lines=3
     ),
     outputs=gr.Textbox(label="Generated SQL"),
+    title="SQL Chatbot (ZeroGPU Safe)",
+    description="Phi-3-mini 4bit + LoRA - GPU allocated only during generation",
     examples=[
         ["Find duplicate emails in users table"],
         ["Top 5 highest paid employees"],
         ["Count orders per customer last month"]
+    ],
+    cache_examples=False  # ← CRITICAL: Disable caching to avoid startup .to("cuda") call
 )
 if __name__ == "__main__":