Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on Jan 4

Commit

87ff5b4

verified ·

1 Parent(s): 52ae0ac

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -69

app.py CHANGED Viewed

@@ -1,71 +1,30 @@
-import gradio as gr
 import torch
-from unsloth import FastLanguageModel
-# ── Global model (loaded once at startup) ───────────────────────────────
-print("Loading model...")
-model, tokenizer = FastLanguageModel.from_pretrained(
-    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",   # very fast pre-quantized base
-    max_seq_length=2048,
-    dtype=None,                                      # auto (bf16/float16)
-    load_in_4bit=True,
-)
-# Load your LoRA adapter
-model = FastLanguageModel.for_inference(
-    model.load_adapter("saadkhi/SQL_Chat_finetuned_model")
-)
-print("Model loaded successfully!")
-# ── Chat function ───────────────────────────────────────────────────────
-def generate_response(message, history):
-    # Build messages list (multi-turn support)
-    messages = []
-    for user_msg, assistant_msg in history:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "assistant", "content": assistant_msg})
-    messages.append({"role": "user", "content": message})
-    # Use the proper chat template (very important for Phi-3)
-    inputs = tokenizer.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_tensors="pt"
-    ).to("cuda" if torch.cuda.is_available() else "cpu")
-    # Generate
-    outputs = model.generate(
-        input_ids=inputs,
-        max_new_tokens=180,               # ← increased but still reasonable
-        temperature=0.0,
-        do_sample=False,                  # greedy = fastest & most deterministic
-        use_cache=True,
-    )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Clean up output (remove input prompt part)
-    if "<|assistant|>" in response:
-        response = response.split("<|assistant|>")[-1].strip()
-    return response
-# ── Gradio UI ───────────────────────────────────────────────────────────
-demo = gr.ChatInterface(
-    fn=generate_response,
-    title="SQL Chat Assistant (Fast Version)",
-    description="Ask SQL related questions • Powered by Phi-3-mini + your fine-tune",
-    examples=[
-        "Write a query to find duplicate emails in users table",
-        "How to delete rows with NULL values in column price?",
-        "Select top 10 most expensive products",
-    ],
-    cache_examples=False,
 )
-if __name__ == "__main__":
-    demo.launch()

 import torch
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+from transformers import BitsAndBytesConfig
+device = "cuda" if torch.cuda.is_available() else "cpu"
+base_model = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
+finetuned_model = "saadkhi/SQL_Chat_finetuned_model"
+tokenizer = AutoTokenizer.from_pretrained(base_model)
+bnb = BitsAndBytesConfig(load_in_4bit=True)
+model = AutoModelForCausalLM.from_pretrained(
+    base_model,
+    quantization_config=bnb,
+    torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
+    device_map="auto"
 )
+model = PeftModel.from_pretrained(model, finetuned_model).to(device)
+model.eval()
+def chat(prompt):
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    with torch.inference_mode():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=60,
+            temperature=0.1,
+            do_sample=False
+        )
+    return tokenizer.decode(output[0], skip_special_tokens=True)
+iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="SQL Chatbot")
+iface.launch()