saadkhi commited on
Commit
87ff5b4
·
verified ·
1 Parent(s): 52ae0ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -69
app.py CHANGED
@@ -1,71 +1,30 @@
1
- import gradio as gr
2
  import torch
3
- from unsloth import FastLanguageModel
4
-
5
- # ── Global model (loaded once at startup) ───────────────────────────────
6
- print("Loading model...")
7
-
8
- model, tokenizer = FastLanguageModel.from_pretrained(
9
- "unsloth/Phi-3-mini-4k-instruct-bnb-4bit", # very fast pre-quantized base
10
- max_seq_length=2048,
11
- dtype=None, # auto (bf16/float16)
12
- load_in_4bit=True,
13
- )
14
-
15
- # Load your LoRA adapter
16
- model = FastLanguageModel.for_inference(
17
- model.load_adapter("saadkhi/SQL_Chat_finetuned_model")
18
- )
19
-
20
- print("Model loaded successfully!")
21
-
22
- # ── Chat function ───────────────────────────────────────────────────────
23
- def generate_response(message, history):
24
- # Build messages list (multi-turn support)
25
- messages = []
26
- for user_msg, assistant_msg in history:
27
- messages.append({"role": "user", "content": user_msg})
28
- messages.append({"role": "assistant", "content": assistant_msg})
29
- messages.append({"role": "user", "content": message})
30
-
31
- # Use the proper chat template (very important for Phi-3)
32
- inputs = tokenizer.apply_chat_template(
33
- messages,
34
- tokenize=True,
35
- add_generation_prompt=True,
36
- return_tensors="pt"
37
- ).to("cuda" if torch.cuda.is_available() else "cpu")
38
-
39
- # Generate
40
- outputs = model.generate(
41
- input_ids=inputs,
42
- max_new_tokens=180, # ← increased but still reasonable
43
- temperature=0.0,
44
- do_sample=False, # greedy = fastest & most deterministic
45
- use_cache=True,
46
- )
47
-
48
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
49
-
50
- # Clean up output (remove input prompt part)
51
- if "<|assistant|>" in response:
52
- response = response.split("<|assistant|>")[-1].strip()
53
-
54
- return response
55
-
56
-
57
- # ── Gradio UI ───────────────────────────────────────────────────────────
58
- demo = gr.ChatInterface(
59
- fn=generate_response,
60
- title="SQL Chat Assistant (Fast Version)",
61
- description="Ask SQL related questions • Powered by Phi-3-mini + your fine-tune",
62
- examples=[
63
- "Write a query to find duplicate emails in users table",
64
- "How to delete rows with NULL values in column price?",
65
- "Select top 10 most expensive products",
66
- ],
67
- cache_examples=False,
68
  )
69
-
70
- if __name__ == "__main__":
71
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ from peft import PeftModel
5
+ from transformers import BitsAndBytesConfig
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+ base_model = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
8
+ finetuned_model = "saadkhi/SQL_Chat_finetuned_model"
9
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
10
+ bnb = BitsAndBytesConfig(load_in_4bit=True)
11
+ model = AutoModelForCausalLM.from_pretrained(
12
+     base_model,
13
+     quantization_config=bnb,
14
+     torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
15
+     device_map="auto"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
+ model = PeftModel.from_pretrained(model, finetuned_model).to(device)
18
+ model.eval()
19
+ def chat(prompt):
20
+     inputs = tokenizer(prompt, return_tensors="pt").to(device)
21
+     with torch.inference_mode():
22
+         output = model.generate(
23
+             **inputs,
24
+             max_new_tokens=60,
25
+             temperature=0.1,
26
+             do_sample=False
27
+         )
28
+     return tokenizer.decode(output[0], skip_special_tokens=True)
29
+ iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="SQL Chatbot")
30
+ iface.launch()