saadkhi commited on
Commit
ed1eebe
·
verified ·
1 Parent(s): 525c420

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -116
app.py CHANGED
@@ -1,128 +1,40 @@
1
- # import torch
2
- # import gradio as gr
3
- # from transformers import AutoTokenizer, AutoModelForCausalLM
4
- # from peft import PeftModel
5
- # from transformers import BitsAndBytesConfig
6
-
7
- # device = "cuda" if torch.cuda.is_available() else "cpu"
8
-
9
- # base_model = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
10
- # finetuned_model = "saadkhi/SQL_Chat_finetuned_model"
11
-
12
- # tokenizer = AutoTokenizer.from_pretrained(base_model)
13
-
14
- # bnb = BitsAndBytesConfig(load_in_4bit=True)
15
-
16
- # model = AutoModelForCausalLM.from_pretrained(
17
- # base_model,
18
- # quantization_config=bnb,
19
- # torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
20
- # device_map="auto"
21
- # )
22
-
23
- # model = PeftModel.from_pretrained(model, finetuned_model).to(device)
24
- # model.eval()
25
-
26
- # def chat(prompt):
27
- # inputs = tokenizer(prompt, return_tensors="pt").to(device)
28
-
29
- # with torch.inference_mode():
30
- # output = model.generate(
31
- # **inputs,
32
- # max_new_tokens=60,
33
- # temperature=0.1,
34
- # do_sample=False
35
- # )
36
-
37
- # return tokenizer.decode(output[0], skip_special_tokens=True)
38
-
39
- # iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="SQL Chatbot")
40
- # iface.launch()
41
-
42
-
43
-
44
-
45
-
46
-
47
-
48
-
49
-
50
  import gradio as gr
51
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
52
  from peft import PeftModel
53
- import torch
54
 
55
- # Best 4-bit config for speed + low memory
56
- quant_config = BitsAndBytesConfig(
57
- load_in_4bit=True,
58
- bnb_4bit_quant_type="nf4",
59
- bnb_4bit_compute_dtype=torch.bfloat16,
60
- bnb_4bit_use_double_quant=True,
61
- )
62
-
63
- # Load base + your LoRA once
64
- base_model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
65
- lora_model_name = "saadkhi/SQL_Chat_finetuned_model"
66
 
67
- print("Loading model (20–40s first time)...")
68
- base_model = AutoModelForCausalLM.from_pretrained(
69
- base_model_name,
70
- quantization_config=quant_config,
71
- device_map="auto",
72
- trust_remote_code=True,
73
- # Removed flash_attention_2 — avoids install issues
74
- )
75
 
76
- model = PeftModel.from_pretrained(base_model, lora_model_name)
77
- tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
78
 
79
- model.eval()
80
- print("Model ready!")
81
 
82
- def chat(message, history):
83
- # Full conversation history
84
- messages = []
85
- for user, assistant in history:
86
- messages.append({"role": "user", "content": user})
87
- if assistant:
88
- messages.append({"role": "assistant", "content": assistant})
89
- messages.append({"role": "user", "content": message})
90
-
91
- inputs = tokenizer.apply_chat_template(
92
- messages,
93
- tokenize=True,
94
- add_generation_prompt=True,
95
- return_tensors="pt"
96
- ).to(model.device)
97
 
98
- # Optimized generation
99
- outputs = model.generate(
100
- inputs,
101
- max_new_tokens=256,
102
- temperature=0.7,
103
- do_sample=True,
104
- top_p=0.9,
105
- repetition_penalty=1.1,
106
- use_cache=True, # KV cache = faster sequential tokens
107
- eos_token_id=tokenizer.eos_token_id,
108
- )
109
 
110
- response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
111
-
112
- history.append((message, response))
113
- return history, ""
114
 
115
- # UI
116
- with gr.Blocks(title="SQL Chatbot", theme=gr.themes.Soft()) as demo:
117
- gr.Markdown("# SQL Chat Assistant")
118
- gr.Markdown("Fine-tuned Phi-3 Mini for SQL. Fast responses (3–8s on GPU).")
119
-
120
- chatbot = gr.Chatbot(height=500)
121
- msg = gr.Textbox(label="Your Question", placeholder="e.g., delete duplicate rows from users table based on email", lines=2)
122
- clear = gr.Button("Clear")
123
 
124
- msg.submit(chat, [msg, chatbot], [chatbot, msg])
125
- clear.click(lambda: ([], ""), None, chatbot)
126
 
127
- demo.queue(max_size=30)
128
- demo.launch()
 
1
+ import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel
5
+ from transformers import BitsAndBytesConfig
6
 
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
8
 
9
+ base_model = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
10
+ finetuned_model = "saadkhi/SQL_Chat_finetuned_model"
 
 
 
 
 
 
11
 
12
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
 
13
 
14
+ bnb = BitsAndBytesConfig(load_in_4bit=True)
 
15
 
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ base_model,
18
+ quantization_config=bnb,
19
+ torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
20
+ device_map="auto"
21
+ )
 
 
 
 
 
 
 
 
 
22
 
23
+ model = PeftModel.from_pretrained(model, finetuned_model).to(device)
24
+ model.eval()
 
 
 
 
 
 
 
 
 
25
 
26
+ def chat(prompt):
27
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
 
 
28
 
29
+ with torch.inference_mode():
30
+ output = model.generate(
31
+ **inputs,
32
+ max_new_tokens=60,
33
+ temperature=0.1,
34
+ do_sample=False
35
+ )
 
36
 
37
+ return tokenizer.decode(output[0], skip_special_tokens=True)
 
38
 
39
+ iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="SQL Chatbot")
40
+ iface.launch()