Spaces:
Sleeping
Sleeping
Refactor model loading and input handling in chatbot application. Updated model and tokenizer initialization, improved device management for inputs, and removed unused sliders from the Gradio interface.
Browse files
app.py
CHANGED
|
@@ -1,33 +1,33 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 4 |
-
from peft import PeftModel
|
| 5 |
|
| 6 |
|
| 7 |
-
# Load
|
| 8 |
print("Đang tải model...")
|
| 9 |
-
|
| 10 |
-
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 16 |
device_map="auto" if torch.cuda.is_available() else None,
|
|
|
|
| 17 |
)
|
| 18 |
|
| 19 |
-
# Load PEFT adapter
|
| 20 |
-
model = PeftModel.from_pretrained(base_model, adapter_repo)
|
| 21 |
-
|
| 22 |
-
# Load tokenizer
|
| 23 |
-
tokenizer = AutoTokenizer.from_pretrained(adapter_repo)
|
| 24 |
-
|
| 25 |
# Set padding token nếu chưa có
|
| 26 |
if tokenizer.pad_token is None:
|
| 27 |
tokenizer.pad_token = tokenizer.eos_token
|
| 28 |
|
| 29 |
model.eval()
|
| 30 |
-
print("Model đã sẵn sàng!")
|
| 31 |
|
| 32 |
|
| 33 |
def respond(
|
|
@@ -55,8 +55,16 @@ def respond(
|
|
| 55 |
|
| 56 |
# Tokenize
|
| 57 |
inputs = tokenizer(prompt, return_tensors="pt")
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# Generate với streaming token-by-token
|
| 62 |
input_length = inputs["input_ids"].shape[1]
|
|
@@ -115,28 +123,7 @@ chatbot = gr.ChatInterface(
|
|
| 115 |
value="You are a helpful coding assistant. Provide clear, concise, and accurate code solutions and explanations.",
|
| 116 |
label="System message",
|
| 117 |
lines=3,
|
| 118 |
-
)
|
| 119 |
-
gr.Slider(
|
| 120 |
-
minimum=1,
|
| 121 |
-
maximum=2048,
|
| 122 |
-
value=512,
|
| 123 |
-
step=1,
|
| 124 |
-
label="Max new tokens",
|
| 125 |
-
),
|
| 126 |
-
gr.Slider(
|
| 127 |
-
minimum=0.1,
|
| 128 |
-
maximum=2.0,
|
| 129 |
-
value=0.7,
|
| 130 |
-
step=0.1,
|
| 131 |
-
label="Temperature",
|
| 132 |
-
),
|
| 133 |
-
gr.Slider(
|
| 134 |
-
minimum=0.1,
|
| 135 |
-
maximum=1.0,
|
| 136 |
-
value=0.95,
|
| 137 |
-
step=0.05,
|
| 138 |
-
label="Top-p (nucleus sampling)",
|
| 139 |
-
),
|
| 140 |
],
|
| 141 |
)
|
| 142 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
+
# Load tokenizer và model
|
| 7 |
print("Đang tải model...")
|
| 8 |
+
model_name = "cochi1706/decoder"
|
| 9 |
+
subfolder = "qwen3-finetuned"
|
| 10 |
|
| 11 |
+
# Xác định device
|
| 12 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 13 |
+
|
| 14 |
+
# Load tokenizer
|
| 15 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder=subfolder)
|
| 16 |
+
|
| 17 |
+
# Load model
|
| 18 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 19 |
+
model_name,
|
| 20 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 21 |
device_map="auto" if torch.cuda.is_available() else None,
|
| 22 |
+
subfolder=subfolder,
|
| 23 |
)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# Set padding token nếu chưa có
|
| 26 |
if tokenizer.pad_token is None:
|
| 27 |
tokenizer.pad_token = tokenizer.eos_token
|
| 28 |
|
| 29 |
model.eval()
|
| 30 |
+
print(f"Model đã sẵn sàng! Device: {device}")
|
| 31 |
|
| 32 |
|
| 33 |
def respond(
|
|
|
|
| 55 |
|
| 56 |
# Tokenize
|
| 57 |
inputs = tokenizer(prompt, return_tensors="pt")
|
| 58 |
+
|
| 59 |
+
# Di chuyển inputs đến device của model
|
| 60 |
+
# Nếu model đã có device_map, lấy device từ model parameters
|
| 61 |
+
if hasattr(model, 'hf_device_map') and model.hf_device_map:
|
| 62 |
+
# Model đã được phân bổ trên nhiều device, sử dụng device của layer đầu tiên
|
| 63 |
+
first_param_device = next(model.parameters()).device
|
| 64 |
+
inputs = {k: v.to(first_param_device) for k, v in inputs.items()}
|
| 65 |
+
else:
|
| 66 |
+
# Model trên một device duy nhất
|
| 67 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 68 |
|
| 69 |
# Generate với streaming token-by-token
|
| 70 |
input_length = inputs["input_ids"].shape[1]
|
|
|
|
| 123 |
value="You are a helpful coding assistant. Provide clear, concise, and accurate code solutions and explanations.",
|
| 124 |
label="System message",
|
| 125 |
lines=3,
|
| 126 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
],
|
| 128 |
)
|
| 129 |
|