Spaces:

cochi1706
/

CodingAssistant

Sleeping

App Files Files Community

cochi1706 commited on Nov 15, 2025

Commit

f1fc130

1 Parent(s): 1e36468

Refactor model loading and input handling in chatbot application. Updated model and tokenizer initialization, improved device management for inputs, and removed unused sliders from the Gradio interface.

Browse files

Files changed (1) hide show

app.py +25 -38

app.py CHANGED Viewed

@@ -1,33 +1,33 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-# Load model và tokenizer
 print("Đang tải model...")
-base_model_name = "Qwen/Qwen3-0.6B"
-adapter_repo = "cochi1706/coding-assistant"
-# Load base model
-base_model = AutoModelForCausalLM.from_pretrained(
-    base_model_name,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
     device_map="auto" if torch.cuda.is_available() else None,
 )
-# Load PEFT adapter
-model = PeftModel.from_pretrained(base_model, adapter_repo)
-# Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(adapter_repo)
 # Set padding token nếu chưa có
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 model.eval()
-print("Model đã sẵn sàng!")
 def respond(
@@ -55,8 +55,16 @@ def respond(
     # Tokenize
     inputs = tokenizer(prompt, return_tensors="pt")
-    if torch.cuda.is_available():
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
     # Generate với streaming token-by-token
     input_length = inputs["input_ids"].shape[1]
@@ -115,28 +123,7 @@ chatbot = gr.ChatInterface(
             value="You are a helpful coding assistant. Provide clear, concise, and accurate code solutions and explanations.",
             label="System message",
             lines=3,
-        ),
-        gr.Slider(
-            minimum=1,
-            maximum=2048,
-            value=512,
-            step=1,
-            label="Max new tokens",
-        ),
-        gr.Slider(
-            minimum=0.1,
-            maximum=2.0,
-            value=0.7,
-            step=0.1,
-            label="Temperature",
-        ),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
     ],
 )

 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# Load tokenizer và model
 print("Đang tải model...")
+model_name = "cochi1706/decoder"
+subfolder = "qwen3-finetuned"
+# Xác định device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder=subfolder)
+# Load model
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
     device_map="auto" if torch.cuda.is_available() else None,
+    subfolder=subfolder,
 )
 # Set padding token nếu chưa có
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 model.eval()
+print(f"Model đã sẵn sàng! Device: {device}")
 def respond(
     # Tokenize
     inputs = tokenizer(prompt, return_tensors="pt")
+    # Di chuyển inputs đến device của model
+    # Nếu model đã có device_map, lấy device từ model parameters
+    if hasattr(model, 'hf_device_map') and model.hf_device_map:
+        # Model đã được phân bổ trên nhiều device, sử dụng device của layer đầu tiên
+        first_param_device = next(model.parameters()).device
+        inputs = {k: v.to(first_param_device) for k, v in inputs.items()}
+    else:
+        # Model trên một device duy nhất
+        inputs = {k: v.to(device) for k, v in inputs.items()}
     # Generate với streaming token-by-token
     input_length = inputs["input_ids"].shape[1]
             value="You are a helpful coding assistant. Provide clear, concise, and accurate code solutions and explanations.",
             label="System message",
             lines=3,
+        )
     ],
 )