Spaces:

cochi1706
/

CodingAssistant

Sleeping

App Files Files Community

cochi1706 commited on Nov 15, 2025

Commit

76de232

1 Parent(s): 376a746

Streamline model loading and response generation in chatbot application by utilizing a text generation pipeline. Removed legacy loading methods and improved response handling for enhanced performance and clarity.

Browse files

Files changed (1) hide show

app.py +33 -126

app.py CHANGED Viewed

@@ -1,87 +1,30 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
 # Load tokenizer và model
 print("Đang tải model...")
-import os
-base_model_name = "Qwen/Qwen3-0.6B"
-adapter_path_local = "./qwen3-finetuned"
-model_loaded = False
-# Ưu tiên 1: Thử load từ local path (nếu có)
-if os.path.exists(adapter_path_local) and os.path.exists(os.path.join(adapter_path_local, "adapter_config.json")):
-    try:
-        print(f"Đang load từ local path: {adapter_path_local}")
-        base_model = AutoModelForCausalLM.from_pretrained(
-            base_model_name,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
-        )
-        model = PeftModel.from_pretrained(base_model, adapter_path_local)
-        tokenizer = AutoTokenizer.from_pretrained(adapter_path_local, local_files_only=True)
-        model_loaded = True
-        print("✓ Đã load model từ local path")
-    except Exception as e:
-        print(f"✗ Không thể load từ local: {e}")
-# Ưu tiên 2: Thử load từ HuggingFace như full model
-if not model_loaded:
-    try:
-        model_name = "cochi1706/decoder/qwen3-finetuned"
-        print(f"Đang thử load full model từ: {model_name}")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
-        )
-        model_loaded = True
-        print("✓ Đã load full model từ HuggingFace")
-    except Exception as e:
-        print(f"✗ Không thể load full model: {e}")
-# Ưu tiên 3: Load như PEFT adapter từ HuggingFace
-if not model_loaded:
-    try:
-        print("Đang load base model và PEFT adapter từ HuggingFace...")
-        base_model = AutoModelForCausalLM.from_pretrained(
-            base_model_name,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
-        )
-        # Thử các adapter paths khác nhau
-        adapter_paths = [
-            "cochi1706/coding-assistant",
-            "cochi1706/decoder/qwen3-finetuned",
-        ]
-        for adapter_path in adapter_paths:
-            try:
-                print(f"  Thử adapter path: {adapter_path}")
-                model = PeftModel.from_pretrained(base_model, adapter_path)
-                tokenizer = AutoTokenizer.from_pretrained(adapter_path)
-                model_loaded = True
-                print(f"✓ Đã load PEFT adapter từ: {adapter_path}")
-                break
-            except Exception as e:
-                print(f"  ✗ Không thể load từ {adapter_path}: {e}")
-                continue
-    except Exception as e:
-        print(f"✗ Không thể load base model: {e}")
-if not model_loaded:
-    raise RuntimeError("Không thể load model từ bất kỳ nguồn nào. Vui lòng kiểm tra lại model path.")
-# Xác định device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Set padding token nếu chưa có
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 model.eval()
 print(f"Model đã sẵn sàng! Device: {device}")
@@ -95,7 +38,7 @@ def respond(
     top_p,
 ):
     """
-    Tạo phản hồi từ model coding assistant
     """
     # Chuẩn bị prompt với chat template
     messages = [{"role": "system", "content": system_message}]
@@ -109,61 +52,25 @@ def respond(
         add_generation_prompt=True
     )
-    # Tokenize
-    inputs = tokenizer(prompt, return_tensors="pt")
-    # Di chuyển inputs đến device của model
-    # Nếu model đã có device_map, lấy device từ model parameters
-    if hasattr(model, 'hf_device_map') and model.hf_device_map:
-        # Model đã được phân bổ trên nhiều device, sử dụng device của layer ��ầu tiên
-        first_param_device = next(model.parameters()).device
-        inputs = {k: v.to(first_param_device) for k, v in inputs.items()}
-    else:
-        # Model trên một device duy nhất
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-    # Generate với streaming token-by-token
-    input_length = inputs["input_ids"].shape[1]
-    response = ""
-    with torch.no_grad():
-        # Khởi tạo với input_ids
-        generated_ids = inputs["input_ids"].clone()
-        for _ in range(max_tokens):
-            # Forward pass
-            outputs = model(generated_ids)
-            logits = outputs.logits[:, -1, :]
-            # Apply temperature và top_p
-            if temperature != 1.0:
-                logits = logits / temperature
-            # Top-p sampling
-            if top_p < 1.0:
-                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-                sorted_indices_to_remove = cumulative_probs > top_p
-                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-                sorted_indices_to_remove[..., 0] = 0
-                indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-                logits[indices_to_remove] = float('-inf')
-            # Sample next token
-            probs = torch.softmax(logits, dim=-1)
-            next_token = torch.multinomial(probs, num_samples=1)
-            # Kiểm tra EOS token
-            if next_token.item() == tokenizer.eos_token_id:
-                break
-            # Thêm token vào generated_ids
-            generated_ids = torch.cat([generated_ids, next_token], dim=1)
-            # Decode token mới và stream
-            new_text = tokenizer.decode([next_token.item()], skip_special_tokens=True)
-            response += new_text
-            yield response
 """
@@ -180,7 +87,7 @@ chatbot = gr.ChatInterface(
             label="System message",
             lines=3,
         )
-    ],
 )
 demo = chatbot

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 # Load tokenizer và model
 print("Đang tải model...")
+model_name = "cochi1706/codingassistant"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Xác định device cho pipeline (0 cho cuda, -1 cho cpu)
+device = 0 if torch.cuda.is_available() else -1
 # Set padding token nếu chưa có
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
+# Tạo pipeline để sinh text
+text_generator = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    device=device,
+    do_sample=True,
+)
 model.eval()
 print(f"Model đã sẵn sàng! Device: {device}")
     top_p,
 ):
     """
+    Tạo phản hồi từ model coding assistant sử dụng pipeline
     """
     # Chuẩn bị prompt với chat template
     messages = [{"role": "system", "content": system_message}]
         add_generation_prompt=True
     )
+    # Sử dụng pipeline để generate text
+    generated = text_generator(
+        prompt,
+        max_length=len(tokenizer.encode(prompt)) + max_tokens,
+        max_new_tokens=max_tokens,
+        num_return_sequences=1,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=True,
+    )
+    # Lấy câu trả lời từ kết quả
+    câu_trả_lời = generated[0]['generated_text']
+    # Loại bỏ prompt ban đầu để chỉ lấy phần response
+    if prompt in câu_trả_lời:
+        câu_trả_lời = câu_trả_lời.replace(prompt, "").strip()
+    return câu_trả_lời
 """
             label="System message",
             lines=3,
         )
+    ]
 )
 demo = chatbot