Spaces:

Snow2222
/

SSS-Distillation

Runtime error

App Files Files Community

Snow2222 commited on Feb 10, 2025

Commit

bb7b0e9

verified ·

1 Parent(s): 17a809a

Update train.py

Browse files

Files changed (1) hide show

train.py +17 -13

train.py CHANGED Viewed

@@ -15,6 +15,9 @@ if hf_token:
 else:
     raise ValueError("Hugging Face token 未设置")
 # 定义教师模型与学生模型
 teacher_model_name = "Qwen/Qwen1.5-7B-Chat"  # 教师模型（较大模型）
 student_model_name = "distilgpt2"            # ✅ 建议用 distilgpt2
@@ -24,7 +27,7 @@ teacher = AutoModelForCausalLM.from_pretrained(
     teacher_model_name,
     trust_remote_code=True,
     token=hf_token
-)
 teacher.eval()  # 固定教师模型，不训练
 # 加载学生模型及 Tokenizer
@@ -32,7 +35,7 @@ student = AutoModelForCausalLM.from_pretrained(
     student_model_name,
     trust_remote_code=True,
     token=hf_token
-)
 tokenizer = AutoTokenizer.from_pretrained(
     student_model_name,
     trust_remote_code=True,
@@ -60,19 +63,19 @@ def preprocess_data(example):
     labels = tokenizer(example["output"], truncation=True, padding="max_length", max_length=128)
     return {
-        "input_ids": inputs["input_ids"],
-        "attention_mask": inputs["attention_mask"],
-        "labels": labels["input_ids"]
     }
 # 预处理数据集
 dataset = dataset.map(preprocess_data, batched=True)
-# ✅ 修正 training_step() 参数问题
 class DistillationTrainer(Trainer):
     def __init__(self, teacher, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.teacher = teacher  # ✅ 传入教师模型
     def compute_loss(self, model, inputs, return_outputs=False):
         labels = inputs["input_ids"]
@@ -83,7 +86,8 @@ class DistillationTrainer(Trainer):
         # ✅ 使用教师模型生成软标签（冻结教师参数）
         with torch.no_grad():
-            outputs_teacher = self.teacher(**inputs)
             logits_teacher = outputs_teacher.logits
         temperature = 2.0
@@ -104,10 +108,10 @@ class DistillationTrainer(Trainer):
         return (loss, outputs_student) if return_outputs else loss
-    def training_step(self, model, inputs, *args, **kwargs):  # ✅ 修正：添加 *args, **kwargs 以兼容 Trainer
-        """✅ 关键修复点：覆盖 `training_step()`，防止 `num_items_in_batch` 传递"""
         model.train()
-        inputs = self._prepare_inputs(inputs)
         loss = self.compute_loss(model, inputs)  # ✅ 直接调用，不传递 `num_items_in_batch`
         return loss
@@ -123,7 +127,7 @@ training_args = TrainingArguments(
     save_strategy="epoch",
     remove_unused_columns=False,  # ✅ 关键设置，确保 Trainer 不删除未识别的列
     gradient_checkpointing=True,  # ✅ 允许梯度检查点，节省显存
-    fp16=True if torch.cuda.is_available() else False
 )
 # 初始化 Trainer
@@ -140,4 +144,4 @@ trainer.train()
 # 保存模型到 Hugging Face
 student.push_to_hub("Snow2222/fst-nnn", use_auth_token=hf_token)
-tokenizer.push_to_hub("Snow2222/fst-nnn", use_auth_token=hf_token)

 else:
     raise ValueError("Hugging Face token 未设置")
+# ✅ 确保所有设备一致
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # 定义教师模型与学生模型
 teacher_model_name = "Qwen/Qwen1.5-7B-Chat"  # 教师模型（较大模型）
 student_model_name = "distilgpt2"            # ✅ 建议用 distilgpt2
     teacher_model_name,
     trust_remote_code=True,
     token=hf_token
+).to(device)  # ✅ 强制放到 GPU 或 CPU
 teacher.eval()  # 固定教师模型，不训练
 # 加载学生模型及 Tokenizer
     student_model_name,
     trust_remote_code=True,
     token=hf_token
+).to(device)  # ✅ 也放到 GPU 或 CPU
 tokenizer = AutoTokenizer.from_pretrained(
     student_model_name,
     trust_remote_code=True,
     labels = tokenizer(example["output"], truncation=True, padding="max_length", max_length=128)
     return {
+        "input_ids": torch.tensor(inputs["input_ids"]).to(device),  # ✅ 强制放到 GPU 或 CPU
+        "attention_mask": torch.tensor(inputs["attention_mask"]).to(device),
+        "labels": torch.tensor(labels["input_ids"]).to(device)
     }
 # 预处理数据集
 dataset = dataset.map(preprocess_data, batched=True)
+# ✅ 修正 training_step() 设备不匹配问题
 class DistillationTrainer(Trainer):
     def __init__(self, teacher, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.teacher = teacher.to(device)  # ✅ 确保 teacher 在 GPU
     def compute_loss(self, model, inputs, return_outputs=False):
         labels = inputs["input_ids"]
         # ✅ 使用教师模型生成软标签（冻结教师参数）
         with torch.no_grad():
+            inputs_on_device = {k: v.to(device) for k, v in inputs.items()}  # ✅ 确保 inputs 在 GPU
+            outputs_teacher = self.teacher(**inputs_on_device)
             logits_teacher = outputs_teacher.logits
         temperature = 2.0
         return (loss, outputs_student) if return_outputs else loss
+    def training_step(self, model, inputs, *args, **kwargs):  # ✅ 兼容 Trainer 额外参数
+        """✅ 关键修复点：确保所有输入和模型都在 GPU"""
         model.train()
+        inputs = {k: v.to(device) for k, v in self._prepare_inputs(inputs).items()}  # ✅ 确保 inputs 在 GPU
         loss = self.compute_loss(model, inputs)  # ✅ 直接调用，不传递 `num_items_in_batch`
         return loss
     save_strategy="epoch",
     remove_unused_columns=False,  # ✅ 关键设置，确保 Trainer 不删除未识别的列
     gradient_checkpointing=True,  # ✅ 允许梯度检查点，节省显存
+    fp16=torch.cuda.is_available()  # ✅ 自动判断是否使用 FP16
 )
 # 初始化 Trainer
 # 保存模型到 Hugging Face
 student.push_to_hub("Snow2222/fst-nnn", use_auth_token=hf_token)
+tokenizer.push_to_hub("Snow2222/fst-nnn", use_auth_token=hf_token)