modify batch and fix tensor issue
Browse files- myolmoe/modeling_myolmoe.py +1 -1
- scripts/train.py +1 -1
myolmoe/modeling_myolmoe.py
CHANGED
|
@@ -1065,7 +1065,7 @@ class MyOlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin):
|
|
| 1065 |
output = (aux_loss,) + output
|
| 1066 |
return (loss,) + output if loss is not None else output
|
| 1067 |
#
|
| 1068 |
-
total_small_expert_loss = 0
|
| 1069 |
for layer_output in outputs:
|
| 1070 |
if len(layer_output) > 1 and isinstance(layer_output[1], torch.Tensor):
|
| 1071 |
total_small_expert_loss += layer_output[1]
|
|
|
|
| 1065 |
output = (aux_loss,) + output
|
| 1066 |
return (loss,) + output if loss is not None else output
|
| 1067 |
#
|
| 1068 |
+
total_small_expert_loss = torch.tensor(0.0, device=logits.device)
|
| 1069 |
for layer_output in outputs:
|
| 1070 |
if len(layer_output) > 1 and isinstance(layer_output[1], torch.Tensor):
|
| 1071 |
total_small_expert_loss += layer_output[1]
|
scripts/train.py
CHANGED
|
@@ -72,7 +72,7 @@ def main():
|
|
| 72 |
# Training arguments
|
| 73 |
training_args = TrainingArguments(
|
| 74 |
output_dir="./output",
|
| 75 |
-
per_device_train_batch_size=
|
| 76 |
gradient_accumulation_steps=8,
|
| 77 |
learning_rate=1e-5,
|
| 78 |
num_train_epochs=1,
|
|
|
|
| 72 |
# Training arguments
|
| 73 |
training_args = TrainingArguments(
|
| 74 |
output_dir="./output",
|
| 75 |
+
per_device_train_batch_size=16,
|
| 76 |
gradient_accumulation_steps=8,
|
| 77 |
learning_rate=1e-5,
|
| 78 |
num_train_epochs=1,
|