Spaces:
Running
Running
attempts to identify trainer bug
Browse files
model.py
CHANGED
|
@@ -129,45 +129,49 @@ class SmolLM3Model:
|
|
| 129 |
logger.info(f"Config attributes: {[attr for attr in dir(self.config) if not attr.startswith('_')]}")
|
| 130 |
|
| 131 |
# Merge config with kwargs
|
| 132 |
-
training_args = {
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
"
|
| 137 |
-
"
|
| 138 |
-
"
|
| 139 |
-
"
|
| 140 |
-
"
|
| 141 |
-
"
|
| 142 |
-
"
|
| 143 |
-
"
|
| 144 |
-
"
|
| 145 |
-
"
|
| 146 |
-
"
|
| 147 |
-
"
|
| 148 |
-
"
|
| 149 |
-
"
|
| 150 |
-
"
|
| 151 |
-
|
| 152 |
-
"
|
| 153 |
-
"
|
| 154 |
-
"
|
| 155 |
-
"
|
| 156 |
-
"
|
| 157 |
-
"
|
| 158 |
-
"
|
| 159 |
-
"
|
| 160 |
-
"
|
| 161 |
-
"
|
| 162 |
-
"
|
| 163 |
-
"
|
| 164 |
-
"
|
| 165 |
-
"
|
| 166 |
-
"
|
| 167 |
-
"
|
| 168 |
-
"
|
| 169 |
-
"
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# Override with kwargs
|
| 173 |
training_args.update(kwargs)
|
|
|
|
| 129 |
logger.info(f"Config attributes: {[attr for attr in dir(self.config) if not attr.startswith('_')]}")
|
| 130 |
|
| 131 |
# Merge config with kwargs
|
| 132 |
+
training_args = {}
|
| 133 |
+
|
| 134 |
+
# Add arguments one by one with error checking
|
| 135 |
+
try:
|
| 136 |
+
training_args["output_dir"] = output_dir
|
| 137 |
+
training_args["per_device_train_batch_size"] = self.config.batch_size
|
| 138 |
+
training_args["per_device_eval_batch_size"] = self.config.batch_size
|
| 139 |
+
training_args["gradient_accumulation_steps"] = self.config.gradient_accumulation_steps
|
| 140 |
+
training_args["learning_rate"] = self.config.learning_rate
|
| 141 |
+
training_args["weight_decay"] = self.config.weight_decay
|
| 142 |
+
training_args["warmup_steps"] = self.config.warmup_steps
|
| 143 |
+
training_args["max_steps"] = self.config.max_iters
|
| 144 |
+
training_args["save_steps"] = self.config.save_steps
|
| 145 |
+
training_args["eval_steps"] = self.config.eval_steps
|
| 146 |
+
training_args["logging_steps"] = self.config.logging_steps
|
| 147 |
+
training_args["save_total_limit"] = self.config.save_total_limit
|
| 148 |
+
training_args["eval_strategy"] = self.config.eval_strategy
|
| 149 |
+
training_args["metric_for_best_model"] = self.config.metric_for_best_model
|
| 150 |
+
training_args["greater_is_better"] = self.config.greater_is_better
|
| 151 |
+
training_args["load_best_model_at_end"] = self.config.load_best_model_at_end
|
| 152 |
+
training_args["fp16"] = self.config.fp16
|
| 153 |
+
training_args["bf16"] = self.config.bf16
|
| 154 |
+
training_args["ddp_backend"] = self.config.ddp_backend if torch.cuda.device_count() > 1 else None
|
| 155 |
+
training_args["ddp_find_unused_parameters"] = self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False
|
| 156 |
+
training_args["report_to"] = None
|
| 157 |
+
training_args["remove_unused_columns"] = False
|
| 158 |
+
training_args["dataloader_pin_memory"] = False
|
| 159 |
+
training_args["group_by_length"] = True
|
| 160 |
+
training_args["length_column_name"] = "length"
|
| 161 |
+
training_args["ignore_data_skip"] = False
|
| 162 |
+
training_args["seed"] = 42
|
| 163 |
+
training_args["data_seed"] = 42
|
| 164 |
+
training_args["dataloader_num_workers"] = getattr(self.config, 'dataloader_num_workers', 4)
|
| 165 |
+
training_args["max_grad_norm"] = getattr(self.config, 'max_grad_norm', 1.0)
|
| 166 |
+
training_args["optim"] = self.config.optimizer
|
| 167 |
+
training_args["lr_scheduler_type"] = self.config.scheduler
|
| 168 |
+
training_args["warmup_ratio"] = 0.1
|
| 169 |
+
training_args["save_strategy"] = "steps"
|
| 170 |
+
training_args["logging_strategy"] = "steps"
|
| 171 |
+
training_args["prediction_loss_only"] = True
|
| 172 |
+
except Exception as e:
|
| 173 |
+
logger.error(f"Error creating training arguments: {e}")
|
| 174 |
+
raise
|
| 175 |
|
| 176 |
# Override with kwargs
|
| 177 |
training_args.update(kwargs)
|