Training in progress, step 200
Browse files- model.safetensors +1 -1
- training.log +183 -0
- training_args.bin +1 -1
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3087467144
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f62d86b7414cfbdcdfec64a511511d957ee11d8f641399bd3e6895bf071830a2
|
| 3 |
size 3087467144
|
training.log
CHANGED
|
@@ -155,3 +155,186 @@ weight_decay=0.0,
|
|
| 155 |
)
|
| 156 |
2025-03-31 14:28:09 - INFO - __main__ - *** Initializing model kwargs ***
|
| 157 |
2025-03-31 14:28:57 - INFO - __main__ - *** Train ***
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
)
|
| 156 |
2025-03-31 14:28:09 - INFO - __main__ - *** Initializing model kwargs ***
|
| 157 |
2025-03-31 14:28:57 - INFO - __main__ - *** Train ***
|
| 158 |
+
2025-03-31 14:58:16 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='Qwen/Qwen2.5-1.5B-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=False, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
|
| 159 |
+
2025-03-31 14:58:16 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
|
| 160 |
+
2025-03-31 14:58:16 - INFO - __main__ - Training parameters SFTConfig(
|
| 161 |
+
_n_gpu=1,
|
| 162 |
+
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
|
| 163 |
+
adafactor=False,
|
| 164 |
+
adam_beta1=0.9,
|
| 165 |
+
adam_beta2=0.999,
|
| 166 |
+
adam_epsilon=1e-08,
|
| 167 |
+
auto_find_batch_size=False,
|
| 168 |
+
average_tokens_across_devices=False,
|
| 169 |
+
batch_eval_metrics=False,
|
| 170 |
+
benchmarks=[],
|
| 171 |
+
bf16=True,
|
| 172 |
+
bf16_full_eval=False,
|
| 173 |
+
callbacks=[],
|
| 174 |
+
chars_per_token=<CHARS_PER_TOKEN>,
|
| 175 |
+
chat_template=None,
|
| 176 |
+
data_seed=None,
|
| 177 |
+
dataloader_drop_last=False,
|
| 178 |
+
dataloader_num_workers=0,
|
| 179 |
+
dataloader_persistent_workers=False,
|
| 180 |
+
dataloader_pin_memory=True,
|
| 181 |
+
dataloader_prefetch_factor=None,
|
| 182 |
+
dataset_batch_size=None,
|
| 183 |
+
dataset_kwargs=None,
|
| 184 |
+
dataset_num_proc=8,
|
| 185 |
+
dataset_text_field=text,
|
| 186 |
+
ddp_backend=None,
|
| 187 |
+
ddp_broadcast_buffers=None,
|
| 188 |
+
ddp_bucket_cap_mb=None,
|
| 189 |
+
ddp_find_unused_parameters=None,
|
| 190 |
+
ddp_timeout=1800,
|
| 191 |
+
debug=[],
|
| 192 |
+
deepspeed=None,
|
| 193 |
+
disable_tqdm=False,
|
| 194 |
+
dispatch_batches=None,
|
| 195 |
+
do_eval=False,
|
| 196 |
+
do_predict=False,
|
| 197 |
+
do_train=False,
|
| 198 |
+
eval_accumulation_steps=None,
|
| 199 |
+
eval_delay=0,
|
| 200 |
+
eval_do_concat_batches=True,
|
| 201 |
+
eval_on_start=False,
|
| 202 |
+
eval_packing=None,
|
| 203 |
+
eval_steps=None,
|
| 204 |
+
eval_strategy=IntervalStrategy.NO,
|
| 205 |
+
eval_use_gather_object=False,
|
| 206 |
+
evaluation_strategy=None,
|
| 207 |
+
fp16=False,
|
| 208 |
+
fp16_backend=auto,
|
| 209 |
+
fp16_full_eval=False,
|
| 210 |
+
fp16_opt_level=O1,
|
| 211 |
+
fsdp=[],
|
| 212 |
+
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
|
| 213 |
+
fsdp_min_num_params=0,
|
| 214 |
+
fsdp_transformer_layer_cls_to_wrap=None,
|
| 215 |
+
full_determinism=False,
|
| 216 |
+
gradient_accumulation_steps=1,
|
| 217 |
+
gradient_checkpointing=True,
|
| 218 |
+
gradient_checkpointing_kwargs={'use_reentrant': False},
|
| 219 |
+
greater_is_better=None,
|
| 220 |
+
group_by_length=False,
|
| 221 |
+
half_precision_backend=auto,
|
| 222 |
+
hub_always_push=False,
|
| 223 |
+
hub_model_id=Qwen2.5-1.5B-Open-R1-Distill,
|
| 224 |
+
hub_model_revision=main,
|
| 225 |
+
hub_private_repo=None,
|
| 226 |
+
hub_strategy=HubStrategy.EVERY_SAVE,
|
| 227 |
+
hub_token=<HUB_TOKEN>,
|
| 228 |
+
ignore_data_skip=False,
|
| 229 |
+
include_for_metrics=[],
|
| 230 |
+
include_inputs_for_metrics=False,
|
| 231 |
+
include_num_input_tokens_seen=False,
|
| 232 |
+
include_tokens_per_second=False,
|
| 233 |
+
jit_mode_eval=False,
|
| 234 |
+
label_names=None,
|
| 235 |
+
label_smoothing_factor=0.0,
|
| 236 |
+
learning_rate=5e-05,
|
| 237 |
+
length_column_name=length,
|
| 238 |
+
load_best_model_at_end=False,
|
| 239 |
+
local_rank=0,
|
| 240 |
+
log_level=info,
|
| 241 |
+
log_level_replica=warning,
|
| 242 |
+
log_on_each_node=True,
|
| 243 |
+
logging_dir=data/Qwen2.5-1.5B-Open-R1-Distill/runs/Mar31_14-58-15_w004.ib.bridges2.psc.edu,
|
| 244 |
+
logging_first_step=False,
|
| 245 |
+
logging_nan_inf_filter=True,
|
| 246 |
+
logging_steps=5,
|
| 247 |
+
logging_strategy=IntervalStrategy.STEPS,
|
| 248 |
+
lr_scheduler_kwargs={'min_lr_rate': 0.1},
|
| 249 |
+
lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
|
| 250 |
+
max_grad_norm=1.0,
|
| 251 |
+
max_length=16384,
|
| 252 |
+
max_seq_length=None,
|
| 253 |
+
max_steps=-1,
|
| 254 |
+
metric_for_best_model=None,
|
| 255 |
+
model_init_kwargs=None,
|
| 256 |
+
mp_parameters=,
|
| 257 |
+
neftune_noise_alpha=None,
|
| 258 |
+
no_cuda=False,
|
| 259 |
+
num_of_sequences=None,
|
| 260 |
+
num_train_epochs=1,
|
| 261 |
+
optim=OptimizerNames.ADAMW_TORCH,
|
| 262 |
+
optim_args=None,
|
| 263 |
+
optim_target_modules=None,
|
| 264 |
+
output_dir=data/Qwen2.5-1.5B-Open-R1-Distill,
|
| 265 |
+
overwrite_hub_revision=False,
|
| 266 |
+
overwrite_output_dir=True,
|
| 267 |
+
packing=True,
|
| 268 |
+
past_index=-1,
|
| 269 |
+
per_device_eval_batch_size=16,
|
| 270 |
+
per_device_train_batch_size=16,
|
| 271 |
+
prediction_loss_only=False,
|
| 272 |
+
push_to_hub=True,
|
| 273 |
+
push_to_hub_model_id=None,
|
| 274 |
+
push_to_hub_organization=None,
|
| 275 |
+
push_to_hub_revision=False,
|
| 276 |
+
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
| 277 |
+
ray_scope=last,
|
| 278 |
+
remove_unused_columns=True,
|
| 279 |
+
report_to=['wandb'],
|
| 280 |
+
restore_callback_states_from_checkpoint=False,
|
| 281 |
+
resume_from_checkpoint=None,
|
| 282 |
+
run_name=data/Qwen2.5-1.5B-Open-R1-Distill,
|
| 283 |
+
save_on_each_node=False,
|
| 284 |
+
save_only_model=False,
|
| 285 |
+
save_safetensors=True,
|
| 286 |
+
save_steps=100,
|
| 287 |
+
save_strategy=SaveStrategy.STEPS,
|
| 288 |
+
save_total_limit=1,
|
| 289 |
+
seed=42,
|
| 290 |
+
skip_memory_metrics=True,
|
| 291 |
+
split_batches=None,
|
| 292 |
+
system_prompt=None,
|
| 293 |
+
tf32=None,
|
| 294 |
+
torch_compile=False,
|
| 295 |
+
torch_compile_backend=None,
|
| 296 |
+
torch_compile_mode=None,
|
| 297 |
+
torch_empty_cache_steps=None,
|
| 298 |
+
torchdynamo=None,
|
| 299 |
+
tpu_metrics_debug=False,
|
| 300 |
+
tpu_num_cores=None,
|
| 301 |
+
use_cpu=False,
|
| 302 |
+
use_ipex=False,
|
| 303 |
+
use_legacy_prediction_loop=False,
|
| 304 |
+
use_liger=True,
|
| 305 |
+
use_liger_kernel=False,
|
| 306 |
+
use_mps_device=False,
|
| 307 |
+
wandb_entity=None,
|
| 308 |
+
wandb_project=None,
|
| 309 |
+
warmup_ratio=0.05,
|
| 310 |
+
warmup_steps=0,
|
| 311 |
+
weight_decay=0.0,
|
| 312 |
+
)
|
| 313 |
+
2025-03-31 14:58:16 - INFO - __main__ - Checkpoint detected, resuming training at last_checkpoint='data/Qwen2.5-1.5B-Open-R1-Distill/checkpoint-100'.
|
| 314 |
+
2025-03-31 14:58:17 - INFO - __main__ - *** Initializing model kwargs ***
|
| 315 |
+
2025-03-31 14:58:19 - INFO - __main__ - *** Train ***
|
| 316 |
+
2025-03-31 14:58:19 - INFO - __main__ - Qwen2ForCausalLM(
|
| 317 |
+
(model): Qwen2Model(
|
| 318 |
+
(embed_tokens): Embedding(151936, 1536)
|
| 319 |
+
(layers): ModuleList(
|
| 320 |
+
(0-27): 28 x Qwen2DecoderLayer(
|
| 321 |
+
(self_attn): Qwen2Attention(
|
| 322 |
+
(q_proj): Linear(in_features=1536, out_features=1536, bias=True)
|
| 323 |
+
(k_proj): Linear(in_features=1536, out_features=256, bias=True)
|
| 324 |
+
(v_proj): Linear(in_features=1536, out_features=256, bias=True)
|
| 325 |
+
(o_proj): Linear(in_features=1536, out_features=1536, bias=False)
|
| 326 |
+
)
|
| 327 |
+
(mlp): LigerSwiGLUMLP(
|
| 328 |
+
(gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
|
| 329 |
+
(up_proj): Linear(in_features=1536, out_features=8960, bias=False)
|
| 330 |
+
(down_proj): Linear(in_features=8960, out_features=1536, bias=False)
|
| 331 |
+
)
|
| 332 |
+
(input_layernorm): LigerRMSNorm((0,), eps=1e-06, offset=0.0, in_place=True)
|
| 333 |
+
(post_attention_layernorm): LigerRMSNorm((0,), eps=1e-06, offset=0.0, in_place=True)
|
| 334 |
+
)
|
| 335 |
+
)
|
| 336 |
+
(norm): LigerRMSNorm((0,), eps=1e-06, offset=0.0, in_place=True)
|
| 337 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
| 338 |
+
)
|
| 339 |
+
(lm_head): Linear(in_features=1536, out_features=151936, bias=False)
|
| 340 |
+
)
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 7544
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aafec81a45a6293fa443b76e9d98f6b47b6c8150ec70128f1283c4044f7f5862
|
| 3 |
size 7544
|