HectorHe commited on
Commit
75fa31a
·
verified ·
1 Parent(s): 458a4f5

Training in progress, step 300

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70dffe839853cec5082e64301f24216ac8bb9b7f8c220b1c978cc0fdfc94a98e
3
  size 4902968072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc94db856a2a2fa155be180932048c6b9b89e2843937717b3cb802377db53cce
3
  size 4902968072
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bbde849303d62152cee16bdd245027b059e913b48f6dd727400105961c0a439
3
  size 419430528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:323a107fc50e953dc081aaa22f9b4579e59937d54a42ee32f2d3253f9182b0eb
3
  size 419430528
training.log CHANGED
@@ -2338,3 +2338,237 @@ Memory reserved: 36896.0
2338
  )
2339
  (lm_head): Linear(in_features=2048, out_features=102400, bias=False)
2340
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2338
  )
2339
  (lm_head): Linear(in_features=2048, out_features=102400, bias=False)
2340
  )
2341
+ 2025-05-02 11:51:33 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
2342
+ 2025-05-02 11:51:33 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='lmms-lab/Math10K', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
2343
+ 2025-05-02 11:51:33 - INFO - __main__ - Training parameters EfficientDistillationConfig(
2344
+ _n_gpu=1,
2345
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
2346
+ adafactor=False,
2347
+ adam_beta1=0.9,
2348
+ adam_beta2=0.999,
2349
+ adam_epsilon=1e-08,
2350
+ auto_find_batch_size=False,
2351
+ average_tokens_across_devices=False,
2352
+ batch_eval_metrics=False,
2353
+ benchmarks=[],
2354
+ bf16=True,
2355
+ bf16_full_eval=False,
2356
+ callbacks=[],
2357
+ chars_per_token=<CHARS_PER_TOKEN>,
2358
+ chat_template=None,
2359
+ data_seed=None,
2360
+ dataloader_drop_last=False,
2361
+ dataloader_num_workers=0,
2362
+ dataloader_persistent_workers=False,
2363
+ dataloader_pin_memory=True,
2364
+ dataloader_prefetch_factor=None,
2365
+ dataset_batch_size=None,
2366
+ dataset_kwargs=None,
2367
+ dataset_num_proc=None,
2368
+ dataset_text_field=text,
2369
+ ddp_backend=None,
2370
+ ddp_broadcast_buffers=None,
2371
+ ddp_bucket_cap_mb=None,
2372
+ ddp_find_unused_parameters=None,
2373
+ ddp_timeout=1800000000,
2374
+ debug=[],
2375
+ deepspeed=None,
2376
+ disable_dropout=True,
2377
+ disable_tqdm=False,
2378
+ dispatch_batches=None,
2379
+ do_eval=True,
2380
+ do_predict=False,
2381
+ do_train=False,
2382
+ eval_accumulation_steps=None,
2383
+ eval_delay=0,
2384
+ eval_do_concat_batches=True,
2385
+ eval_on_start=False,
2386
+ eval_packing=None,
2387
+ eval_steps=None,
2388
+ eval_strategy=IntervalStrategy.NO,
2389
+ eval_use_gather_object=False,
2390
+ evaluation_strategy=None,
2391
+ fp16=False,
2392
+ fp16_backend=auto,
2393
+ fp16_full_eval=False,
2394
+ fp16_opt_level=O1,
2395
+ fsdp=[],
2396
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
2397
+ fsdp_min_num_params=0,
2398
+ fsdp_transformer_layer_cls_to_wrap=None,
2399
+ full_determinism=False,
2400
+ gradient_accumulation_steps=1,
2401
+ gradient_checkpointing=False,
2402
+ gradient_checkpointing_kwargs={'use_reentrant': False},
2403
+ greater_is_better=None,
2404
+ group_by_length=False,
2405
+ half_precision_backend=auto,
2406
+ hub_always_push=False,
2407
+ hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill,
2408
+ hub_model_revision=main,
2409
+ hub_private_repo=None,
2410
+ hub_strategy=HubStrategy.EVERY_SAVE,
2411
+ hub_token=<HUB_TOKEN>,
2412
+ ignore_data_skip=False,
2413
+ include_for_metrics=[],
2414
+ include_inputs_for_metrics=False,
2415
+ include_num_input_tokens_seen=False,
2416
+ include_tokens_per_second=False,
2417
+ jit_mode_eval=False,
2418
+ label_names=None,
2419
+ label_smoothing_factor=0.0,
2420
+ learning_rate=2e-05,
2421
+ length_column_name=length,
2422
+ lmbda=0.0,
2423
+ load_best_model_at_end=False,
2424
+ local_rank=0,
2425
+ log_level=info,
2426
+ log_level_replica=warning,
2427
+ log_on_each_node=True,
2428
+ logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill_math10K/runs/May02_11-51-33_q-h100,
2429
+ logging_first_step=False,
2430
+ logging_nan_inf_filter=True,
2431
+ logging_steps=1,
2432
+ logging_strategy=IntervalStrategy.STEPS,
2433
+ loss_type=forward_kl,
2434
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
2435
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
2436
+ max_grad_norm=1.0,
2437
+ max_length=4096,
2438
+ max_new_tokens=1024,
2439
+ max_seq_length=None,
2440
+ max_steps=-1,
2441
+ metric_for_best_model=None,
2442
+ model_init_kwargs=None,
2443
+ mp_parameters=,
2444
+ neftune_noise_alpha=None,
2445
+ no_cuda=False,
2446
+ num_of_sequences=None,
2447
+ num_train_epochs=3,
2448
+ optim=OptimizerNames.ADAMW_TORCH,
2449
+ optim_args=None,
2450
+ optim_target_modules=None,
2451
+ output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill_math10K,
2452
+ overwrite_hub_revision=False,
2453
+ overwrite_output_dir=True,
2454
+ packing=False,
2455
+ past_index=-1,
2456
+ per_device_eval_batch_size=16,
2457
+ per_device_train_batch_size=4,
2458
+ prediction_loss_only=False,
2459
+ push_to_hub=True,
2460
+ push_to_hub_model_id=None,
2461
+ push_to_hub_organization=None,
2462
+ push_to_hub_revision=False,
2463
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
2464
+ ray_scope=last,
2465
+ reduction=sum,
2466
+ remove_unused_columns=True,
2467
+ report_to=['wandb'],
2468
+ restore_callback_states_from_checkpoint=False,
2469
+ resume_from_checkpoint=/home/deepseek/hector/test/data/DeepSeek-Coder-V2-Lite-Instruct/distill_math10K/checkpoint-200,
2470
+ run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill_math10K,
2471
+ save_on_each_node=False,
2472
+ save_only_model=False,
2473
+ save_safetensors=True,
2474
+ save_steps=60,
2475
+ save_strategy=SaveStrategy.STEPS,
2476
+ save_total_limit=2,
2477
+ seed=1234,
2478
+ skip_memory_metrics=True,
2479
+ split_batches=None,
2480
+ system_prompt=None,
2481
+ teacher_model_init_kwargs=None,
2482
+ teacher_model_name_or_path=None,
2483
+ temperature=0.9,
2484
+ tf32=None,
2485
+ torch_compile=False,
2486
+ torch_compile_backend=None,
2487
+ torch_compile_mode=None,
2488
+ torch_empty_cache_steps=None,
2489
+ torchdynamo=None,
2490
+ tpu_metrics_debug=False,
2491
+ tpu_num_cores=None,
2492
+ use_cpu=False,
2493
+ use_ipex=False,
2494
+ use_legacy_prediction_loop=False,
2495
+ use_liger=False,
2496
+ use_liger_kernel=False,
2497
+ use_mps_device=False,
2498
+ wandb_entity=None,
2499
+ wandb_project=None,
2500
+ warmup_ratio=0.1,
2501
+ warmup_steps=0,
2502
+ weight_decay=0.0,
2503
+ )
2504
+ 2025-05-02 11:51:37 - INFO - __main__ - *** Initializing model kwargs ***
2505
+ 2025-05-02 11:51:37 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill_math10K/top_6_experts_lmms-lab_Math10K.json: {'model.layers.1.mlp': [51, 61, 44, 45, 14, 22], 'model.layers.2.mlp': [27, 25, 18, 13, 3, 23], 'model.layers.3.mlp': [54, 25, 41, 23, 28, 57], 'model.layers.4.mlp': [37, 21, 33, 49, 11, 14], 'model.layers.5.mlp': [54, 47, 35, 20, 52, 9], 'model.layers.6.mlp': [22, 1, 13, 45, 42, 47], 'model.layers.7.mlp': [58, 43, 24, 18, 44, 62], 'model.layers.8.mlp': [47, 39, 56, 30, 54, 58], 'model.layers.9.mlp': [31, 13, 22, 24, 12, 32], 'model.layers.10.mlp': [47, 19, 42, 2, 13, 22], 'model.layers.11.mlp': [29, 11, 17, 10, 59, 22], 'model.layers.12.mlp': [5, 56, 3, 59, 4, 26], 'model.layers.13.mlp': [10, 42, 58, 14, 47, 17], 'model.layers.14.mlp': [51, 7, 27, 18, 31, 61], 'model.layers.15.mlp': [24, 55, 5, 17, 14, 41], 'model.layers.16.mlp': [61, 33, 63, 49, 19, 9], 'model.layers.17.mlp': [0, 26, 43, 32, 27, 29], 'model.layers.18.mlp': [5, 56, 42, 36, 2, 1], 'model.layers.19.mlp': [2, 23, 24, 36, 40, 0], 'model.layers.20.mlp': [1, 56, 38, 20, 48, 58], 'model.layers.21.mlp': [5, 13, 15, 28, 19, 10], 'model.layers.22.mlp': [58, 32, 31, 3, 45, 14], 'model.layers.23.mlp': [20, 0, 58, 45, 33, 42], 'model.layers.24.mlp': [62, 7, 42, 47, 10, 63], 'model.layers.25.mlp': [45, 48, 39, 11, 46, 38], 'model.layers.26.mlp': [46, 49, 6, 13, 11, 57]}
2506
+ 2025-05-02 11:51:37 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0
2507
+ Memory reserved: 0.0
2508
+ 2025-05-02 11:51:47 - INFO - __main__ - Model memory after loading model:Memory allocated: 0.0
2509
+ Memory reserved: 0.0
2510
+ 2025-05-02 11:51:47 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts...
2511
+ 2025-05-02 11:52:02 - INFO - __main__ - MoE layers replaced with Dense MLP layers
2512
+ 2025-05-02 11:52:02 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 0.0
2513
+ Memory reserved: 0.0
2514
+ 2025-05-02 11:52:02 - INFO - __main__ - Initializing EfficientDistillationTrainer...
2515
+ 2025-05-02 11:52:27 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 31126.0048828125
2516
+ Memory reserved: 36896.0
2517
+ 2025-05-02 11:52:27 - INFO - __main__ - *** Starting training ***
2518
+ 2025-05-02 11:52:27 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM(
2519
+ (model): DeepseekV2Model(
2520
+ (embed_tokens): Embedding(102400, 2048)
2521
+ (layers): ModuleList(
2522
+ (0): DeepseekV2DecoderLayer(
2523
+ (self_attn): DeepseekV2FlashAttention2(
2524
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
2525
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
2526
+ (kv_a_layernorm): DeepseekV2RMSNorm()
2527
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
2528
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
2529
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
2530
+ )
2531
+ (mlp): DeepseekV2MLP(
2532
+ (gate_proj): Linear(in_features=2048, out_features=10944, bias=False)
2533
+ (up_proj): Linear(in_features=2048, out_features=10944, bias=False)
2534
+ (down_proj): Linear(in_features=10944, out_features=2048, bias=False)
2535
+ (act_fn): SiLU()
2536
+ )
2537
+ (input_layernorm): DeepseekV2RMSNorm()
2538
+ (post_attention_layernorm): DeepseekV2RMSNorm()
2539
+ )
2540
+ (1-26): 26 x DeepseekV2DecoderLayer(
2541
+ (self_attn): DeepseekV2FlashAttention2(
2542
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
2543
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
2544
+ (kv_a_layernorm): DeepseekV2RMSNorm()
2545
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
2546
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
2547
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
2548
+ )
2549
+ (mlp): DeepseekV2MoE(
2550
+ (gate): MoEGate()
2551
+ (shared_experts): DeepseekV2MLP(
2552
+ (gate_proj): Linear(in_features=2048, out_features=2816, bias=False)
2553
+ (up_proj): Linear(in_features=2048, out_features=2816, bias=False)
2554
+ (down_proj): Linear(in_features=2816, out_features=2048, bias=False)
2555
+ (act_fn): SiLU()
2556
+ )
2557
+ (selected_experts): ModuleList(
2558
+ (0-5): 6 x DeepseekV2MLP(
2559
+ (gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
2560
+ (up_proj): Linear(in_features=2048, out_features=1408, bias=False)
2561
+ (down_proj): Linear(in_features=1408, out_features=2048, bias=False)
2562
+ (act_fn): SiLU()
2563
+ )
2564
+ )
2565
+ (experts): ModuleList()
2566
+ )
2567
+ (input_layernorm): DeepseekV2RMSNorm()
2568
+ (post_attention_layernorm): DeepseekV2RMSNorm()
2569
+ )
2570
+ )
2571
+ (norm): DeepseekV2RMSNorm()
2572
+ )
2573
+ (lm_head): Linear(in_features=2048, out_features=102400, bias=False)
2574
+ )
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:704190d4216fcaaa319f4fdcbc24ff07ace3d70a8844da8e10f26947ad156c65
3
- size 7864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db10de6e81e3aca9df73ad17f5b1758259f629be3e058bac7fd10d2ed152ecb4
3
+ size 7992