jayzou3773 commited on
Commit
88fe374
·
verified ·
1 Parent(s): 8d1b4fb

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: deepseek-ai/DeepSeek-V2-Lite
3
+ datasets: RoxanneWsyw/ESFT-summary
4
+ library_name: transformers
5
+ tags:
6
+ - generated_from_trainer
7
+ - open-r1
8
+ licence: license
9
+ ---
10
+
11
+ # Model Card for None
12
+
13
+ This model is a fine-tuned version of [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) on the [RoxanneWsyw/ESFT-summary](https://huggingface.co/datasets/RoxanneWsyw/ESFT-summary) dataset.
14
+ It has been trained using [TRL](https://github.com/huggingface/trl).
15
+
16
+ ## Quick start
17
+
18
+ ```python
19
+ from transformers import pipeline
20
+
21
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
22
+ generator = pipeline("text-generation", model="None", device="cuda")
23
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
24
+ print(output["generated_text"])
25
+ ```
26
+
27
+ ## Training procedure
28
+
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jayzxinkai-uc-san-diego/moe-honing/runs/h40v6ilw)
30
+
31
+
32
+ This model was trained with SFT.
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.16.0.dev0
37
+ - Transformers: 4.49.0
38
+ - Pytorch: 2.6.0
39
+ - Datasets: 4.8.3
40
+ - Tokenizers: 0.21.4
41
+
42
+ ## Citations
43
+
44
+
45
+
46
+ Cite TRL as:
47
+
48
+ ```bibtex
49
+ @misc{vonwerra2022trl,
50
+ title = {{TRL: Transformer Reinforcement Learning}},
51
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
52
+ year = 2020,
53
+ journal = {GitHub repository},
54
+ publisher = {GitHub},
55
+ howpublished = {\url{https://github.com/huggingface/trl}}
56
+ }
57
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 1.6538790970985021e+18,
3
+ "train_loss": 1.2438032255853926,
4
+ "train_runtime": 8525.9707,
5
+ "train_samples": 19587,
6
+ "train_samples_per_second": 2.297,
7
+ "train_steps_per_second": 0.144
8
+ }
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deepseek-ai/DeepSeek-V2-Lite",
3
+ "architectures": [
4
+ "DeepseekV2ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "deepseek-ai/DeepSeek-V2-Lite--configuration_deepseek.DeepseekV2Config",
10
+ "AutoModel": "deepseek-ai/DeepSeek-V2-Lite--modeling_deepseek.DeepseekV2Model",
11
+ "AutoModelForCausalLM": "deepseek-ai/DeepSeek-V2-Lite--modeling_deepseek.DeepseekV2ForCausalLM"
12
+ },
13
+ "aux_loss_alpha": 0.001,
14
+ "bos_token_id": 100000,
15
+ "eos_token_id": 100001,
16
+ "ep_size": 1,
17
+ "first_k_dense_replace": 1,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 2048,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 10944,
22
+ "kv_lora_rank": 512,
23
+ "max_position_embeddings": 163840,
24
+ "model_type": "deepseek_v2",
25
+ "moe_intermediate_size": 1408,
26
+ "moe_layer_freq": 1,
27
+ "n_group": 1,
28
+ "n_routed_experts": 64,
29
+ "n_shared_experts": 2,
30
+ "norm_topk_prob": false,
31
+ "num_attention_heads": 16,
32
+ "num_experts_per_tok": 6,
33
+ "num_hidden_layers": 27,
34
+ "num_key_value_heads": 16,
35
+ "pretraining_tp": 1,
36
+ "q_lora_rank": null,
37
+ "qk_nope_head_dim": 128,
38
+ "qk_rope_head_dim": 64,
39
+ "rms_norm_eps": 1e-06,
40
+ "rope_scaling": {
41
+ "beta_fast": 32,
42
+ "beta_slow": 1,
43
+ "factor": 40,
44
+ "mscale": 0.707,
45
+ "mscale_all_dim": 0.707,
46
+ "original_max_position_embeddings": 4096,
47
+ "type": "yarn"
48
+ },
49
+ "rope_theta": 10000,
50
+ "routed_scaling_factor": 1.0,
51
+ "scoring_func": "softmax",
52
+ "seq_aux": true,
53
+ "tie_word_embeddings": false,
54
+ "topk_group": 1,
55
+ "topk_method": "greedy",
56
+ "torch_dtype": "bfloat16",
57
+ "transformers_version": "4.49.0",
58
+ "use_cache": true,
59
+ "v_head_dim": 128,
60
+ "vocab_size": 102400
61
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100000,
4
+ "do_sample": true,
5
+ "eos_token_id": 100001,
6
+ "temperature": 0.3,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.49.0"
9
+ }
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2457ec9edd72f8351039664d1f3272d618e274e32c3c573adec6003d96feed98
3
+ size 4994763632
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a19ebf28284f39d2c5f2768fa231635834e1662c859b8787a2aa67f17a6a193c
3
+ size 4995044944
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d3541a3afdf6e54c41d1e64c12cb9e278d6306b3ce2d7332ef30e6cbef9f1b4
3
+ size 4996085000
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eadfb0e42fcd21ed166f5f016e4ea87d71d3ae1398e1858592b39e1cc05b5914
3
+ size 4996085224
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d8ad850f9127800df6c2a1146898b17704a9fb17f3432b84e88a42cb91f533c
3
+ size 4996085224
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d36d1ff84020644b29166297da9f378e304fbb4541ca0f13ba9857a809ea61
3
+ size 4995045792
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75d551b9802edc5ab1cb0f8a3d1ce0dbece82366e4b1495ba175666a8f778586
3
+ size 1440515736
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|end▁of▁sentence|>"
17
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "100000": {
7
+ "content": "<|begin▁of▁sentence|>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "100001": {
15
+ "content": "<|end▁of▁sentence|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ }
22
+ },
23
+ "bos_token": "<|begin▁of▁sentence|>",
24
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
25
+ "clean_up_tokenization_spaces": false,
26
+ "eos_token": "<|end▁of▁sentence|>",
27
+ "extra_special_tokens": {},
28
+ "fast_tokenizer": true,
29
+ "legacy": true,
30
+ "model_max_length": 16384,
31
+ "pad_token": "<|end▁of▁sentence|>",
32
+ "sp_model_kwargs": {},
33
+ "tokenizer_class": "LlamaTokenizerFast",
34
+ "unk_token": null,
35
+ "use_default_system_prompt": false
36
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 1.6538790970985021e+18,
3
+ "train_loss": 1.2438032255853926,
4
+ "train_runtime": 8525.9707,
5
+ "train_samples": 19587,
6
+ "train_samples_per_second": 2.297,
7
+ "train_steps_per_second": 0.144
8
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training.log ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-21 20:09:11 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-V2-Lite', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
2
+ 2026-03-21 20:09:11 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='RoxanneWsyw/ESFT-summary', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
3
+ 2026-03-21 20:09:11 - INFO - __main__ - Training parameters SFTConfig(
4
+ _n_gpu=1,
5
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
6
+ adafactor=False,
7
+ adam_beta1=0.9,
8
+ adam_beta2=0.999,
9
+ adam_epsilon=1e-08,
10
+ attn_kl_weight=1.0,
11
+ auto_find_batch_size=False,
12
+ average_tokens_across_devices=False,
13
+ batch_eval_metrics=False,
14
+ benchmarks=[],
15
+ bf16=True,
16
+ bf16_full_eval=False,
17
+ callbacks=[],
18
+ chars_per_token=<CHARS_PER_TOKEN>,
19
+ chat_template=None,
20
+ cluster_mode=hierarchical-dynamic,
21
+ cluster_num_groups=None,
22
+ cluster_prune_ratio=None,
23
+ cluster_prune_tau=1.0,
24
+ data_seed=None,
25
+ dataloader_drop_last=False,
26
+ dataloader_num_workers=0,
27
+ dataloader_persistent_workers=False,
28
+ dataloader_pin_memory=True,
29
+ dataloader_prefetch_factor=None,
30
+ dataset_batch_size=None,
31
+ dataset_kwargs=None,
32
+ dataset_num_proc=None,
33
+ dataset_text_field=text,
34
+ ddp_backend=None,
35
+ ddp_broadcast_buffers=None,
36
+ ddp_bucket_cap_mb=None,
37
+ ddp_find_unused_parameters=None,
38
+ ddp_timeout=1800000000,
39
+ debug=[],
40
+ deepspeed=None,
41
+ disable_teacher_dropout=True,
42
+ disable_tqdm=False,
43
+ dispatch_batches=None,
44
+ do_eval=True,
45
+ do_predict=False,
46
+ do_train=False,
47
+ entropy_slope_alpha=1.0,
48
+ entropy_slope_beta=1.0,
49
+ eval_accumulation_steps=None,
50
+ eval_delay=0,
51
+ eval_do_concat_batches=True,
52
+ eval_on_start=False,
53
+ eval_packing=None,
54
+ eval_steps=None,
55
+ eval_strategy=IntervalStrategy.NO,
56
+ eval_use_gather_object=False,
57
+ evaluation_strategy=None,
58
+ fp16=False,
59
+ fp16_backend=auto,
60
+ fp16_full_eval=False,
61
+ fp16_opt_level=O1,
62
+ fsdp=[],
63
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
64
+ fsdp_min_num_params=0,
65
+ fsdp_transformer_layer_cls_to_wrap=None,
66
+ full_determinism=False,
67
+ gradient_accumulation_steps=1,
68
+ gradient_checkpointing=True,
69
+ gradient_checkpointing_kwargs={'use_reentrant': False},
70
+ greater_is_better=None,
71
+ group_by_length=False,
72
+ half_precision_backend=auto,
73
+ hub_always_push=False,
74
+ hub_model_id=None,
75
+ hub_model_revision=main,
76
+ hub_private_repo=None,
77
+ hub_strategy=HubStrategy.EVERY_SAVE,
78
+ hub_token=<HUB_TOKEN>,
79
+ ignore_data_skip=False,
80
+ include_for_metrics=[],
81
+ include_inputs_for_metrics=False,
82
+ include_num_input_tokens_seen=False,
83
+ include_tokens_per_second=False,
84
+ jit_mode_eval=False,
85
+ label_names=None,
86
+ label_smoothing_factor=0.0,
87
+ last_entropy_weight=1.0,
88
+ layer_entropy_l1_layers=None,
89
+ layer_entropy_l1_weight=1.0,
90
+ learning_rate=1e-05,
91
+ length_column_name=length,
92
+ load_best_model_at_end=False,
93
+ local_rank=0,
94
+ log_level=info,
95
+ log_level_replica=warning,
96
+ log_on_each_node=True,
97
+ logging_dir=/project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft/runs/Mar21_20-09-09_orchard-community-1,
98
+ logging_first_step=False,
99
+ logging_nan_inf_filter=True,
100
+ logging_steps=1,
101
+ logging_strategy=IntervalStrategy.STEPS,
102
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
103
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
104
+ max_grad_norm=1.0,
105
+ max_length=4096,
106
+ max_seq_length=None,
107
+ max_steps=-1,
108
+ merging_metrics=None,
109
+ metric_for_best_model=None,
110
+ model_init_kwargs=None,
111
+ mp_parameters=,
112
+ neftune_noise_alpha=None,
113
+ no_cuda=False,
114
+ num_of_sequences=None,
115
+ num_train_epochs=1,
116
+ optim=OptimizerNames.ADAMW_TORCH,
117
+ optim_args=None,
118
+ optim_target_modules=None,
119
+ output_dir=/project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft,
120
+ overwrite_hub_revision=False,
121
+ overwrite_output_dir=True,
122
+ packing=False,
123
+ past_index=-1,
124
+ per_device_eval_batch_size=16,
125
+ per_device_train_batch_size=4,
126
+ prediction_loss_only=False,
127
+ push_to_hub=False,
128
+ push_to_hub_model_id=None,
129
+ push_to_hub_organization=None,
130
+ push_to_hub_revision=False,
131
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
132
+ ray_scope=last,
133
+ remove_unused_columns=True,
134
+ report_to=['wandb'],
135
+ restore_callback_states_from_checkpoint=False,
136
+ resume_from_checkpoint=None,
137
+ router_manual_mask=None,
138
+ router_prune_enable=True,
139
+ router_prune_expert_per_layer=None,
140
+ router_prune_interval=5,
141
+ router_prune_min_keep=1,
142
+ router_prune_start_step=None,
143
+ router_prune_step_size=32,
144
+ router_prune_use_plan=True,
145
+ run_name=/project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft,
146
+ save_on_each_node=False,
147
+ save_only_model=False,
148
+ save_safetensors=True,
149
+ save_steps=500,
150
+ save_strategy=SaveStrategy.NO,
151
+ save_total_limit=None,
152
+ seed=1234,
153
+ skip_memory_metrics=True,
154
+ split_batches=None,
155
+ system_prompt=None,
156
+ teacher_attn_implementation=None,
157
+ teacher_model_name_or_path=None,
158
+ teacher_model_revision=None,
159
+ teacher_torch_dtype=auto,
160
+ tf32=None,
161
+ torch_compile=False,
162
+ torch_compile_backend=None,
163
+ torch_compile_mode=None,
164
+ torch_empty_cache_steps=None,
165
+ torchdynamo=None,
166
+ tpu_metrics_debug=False,
167
+ tpu_num_cores=None,
168
+ use_cpu=False,
169
+ use_ipex=False,
170
+ use_legacy_prediction_loop=False,
171
+ use_liger=False,
172
+ use_liger_kernel=False,
173
+ use_mps_device=False,
174
+ wandb_entity=jayzxinkai-uc-san-diego,
175
+ wandb_project=moe-honing,
176
+ warmup_ratio=0.1,
177
+ warmup_steps=0,
178
+ weight_decay=0.0,
179
+ weight_feature_rank=None,
180
+ )
181
+ 2026-03-21 20:09:12 - INFO - datasets.builder - Found cached dataset esft-summary (/tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138)
182
+ 2026-03-21 20:09:12 - INFO - datasets.arrow_dataset - Caching processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-d8c6f402c91a2432.arrow
183
+ 2026-03-21 20:09:13 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-1975b68b541b14ab_*_of_00001.arrow
184
+ 2026-03-21 20:09:14 - INFO - __main__ - *** Initializing model kwargs ***
185
+ 2026-03-21 20:10:05 - INFO - datasets.arrow_dataset - Caching processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-6435616c9a34cd0e.arrow
186
+ 2026-03-21 20:10:06 - INFO - datasets.arrow_dataset - Caching processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-264b8deb588933bd.arrow
187
+ 2026-03-21 20:10:07 - INFO - datasets.arrow_dataset - Caching processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-ad7de21f8027b38f.arrow
188
+ 2026-03-21 20:10:34 - INFO - datasets.arrow_dataset - Caching processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-545571e95ce7027c.arrow
189
+ 2026-03-21 20:10:47 - INFO - __main__ - *** Train ***
190
+ 2026-03-21 20:10:47 - INFO - __main__ - DeepseekV2ForCausalLM(
191
+ (model): DeepseekV2Model(
192
+ (embed_tokens): Embedding(102400, 2048)
193
+ (layers): ModuleList(
194
+ (0): DeepseekV2DecoderLayer(
195
+ (self_attn): DeepseekV2FlashAttention2(
196
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
197
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
198
+ (kv_a_layernorm): DeepseekV2RMSNorm()
199
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
200
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
201
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
202
+ )
203
+ (mlp): DeepseekV2MLP(
204
+ (gate_proj): Linear(in_features=2048, out_features=10944, bias=False)
205
+ (up_proj): Linear(in_features=2048, out_features=10944, bias=False)
206
+ (down_proj): Linear(in_features=10944, out_features=2048, bias=False)
207
+ (act_fn): SiLU()
208
+ )
209
+ (input_layernorm): DeepseekV2RMSNorm()
210
+ (post_attention_layernorm): DeepseekV2RMSNorm()
211
+ )
212
+ (1-26): 26 x DeepseekV2DecoderLayer(
213
+ (self_attn): DeepseekV2FlashAttention2(
214
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
215
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
216
+ (kv_a_layernorm): DeepseekV2RMSNorm()
217
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
218
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
219
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
220
+ )
221
+ (mlp): DeepseekV2MoE(
222
+ (experts): ModuleList(
223
+ (0-63): 64 x DeepseekV2MLP(
224
+ (gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
225
+ (up_proj): Linear(in_features=2048, out_features=1408, bias=False)
226
+ (down_proj): Linear(in_features=1408, out_features=2048, bias=False)
227
+ (act_fn): SiLU()
228
+ )
229
+ )
230
+ (gate): MoEGate()
231
+ (shared_experts): DeepseekV2MLP(
232
+ (gate_proj): Linear(in_features=2048, out_features=2816, bias=False)
233
+ (up_proj): Linear(in_features=2048, out_features=2816, bias=False)
234
+ (down_proj): Linear(in_features=2816, out_features=2048, bias=False)
235
+ (act_fn): SiLU()
236
+ )
237
+ )
238
+ (input_layernorm): DeepseekV2RMSNorm()
239
+ (post_attention_layernorm): DeepseekV2RMSNorm()
240
+ )
241
+ )
242
+ (norm): DeepseekV2RMSNorm()
243
+ )
244
+ (lm_head): Linear(in_features=2048, out_features=102400, bias=False)
245
+ )
246
+ 2026-03-21 20:13:27 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-V2-Lite', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
247
+ 2026-03-21 20:13:27 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='RoxanneWsyw/ESFT-summary', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
248
+ 2026-03-21 20:13:27 - INFO - __main__ - Training parameters SFTConfig(
249
+ _n_gpu=1,
250
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
251
+ adafactor=False,
252
+ adam_beta1=0.9,
253
+ adam_beta2=0.999,
254
+ adam_epsilon=1e-08,
255
+ attn_kl_weight=1.0,
256
+ auto_find_batch_size=False,
257
+ average_tokens_across_devices=False,
258
+ batch_eval_metrics=False,
259
+ benchmarks=[],
260
+ bf16=True,
261
+ bf16_full_eval=False,
262
+ callbacks=[],
263
+ chars_per_token=<CHARS_PER_TOKEN>,
264
+ chat_template=None,
265
+ cluster_mode=hierarchical-dynamic,
266
+ cluster_num_groups=None,
267
+ cluster_prune_ratio=None,
268
+ cluster_prune_tau=1.0,
269
+ data_seed=None,
270
+ dataloader_drop_last=False,
271
+ dataloader_num_workers=0,
272
+ dataloader_persistent_workers=False,
273
+ dataloader_pin_memory=True,
274
+ dataloader_prefetch_factor=None,
275
+ dataset_batch_size=None,
276
+ dataset_kwargs=None,
277
+ dataset_num_proc=None,
278
+ dataset_text_field=text,
279
+ ddp_backend=None,
280
+ ddp_broadcast_buffers=None,
281
+ ddp_bucket_cap_mb=None,
282
+ ddp_find_unused_parameters=None,
283
+ ddp_timeout=1800000000,
284
+ debug=[],
285
+ deepspeed=None,
286
+ disable_teacher_dropout=True,
287
+ disable_tqdm=False,
288
+ dispatch_batches=None,
289
+ do_eval=True,
290
+ do_predict=False,
291
+ do_train=False,
292
+ entropy_slope_alpha=1.0,
293
+ entropy_slope_beta=1.0,
294
+ eval_accumulation_steps=None,
295
+ eval_delay=0,
296
+ eval_do_concat_batches=True,
297
+ eval_on_start=False,
298
+ eval_packing=None,
299
+ eval_steps=None,
300
+ eval_strategy=IntervalStrategy.NO,
301
+ eval_use_gather_object=False,
302
+ evaluation_strategy=None,
303
+ fp16=False,
304
+ fp16_backend=auto,
305
+ fp16_full_eval=False,
306
+ fp16_opt_level=O1,
307
+ fsdp=[],
308
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
309
+ fsdp_min_num_params=0,
310
+ fsdp_transformer_layer_cls_to_wrap=None,
311
+ full_determinism=False,
312
+ gradient_accumulation_steps=1,
313
+ gradient_checkpointing=True,
314
+ gradient_checkpointing_kwargs={'use_reentrant': False},
315
+ greater_is_better=None,
316
+ group_by_length=False,
317
+ half_precision_backend=auto,
318
+ hub_always_push=False,
319
+ hub_model_id=None,
320
+ hub_model_revision=main,
321
+ hub_private_repo=None,
322
+ hub_strategy=HubStrategy.EVERY_SAVE,
323
+ hub_token=<HUB_TOKEN>,
324
+ ignore_data_skip=False,
325
+ include_for_metrics=[],
326
+ include_inputs_for_metrics=False,
327
+ include_num_input_tokens_seen=False,
328
+ include_tokens_per_second=False,
329
+ jit_mode_eval=False,
330
+ label_names=None,
331
+ label_smoothing_factor=0.0,
332
+ last_entropy_weight=1.0,
333
+ layer_entropy_l1_layers=None,
334
+ layer_entropy_l1_weight=1.0,
335
+ learning_rate=1e-05,
336
+ length_column_name=length,
337
+ load_best_model_at_end=False,
338
+ local_rank=0,
339
+ log_level=info,
340
+ log_level_replica=warning,
341
+ log_on_each_node=True,
342
+ logging_dir=/project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft/runs/Mar21_20-13-25_orchard-community-1,
343
+ logging_first_step=False,
344
+ logging_nan_inf_filter=True,
345
+ logging_steps=1,
346
+ logging_strategy=IntervalStrategy.STEPS,
347
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
348
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
349
+ max_grad_norm=1.0,
350
+ max_length=4096,
351
+ max_seq_length=None,
352
+ max_steps=-1,
353
+ merging_metrics=None,
354
+ metric_for_best_model=None,
355
+ model_init_kwargs=None,
356
+ mp_parameters=,
357
+ neftune_noise_alpha=None,
358
+ no_cuda=False,
359
+ num_of_sequences=None,
360
+ num_train_epochs=1,
361
+ optim=OptimizerNames.ADAMW_TORCH,
362
+ optim_args=None,
363
+ optim_target_modules=None,
364
+ output_dir=/project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft,
365
+ overwrite_hub_revision=False,
366
+ overwrite_output_dir=True,
367
+ packing=False,
368
+ past_index=-1,
369
+ per_device_eval_batch_size=16,
370
+ per_device_train_batch_size=2,
371
+ prediction_loss_only=False,
372
+ push_to_hub=False,
373
+ push_to_hub_model_id=None,
374
+ push_to_hub_organization=None,
375
+ push_to_hub_revision=False,
376
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
377
+ ray_scope=last,
378
+ remove_unused_columns=True,
379
+ report_to=['wandb'],
380
+ restore_callback_states_from_checkpoint=False,
381
+ resume_from_checkpoint=None,
382
+ router_manual_mask=None,
383
+ router_prune_enable=True,
384
+ router_prune_expert_per_layer=None,
385
+ router_prune_interval=5,
386
+ router_prune_min_keep=1,
387
+ router_prune_start_step=None,
388
+ router_prune_step_size=32,
389
+ router_prune_use_plan=True,
390
+ run_name=/project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft,
391
+ save_on_each_node=False,
392
+ save_only_model=False,
393
+ save_safetensors=True,
394
+ save_steps=500,
395
+ save_strategy=SaveStrategy.NO,
396
+ save_total_limit=None,
397
+ seed=1234,
398
+ skip_memory_metrics=True,
399
+ split_batches=None,
400
+ system_prompt=None,
401
+ teacher_attn_implementation=None,
402
+ teacher_model_name_or_path=None,
403
+ teacher_model_revision=None,
404
+ teacher_torch_dtype=auto,
405
+ tf32=None,
406
+ torch_compile=False,
407
+ torch_compile_backend=None,
408
+ torch_compile_mode=None,
409
+ torch_empty_cache_steps=None,
410
+ torchdynamo=None,
411
+ tpu_metrics_debug=False,
412
+ tpu_num_cores=None,
413
+ use_cpu=False,
414
+ use_ipex=False,
415
+ use_legacy_prediction_loop=False,
416
+ use_liger=False,
417
+ use_liger_kernel=False,
418
+ use_mps_device=False,
419
+ wandb_entity=jayzxinkai-uc-san-diego,
420
+ wandb_project=moe-honing,
421
+ warmup_ratio=0.1,
422
+ warmup_steps=0,
423
+ weight_decay=0.0,
424
+ weight_feature_rank=None,
425
+ )
426
+ 2026-03-21 20:13:28 - INFO - datasets.builder - Found cached dataset esft-summary (/tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138)
427
+ 2026-03-21 20:13:28 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-d8c6f402c91a2432_*_of_00001.arrow
428
+ 2026-03-21 20:13:28 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-1975b68b541b14ab_*_of_00001.arrow
429
+ 2026-03-21 20:13:29 - INFO - __main__ - *** Initializing model kwargs ***
430
+ 2026-03-21 20:13:47 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-6435616c9a34cd0e_*_of_00001.arrow
431
+ 2026-03-21 20:13:47 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-264b8deb588933bd_*_of_00001.arrow
432
+ 2026-03-21 20:13:47 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-ad7de21f8027b38f_*_of_00001.arrow
433
+ 2026-03-21 20:13:47 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-545571e95ce7027c_*_of_00001.arrow
434
+ 2026-03-21 20:13:53 - INFO - __main__ - *** Train ***
435
+ 2026-03-21 20:13:53 - INFO - __main__ - DeepseekV2ForCausalLM(
436
+ (model): DeepseekV2Model(
437
+ (embed_tokens): Embedding(102400, 2048)
438
+ (layers): ModuleList(
439
+ (0): DeepseekV2DecoderLayer(
440
+ (self_attn): DeepseekV2FlashAttention2(
441
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
442
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
443
+ (kv_a_layernorm): DeepseekV2RMSNorm()
444
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
445
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
446
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
447
+ )
448
+ (mlp): DeepseekV2MLP(
449
+ (gate_proj): Linear(in_features=2048, out_features=10944, bias=False)
450
+ (up_proj): Linear(in_features=2048, out_features=10944, bias=False)
451
+ (down_proj): Linear(in_features=10944, out_features=2048, bias=False)
452
+ (act_fn): SiLU()
453
+ )
454
+ (input_layernorm): DeepseekV2RMSNorm()
455
+ (post_attention_layernorm): DeepseekV2RMSNorm()
456
+ )
457
+ (1-26): 26 x DeepseekV2DecoderLayer(
458
+ (self_attn): DeepseekV2FlashAttention2(
459
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
460
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
461
+ (kv_a_layernorm): DeepseekV2RMSNorm()
462
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
463
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
464
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
465
+ )
466
+ (mlp): DeepseekV2MoE(
467
+ (experts): ModuleList(
468
+ (0-63): 64 x DeepseekV2MLP(
469
+ (gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
470
+ (up_proj): Linear(in_features=2048, out_features=1408, bias=False)
471
+ (down_proj): Linear(in_features=1408, out_features=2048, bias=False)
472
+ (act_fn): SiLU()
473
+ )
474
+ )
475
+ (gate): MoEGate()
476
+ (shared_experts): DeepseekV2MLP(
477
+ (gate_proj): Linear(in_features=2048, out_features=2816, bias=False)
478
+ (up_proj): Linear(in_features=2048, out_features=2816, bias=False)
479
+ (down_proj): Linear(in_features=2816, out_features=2048, bias=False)
480
+ (act_fn): SiLU()
481
+ )
482
+ )
483
+ (input_layernorm): DeepseekV2RMSNorm()
484
+ (post_attention_layernorm): DeepseekV2RMSNorm()
485
+ )
486
+ )
487
+ (norm): DeepseekV2RMSNorm()
488
+ )
489
+ (lm_head): Linear(in_features=2048, out_features=102400, bias=False)
490
+ )
491
+ 2026-03-21 20:15:46 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-V2-Lite', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
492
+ 2026-03-21 20:15:46 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='RoxanneWsyw/ESFT-summary', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
493
+ 2026-03-21 20:15:46 - INFO - __main__ - Training parameters SFTConfig(
494
+ _n_gpu=1,
495
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
496
+ adafactor=False,
497
+ adam_beta1=0.9,
498
+ adam_beta2=0.999,
499
+ adam_epsilon=1e-08,
500
+ attn_kl_weight=1.0,
501
+ auto_find_batch_size=False,
502
+ average_tokens_across_devices=False,
503
+ batch_eval_metrics=False,
504
+ benchmarks=[],
505
+ bf16=True,
506
+ bf16_full_eval=False,
507
+ callbacks=[],
508
+ chars_per_token=<CHARS_PER_TOKEN>,
509
+ chat_template=None,
510
+ cluster_mode=hierarchical-dynamic,
511
+ cluster_num_groups=None,
512
+ cluster_prune_ratio=None,
513
+ cluster_prune_tau=1.0,
514
+ data_seed=None,
515
+ dataloader_drop_last=False,
516
+ dataloader_num_workers=0,
517
+ dataloader_persistent_workers=False,
518
+ dataloader_pin_memory=True,
519
+ dataloader_prefetch_factor=None,
520
+ dataset_batch_size=None,
521
+ dataset_kwargs=None,
522
+ dataset_num_proc=None,
523
+ dataset_text_field=text,
524
+ ddp_backend=None,
525
+ ddp_broadcast_buffers=None,
526
+ ddp_bucket_cap_mb=None,
527
+ ddp_find_unused_parameters=None,
528
+ ddp_timeout=1800000000,
529
+ debug=[],
530
+ deepspeed=None,
531
+ disable_teacher_dropout=True,
532
+ disable_tqdm=False,
533
+ dispatch_batches=None,
534
+ do_eval=True,
535
+ do_predict=False,
536
+ do_train=False,
537
+ entropy_slope_alpha=1.0,
538
+ entropy_slope_beta=1.0,
539
+ eval_accumulation_steps=None,
540
+ eval_delay=0,
541
+ eval_do_concat_batches=True,
542
+ eval_on_start=False,
543
+ eval_packing=None,
544
+ eval_steps=None,
545
+ eval_strategy=IntervalStrategy.NO,
546
+ eval_use_gather_object=False,
547
+ evaluation_strategy=None,
548
+ fp16=False,
549
+ fp16_backend=auto,
550
+ fp16_full_eval=False,
551
+ fp16_opt_level=O1,
552
+ fsdp=[],
553
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
554
+ fsdp_min_num_params=0,
555
+ fsdp_transformer_layer_cls_to_wrap=None,
556
+ full_determinism=False,
557
+ gradient_accumulation_steps=1,
558
+ gradient_checkpointing=True,
559
+ gradient_checkpointing_kwargs={'use_reentrant': False},
560
+ greater_is_better=None,
561
+ group_by_length=False,
562
+ half_precision_backend=auto,
563
+ hub_always_push=False,
564
+ hub_model_id=None,
565
+ hub_model_revision=main,
566
+ hub_private_repo=None,
567
+ hub_strategy=HubStrategy.EVERY_SAVE,
568
+ hub_token=<HUB_TOKEN>,
569
+ ignore_data_skip=False,
570
+ include_for_metrics=[],
571
+ include_inputs_for_metrics=False,
572
+ include_num_input_tokens_seen=False,
573
+ include_tokens_per_second=False,
574
+ jit_mode_eval=False,
575
+ label_names=None,
576
+ label_smoothing_factor=0.0,
577
+ last_entropy_weight=1.0,
578
+ layer_entropy_l1_layers=None,
579
+ layer_entropy_l1_weight=1.0,
580
+ learning_rate=1e-05,
581
+ length_column_name=length,
582
+ load_best_model_at_end=False,
583
+ local_rank=0,
584
+ log_level=info,
585
+ log_level_replica=warning,
586
+ log_on_each_node=True,
587
+ logging_dir=/project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft/runs/Mar21_20-15-44_orchard-community-1,
588
+ logging_first_step=False,
589
+ logging_nan_inf_filter=True,
590
+ logging_steps=1,
591
+ logging_strategy=IntervalStrategy.STEPS,
592
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
593
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
594
+ max_grad_norm=1.0,
595
+ max_length=4096,
596
+ max_seq_length=None,
597
+ max_steps=-1,
598
+ merging_metrics=None,
599
+ metric_for_best_model=None,
600
+ model_init_kwargs=None,
601
+ mp_parameters=,
602
+ neftune_noise_alpha=None,
603
+ no_cuda=False,
604
+ num_of_sequences=None,
605
+ num_train_epochs=1,
606
+ optim=OptimizerNames.ADAMW_TORCH,
607
+ optim_args=None,
608
+ optim_target_modules=None,
609
+ output_dir=/project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft,
610
+ overwrite_hub_revision=False,
611
+ overwrite_output_dir=True,
612
+ packing=False,
613
+ past_index=-1,
614
+ per_device_eval_batch_size=16,
615
+ per_device_train_batch_size=2,
616
+ prediction_loss_only=False,
617
+ push_to_hub=False,
618
+ push_to_hub_model_id=None,
619
+ push_to_hub_organization=None,
620
+ push_to_hub_revision=False,
621
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
622
+ ray_scope=last,
623
+ remove_unused_columns=True,
624
+ report_to=['wandb'],
625
+ restore_callback_states_from_checkpoint=False,
626
+ resume_from_checkpoint=None,
627
+ router_manual_mask=None,
628
+ router_prune_enable=True,
629
+ router_prune_expert_per_layer=None,
630
+ router_prune_interval=5,
631
+ router_prune_min_keep=1,
632
+ router_prune_start_step=None,
633
+ router_prune_step_size=32,
634
+ router_prune_use_plan=True,
635
+ run_name=/project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft,
636
+ save_on_each_node=False,
637
+ save_only_model=False,
638
+ save_safetensors=True,
639
+ save_steps=500,
640
+ save_strategy=SaveStrategy.NO,
641
+ save_total_limit=None,
642
+ seed=1234,
643
+ skip_memory_metrics=True,
644
+ split_batches=None,
645
+ system_prompt=None,
646
+ teacher_attn_implementation=None,
647
+ teacher_model_name_or_path=None,
648
+ teacher_model_revision=None,
649
+ teacher_torch_dtype=auto,
650
+ tf32=None,
651
+ torch_compile=False,
652
+ torch_compile_backend=None,
653
+ torch_compile_mode=None,
654
+ torch_empty_cache_steps=None,
655
+ torchdynamo=None,
656
+ tpu_metrics_debug=False,
657
+ tpu_num_cores=None,
658
+ use_cpu=False,
659
+ use_ipex=False,
660
+ use_legacy_prediction_loop=False,
661
+ use_liger=False,
662
+ use_liger_kernel=False,
663
+ use_mps_device=False,
664
+ wandb_entity=jayzxinkai-uc-san-diego,
665
+ wandb_project=moe-honing,
666
+ warmup_ratio=0.1,
667
+ warmup_steps=0,
668
+ weight_decay=0.0,
669
+ weight_feature_rank=None,
670
+ )
671
+ 2026-03-21 20:15:47 - INFO - datasets.builder - Found cached dataset esft-summary (/tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138)
672
+ 2026-03-21 20:15:47 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-d8c6f402c91a2432_*_of_00001.arrow
673
+ 2026-03-21 20:15:47 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-1975b68b541b14ab_*_of_00001.arrow
674
+ 2026-03-21 20:15:48 - INFO - __main__ - *** Initializing model kwargs ***
675
+ 2026-03-21 20:16:06 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-6435616c9a34cd0e_*_of_00001.arrow
676
+ 2026-03-21 20:16:06 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-264b8deb588933bd_*_of_00001.arrow
677
+ 2026-03-21 20:16:06 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-ad7de21f8027b38f_*_of_00001.arrow
678
+ 2026-03-21 20:16:06 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /tmp/hf_cache/datasets/RoxanneWsyw___esft-summary/default/0.0.0/70d8f41993d8681cb5ccb26c656f4b9f5e0f8138/cache-545571e95ce7027c_*_of_00001.arrow
679
+ 2026-03-21 20:16:12 - INFO - __main__ - *** Train ***
680
+ 2026-03-21 20:16:12 - INFO - __main__ - DeepseekV2ForCausalLM(
681
+ (model): DeepseekV2Model(
682
+ (embed_tokens): Embedding(102400, 2048)
683
+ (layers): ModuleList(
684
+ (0): DeepseekV2DecoderLayer(
685
+ (self_attn): DeepseekV2FlashAttention2(
686
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
687
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
688
+ (kv_a_layernorm): DeepseekV2RMSNorm()
689
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
690
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
691
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
692
+ )
693
+ (mlp): DeepseekV2MLP(
694
+ (gate_proj): Linear(in_features=2048, out_features=10944, bias=False)
695
+ (up_proj): Linear(in_features=2048, out_features=10944, bias=False)
696
+ (down_proj): Linear(in_features=10944, out_features=2048, bias=False)
697
+ (act_fn): SiLU()
698
+ )
699
+ (input_layernorm): DeepseekV2RMSNorm()
700
+ (post_attention_layernorm): DeepseekV2RMSNorm()
701
+ )
702
+ (1-26): 26 x DeepseekV2DecoderLayer(
703
+ (self_attn): DeepseekV2FlashAttention2(
704
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
705
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
706
+ (kv_a_layernorm): DeepseekV2RMSNorm()
707
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
708
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
709
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
710
+ )
711
+ (mlp): DeepseekV2MoE(
712
+ (experts): ModuleList(
713
+ (0-63): 64 x DeepseekV2MLP(
714
+ (gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
715
+ (up_proj): Linear(in_features=2048, out_features=1408, bias=False)
716
+ (down_proj): Linear(in_features=1408, out_features=2048, bias=False)
717
+ (act_fn): SiLU()
718
+ )
719
+ )
720
+ (gate): MoEGate()
721
+ (shared_experts): DeepseekV2MLP(
722
+ (gate_proj): Linear(in_features=2048, out_features=2816, bias=False)
723
+ (up_proj): Linear(in_features=2048, out_features=2816, bias=False)
724
+ (down_proj): Linear(in_features=2816, out_features=2048, bias=False)
725
+ (act_fn): SiLU()
726
+ )
727
+ )
728
+ (input_layernorm): DeepseekV2RMSNorm()
729
+ (post_attention_layernorm): DeepseekV2RMSNorm()
730
+ )
731
+ )
732
+ (norm): DeepseekV2RMSNorm()
733
+ )
734
+ (lm_head): Linear(in_features=2048, out_features=102400, bias=False)
735
+ )
736
+ 2026-03-21 22:39:38 - INFO - __main__ - *** Save model ***
737
+ 2026-03-21 22:41:57 - INFO - __main__ - Model saved to /project/flame/haozeh/llm-honing/sft_models/deepseek-summary-sft
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddef2d3e003e6daaba60e27c15bbfbcec4a6fad8cab4a14f3376255328e32b50
3
+ size 8504