AlbertTan commited on
Commit
07ea58f
·
verified ·
1 Parent(s): 22d130f

Upload 17 files

Browse files
logs/cot/qsa-gsm/llama-1b-cot/checkpoints/epoch7__step12056__monitor0.560.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:257e9b813b798f69f816f7e3d0ff9221f8721b61f29883f912c5f60df9d32a4c
3
+ size 54578518
logs/cot/qsa-gsm/llama-1b-cot/checkpoints/last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:241221c54f52ee820c7f7778459de032ea02650e08fff49e309821394fb6ba66
3
+ size 54578518
logs/cot/qsa-gsm/llama-1b-cot/events.out.tfevents.1746637111.tj-3008206-g-1b-cot-bs256-copy-master-0.46.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fea2c0aab04baaa84c9d9e9b61ffc95086ebd5d56ac7fc18aa881934a78c1ba2
3
+ size 201157
logs/cot/qsa-gsm/llama-1b-cot/hparams.yaml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ all_config:
2
+ trainer:
3
+ target: lightning.pytorch.trainer.Trainer
4
+ devices:
5
+ - 0
6
+ - 1
7
+ - 2
8
+ - 3
9
+ - 4
10
+ - 5
11
+ - 6
12
+ - 7
13
+ max_steps: -1
14
+ check_val_every_n_epoch: 1
15
+ log_every_n_steps: 10
16
+ num_sanity_val_steps: 2
17
+ gradient_clip_val: null
18
+ reload_dataloaders_every_n_epochs: 0
19
+ accumulate_grad_batches: 1
20
+ precision: bf16-mixed
21
+ use_distributed_sampler: true
22
+ strategy: auto
23
+ logger:
24
+ target: lightning.pytorch.loggers.TensorBoardLogger
25
+ save_dir: logs/cot
26
+ name: qsa-gsm
27
+ version: g-1b-cot-bs256
28
+ max_epochs: 50
29
+ callbacks:
30
+ - target: lightning.pytorch.callbacks.ModelCheckpoint
31
+ save_last: true
32
+ save_top_k: 3
33
+ mode: max
34
+ monitor: monitor
35
+ auto_insert_metric_name: false
36
+ filename: epoch{epoch}__step{step}__monitor{monitor:.3f}
37
+ save_weights_only: true
38
+ seed: null
39
+ model:
40
+ target: src.models.cot.LitCoT
41
+ model_kwargs:
42
+ model_id: Llama-3.2-1B-Instruct
43
+ depth: 1
44
+ sft_method: cot
45
+ do_lora: true
46
+ lora_config:
47
+ r: 128
48
+ lora_alpha: 32
49
+ answer_generation_config:
50
+ max_new_tokens: 128
51
+ do_sample: true
52
+ top_p: 0.9
53
+ temperature: 1.0
54
+ training_kwargs:
55
+ optimizer:
56
+ target: torch.optim.AdamW
57
+ lr: 0.0001
58
+ weight_decay: 0.01
59
+ use_scheduler: false
60
+ scheduler:
61
+ target: constant_schedule_with_warmup
62
+ warmup_steps: 1000
63
+ dataloader:
64
+ batch_size: 32
65
+ val_batch_size: 32
66
+ num_workers: 32
67
+ pin_memory: true
68
+ persistent_workers: true
69
+ data_module:
70
+ target: src.datasets.qsa.QSADataModule
71
+ dataset_name: gsm
72
+ tiny_dataset: false
73
+ epoch_scaling: 1
74
+ args:
75
+ model: cot
76
+ dataset: qsa
77
+ trainer: default
78
+ devices: all
79
+ no_log: false
80
+ log_suffix: g-1b-cot-bs256
81
+ resume_ckpt_path: null
82
+ load_ckpt_path: null
83
+ workspace_path: /workspace/images-ks3-starfs-hd/workspace/wenhui
84
+ do_test: true
85
+ test_ckpt_path: ''
86
+ test_times: 1
87
+ seed: 0
88
+ unkown_args:
89
+ dataset_name: gsm
90
+ model_id: Llama-3.2-1B-Instruct
91
+ sft_method: cot
92
+ batch_size: '256'
93
+ precision: bf16-mixed
94
+ max_new_tokens: '128'
logs/cot/qsa-gsm/llama-1b-cot/train.json ADDED
The diff for this file is too large to render. See raw diff
 
logs/cot/qsa-math/llama1b/checkpoints/epoch7__step1760__monitor0.170.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ccd12db32aff29ad5dce8ca9b6ebb65ac1a6a79909a180b7818cc5a44806d96
3
+ size 54583702
logs/cot/qsa-math/llama1b/events.out.tfevents.1746899762.tj-3008206-math-1b-cot-master-0.46.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b86d1b79e05922f704e9becc3dc749bd05c3fb538dcf48c147436985a78e60d4
3
+ size 77119
logs/cot/qsa-math/llama1b/events.out.tfevents.1746902039.tj-3008206-math-1b-cot-master-0.46.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9c4ec9e8af49ac8cd8d9ac6bc813a27a27ebd86056c5c1091b20a1247bb2e1a
3
+ size 16672
logs/cot/qsa-math/llama1b/hparams.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ all_config:
2
+ trainer:
3
+ target: lightning.pytorch.trainer.Trainer
4
+ devices:
5
+ - 0
6
+ - 1
7
+ - 2
8
+ - 3
9
+ - 4
10
+ - 5
11
+ - 6
12
+ - 7
13
+ max_steps: -1
14
+ check_val_every_n_epoch: 1
15
+ log_every_n_steps: 10
16
+ num_sanity_val_steps: 2
17
+ gradient_clip_val: null
18
+ reload_dataloaders_every_n_epochs: 0
19
+ accumulate_grad_batches: 1
20
+ precision: bf16-mixed
21
+ use_distributed_sampler: true
22
+ strategy: auto
23
+ logger:
24
+ target: lightning.pytorch.loggers.TensorBoardLogger
25
+ save_dir: logs/cot
26
+ name: qsa-math
27
+ version: llama1b
28
+ max_epochs: 25
29
+ callbacks:
30
+ - target: lightning.pytorch.callbacks.ModelCheckpoint
31
+ save_last: true
32
+ save_top_k: 3
33
+ mode: max
34
+ monitor: monitor
35
+ auto_insert_metric_name: false
36
+ filename: epoch{epoch}__step{step}__monitor{monitor:.3f}
37
+ save_weights_only: true
38
+ seed: null
39
+ model:
40
+ target: src.models.cot.LitCoT
41
+ model_kwargs:
42
+ model_id: Llama-3.2-1B-Instruct
43
+ depth: 1
44
+ sft_method: cot
45
+ set_pad_as_last_token: false
46
+ do_lora: true
47
+ lora_config:
48
+ r: 128
49
+ lora_alpha: 32
50
+ answer_generation_config:
51
+ max_new_tokens: 1024
52
+ do_sample: true
53
+ top_p: 0.9
54
+ temperature: 1.0
55
+ do_rl: false
56
+ rl_config:
57
+ filter_dataset: false
58
+ exp_batch_size: 8
59
+ group_size: 8
60
+ punish_latent_length: true
61
+ clip_grad_norm: 1.0
62
+ clip_eps: 0.2
63
+ use_latent_loss: true
64
+ use_answer_loss: true
65
+ n_train_samples_per_epoch: 4096
66
+ training_kwargs:
67
+ optimizer:
68
+ target: torch.optim.AdamW
69
+ lr: 0.0001
70
+ weight_decay: 0.01
71
+ use_scheduler: false
72
+ scheduler:
73
+ target: constant_schedule_with_warmup
74
+ warmup_steps: 1000
75
+ dataloader:
76
+ batch_size: 4
77
+ val_batch_size: 32
78
+ num_workers: 32
79
+ pin_memory: true
80
+ persistent_workers: true
81
+ data_module:
82
+ target: src.datasets.qsa.QSADataModule
83
+ dataset_name: math
84
+ tiny_dataset: false
85
+ epoch_scaling: 1
86
+ args:
87
+ model: latent_cot
88
+ dataset: qsa
89
+ trainer: default
90
+ devices: all
91
+ no_log: false
92
+ log_suffix: llama1b
93
+ resume_ckpt_path: null
94
+ load_ckpt_path: null
95
+ workspace_path:
96
+ do_test: true
97
+ test_ckpt_path: ''
98
+ test_times: 1
99
+ seed: 0
100
+ unkown_args:
101
+ dataset_name: math
102
+ model_id: Llama-3.2-1B-Instruct
103
+ batch_size: '32'
104
+ sft_method: cot
105
+ max_new_tokens: '1024'
logs/cot/qsa-math/llama1b/log.txt ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 20250511-015608:
2
+ Start training with model:
3
+ LitLatentReasoning(
4
+ (llm): PeftModel(
5
+ (base_model): LoraModel(
6
+ (model): LlamaForCausalLM(
7
+ (model): LlamaModel(
8
+ (embed_tokens): Embedding(128257, 2048)
9
+ (layers): ModuleList(
10
+ (0-15): 16 x LlamaDecoderLayer(
11
+ (self_attn): LlamaSdpaAttention(
12
+ (q_proj): lora.Linear(
13
+ (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
14
+ (lora_dropout): ModuleDict(
15
+ (default): Identity()
16
+ )
17
+ (lora_A): ModuleDict(
18
+ (default): Linear(in_features=2048, out_features=128, bias=False)
19
+ )
20
+ (lora_B): ModuleDict(
21
+ (default): Linear(in_features=128, out_features=2048, bias=False)
22
+ )
23
+ (lora_embedding_A): ParameterDict()
24
+ (lora_embedding_B): ParameterDict()
25
+ (lora_magnitude_vector): ModuleDict()
26
+ )
27
+ (k_proj): Linear(in_features=2048, out_features=512, bias=False)
28
+ (v_proj): lora.Linear(
29
+ (base_layer): Linear(in_features=2048, out_features=512, bias=False)
30
+ (lora_dropout): ModuleDict(
31
+ (default): Identity()
32
+ )
33
+ (lora_A): ModuleDict(
34
+ (default): Linear(in_features=2048, out_features=128, bias=False)
35
+ )
36
+ (lora_B): ModuleDict(
37
+ (default): Linear(in_features=128, out_features=512, bias=False)
38
+ )
39
+ (lora_embedding_A): ParameterDict()
40
+ (lora_embedding_B): ParameterDict()
41
+ (lora_magnitude_vector): ModuleDict()
42
+ )
43
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
44
+ (rotary_emb): LlamaRotaryEmbedding()
45
+ )
46
+ (mlp): LlamaMLP(
47
+ (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
48
+ (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
49
+ (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
50
+ (act_fn): SiLU()
51
+ )
52
+ (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
53
+ (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
54
+ )
55
+ )
56
+ (norm): LlamaRMSNorm((2048,), eps=1e-05)
57
+ (rotary_emb): LlamaRotaryEmbedding()
58
+ )
59
+ (lm_head): Linear(in_features=2048, out_features=128257, bias=False)
60
+ )
61
+ )
62
+ )
63
+ (embedding): Embedding(128257, 2048)
64
+ )
65
+ config:
66
+ {'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-015422_637954_math-1b-cot-bs64'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'Llama-3.2-1B-Instruct', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-1b-cot-bs64', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'Llama-3.2-1B-Instruct', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}
67
+ 20250511-023400:
68
+ Start testing with model:
69
+ LitLatentReasoning(
70
+ (llm): PeftModel(
71
+ (base_model): LoraModel(
72
+ (model): LlamaForCausalLM(
73
+ (model): LlamaModel(
74
+ (embed_tokens): Embedding(128257, 2048)
75
+ (layers): ModuleList(
76
+ (0-15): 16 x LlamaDecoderLayer(
77
+ (self_attn): LlamaSdpaAttention(
78
+ (q_proj): lora.Linear(
79
+ (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
80
+ (lora_dropout): ModuleDict(
81
+ (default): Identity()
82
+ )
83
+ (lora_A): ModuleDict(
84
+ (default): Linear(in_features=2048, out_features=128, bias=False)
85
+ )
86
+ (lora_B): ModuleDict(
87
+ (default): Linear(in_features=128, out_features=2048, bias=False)
88
+ )
89
+ (lora_embedding_A): ParameterDict()
90
+ (lora_embedding_B): ParameterDict()
91
+ (lora_magnitude_vector): ModuleDict()
92
+ )
93
+ (k_proj): Linear(in_features=2048, out_features=512, bias=False)
94
+ (v_proj): lora.Linear(
95
+ (base_layer): Linear(in_features=2048, out_features=512, bias=False)
96
+ (lora_dropout): ModuleDict(
97
+ (default): Identity()
98
+ )
99
+ (lora_A): ModuleDict(
100
+ (default): Linear(in_features=2048, out_features=128, bias=False)
101
+ )
102
+ (lora_B): ModuleDict(
103
+ (default): Linear(in_features=128, out_features=512, bias=False)
104
+ )
105
+ (lora_embedding_A): ParameterDict()
106
+ (lora_embedding_B): ParameterDict()
107
+ (lora_magnitude_vector): ModuleDict()
108
+ )
109
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
110
+ (rotary_emb): LlamaRotaryEmbedding()
111
+ )
112
+ (mlp): LlamaMLP(
113
+ (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
114
+ (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
115
+ (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
116
+ (act_fn): SiLU()
117
+ )
118
+ (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
119
+ (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
120
+ )
121
+ )
122
+ (norm): LlamaRMSNorm((2048,), eps=1e-05)
123
+ (rotary_emb): LlamaRotaryEmbedding()
124
+ )
125
+ (lm_head): Linear(in_features=2048, out_features=128257, bias=False)
126
+ )
127
+ )
128
+ )
129
+ (embedding): Embedding(128257, 2048)
130
+ )
131
+ config:
132
+ {'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-015422_637954_math-1b-cot-bs64'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'Llama-3.2-1B-Instruct', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-1b-cot-bs64', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'Llama-3.2-1B-Instruct', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}.
133
+ 20250511-024009:
134
+ Test results: defaultdict(<class 'list'>, {'monitor': [0.1034201979637146], 'test/acc': [0.1034201979637146], 'test/n_latent_forward': [211.8072052001953], 'test/output_length': [226.03562927246094]})
135
+ Test statistics with 1 replications: {'monitor': (0.1034201979637146, 0.0), 'test/acc': (0.1034201979637146, 0.0), 'test/n_latent_forward': (211.8072052001953, 0.0), 'test/output_length': (226.03562927246094, 0.0)}
logs/cot/qsa-math/llama1b/train.json ADDED
The diff for this file is too large to render. See raw diff
 
logs/cot/qsa-math/qw1b/checkpoints/epoch9__step2080__monitor0.247.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74d7eb5644e9adce03ddc0fbe41a3a180a65c4dd9510ec9d770b96b47445113f
3
+ size 69805714
logs/cot/qsa-math/qw1b/events.out.tfevents.1746903255.tj-3008206-math-qw1b-cot-bs32-master-0.46.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f7f40bd90676bc1f402258b3b2d741f03834fc5bd460dc6fac04526a013c229
3
+ size 74177
logs/cot/qsa-math/qw1b/events.out.tfevents.1746907319.tj-3008206-math-qw1b-cot-bs32-master-0.46.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27faf06bbc0591eae709d6ab140f04e04ba0d2882f9e1f2584789b1f99fee35a
3
+ size 16700
logs/cot/qsa-math/qw1b/hparams.yaml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ all_config:
2
+ trainer:
3
+ target: lightning.pytorch.trainer.Trainer
4
+ devices:
5
+ - 0
6
+ - 1
7
+ - 2
8
+ - 3
9
+ - 4
10
+ - 5
11
+ - 6
12
+ - 7
13
+ max_steps: -1
14
+ check_val_every_n_epoch: 1
15
+ log_every_n_steps: 10
16
+ num_sanity_val_steps: 2
17
+ gradient_clip_val: null
18
+ reload_dataloaders_every_n_epochs: 0
19
+ accumulate_grad_batches: 1
20
+ precision: bf16-mixed
21
+ use_distributed_sampler: true
22
+ strategy: auto
23
+ logger:
24
+ target: lightning.pytorch.loggers.TensorBoardLogger
25
+ save_dir: logs/cot
26
+ name: qsa-math
27
+ version: qw1b
28
+ max_epochs: 25
29
+ callbacks:
30
+ - target: lightning.pytorch.callbacks.ModelCheckpoint
31
+ save_last: true
32
+ save_top_k: 3
33
+ mode: max
34
+ monitor: monitor
35
+ auto_insert_metric_name: false
36
+ filename: epoch{epoch}__step{step}__monitor{monitor:.3f}
37
+ save_weights_only: true
38
+ seed: null
39
+ model:
40
+ target: src.models.latent_cot.LitLatentReasoning
41
+ model_kwargs:
42
+ model_id: DeepSeek-R1-Distill-Qwen-1.5B
43
+ depth: 1
44
+ sft_method: cot
45
+ set_pad_as_last_token: false
46
+ do_lora: true
47
+ lora_config:
48
+ r: 128
49
+ lora_alpha: 32
50
+ answer_generation_config:
51
+ max_new_tokens: 1024
52
+ do_sample: true
53
+ top_p: 0.9
54
+ temperature: 1.0
55
+ training_kwargs:
56
+ optimizer:
57
+ target: torch.optim.AdamW
58
+ lr: 0.0001
59
+ weight_decay: 0.01
60
+ use_scheduler: false
61
+ scheduler:
62
+ target: constant_schedule_with_warmup
63
+ warmup_steps: 1000
64
+ dataloader:
65
+ batch_size: 4
66
+ val_batch_size: 32
67
+ num_workers: 32
68
+ pin_memory: true
69
+ persistent_workers: true
70
+ data_module:
71
+ target: src.datasets.qsa.QSADataModule
72
+ dataset_name: math
73
+ tiny_dataset: false
74
+ epoch_scaling: 1
75
+ args:
76
+ model: cot
77
+ dataset: qsa
78
+ trainer: default
79
+ devices: all
80
+ no_log: false
81
+ log_suffix: qw1b
82
+ resume_ckpt_path: null
83
+ load_ckpt_path: null
84
+ workspace_path: /workspace/images-ks3-starfs-hd/workspace/wenhui
85
+ do_test: true
86
+ test_ckpt_path: ''
87
+ test_times: 1
88
+ seed: 0
89
+ unkown_args:
90
+ dataset_name: math
91
+ model_id: DeepSeek-R1-Distill-Qwen-1.5B
92
+ batch_size: '32'
93
+ sft_method: cot
94
+ max_new_tokens: '1024'
logs/cot/qsa-math/qw1b/log.txt ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 20250511-025424:
2
+ Start training with model:
3
+ LitLatentReasoning(
4
+ (llm): PeftModel(
5
+ (base_model): LoraModel(
6
+ (model): Qwen2ForCausalLM(
7
+ (model): Qwen2Model(
8
+ (embed_tokens): Embedding(151666, 1536)
9
+ (layers): ModuleList(
10
+ (0-27): 28 x Qwen2DecoderLayer(
11
+ (self_attn): Qwen2SdpaAttention(
12
+ (q_proj): lora.Linear(
13
+ (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
14
+ (lora_dropout): ModuleDict(
15
+ (default): Identity()
16
+ )
17
+ (lora_A): ModuleDict(
18
+ (default): Linear(in_features=1536, out_features=128, bias=False)
19
+ )
20
+ (lora_B): ModuleDict(
21
+ (default): Linear(in_features=128, out_features=1536, bias=False)
22
+ )
23
+ (lora_embedding_A): ParameterDict()
24
+ (lora_embedding_B): ParameterDict()
25
+ (lora_magnitude_vector): ModuleDict()
26
+ )
27
+ (k_proj): Linear(in_features=1536, out_features=256, bias=True)
28
+ (v_proj): lora.Linear(
29
+ (base_layer): Linear(in_features=1536, out_features=256, bias=True)
30
+ (lora_dropout): ModuleDict(
31
+ (default): Identity()
32
+ )
33
+ (lora_A): ModuleDict(
34
+ (default): Linear(in_features=1536, out_features=128, bias=False)
35
+ )
36
+ (lora_B): ModuleDict(
37
+ (default): Linear(in_features=128, out_features=256, bias=False)
38
+ )
39
+ (lora_embedding_A): ParameterDict()
40
+ (lora_embedding_B): ParameterDict()
41
+ (lora_magnitude_vector): ModuleDict()
42
+ )
43
+ (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
44
+ (rotary_emb): Qwen2RotaryEmbedding()
45
+ )
46
+ (mlp): Qwen2MLP(
47
+ (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
48
+ (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
49
+ (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
50
+ (act_fn): SiLU()
51
+ )
52
+ (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
53
+ (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
54
+ )
55
+ )
56
+ (norm): Qwen2RMSNorm((1536,), eps=1e-06)
57
+ (rotary_emb): Qwen2RotaryEmbedding()
58
+ )
59
+ (lm_head): Linear(in_features=1536, out_features=151666, bias=False)
60
+ )
61
+ )
62
+ )
63
+ (embedding): Embedding(151666, 1536)
64
+ )
65
+ config:
66
+ {'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-025110_355317_math-qw1b-cot-bs32'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-qw1b-cot-bs32', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}
67
+ 20250511-040159:
68
+ Start testing with model:
69
+ LitLatentReasoning(
70
+ (llm): PeftModel(
71
+ (base_model): LoraModel(
72
+ (model): Qwen2ForCausalLM(
73
+ (model): Qwen2Model(
74
+ (embed_tokens): Embedding(151666, 1536)
75
+ (layers): ModuleList(
76
+ (0-27): 28 x Qwen2DecoderLayer(
77
+ (self_attn): Qwen2SdpaAttention(
78
+ (q_proj): lora.Linear(
79
+ (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
80
+ (lora_dropout): ModuleDict(
81
+ (default): Identity()
82
+ )
83
+ (lora_A): ModuleDict(
84
+ (default): Linear(in_features=1536, out_features=128, bias=False)
85
+ )
86
+ (lora_B): ModuleDict(
87
+ (default): Linear(in_features=128, out_features=1536, bias=False)
88
+ )
89
+ (lora_embedding_A): ParameterDict()
90
+ (lora_embedding_B): ParameterDict()
91
+ (lora_magnitude_vector): ModuleDict()
92
+ )
93
+ (k_proj): Linear(in_features=1536, out_features=256, bias=True)
94
+ (v_proj): lora.Linear(
95
+ (base_layer): Linear(in_features=1536, out_features=256, bias=True)
96
+ (lora_dropout): ModuleDict(
97
+ (default): Identity()
98
+ )
99
+ (lora_A): ModuleDict(
100
+ (default): Linear(in_features=1536, out_features=128, bias=False)
101
+ )
102
+ (lora_B): ModuleDict(
103
+ (default): Linear(in_features=128, out_features=256, bias=False)
104
+ )
105
+ (lora_embedding_A): ParameterDict()
106
+ (lora_embedding_B): ParameterDict()
107
+ (lora_magnitude_vector): ModuleDict()
108
+ )
109
+ (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
110
+ (rotary_emb): Qwen2RotaryEmbedding()
111
+ )
112
+ (mlp): Qwen2MLP(
113
+ (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
114
+ (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
115
+ (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
116
+ (act_fn): SiLU()
117
+ )
118
+ (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
119
+ (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
120
+ )
121
+ )
122
+ (norm): Qwen2RMSNorm((1536,), eps=1e-06)
123
+ (rotary_emb): Qwen2RotaryEmbedding()
124
+ )
125
+ (lm_head): Linear(in_features=1536, out_features=151666, bias=False)
126
+ )
127
+ )
128
+ )
129
+ (embedding): Embedding(151666, 1536)
130
+ )
131
+ config:
132
+ {'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-025110_355317_math-qw1b-cot-bs32'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-qw1b-cot-bs32', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}.
133
+ 20250511-041020:
134
+ Test results: defaultdict(<class 'list'>, {'monitor': [0.2351384311914444], 'test/acc': [0.2351384311914444], 'test/n_latent_forward': [209.3857879638672], 'test/output_length': [223.18994140625]})
135
+ Test statistics with 1 replications: {'monitor': (0.2351384311914444, 0.0), 'test/acc': (0.2351384311914444, 0.0), 'test/n_latent_forward': (209.3857879638672, 0.0), 'test/output_length': (223.18994140625, 0.0)}
logs/cot/qsa-math/qw1b/train.json ADDED
The diff for this file is too large to render. See raw diff