Upload 17 files
Browse files- logs/cot/qsa-gsm/llama-1b-cot/checkpoints/epoch7__step12056__monitor0.560.ckpt +3 -0
- logs/cot/qsa-gsm/llama-1b-cot/checkpoints/last.ckpt +3 -0
- logs/cot/qsa-gsm/llama-1b-cot/events.out.tfevents.1746637111.tj-3008206-g-1b-cot-bs256-copy-master-0.46.0 +3 -0
- logs/cot/qsa-gsm/llama-1b-cot/hparams.yaml +94 -0
- logs/cot/qsa-gsm/llama-1b-cot/train.json +0 -0
- logs/cot/qsa-math/llama1b/checkpoints/epoch7__step1760__monitor0.170.ckpt +3 -0
- logs/cot/qsa-math/llama1b/events.out.tfevents.1746899762.tj-3008206-math-1b-cot-master-0.46.0 +3 -0
- logs/cot/qsa-math/llama1b/events.out.tfevents.1746902039.tj-3008206-math-1b-cot-master-0.46.1 +3 -0
- logs/cot/qsa-math/llama1b/hparams.yaml +105 -0
- logs/cot/qsa-math/llama1b/log.txt +135 -0
- logs/cot/qsa-math/llama1b/train.json +0 -0
- logs/cot/qsa-math/qw1b/checkpoints/epoch9__step2080__monitor0.247.ckpt +3 -0
- logs/cot/qsa-math/qw1b/events.out.tfevents.1746903255.tj-3008206-math-qw1b-cot-bs32-master-0.46.0 +3 -0
- logs/cot/qsa-math/qw1b/events.out.tfevents.1746907319.tj-3008206-math-qw1b-cot-bs32-master-0.46.1 +3 -0
- logs/cot/qsa-math/qw1b/hparams.yaml +94 -0
- logs/cot/qsa-math/qw1b/log.txt +135 -0
- logs/cot/qsa-math/qw1b/train.json +0 -0
logs/cot/qsa-gsm/llama-1b-cot/checkpoints/epoch7__step12056__monitor0.560.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:257e9b813b798f69f816f7e3d0ff9221f8721b61f29883f912c5f60df9d32a4c
|
| 3 |
+
size 54578518
|
logs/cot/qsa-gsm/llama-1b-cot/checkpoints/last.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:241221c54f52ee820c7f7778459de032ea02650e08fff49e309821394fb6ba66
|
| 3 |
+
size 54578518
|
logs/cot/qsa-gsm/llama-1b-cot/events.out.tfevents.1746637111.tj-3008206-g-1b-cot-bs256-copy-master-0.46.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fea2c0aab04baaa84c9d9e9b61ffc95086ebd5d56ac7fc18aa881934a78c1ba2
|
| 3 |
+
size 201157
|
logs/cot/qsa-gsm/llama-1b-cot/hparams.yaml
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
all_config:
|
| 2 |
+
trainer:
|
| 3 |
+
target: lightning.pytorch.trainer.Trainer
|
| 4 |
+
devices:
|
| 5 |
+
- 0
|
| 6 |
+
- 1
|
| 7 |
+
- 2
|
| 8 |
+
- 3
|
| 9 |
+
- 4
|
| 10 |
+
- 5
|
| 11 |
+
- 6
|
| 12 |
+
- 7
|
| 13 |
+
max_steps: -1
|
| 14 |
+
check_val_every_n_epoch: 1
|
| 15 |
+
log_every_n_steps: 10
|
| 16 |
+
num_sanity_val_steps: 2
|
| 17 |
+
gradient_clip_val: null
|
| 18 |
+
reload_dataloaders_every_n_epochs: 0
|
| 19 |
+
accumulate_grad_batches: 1
|
| 20 |
+
precision: bf16-mixed
|
| 21 |
+
use_distributed_sampler: true
|
| 22 |
+
strategy: auto
|
| 23 |
+
logger:
|
| 24 |
+
target: lightning.pytorch.loggers.TensorBoardLogger
|
| 25 |
+
save_dir: logs/cot
|
| 26 |
+
name: qsa-gsm
|
| 27 |
+
version: g-1b-cot-bs256
|
| 28 |
+
max_epochs: 50
|
| 29 |
+
callbacks:
|
| 30 |
+
- target: lightning.pytorch.callbacks.ModelCheckpoint
|
| 31 |
+
save_last: true
|
| 32 |
+
save_top_k: 3
|
| 33 |
+
mode: max
|
| 34 |
+
monitor: monitor
|
| 35 |
+
auto_insert_metric_name: false
|
| 36 |
+
filename: epoch{epoch}__step{step}__monitor{monitor:.3f}
|
| 37 |
+
save_weights_only: true
|
| 38 |
+
seed: null
|
| 39 |
+
model:
|
| 40 |
+
target: src.models.cot.LitCoT
|
| 41 |
+
model_kwargs:
|
| 42 |
+
model_id: Llama-3.2-1B-Instruct
|
| 43 |
+
depth: 1
|
| 44 |
+
sft_method: cot
|
| 45 |
+
do_lora: true
|
| 46 |
+
lora_config:
|
| 47 |
+
r: 128
|
| 48 |
+
lora_alpha: 32
|
| 49 |
+
answer_generation_config:
|
| 50 |
+
max_new_tokens: 128
|
| 51 |
+
do_sample: true
|
| 52 |
+
top_p: 0.9
|
| 53 |
+
temperature: 1.0
|
| 54 |
+
training_kwargs:
|
| 55 |
+
optimizer:
|
| 56 |
+
target: torch.optim.AdamW
|
| 57 |
+
lr: 0.0001
|
| 58 |
+
weight_decay: 0.01
|
| 59 |
+
use_scheduler: false
|
| 60 |
+
scheduler:
|
| 61 |
+
target: constant_schedule_with_warmup
|
| 62 |
+
warmup_steps: 1000
|
| 63 |
+
dataloader:
|
| 64 |
+
batch_size: 32
|
| 65 |
+
val_batch_size: 32
|
| 66 |
+
num_workers: 32
|
| 67 |
+
pin_memory: true
|
| 68 |
+
persistent_workers: true
|
| 69 |
+
data_module:
|
| 70 |
+
target: src.datasets.qsa.QSADataModule
|
| 71 |
+
dataset_name: gsm
|
| 72 |
+
tiny_dataset: false
|
| 73 |
+
epoch_scaling: 1
|
| 74 |
+
args:
|
| 75 |
+
model: cot
|
| 76 |
+
dataset: qsa
|
| 77 |
+
trainer: default
|
| 78 |
+
devices: all
|
| 79 |
+
no_log: false
|
| 80 |
+
log_suffix: g-1b-cot-bs256
|
| 81 |
+
resume_ckpt_path: null
|
| 82 |
+
load_ckpt_path: null
|
| 83 |
+
workspace_path: /workspace/images-ks3-starfs-hd/workspace/wenhui
|
| 84 |
+
do_test: true
|
| 85 |
+
test_ckpt_path: ''
|
| 86 |
+
test_times: 1
|
| 87 |
+
seed: 0
|
| 88 |
+
unkown_args:
|
| 89 |
+
dataset_name: gsm
|
| 90 |
+
model_id: Llama-3.2-1B-Instruct
|
| 91 |
+
sft_method: cot
|
| 92 |
+
batch_size: '256'
|
| 93 |
+
precision: bf16-mixed
|
| 94 |
+
max_new_tokens: '128'
|
logs/cot/qsa-gsm/llama-1b-cot/train.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
logs/cot/qsa-math/llama1b/checkpoints/epoch7__step1760__monitor0.170.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ccd12db32aff29ad5dce8ca9b6ebb65ac1a6a79909a180b7818cc5a44806d96
|
| 3 |
+
size 54583702
|
logs/cot/qsa-math/llama1b/events.out.tfevents.1746899762.tj-3008206-math-1b-cot-master-0.46.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b86d1b79e05922f704e9becc3dc749bd05c3fb538dcf48c147436985a78e60d4
|
| 3 |
+
size 77119
|
logs/cot/qsa-math/llama1b/events.out.tfevents.1746902039.tj-3008206-math-1b-cot-master-0.46.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9c4ec9e8af49ac8cd8d9ac6bc813a27a27ebd86056c5c1091b20a1247bb2e1a
|
| 3 |
+
size 16672
|
logs/cot/qsa-math/llama1b/hparams.yaml
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
all_config:
|
| 2 |
+
trainer:
|
| 3 |
+
target: lightning.pytorch.trainer.Trainer
|
| 4 |
+
devices:
|
| 5 |
+
- 0
|
| 6 |
+
- 1
|
| 7 |
+
- 2
|
| 8 |
+
- 3
|
| 9 |
+
- 4
|
| 10 |
+
- 5
|
| 11 |
+
- 6
|
| 12 |
+
- 7
|
| 13 |
+
max_steps: -1
|
| 14 |
+
check_val_every_n_epoch: 1
|
| 15 |
+
log_every_n_steps: 10
|
| 16 |
+
num_sanity_val_steps: 2
|
| 17 |
+
gradient_clip_val: null
|
| 18 |
+
reload_dataloaders_every_n_epochs: 0
|
| 19 |
+
accumulate_grad_batches: 1
|
| 20 |
+
precision: bf16-mixed
|
| 21 |
+
use_distributed_sampler: true
|
| 22 |
+
strategy: auto
|
| 23 |
+
logger:
|
| 24 |
+
target: lightning.pytorch.loggers.TensorBoardLogger
|
| 25 |
+
save_dir: logs/cot
|
| 26 |
+
name: qsa-math
|
| 27 |
+
version: llama1b
|
| 28 |
+
max_epochs: 25
|
| 29 |
+
callbacks:
|
| 30 |
+
- target: lightning.pytorch.callbacks.ModelCheckpoint
|
| 31 |
+
save_last: true
|
| 32 |
+
save_top_k: 3
|
| 33 |
+
mode: max
|
| 34 |
+
monitor: monitor
|
| 35 |
+
auto_insert_metric_name: false
|
| 36 |
+
filename: epoch{epoch}__step{step}__monitor{monitor:.3f}
|
| 37 |
+
save_weights_only: true
|
| 38 |
+
seed: null
|
| 39 |
+
model:
|
| 40 |
+
target: src.models.cot.LitCoT
|
| 41 |
+
model_kwargs:
|
| 42 |
+
model_id: Llama-3.2-1B-Instruct
|
| 43 |
+
depth: 1
|
| 44 |
+
sft_method: cot
|
| 45 |
+
set_pad_as_last_token: false
|
| 46 |
+
do_lora: true
|
| 47 |
+
lora_config:
|
| 48 |
+
r: 128
|
| 49 |
+
lora_alpha: 32
|
| 50 |
+
answer_generation_config:
|
| 51 |
+
max_new_tokens: 1024
|
| 52 |
+
do_sample: true
|
| 53 |
+
top_p: 0.9
|
| 54 |
+
temperature: 1.0
|
| 55 |
+
do_rl: false
|
| 56 |
+
rl_config:
|
| 57 |
+
filter_dataset: false
|
| 58 |
+
exp_batch_size: 8
|
| 59 |
+
group_size: 8
|
| 60 |
+
punish_latent_length: true
|
| 61 |
+
clip_grad_norm: 1.0
|
| 62 |
+
clip_eps: 0.2
|
| 63 |
+
use_latent_loss: true
|
| 64 |
+
use_answer_loss: true
|
| 65 |
+
n_train_samples_per_epoch: 4096
|
| 66 |
+
training_kwargs:
|
| 67 |
+
optimizer:
|
| 68 |
+
target: torch.optim.AdamW
|
| 69 |
+
lr: 0.0001
|
| 70 |
+
weight_decay: 0.01
|
| 71 |
+
use_scheduler: false
|
| 72 |
+
scheduler:
|
| 73 |
+
target: constant_schedule_with_warmup
|
| 74 |
+
warmup_steps: 1000
|
| 75 |
+
dataloader:
|
| 76 |
+
batch_size: 4
|
| 77 |
+
val_batch_size: 32
|
| 78 |
+
num_workers: 32
|
| 79 |
+
pin_memory: true
|
| 80 |
+
persistent_workers: true
|
| 81 |
+
data_module:
|
| 82 |
+
target: src.datasets.qsa.QSADataModule
|
| 83 |
+
dataset_name: math
|
| 84 |
+
tiny_dataset: false
|
| 85 |
+
epoch_scaling: 1
|
| 86 |
+
args:
|
| 87 |
+
model: latent_cot
|
| 88 |
+
dataset: qsa
|
| 89 |
+
trainer: default
|
| 90 |
+
devices: all
|
| 91 |
+
no_log: false
|
| 92 |
+
log_suffix: llama1b
|
| 93 |
+
resume_ckpt_path: null
|
| 94 |
+
load_ckpt_path: null
|
| 95 |
+
workspace_path:
|
| 96 |
+
do_test: true
|
| 97 |
+
test_ckpt_path: ''
|
| 98 |
+
test_times: 1
|
| 99 |
+
seed: 0
|
| 100 |
+
unkown_args:
|
| 101 |
+
dataset_name: math
|
| 102 |
+
model_id: Llama-3.2-1B-Instruct
|
| 103 |
+
batch_size: '32'
|
| 104 |
+
sft_method: cot
|
| 105 |
+
max_new_tokens: '1024'
|
logs/cot/qsa-math/llama1b/log.txt
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
20250511-015608:
|
| 2 |
+
Start training with model:
|
| 3 |
+
LitLatentReasoning(
|
| 4 |
+
(llm): PeftModel(
|
| 5 |
+
(base_model): LoraModel(
|
| 6 |
+
(model): LlamaForCausalLM(
|
| 7 |
+
(model): LlamaModel(
|
| 8 |
+
(embed_tokens): Embedding(128257, 2048)
|
| 9 |
+
(layers): ModuleList(
|
| 10 |
+
(0-15): 16 x LlamaDecoderLayer(
|
| 11 |
+
(self_attn): LlamaSdpaAttention(
|
| 12 |
+
(q_proj): lora.Linear(
|
| 13 |
+
(base_layer): Linear(in_features=2048, out_features=2048, bias=False)
|
| 14 |
+
(lora_dropout): ModuleDict(
|
| 15 |
+
(default): Identity()
|
| 16 |
+
)
|
| 17 |
+
(lora_A): ModuleDict(
|
| 18 |
+
(default): Linear(in_features=2048, out_features=128, bias=False)
|
| 19 |
+
)
|
| 20 |
+
(lora_B): ModuleDict(
|
| 21 |
+
(default): Linear(in_features=128, out_features=2048, bias=False)
|
| 22 |
+
)
|
| 23 |
+
(lora_embedding_A): ParameterDict()
|
| 24 |
+
(lora_embedding_B): ParameterDict()
|
| 25 |
+
(lora_magnitude_vector): ModuleDict()
|
| 26 |
+
)
|
| 27 |
+
(k_proj): Linear(in_features=2048, out_features=512, bias=False)
|
| 28 |
+
(v_proj): lora.Linear(
|
| 29 |
+
(base_layer): Linear(in_features=2048, out_features=512, bias=False)
|
| 30 |
+
(lora_dropout): ModuleDict(
|
| 31 |
+
(default): Identity()
|
| 32 |
+
)
|
| 33 |
+
(lora_A): ModuleDict(
|
| 34 |
+
(default): Linear(in_features=2048, out_features=128, bias=False)
|
| 35 |
+
)
|
| 36 |
+
(lora_B): ModuleDict(
|
| 37 |
+
(default): Linear(in_features=128, out_features=512, bias=False)
|
| 38 |
+
)
|
| 39 |
+
(lora_embedding_A): ParameterDict()
|
| 40 |
+
(lora_embedding_B): ParameterDict()
|
| 41 |
+
(lora_magnitude_vector): ModuleDict()
|
| 42 |
+
)
|
| 43 |
+
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
| 44 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 45 |
+
)
|
| 46 |
+
(mlp): LlamaMLP(
|
| 47 |
+
(gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
|
| 48 |
+
(up_proj): Linear(in_features=2048, out_features=8192, bias=False)
|
| 49 |
+
(down_proj): Linear(in_features=8192, out_features=2048, bias=False)
|
| 50 |
+
(act_fn): SiLU()
|
| 51 |
+
)
|
| 52 |
+
(input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
|
| 53 |
+
(post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
(norm): LlamaRMSNorm((2048,), eps=1e-05)
|
| 57 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 58 |
+
)
|
| 59 |
+
(lm_head): Linear(in_features=2048, out_features=128257, bias=False)
|
| 60 |
+
)
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
(embedding): Embedding(128257, 2048)
|
| 64 |
+
)
|
| 65 |
+
config:
|
| 66 |
+
{'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-015422_637954_math-1b-cot-bs64'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'Llama-3.2-1B-Instruct', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-1b-cot-bs64', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'Llama-3.2-1B-Instruct', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}
|
| 67 |
+
20250511-023400:
|
| 68 |
+
Start testing with model:
|
| 69 |
+
LitLatentReasoning(
|
| 70 |
+
(llm): PeftModel(
|
| 71 |
+
(base_model): LoraModel(
|
| 72 |
+
(model): LlamaForCausalLM(
|
| 73 |
+
(model): LlamaModel(
|
| 74 |
+
(embed_tokens): Embedding(128257, 2048)
|
| 75 |
+
(layers): ModuleList(
|
| 76 |
+
(0-15): 16 x LlamaDecoderLayer(
|
| 77 |
+
(self_attn): LlamaSdpaAttention(
|
| 78 |
+
(q_proj): lora.Linear(
|
| 79 |
+
(base_layer): Linear(in_features=2048, out_features=2048, bias=False)
|
| 80 |
+
(lora_dropout): ModuleDict(
|
| 81 |
+
(default): Identity()
|
| 82 |
+
)
|
| 83 |
+
(lora_A): ModuleDict(
|
| 84 |
+
(default): Linear(in_features=2048, out_features=128, bias=False)
|
| 85 |
+
)
|
| 86 |
+
(lora_B): ModuleDict(
|
| 87 |
+
(default): Linear(in_features=128, out_features=2048, bias=False)
|
| 88 |
+
)
|
| 89 |
+
(lora_embedding_A): ParameterDict()
|
| 90 |
+
(lora_embedding_B): ParameterDict()
|
| 91 |
+
(lora_magnitude_vector): ModuleDict()
|
| 92 |
+
)
|
| 93 |
+
(k_proj): Linear(in_features=2048, out_features=512, bias=False)
|
| 94 |
+
(v_proj): lora.Linear(
|
| 95 |
+
(base_layer): Linear(in_features=2048, out_features=512, bias=False)
|
| 96 |
+
(lora_dropout): ModuleDict(
|
| 97 |
+
(default): Identity()
|
| 98 |
+
)
|
| 99 |
+
(lora_A): ModuleDict(
|
| 100 |
+
(default): Linear(in_features=2048, out_features=128, bias=False)
|
| 101 |
+
)
|
| 102 |
+
(lora_B): ModuleDict(
|
| 103 |
+
(default): Linear(in_features=128, out_features=512, bias=False)
|
| 104 |
+
)
|
| 105 |
+
(lora_embedding_A): ParameterDict()
|
| 106 |
+
(lora_embedding_B): ParameterDict()
|
| 107 |
+
(lora_magnitude_vector): ModuleDict()
|
| 108 |
+
)
|
| 109 |
+
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
| 110 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 111 |
+
)
|
| 112 |
+
(mlp): LlamaMLP(
|
| 113 |
+
(gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
|
| 114 |
+
(up_proj): Linear(in_features=2048, out_features=8192, bias=False)
|
| 115 |
+
(down_proj): Linear(in_features=8192, out_features=2048, bias=False)
|
| 116 |
+
(act_fn): SiLU()
|
| 117 |
+
)
|
| 118 |
+
(input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
|
| 119 |
+
(post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
|
| 120 |
+
)
|
| 121 |
+
)
|
| 122 |
+
(norm): LlamaRMSNorm((2048,), eps=1e-05)
|
| 123 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 124 |
+
)
|
| 125 |
+
(lm_head): Linear(in_features=2048, out_features=128257, bias=False)
|
| 126 |
+
)
|
| 127 |
+
)
|
| 128 |
+
)
|
| 129 |
+
(embedding): Embedding(128257, 2048)
|
| 130 |
+
)
|
| 131 |
+
config:
|
| 132 |
+
{'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-015422_637954_math-1b-cot-bs64'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'Llama-3.2-1B-Instruct', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-1b-cot-bs64', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'Llama-3.2-1B-Instruct', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}.
|
| 133 |
+
20250511-024009:
|
| 134 |
+
Test results: defaultdict(<class 'list'>, {'monitor': [0.1034201979637146], 'test/acc': [0.1034201979637146], 'test/n_latent_forward': [211.8072052001953], 'test/output_length': [226.03562927246094]})
|
| 135 |
+
Test statistics with 1 replications: {'monitor': (0.1034201979637146, 0.0), 'test/acc': (0.1034201979637146, 0.0), 'test/n_latent_forward': (211.8072052001953, 0.0), 'test/output_length': (226.03562927246094, 0.0)}
|
logs/cot/qsa-math/llama1b/train.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
logs/cot/qsa-math/qw1b/checkpoints/epoch9__step2080__monitor0.247.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74d7eb5644e9adce03ddc0fbe41a3a180a65c4dd9510ec9d770b96b47445113f
|
| 3 |
+
size 69805714
|
logs/cot/qsa-math/qw1b/events.out.tfevents.1746903255.tj-3008206-math-qw1b-cot-bs32-master-0.46.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f7f40bd90676bc1f402258b3b2d741f03834fc5bd460dc6fac04526a013c229
|
| 3 |
+
size 74177
|
logs/cot/qsa-math/qw1b/events.out.tfevents.1746907319.tj-3008206-math-qw1b-cot-bs32-master-0.46.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:27faf06bbc0591eae709d6ab140f04e04ba0d2882f9e1f2584789b1f99fee35a
|
| 3 |
+
size 16700
|
logs/cot/qsa-math/qw1b/hparams.yaml
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
all_config:
|
| 2 |
+
trainer:
|
| 3 |
+
target: lightning.pytorch.trainer.Trainer
|
| 4 |
+
devices:
|
| 5 |
+
- 0
|
| 6 |
+
- 1
|
| 7 |
+
- 2
|
| 8 |
+
- 3
|
| 9 |
+
- 4
|
| 10 |
+
- 5
|
| 11 |
+
- 6
|
| 12 |
+
- 7
|
| 13 |
+
max_steps: -1
|
| 14 |
+
check_val_every_n_epoch: 1
|
| 15 |
+
log_every_n_steps: 10
|
| 16 |
+
num_sanity_val_steps: 2
|
| 17 |
+
gradient_clip_val: null
|
| 18 |
+
reload_dataloaders_every_n_epochs: 0
|
| 19 |
+
accumulate_grad_batches: 1
|
| 20 |
+
precision: bf16-mixed
|
| 21 |
+
use_distributed_sampler: true
|
| 22 |
+
strategy: auto
|
| 23 |
+
logger:
|
| 24 |
+
target: lightning.pytorch.loggers.TensorBoardLogger
|
| 25 |
+
save_dir: logs/cot
|
| 26 |
+
name: qsa-math
|
| 27 |
+
version: qw1b
|
| 28 |
+
max_epochs: 25
|
| 29 |
+
callbacks:
|
| 30 |
+
- target: lightning.pytorch.callbacks.ModelCheckpoint
|
| 31 |
+
save_last: true
|
| 32 |
+
save_top_k: 3
|
| 33 |
+
mode: max
|
| 34 |
+
monitor: monitor
|
| 35 |
+
auto_insert_metric_name: false
|
| 36 |
+
filename: epoch{epoch}__step{step}__monitor{monitor:.3f}
|
| 37 |
+
save_weights_only: true
|
| 38 |
+
seed: null
|
| 39 |
+
model:
|
| 40 |
+
target: src.models.latent_cot.LitLatentReasoning
|
| 41 |
+
model_kwargs:
|
| 42 |
+
model_id: DeepSeek-R1-Distill-Qwen-1.5B
|
| 43 |
+
depth: 1
|
| 44 |
+
sft_method: cot
|
| 45 |
+
set_pad_as_last_token: false
|
| 46 |
+
do_lora: true
|
| 47 |
+
lora_config:
|
| 48 |
+
r: 128
|
| 49 |
+
lora_alpha: 32
|
| 50 |
+
answer_generation_config:
|
| 51 |
+
max_new_tokens: 1024
|
| 52 |
+
do_sample: true
|
| 53 |
+
top_p: 0.9
|
| 54 |
+
temperature: 1.0
|
| 55 |
+
training_kwargs:
|
| 56 |
+
optimizer:
|
| 57 |
+
target: torch.optim.AdamW
|
| 58 |
+
lr: 0.0001
|
| 59 |
+
weight_decay: 0.01
|
| 60 |
+
use_scheduler: false
|
| 61 |
+
scheduler:
|
| 62 |
+
target: constant_schedule_with_warmup
|
| 63 |
+
warmup_steps: 1000
|
| 64 |
+
dataloader:
|
| 65 |
+
batch_size: 4
|
| 66 |
+
val_batch_size: 32
|
| 67 |
+
num_workers: 32
|
| 68 |
+
pin_memory: true
|
| 69 |
+
persistent_workers: true
|
| 70 |
+
data_module:
|
| 71 |
+
target: src.datasets.qsa.QSADataModule
|
| 72 |
+
dataset_name: math
|
| 73 |
+
tiny_dataset: false
|
| 74 |
+
epoch_scaling: 1
|
| 75 |
+
args:
|
| 76 |
+
model: cot
|
| 77 |
+
dataset: qsa
|
| 78 |
+
trainer: default
|
| 79 |
+
devices: all
|
| 80 |
+
no_log: false
|
| 81 |
+
log_suffix: qw1b
|
| 82 |
+
resume_ckpt_path: null
|
| 83 |
+
load_ckpt_path: null
|
| 84 |
+
workspace_path: /workspace/images-ks3-starfs-hd/workspace/wenhui
|
| 85 |
+
do_test: true
|
| 86 |
+
test_ckpt_path: ''
|
| 87 |
+
test_times: 1
|
| 88 |
+
seed: 0
|
| 89 |
+
unkown_args:
|
| 90 |
+
dataset_name: math
|
| 91 |
+
model_id: DeepSeek-R1-Distill-Qwen-1.5B
|
| 92 |
+
batch_size: '32'
|
| 93 |
+
sft_method: cot
|
| 94 |
+
max_new_tokens: '1024'
|
logs/cot/qsa-math/qw1b/log.txt
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
20250511-025424:
|
| 2 |
+
Start training with model:
|
| 3 |
+
LitLatentReasoning(
|
| 4 |
+
(llm): PeftModel(
|
| 5 |
+
(base_model): LoraModel(
|
| 6 |
+
(model): Qwen2ForCausalLM(
|
| 7 |
+
(model): Qwen2Model(
|
| 8 |
+
(embed_tokens): Embedding(151666, 1536)
|
| 9 |
+
(layers): ModuleList(
|
| 10 |
+
(0-27): 28 x Qwen2DecoderLayer(
|
| 11 |
+
(self_attn): Qwen2SdpaAttention(
|
| 12 |
+
(q_proj): lora.Linear(
|
| 13 |
+
(base_layer): Linear(in_features=1536, out_features=1536, bias=True)
|
| 14 |
+
(lora_dropout): ModuleDict(
|
| 15 |
+
(default): Identity()
|
| 16 |
+
)
|
| 17 |
+
(lora_A): ModuleDict(
|
| 18 |
+
(default): Linear(in_features=1536, out_features=128, bias=False)
|
| 19 |
+
)
|
| 20 |
+
(lora_B): ModuleDict(
|
| 21 |
+
(default): Linear(in_features=128, out_features=1536, bias=False)
|
| 22 |
+
)
|
| 23 |
+
(lora_embedding_A): ParameterDict()
|
| 24 |
+
(lora_embedding_B): ParameterDict()
|
| 25 |
+
(lora_magnitude_vector): ModuleDict()
|
| 26 |
+
)
|
| 27 |
+
(k_proj): Linear(in_features=1536, out_features=256, bias=True)
|
| 28 |
+
(v_proj): lora.Linear(
|
| 29 |
+
(base_layer): Linear(in_features=1536, out_features=256, bias=True)
|
| 30 |
+
(lora_dropout): ModuleDict(
|
| 31 |
+
(default): Identity()
|
| 32 |
+
)
|
| 33 |
+
(lora_A): ModuleDict(
|
| 34 |
+
(default): Linear(in_features=1536, out_features=128, bias=False)
|
| 35 |
+
)
|
| 36 |
+
(lora_B): ModuleDict(
|
| 37 |
+
(default): Linear(in_features=128, out_features=256, bias=False)
|
| 38 |
+
)
|
| 39 |
+
(lora_embedding_A): ParameterDict()
|
| 40 |
+
(lora_embedding_B): ParameterDict()
|
| 41 |
+
(lora_magnitude_vector): ModuleDict()
|
| 42 |
+
)
|
| 43 |
+
(o_proj): Linear(in_features=1536, out_features=1536, bias=False)
|
| 44 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
| 45 |
+
)
|
| 46 |
+
(mlp): Qwen2MLP(
|
| 47 |
+
(gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
|
| 48 |
+
(up_proj): Linear(in_features=1536, out_features=8960, bias=False)
|
| 49 |
+
(down_proj): Linear(in_features=8960, out_features=1536, bias=False)
|
| 50 |
+
(act_fn): SiLU()
|
| 51 |
+
)
|
| 52 |
+
(input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
|
| 53 |
+
(post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
(norm): Qwen2RMSNorm((1536,), eps=1e-06)
|
| 57 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
| 58 |
+
)
|
| 59 |
+
(lm_head): Linear(in_features=1536, out_features=151666, bias=False)
|
| 60 |
+
)
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
(embedding): Embedding(151666, 1536)
|
| 64 |
+
)
|
| 65 |
+
config:
|
| 66 |
+
{'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-025110_355317_math-qw1b-cot-bs32'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-qw1b-cot-bs32', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}
|
| 67 |
+
20250511-040159:
|
| 68 |
+
Start testing with model:
|
| 69 |
+
LitLatentReasoning(
|
| 70 |
+
(llm): PeftModel(
|
| 71 |
+
(base_model): LoraModel(
|
| 72 |
+
(model): Qwen2ForCausalLM(
|
| 73 |
+
(model): Qwen2Model(
|
| 74 |
+
(embed_tokens): Embedding(151666, 1536)
|
| 75 |
+
(layers): ModuleList(
|
| 76 |
+
(0-27): 28 x Qwen2DecoderLayer(
|
| 77 |
+
(self_attn): Qwen2SdpaAttention(
|
| 78 |
+
(q_proj): lora.Linear(
|
| 79 |
+
(base_layer): Linear(in_features=1536, out_features=1536, bias=True)
|
| 80 |
+
(lora_dropout): ModuleDict(
|
| 81 |
+
(default): Identity()
|
| 82 |
+
)
|
| 83 |
+
(lora_A): ModuleDict(
|
| 84 |
+
(default): Linear(in_features=1536, out_features=128, bias=False)
|
| 85 |
+
)
|
| 86 |
+
(lora_B): ModuleDict(
|
| 87 |
+
(default): Linear(in_features=128, out_features=1536, bias=False)
|
| 88 |
+
)
|
| 89 |
+
(lora_embedding_A): ParameterDict()
|
| 90 |
+
(lora_embedding_B): ParameterDict()
|
| 91 |
+
(lora_magnitude_vector): ModuleDict()
|
| 92 |
+
)
|
| 93 |
+
(k_proj): Linear(in_features=1536, out_features=256, bias=True)
|
| 94 |
+
(v_proj): lora.Linear(
|
| 95 |
+
(base_layer): Linear(in_features=1536, out_features=256, bias=True)
|
| 96 |
+
(lora_dropout): ModuleDict(
|
| 97 |
+
(default): Identity()
|
| 98 |
+
)
|
| 99 |
+
(lora_A): ModuleDict(
|
| 100 |
+
(default): Linear(in_features=1536, out_features=128, bias=False)
|
| 101 |
+
)
|
| 102 |
+
(lora_B): ModuleDict(
|
| 103 |
+
(default): Linear(in_features=128, out_features=256, bias=False)
|
| 104 |
+
)
|
| 105 |
+
(lora_embedding_A): ParameterDict()
|
| 106 |
+
(lora_embedding_B): ParameterDict()
|
| 107 |
+
(lora_magnitude_vector): ModuleDict()
|
| 108 |
+
)
|
| 109 |
+
(o_proj): Linear(in_features=1536, out_features=1536, bias=False)
|
| 110 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
| 111 |
+
)
|
| 112 |
+
(mlp): Qwen2MLP(
|
| 113 |
+
(gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
|
| 114 |
+
(up_proj): Linear(in_features=1536, out_features=8960, bias=False)
|
| 115 |
+
(down_proj): Linear(in_features=8960, out_features=1536, bias=False)
|
| 116 |
+
(act_fn): SiLU()
|
| 117 |
+
)
|
| 118 |
+
(input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
|
| 119 |
+
(post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
|
| 120 |
+
)
|
| 121 |
+
)
|
| 122 |
+
(norm): Qwen2RMSNorm((1536,), eps=1e-06)
|
| 123 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
| 124 |
+
)
|
| 125 |
+
(lm_head): Linear(in_features=1536, out_features=151666, bias=False)
|
| 126 |
+
)
|
| 127 |
+
)
|
| 128 |
+
)
|
| 129 |
+
(embedding): Embedding(151666, 1536)
|
| 130 |
+
)
|
| 131 |
+
config:
|
| 132 |
+
{'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-025110_355317_math-qw1b-cot-bs32'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-qw1b-cot-bs32', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}.
|
| 133 |
+
20250511-041020:
|
| 134 |
+
Test results: defaultdict(<class 'list'>, {'monitor': [0.2351384311914444], 'test/acc': [0.2351384311914444], 'test/n_latent_forward': [209.3857879638672], 'test/output_length': [223.18994140625]})
|
| 135 |
+
Test statistics with 1 replications: {'monitor': (0.2351384311914444, 0.0), 'test/acc': (0.2351384311914444, 0.0), 'test/n_latent_forward': (209.3857879638672, 0.0), 'test/output_length': (223.18994140625, 0.0)}
|
logs/cot/qsa-math/qw1b/train.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|