Upload 17 files

Browse files

Files changed (17) hide show

logs/cot/qsa-gsm/llama-1b-cot/checkpoints/epoch7__step12056__monitor0.560.ckpt +3 -0
logs/cot/qsa-gsm/llama-1b-cot/checkpoints/last.ckpt +3 -0
logs/cot/qsa-gsm/llama-1b-cot/events.out.tfevents.1746637111.tj-3008206-g-1b-cot-bs256-copy-master-0.46.0 +3 -0
logs/cot/qsa-gsm/llama-1b-cot/hparams.yaml +94 -0
logs/cot/qsa-gsm/llama-1b-cot/train.json +0 -0
logs/cot/qsa-math/llama1b/checkpoints/epoch7__step1760__monitor0.170.ckpt +3 -0
logs/cot/qsa-math/llama1b/events.out.tfevents.1746899762.tj-3008206-math-1b-cot-master-0.46.0 +3 -0
logs/cot/qsa-math/llama1b/events.out.tfevents.1746902039.tj-3008206-math-1b-cot-master-0.46.1 +3 -0
logs/cot/qsa-math/llama1b/hparams.yaml +105 -0
logs/cot/qsa-math/llama1b/log.txt +135 -0
logs/cot/qsa-math/llama1b/train.json +0 -0
logs/cot/qsa-math/qw1b/checkpoints/epoch9__step2080__monitor0.247.ckpt +3 -0
logs/cot/qsa-math/qw1b/events.out.tfevents.1746903255.tj-3008206-math-qw1b-cot-bs32-master-0.46.0 +3 -0
logs/cot/qsa-math/qw1b/events.out.tfevents.1746907319.tj-3008206-math-qw1b-cot-bs32-master-0.46.1 +3 -0
logs/cot/qsa-math/qw1b/hparams.yaml +94 -0
logs/cot/qsa-math/qw1b/log.txt +135 -0
logs/cot/qsa-math/qw1b/train.json +0 -0

logs/cot/qsa-gsm/llama-1b-cot/checkpoints/epoch7__step12056__monitor0.560.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:257e9b813b798f69f816f7e3d0ff9221f8721b61f29883f912c5f60df9d32a4c
+size 54578518

logs/cot/qsa-gsm/llama-1b-cot/checkpoints/last.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:241221c54f52ee820c7f7778459de032ea02650e08fff49e309821394fb6ba66
+size 54578518

logs/cot/qsa-gsm/llama-1b-cot/events.out.tfevents.1746637111.tj-3008206-g-1b-cot-bs256-copy-master-0.46.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fea2c0aab04baaa84c9d9e9b61ffc95086ebd5d56ac7fc18aa881934a78c1ba2
+size 201157

logs/cot/qsa-gsm/llama-1b-cot/hparams.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+all_config:
+  trainer:
+    target: lightning.pytorch.trainer.Trainer
+    devices:
+    - 0
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    max_steps: -1
+    check_val_every_n_epoch: 1
+    log_every_n_steps: 10
+    num_sanity_val_steps: 2
+    gradient_clip_val: null
+    reload_dataloaders_every_n_epochs: 0
+    accumulate_grad_batches: 1
+    precision: bf16-mixed
+    use_distributed_sampler: true
+    strategy: auto
+    logger:
+      target: lightning.pytorch.loggers.TensorBoardLogger
+      save_dir: logs/cot
+      name: qsa-gsm
+      version: g-1b-cot-bs256
+    max_epochs: 50
+  callbacks:
+  - target: lightning.pytorch.callbacks.ModelCheckpoint
+    save_last: true
+    save_top_k: 3
+    mode: max
+    monitor: monitor
+    auto_insert_metric_name: false
+    filename: epoch{epoch}__step{step}__monitor{monitor:.3f}
+    save_weights_only: true
+  seed: null
+  model:
+    target: src.models.cot.LitCoT
+    model_kwargs:
+      model_id: Llama-3.2-1B-Instruct
+      depth: 1
+      sft_method: cot
+      do_lora: true
+      lora_config:
+        r: 128
+        lora_alpha: 32
+      answer_generation_config:
+        max_new_tokens: 128
+        do_sample: true
+        top_p: 0.9
+        temperature: 1.0
+    training_kwargs:
+      optimizer:
+        target: torch.optim.AdamW
+        lr: 0.0001
+        weight_decay: 0.01
+      use_scheduler: false
+      scheduler:
+        target: constant_schedule_with_warmup
+        warmup_steps: 1000
+  dataloader:
+    batch_size: 32
+    val_batch_size: 32
+    num_workers: 32
+    pin_memory: true
+    persistent_workers: true
+  data_module:
+    target: src.datasets.qsa.QSADataModule
+    dataset_name: gsm
+    tiny_dataset: false
+    epoch_scaling: 1
+  args:
+    model: cot
+    dataset: qsa
+    trainer: default
+    devices: all
+    no_log: false
+    log_suffix: g-1b-cot-bs256
+    resume_ckpt_path: null
+    load_ckpt_path: null
+    workspace_path: /workspace/images-ks3-starfs-hd/workspace/wenhui
+    do_test: true
+    test_ckpt_path: ''
+    test_times: 1
+    seed: 0
+  unkown_args:
+    dataset_name: gsm
+    model_id: Llama-3.2-1B-Instruct
+    sft_method: cot
+    batch_size: '256'
+    precision: bf16-mixed
+    max_new_tokens: '128'

logs/cot/qsa-gsm/llama-1b-cot/train.json ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/cot/qsa-math/llama1b/checkpoints/epoch7__step1760__monitor0.170.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ccd12db32aff29ad5dce8ca9b6ebb65ac1a6a79909a180b7818cc5a44806d96
+size 54583702

logs/cot/qsa-math/llama1b/events.out.tfevents.1746899762.tj-3008206-math-1b-cot-master-0.46.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b86d1b79e05922f704e9becc3dc749bd05c3fb538dcf48c147436985a78e60d4
+size 77119

logs/cot/qsa-math/llama1b/events.out.tfevents.1746902039.tj-3008206-math-1b-cot-master-0.46.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9c4ec9e8af49ac8cd8d9ac6bc813a27a27ebd86056c5c1091b20a1247bb2e1a
+size 16672

logs/cot/qsa-math/llama1b/hparams.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+all_config:
+  trainer:
+    target: lightning.pytorch.trainer.Trainer
+    devices:
+    - 0
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    max_steps: -1
+    check_val_every_n_epoch: 1
+    log_every_n_steps: 10
+    num_sanity_val_steps: 2
+    gradient_clip_val: null
+    reload_dataloaders_every_n_epochs: 0
+    accumulate_grad_batches: 1
+    precision: bf16-mixed
+    use_distributed_sampler: true
+    strategy: auto
+    logger:
+      target: lightning.pytorch.loggers.TensorBoardLogger
+      save_dir: logs/cot
+      name: qsa-math
+      version: llama1b
+    max_epochs: 25
+  callbacks:
+  - target: lightning.pytorch.callbacks.ModelCheckpoint
+    save_last: true
+    save_top_k: 3
+    mode: max
+    monitor: monitor
+    auto_insert_metric_name: false
+    filename: epoch{epoch}__step{step}__monitor{monitor:.3f}
+    save_weights_only: true
+  seed: null
+  model:
+    target: src.models.cot.LitCoT
+    model_kwargs:
+      model_id: Llama-3.2-1B-Instruct
+      depth: 1
+      sft_method: cot
+      set_pad_as_last_token: false
+      do_lora: true
+      lora_config:
+        r: 128
+        lora_alpha: 32
+      answer_generation_config:
+        max_new_tokens: 1024
+        do_sample: true
+        top_p: 0.9
+        temperature: 1.0
+      do_rl: false
+      rl_config:
+        filter_dataset: false
+        exp_batch_size: 8
+        group_size: 8
+        punish_latent_length: true
+        clip_grad_norm: 1.0
+        clip_eps: 0.2
+        use_latent_loss: true
+        use_answer_loss: true
+        n_train_samples_per_epoch: 4096
+    training_kwargs:
+      optimizer:
+        target: torch.optim.AdamW
+        lr: 0.0001
+        weight_decay: 0.01
+      use_scheduler: false
+      scheduler:
+        target: constant_schedule_with_warmup
+        warmup_steps: 1000
+  dataloader:
+    batch_size: 4
+    val_batch_size: 32
+    num_workers: 32
+    pin_memory: true
+    persistent_workers: true
+  data_module:
+    target: src.datasets.qsa.QSADataModule
+    dataset_name: math
+    tiny_dataset: false
+    epoch_scaling: 1
+  args:
+    model: latent_cot
+    dataset: qsa
+    trainer: default
+    devices: all
+    no_log: false
+    log_suffix: llama1b
+    resume_ckpt_path: null
+    load_ckpt_path: null
+    workspace_path:
+    do_test: true
+    test_ckpt_path: ''
+    test_times: 1
+    seed: 0
+  unkown_args:
+    dataset_name: math
+    model_id: Llama-3.2-1B-Instruct
+    batch_size: '32'
+    sft_method: cot
+    max_new_tokens: '1024'

logs/cot/qsa-math/llama1b/log.txt ADDED Viewed

	@@ -0,0 +1,135 @@

+20250511-015608:
+Start training with model:
+ LitLatentReasoning(
+  (llm): PeftModel(
+    (base_model): LoraModel(
+      (model): LlamaForCausalLM(
+        (model): LlamaModel(
+          (embed_tokens): Embedding(128257, 2048)
+          (layers): ModuleList(
+            (0-15): 16 x LlamaDecoderLayer(
+              (self_attn): LlamaSdpaAttention(
+                (q_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=128, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=128, out_features=2048, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                  (lora_magnitude_vector): ModuleDict()
+                )
+                (k_proj): Linear(in_features=2048, out_features=512, bias=False)
+                (v_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=512, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=128, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=128, out_features=512, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                  (lora_magnitude_vector): ModuleDict()
+                )
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
+                (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
+            )
+          )
+          (norm): LlamaRMSNorm((2048,), eps=1e-05)
+          (rotary_emb): LlamaRotaryEmbedding()
+        )
+        (lm_head): Linear(in_features=2048, out_features=128257, bias=False)
+      )
+    )
+  )
+  (embedding): Embedding(128257, 2048)
+)
+config:
+{'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-015422_637954_math-1b-cot-bs64'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'Llama-3.2-1B-Instruct', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-1b-cot-bs64', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'Llama-3.2-1B-Instruct', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}
+20250511-023400:
+Start testing with model:
+LitLatentReasoning(
+  (llm): PeftModel(
+    (base_model): LoraModel(
+      (model): LlamaForCausalLM(
+        (model): LlamaModel(
+          (embed_tokens): Embedding(128257, 2048)
+          (layers): ModuleList(
+            (0-15): 16 x LlamaDecoderLayer(
+              (self_attn): LlamaSdpaAttention(
+                (q_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=128, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=128, out_features=2048, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                  (lora_magnitude_vector): ModuleDict()
+                )
+                (k_proj): Linear(in_features=2048, out_features=512, bias=False)
+                (v_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=512, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=128, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=128, out_features=512, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                  (lora_magnitude_vector): ModuleDict()
+                )
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
+                (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
+            )
+          )
+          (norm): LlamaRMSNorm((2048,), eps=1e-05)
+          (rotary_emb): LlamaRotaryEmbedding()
+        )
+        (lm_head): Linear(in_features=2048, out_features=128257, bias=False)
+      )
+    )
+  )
+  (embedding): Embedding(128257, 2048)
+)
+config:
+{'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-015422_637954_math-1b-cot-bs64'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'Llama-3.2-1B-Instruct', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-1b-cot-bs64', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'Llama-3.2-1B-Instruct', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}.
+20250511-024009:
+Test results: defaultdict(<class 'list'>, {'monitor': [0.1034201979637146], 'test/acc': [0.1034201979637146], 'test/n_latent_forward': [211.8072052001953], 'test/output_length': [226.03562927246094]})
+Test statistics with 1 replications: {'monitor': (0.1034201979637146, 0.0), 'test/acc': (0.1034201979637146, 0.0), 'test/n_latent_forward': (211.8072052001953, 0.0), 'test/output_length': (226.03562927246094, 0.0)}

logs/cot/qsa-math/llama1b/train.json ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/cot/qsa-math/qw1b/checkpoints/epoch9__step2080__monitor0.247.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74d7eb5644e9adce03ddc0fbe41a3a180a65c4dd9510ec9d770b96b47445113f
+size 69805714

logs/cot/qsa-math/qw1b/events.out.tfevents.1746903255.tj-3008206-math-qw1b-cot-bs32-master-0.46.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f7f40bd90676bc1f402258b3b2d741f03834fc5bd460dc6fac04526a013c229
+size 74177

logs/cot/qsa-math/qw1b/events.out.tfevents.1746907319.tj-3008206-math-qw1b-cot-bs32-master-0.46.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27faf06bbc0591eae709d6ab140f04e04ba0d2882f9e1f2584789b1f99fee35a
+size 16700

logs/cot/qsa-math/qw1b/hparams.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+all_config:
+  trainer:
+    target: lightning.pytorch.trainer.Trainer
+    devices:
+    - 0
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    max_steps: -1
+    check_val_every_n_epoch: 1
+    log_every_n_steps: 10
+    num_sanity_val_steps: 2
+    gradient_clip_val: null
+    reload_dataloaders_every_n_epochs: 0
+    accumulate_grad_batches: 1
+    precision: bf16-mixed
+    use_distributed_sampler: true
+    strategy: auto
+    logger:
+      target: lightning.pytorch.loggers.TensorBoardLogger
+      save_dir: logs/cot
+      name: qsa-math
+      version: qw1b
+    max_epochs: 25
+  callbacks:
+  - target: lightning.pytorch.callbacks.ModelCheckpoint
+    save_last: true
+    save_top_k: 3
+    mode: max
+    monitor: monitor
+    auto_insert_metric_name: false
+    filename: epoch{epoch}__step{step}__monitor{monitor:.3f}
+    save_weights_only: true
+  seed: null
+  model:
+    target: src.models.latent_cot.LitLatentReasoning
+    model_kwargs:
+      model_id: DeepSeek-R1-Distill-Qwen-1.5B
+      depth: 1
+      sft_method: cot
+      set_pad_as_last_token: false
+      do_lora: true
+      lora_config:
+        r: 128
+        lora_alpha: 32
+      answer_generation_config:
+        max_new_tokens: 1024
+        do_sample: true
+        top_p: 0.9
+        temperature: 1.0
+    training_kwargs:
+      optimizer:
+        target: torch.optim.AdamW
+        lr: 0.0001
+        weight_decay: 0.01
+      use_scheduler: false
+      scheduler:
+        target: constant_schedule_with_warmup
+        warmup_steps: 1000
+  dataloader:
+    batch_size: 4
+    val_batch_size: 32
+    num_workers: 32
+    pin_memory: true
+    persistent_workers: true
+  data_module:
+    target: src.datasets.qsa.QSADataModule
+    dataset_name: math
+    tiny_dataset: false
+    epoch_scaling: 1
+  args:
+    model: cot
+    dataset: qsa
+    trainer: default
+    devices: all
+    no_log: false
+    log_suffix: qw1b
+    resume_ckpt_path: null
+    load_ckpt_path: null
+    workspace_path: /workspace/images-ks3-starfs-hd/workspace/wenhui
+    do_test: true
+    test_ckpt_path: ''
+    test_times: 1
+    seed: 0
+  unkown_args:
+    dataset_name: math
+    model_id: DeepSeek-R1-Distill-Qwen-1.5B
+    batch_size: '32'
+    sft_method: cot
+    max_new_tokens: '1024'

logs/cot/qsa-math/qw1b/log.txt ADDED Viewed

	@@ -0,0 +1,135 @@

+20250511-025424:
+Start training with model:
+ LitLatentReasoning(
+  (llm): PeftModel(
+    (base_model): LoraModel(
+      (model): Qwen2ForCausalLM(
+        (model): Qwen2Model(
+          (embed_tokens): Embedding(151666, 1536)
+          (layers): ModuleList(
+            (0-27): 28 x Qwen2DecoderLayer(
+              (self_attn): Qwen2SdpaAttention(
+                (q_proj): lora.Linear(
+                  (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=1536, out_features=128, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=128, out_features=1536, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                  (lora_magnitude_vector): ModuleDict()
+                )
+                (k_proj): Linear(in_features=1536, out_features=256, bias=True)
+                (v_proj): lora.Linear(
+                  (base_layer): Linear(in_features=1536, out_features=256, bias=True)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=1536, out_features=128, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=128, out_features=256, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                  (lora_magnitude_vector): ModuleDict()
+                )
+                (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
+                (rotary_emb): Qwen2RotaryEmbedding()
+              )
+              (mlp): Qwen2MLP(
+                (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
+                (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
+                (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
+              (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
+            )
+          )
+          (norm): Qwen2RMSNorm((1536,), eps=1e-06)
+          (rotary_emb): Qwen2RotaryEmbedding()
+        )
+        (lm_head): Linear(in_features=1536, out_features=151666, bias=False)
+      )
+    )
+  )
+  (embedding): Embedding(151666, 1536)
+)
+config:
+{'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-025110_355317_math-qw1b-cot-bs32'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-qw1b-cot-bs32', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}
+20250511-040159:
+Start testing with model:
+LitLatentReasoning(
+  (llm): PeftModel(
+    (base_model): LoraModel(
+      (model): Qwen2ForCausalLM(
+        (model): Qwen2Model(
+          (embed_tokens): Embedding(151666, 1536)
+          (layers): ModuleList(
+            (0-27): 28 x Qwen2DecoderLayer(
+              (self_attn): Qwen2SdpaAttention(
+                (q_proj): lora.Linear(
+                  (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=1536, out_features=128, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=128, out_features=1536, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                  (lora_magnitude_vector): ModuleDict()
+                )
+                (k_proj): Linear(in_features=1536, out_features=256, bias=True)
+                (v_proj): lora.Linear(
+                  (base_layer): Linear(in_features=1536, out_features=256, bias=True)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=1536, out_features=128, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=128, out_features=256, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                  (lora_magnitude_vector): ModuleDict()
+                )
+                (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
+                (rotary_emb): Qwen2RotaryEmbedding()
+              )
+              (mlp): Qwen2MLP(
+                (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
+                (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
+                (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
+              (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
+            )
+          )
+          (norm): Qwen2RMSNorm((1536,), eps=1e-06)
+          (rotary_emb): Qwen2RotaryEmbedding()
+        )
+        (lm_head): Linear(in_features=1536, out_features=151666, bias=False)
+      )
+    )
+  )
+  (embedding): Embedding(151666, 1536)
+)
+config:
+{'trainer': {'target': 'lightning.pytorch.trainer.Trainer', 'devices': [0, 1, 2, 3, 4, 5, 6, 7], 'max_steps': -1, 'check_val_every_n_epoch': 1, 'log_every_n_steps': 10, 'num_sanity_val_steps': 2, 'gradient_clip_val': None, 'reload_dataloaders_every_n_epochs': 0, 'accumulate_grad_batches': 1, 'precision': 'bf16-mixed', 'use_distributed_sampler': True, 'strategy': 'auto', 'logger': {'target': 'lightning.pytorch.loggers.TensorBoardLogger', 'save_dir': 'logs/latent_cot', 'name': 'qsa-math', 'version': '20250511-025110_355317_math-qw1b-cot-bs32'}, 'max_epochs': 25}, 'callbacks': [{'target': 'lightning.pytorch.callbacks.ModelCheckpoint', 'save_last': True, 'save_top_k': 3, 'mode': 'max', 'monitor': 'monitor', 'auto_insert_metric_name': False, 'filename': 'epoch{epoch}__step{step}__monitor{monitor:.3f}', 'save_weights_only': True}], 'seed': None, 'model': {'target': 'src.models.latent_cot.LitLatentReasoning', 'model_kwargs': {'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'depth': 1, 'sft_method': 'cot', 'set_pad_as_last_token': False, 'do_lora': True, 'lora_config': {'r': 128, 'lora_alpha': 32}, 'coconut_config': {'n_epochs_per_stage': 2, 'max_n_stage': 3, 'n_latents_per_step': 2, 'current_stage': 1}, 'codi_config': {'codi_proj': False, 'n_latents': 6, 'alpha': 1, 'beta': 1, 'gamma': 20}, 'latent_cot_config': {'ce_weight': 1, 'embed_modeling_weight': 1, 'embed_modeling_loss': 'mse', 'entropy_weight': 0, 'pred_embed_forward_weight': 0, 'max_compression_r': 4, 'pred_compressed_cot': True, 'replace_r_with_auto_prob': 0.0, 'sqrt_mean': True}, 'latent_policy_config': {'lp_determinisitc': False, 'lp_intermediate_size': 2048}, 'latent_generation_config': {'max_n_latent_forward': 64, 'latent_temperature': 1.0, 'compression_r': 4}, 'answer_generation_config': {'max_new_tokens': 1024, 'do_sample': True, 'top_p': 0.9, 'temperature': 1.0}, 'do_rl': False, 'rl_config': {'filter_dataset': False, 'exp_batch_size': 8, 'group_size': 8, 'punish_latent_length': True, 'clip_grad_norm': 1.0, 'clip_eps': 0.2, 'use_latent_loss': True, 'use_answer_loss': True, 'n_train_samples_per_epoch': 4096}}, 'training_kwargs': {'optimizer': {'target': 'torch.optim.AdamW', 'lr': 0.0001, 'weight_decay': 0.01}, 'use_scheduler': False, 'scheduler': {'target': 'constant_schedule_with_warmup', 'warmup_steps': 1000}}}, 'dataloader': {'batch_size': 4, 'val_batch_size': 32, 'num_workers': 32, 'pin_memory': True, 'persistent_workers': True}, 'data_module': {'target': 'src.datasets.qsa.QSADataModule', 'dataset_name': 'math', 'tiny_dataset': False, 'epoch_scaling': 1}, 'args': {'model': 'latent_cot', 'dataset': 'qsa', 'trainer': 'default', 'devices': 'all', 'no_log': False, 'log_suffix': 'math-qw1b-cot-bs32', 'resume_ckpt_path': None, 'load_ckpt_path': None, 'workspace_path': '/workspace/images-ks3-starfs-hd/workspace/wenhui', 'do_test': True, 'test_ckpt_path': '', 'test_times': 1, 'seed': 0}, 'unkown_args': {'dataset_name': 'math', 'model_id': 'DeepSeek-R1-Distill-Qwen-1.5B', 'batch_size': '32', 'sft_method': 'cot', 'max_new_tokens': '1024'}}.
+20250511-041020:
+Test results: defaultdict(<class 'list'>, {'monitor': [0.2351384311914444], 'test/acc': [0.2351384311914444], 'test/n_latent_forward': [209.3857879638672], 'test/output_length': [223.18994140625]})
+Test statistics with 1 replications: {'monitor': (0.2351384311914444, 0.0), 'test/acc': (0.2351384311914444, 0.0), 'test/n_latent_forward': (209.3857879638672, 0.0), 'test/output_length': (223.18994140625, 0.0)}

logs/cot/qsa-math/qw1b/train.json ADDED Viewed

The diff for this file is too large to render. See raw diff