xiulinyang commited on
Commit
a20227b
·
verified ·
1 Parent(s): 3636cdc

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. all_results.json +1 -0
  2. config.json +31 -0
  3. generation_config.json +6 -0
  4. model.safetensors +3 -0
  5. training.log +111 -0
all_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"perplexity": 46.59282867092637}
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 32768,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 32769,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 512,
14
+ "n_head": 8,
15
+ "n_inner": null,
16
+ "n_layer": 4,
17
+ "n_positions": 1024,
18
+ "reorder_and_upcast_attn": false,
19
+ "resid_pdrop": 0.1,
20
+ "scale_attn_by_inverse_layer_idx": false,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.37.0",
29
+ "use_cache": true,
30
+ "vocab_size": 32770
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 32768,
4
+ "eos_token_id": 32769,
5
+ "transformers_version": "4.37.0"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beac15998d30bf6cfc27eca320c8e86b683d8eb49940723cba17bbd2b8701ce4
3
+ size 119657520
training.log ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 01/07/2026 19:45:41 - INFO - __main__ - Distributed environment: DistributedType.NO
2
+ Num processes: 1
3
+ Process index: 0
4
+ Local process index: 0
5
+ Device: cuda
6
+
7
+ Mixed precision type: no
8
+
9
+ 01/07/2026 19:45:41 - INFO - __main__ - Arguments:
10
+ 01/07/2026 19:45:41 - INFO - __main__ - Namespace(train_file='data/preprocessed/dependency/train.sequential=False.random=False.convert_method=exponential.jsonl', validation_file='data/preprocessed/dependency/val.sequential=False.random=False.convert_method=exponential.jsonl', model_name_or_path=None, per_device_train_batch_size=32, per_device_eval_batch_size=32, learning_rate=0.0001, weight_decay=0.0, num_train_epochs=10, max_train_steps=None, gradient_accumulation_steps=1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, num_warmup_steps=0, output_dir='experiments/42/', seed=42, block_size=512, preprocessing_num_workers=None, overwrite_cache=False, trust_remote_code=False, checkpointing_steps='epoch', resume_from_checkpoint=None, with_tracking=True, report_to='wandb', low_cpu_mem_usage=False, n_positions=1024, n_embd=512, n_layer=4, n_head=8, n_inner=None, activation_function='gelu_new', resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-05, initializer_range=0.02, attn_loss_weight=0.5, attn_loss_layers=[3], attn_loss_heads=[0], attn_loss_reduction='none')
11
+ 01/07/2026 19:45:41 - INFO - __main__ - Training new model from scratch
12
+ 01/07/2026 19:45:42 - INFO - __main__ - Sample 233478 of the training set: {'token_ids': [32768, 433, 365, 410, 362, 1798, 298, 515, 707, 311, 380, 5505, 3469, 269, 32769], 'attn_matrix': [[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.33333333333333337, 0.33333333333333337, 0.33333333333333337, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.25, 0.25, 0.25, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.14884758120207756, 0.14884758120207756, 0.14884758120207756, 0.14884758120207756, 0.4046096751916897, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.1295625143296497, 0.1295625143296497, 0.1295625143296497, 0.1295625143296497, 0.35218742835175154, 0.1295625143296497, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.11470149963903493, 0.11470149963903493, 0.11470149963903493, 0.11470149963903493, 0.3117910021657905, 0.11470149963903493, 0.11470149963903493, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.062083505325827404, 0.062083505325827404, 0.062083505325827404, 0.062083505325827404, 0.168760464374237, 0.062083505325827404, 0.062083505325827404, 0.45873850367079866, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.053119101148811854, 0.053119101148811854, 0.053119101148811854, 0.053119101148811854, 0.14439268739689323, 0.053119101148811854, 0.053119101148811854, 0.3925000183133424, 0.14439268739689323, 0.0, 0.0, 0.0, 0.0, 0.0], [0.025699570132253498, 0.025699570132253498, 0.025699570132253498, 0.025699570132253498, 0.0698586744897135, 0.025699570132253498, 0.025699570132253498, 0.18989556542562366, 0.0698586744897135, 0.5161896648014286, 0.0, 0.0, 0.0, 0.0], [0.01695010243696609, 0.01695010243696609, 0.01695010243696609, 0.01695010243696609, 0.046075155444924305, 0.01695010243696609, 0.01695010243696609, 0.12524525778936357, 0.046075155444924305, 0.3404519083494957, 0.3404519083494957, 0.0, 0.0, 0.0], [0.012645065691194264, 0.012645065691194264, 0.012645065691194264, 0.012645065691194264, 0.034372852288044285, 0.012645065691194264, 0.012645065691194264, 0.09343509976689769, 0.034372852288044285, 0.253982933836616, 0.253982933836616, 0.253982933836616, 0.0, 0.0], [0.09355455753516212, 0.09355455753516212, 0.09355455753516212, 0.09355455753516212, 0.25430765371735753, 0.09355455753516212, 0.09355455753516212, 0.09355455753516212, 0.034416798345076986, 0.034416798345076986, 0.0046578071521283775, 0.0046578071521283775, 0.012661232542097145, 0.0], [0.08555088256961882, 0.08555088256961882, 0.08555088256961882, 0.08555088256961882, 0.2325514094976286, 0.08555088256961882, 0.08555088256961882, 0.08555088256961882, 0.031472410871435055, 0.031472410871435055, 0.004259327639424712, 0.004259327639424712, 0.011578052923701556, 0.08555088256961882]], 'word_token_membership_mask': [[True, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, True, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, True, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, True, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, True, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, True]], 'input_ids': [32768, 433, 365, 410, 362, 1798, 298, 515, 707, 311, 380, 5505, 3469, 269, 32769], 'row_word_token_membership_mask': [[True, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, True, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, True, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, True]], 'col_word_token_membership_mask': [[True, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, True, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, True, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, True]]}.
13
+ 01/07/2026 19:45:42 - INFO - __main__ - Sample 52451 of the training set: {'token_ids': [32768, 43, 1049, 79, 2282, 269, 32769], 'attn_matrix': [[1.0, 0.0, 0.0, 0.0, 0.0], [0.5, 0.5, 0.0, 0.0, 0.0], [0.2119415576170854, 0.2119415576170854, 0.5761168847658292, 0.0, 0.0], [0.1748777045271094, 0.1748777045271094, 0.47536688641867175, 0.1748777045271094, 0.0], [0.14884758120207756, 0.14884758120207756, 0.4046096751916897, 0.14884758120207756, 0.14884758120207756]], 'word_token_membership_mask': [[True, False, False, False, False, False, False], [False, True, False, False, False, False, False], [False, False, True, True, False, False, False], [False, False, False, False, True, False, False], [False, False, False, False, False, True, False], [False, False, False, False, False, False, True]], 'input_ids': [32768, 43, 1049, 79, 2282, 269, 32769], 'row_word_token_membership_mask': [[True, False, False, False, False, False], [False, True, True, False, False, False], [False, False, False, True, False, False], [False, False, False, False, True, False], [False, False, False, False, False, True]], 'col_word_token_membership_mask': [[True, False, False, False, False, False], [False, True, False, False, False, False], [False, False, True, True, False, False], [False, False, False, False, True, False], [False, False, False, False, False, True]]}.
14
+ 01/07/2026 19:45:42 - INFO - __main__ - Sample 576778 of the training set: {'token_ids': [32768, 987, 373, 1213, 285, 17832, 351, 375, 506, 269, 32769], 'attn_matrix': [[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.2119415576170854, 0.2119415576170854, 0.5761168847658292, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.08259453944353538, 0.08259453944353538, 0.2245152356993061, 0.6102956854136232, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.05129153620151443, 0.05129153620151443, 0.13942485081032596, 0.3789960383933226, 0.3789960383933226, 0.0, 0.0, 0.0, 0.0, 0.0], [0.02526405296587557, 0.02526405296587557, 0.06867481609036641, 0.18667750465120986, 0.18667750465120986, 0.5074420686754628, 0.0, 0.0, 0.0, 0.0], [0.02128973783260619, 0.02128973783260619, 0.05787150748303046, 0.15731106717665336, 0.15731106717665336, 0.4276158153217971, 0.15731106717665336, 0.0, 0.0, 0.0], [0.00984551044474046, 0.00984551044474046, 0.026762872133841722, 0.0727490289987949, 0.0727490289987949, 0.19775236356546433, 0.0727490289987949, 0.5375466564148283, 0.0, 0.0], [0.15285952351652357, 0.15285952351652357, 0.41551526508187425, 0.15285952351652357, 0.02068728691052238, 0.05623387608899164, 0.02068728691052238, 0.02068728691052238, 0.007610427547996267, 0.0], [0.13259163011488337, 0.13259163011488337, 0.3604214187470507, 0.13259163011488337, 0.017944325816401926, 0.04877773479067388, 0.017944325816401926, 0.017944325816401926, 0.006601348553536226, 0.13259163011488337]], 'word_token_membership_mask': [[True, False, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False], [False, False, False, True, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False], [False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, False, True]], 'input_ids': [32768, 987, 373, 1213, 285, 17832, 351, 375, 506, 269, 32769], 'row_word_token_membership_mask': [[True, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False], [False, False, False, True, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False], [False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, True]], 'col_word_token_membership_mask': [[True, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False], [False, False, False, True, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False], [False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, True]]}.
15
+ 01/07/2026 19:45:42 - INFO - __main__ - ***** Running training *****
16
+ 01/07/2026 19:45:42 - INFO - __main__ - Num examples = 1132837
17
+ 01/07/2026 19:45:42 - INFO - __main__ - Num Epochs = 10
18
+ 01/07/2026 19:45:42 - INFO - __main__ - Instantaneous batch size per device = 32
19
+ 01/07/2026 19:45:42 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 32
20
+ 01/07/2026 19:45:42 - INFO - __main__ - Gradient Accumulation steps = 1
21
+ 01/07/2026 19:45:42 - INFO - __main__ - Total optimization steps = 354020
22
+ 01/07/2026 20:02:04 - INFO - __main__ - epoch 0: perplexity: 60.982220329810666 eval_loss: 4.175161838531494 eval_nwp_loss: 4.11058235168457 eval_attn_loss: 0.12915922701358795
23
+ 01/07/2026 20:02:04 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_0
24
+ 01/07/2026 20:02:04 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
25
+ 01/07/2026 20:02:04 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_0/model.safetensors
26
+ 01/07/2026 20:02:04 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_0/optimizer.bin
27
+ 01/07/2026 20:02:04 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_0/scheduler.bin
28
+ 01/07/2026 20:02:04 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_0/sampler.bin
29
+ 01/07/2026 20:02:04 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_0/sampler_1.bin
30
+ 01/07/2026 20:02:04 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_0/random_states_0.pkl
31
+ 01/07/2026 20:16:55 - INFO - __main__ - epoch 1: perplexity: 54.10629752855644 eval_loss: 4.052062034606934 eval_nwp_loss: 3.990950584411621 eval_attn_loss: 0.12222273647785187
32
+ 01/07/2026 20:16:55 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_1
33
+ 01/07/2026 20:16:55 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
34
+ 01/07/2026 20:16:55 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_1/model.safetensors
35
+ 01/07/2026 20:16:55 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_1/optimizer.bin
36
+ 01/07/2026 20:16:55 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_1/scheduler.bin
37
+ 01/07/2026 20:16:55 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_1/sampler.bin
38
+ 01/07/2026 20:16:55 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_1/sampler_1.bin
39
+ 01/07/2026 20:16:55 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_1/random_states_0.pkl
40
+ 01/07/2026 20:34:11 - INFO - __main__ - epoch 2: perplexity: 51.096929297639356 eval_loss: 3.9933700561523438 eval_nwp_loss: 3.9337244033813477 eval_attn_loss: 0.119290791451931
41
+ 01/07/2026 20:34:11 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_2
42
+ 01/07/2026 20:34:11 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
43
+ 01/07/2026 20:34:11 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_2/model.safetensors
44
+ 01/07/2026 20:34:12 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_2/optimizer.bin
45
+ 01/07/2026 20:34:12 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_2/scheduler.bin
46
+ 01/07/2026 20:34:12 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_2/sampler.bin
47
+ 01/07/2026 20:34:12 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_2/sampler_1.bin
48
+ 01/07/2026 20:34:12 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_2/random_states_0.pkl
49
+ 01/07/2026 20:49:31 - INFO - __main__ - epoch 3: perplexity: 49.303613736377336 eval_loss: 3.95656156539917 eval_nwp_loss: 3.8979973793029785 eval_attn_loss: 0.11712862551212311
50
+ 01/07/2026 20:49:31 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_3
51
+ 01/07/2026 20:49:31 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
52
+ 01/07/2026 20:49:32 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_3/model.safetensors
53
+ 01/07/2026 20:49:32 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_3/optimizer.bin
54
+ 01/07/2026 20:49:32 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_3/scheduler.bin
55
+ 01/07/2026 20:49:32 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_3/sampler.bin
56
+ 01/07/2026 20:49:32 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_3/sampler_1.bin
57
+ 01/07/2026 20:49:32 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_3/random_states_0.pkl
58
+ 01/07/2026 21:04:38 - INFO - __main__ - epoch 4: perplexity: 48.16577995291985 eval_loss: 3.932523488998413 eval_nwp_loss: 3.8746488094329834 eval_attn_loss: 0.11575036495923996
59
+ 01/07/2026 21:04:38 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_4
60
+ 01/07/2026 21:04:38 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
61
+ 01/07/2026 21:04:38 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_4/model.safetensors
62
+ 01/07/2026 21:04:38 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_4/optimizer.bin
63
+ 01/07/2026 21:04:38 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_4/scheduler.bin
64
+ 01/07/2026 21:04:38 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_4/sampler.bin
65
+ 01/07/2026 21:04:38 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_4/sampler_1.bin
66
+ 01/07/2026 21:04:38 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_4/random_states_0.pkl
67
+ 01/07/2026 21:22:56 - INFO - __main__ - epoch 5: perplexity: 47.29216797480242 eval_loss: 3.9137611389160156 eval_nwp_loss: 3.856344699859619 eval_attn_loss: 0.11483234912157059
68
+ 01/07/2026 21:22:56 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_5
69
+ 01/07/2026 21:22:56 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
70
+ 01/07/2026 21:22:56 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_5/model.safetensors
71
+ 01/07/2026 21:22:56 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_5/optimizer.bin
72
+ 01/07/2026 21:22:56 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_5/scheduler.bin
73
+ 01/07/2026 21:22:56 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_5/sampler.bin
74
+ 01/07/2026 21:22:56 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_5/sampler_1.bin
75
+ 01/07/2026 21:22:56 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_5/random_states_0.pkl
76
+ 01/07/2026 21:40:35 - INFO - __main__ - epoch 6: perplexity: 47.01982156355468 eval_loss: 3.9077868461608887 eval_nwp_loss: 3.850569248199463 eval_attn_loss: 0.11443594098091125
77
+ 01/07/2026 21:40:35 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_6
78
+ 01/07/2026 21:40:35 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
79
+ 01/07/2026 21:40:36 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_6/model.safetensors
80
+ 01/07/2026 21:40:36 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_6/optimizer.bin
81
+ 01/07/2026 21:40:36 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_6/scheduler.bin
82
+ 01/07/2026 21:40:36 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_6/sampler.bin
83
+ 01/07/2026 21:40:36 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_6/sampler_1.bin
84
+ 01/07/2026 21:40:36 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_6/random_states_0.pkl
85
+ 01/07/2026 21:57:58 - INFO - __main__ - epoch 7: perplexity: 46.737497877562554 eval_loss: 3.9014720916748047 eval_nwp_loss: 3.8445467948913574 eval_attn_loss: 0.11385107785463333
86
+ 01/07/2026 21:57:58 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_7
87
+ 01/07/2026 21:57:58 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
88
+ 01/07/2026 21:57:58 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_7/model.safetensors
89
+ 01/07/2026 21:57:58 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_7/optimizer.bin
90
+ 01/07/2026 21:57:58 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_7/scheduler.bin
91
+ 01/07/2026 21:57:58 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_7/sampler.bin
92
+ 01/07/2026 21:57:58 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_7/sampler_1.bin
93
+ 01/07/2026 21:57:58 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_7/random_states_0.pkl
94
+ 01/07/2026 22:16:27 - INFO - __main__ - epoch 8: perplexity: 46.53700811484802 eval_loss: 3.8968794345855713 eval_nwp_loss: 3.840247869491577 eval_attn_loss: 0.11326242983341217
95
+ 01/07/2026 22:16:27 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_8
96
+ 01/07/2026 22:16:27 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
97
+ 01/07/2026 22:16:27 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_8/model.safetensors
98
+ 01/07/2026 22:16:27 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_8/optimizer.bin
99
+ 01/07/2026 22:16:27 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_8/scheduler.bin
100
+ 01/07/2026 22:16:27 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_8/sampler.bin
101
+ 01/07/2026 22:16:27 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_8/sampler_1.bin
102
+ 01/07/2026 22:16:27 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_8/random_states_0.pkl
103
+ 01/07/2026 22:35:11 - INFO - __main__ - epoch 9: perplexity: 46.59282867092637 eval_loss: 3.8980274200439453 eval_nwp_loss: 3.8414466381073 eval_attn_loss: 0.11316093802452087
104
+ 01/07/2026 22:35:11 - INFO - accelerate.accelerator - Saving current state to experiments/42/epoch_9
105
+ 01/07/2026 22:35:11 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
106
+ 01/07/2026 22:35:12 - INFO - accelerate.checkpointing - Model weights saved in experiments/42/epoch_9/model.safetensors
107
+ 01/07/2026 22:35:12 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/42/epoch_9/optimizer.bin
108
+ 01/07/2026 22:35:12 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/42/epoch_9/scheduler.bin
109
+ 01/07/2026 22:35:12 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/42/epoch_9/sampler.bin
110
+ 01/07/2026 22:35:12 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/42/epoch_9/sampler_1.bin
111
+ 01/07/2026 22:35:12 - INFO - accelerate.checkpointing - Random states saved in experiments/42/epoch_9/random_states_0.pkl