xiulinyang commited on
Commit
05faf0b
·
verified ·
1 Parent(s): 7ba33fe

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. all_results.json +1 -0
  2. config.json +31 -0
  3. generation_config.json +6 -0
  4. model.safetensors +3 -0
  5. training.log +119 -0
all_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"perplexity": 46.75517415664002}
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 32768,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 32769,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 512,
14
+ "n_head": 8,
15
+ "n_inner": null,
16
+ "n_layer": 4,
17
+ "n_positions": 1024,
18
+ "reorder_and_upcast_attn": false,
19
+ "resid_pdrop": 0.1,
20
+ "scale_attn_by_inverse_layer_idx": false,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.37.0",
29
+ "use_cache": true,
30
+ "vocab_size": 32770
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 32768,
4
+ "eos_token_id": 32769,
5
+ "transformers_version": "4.37.0"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb3e6da1653a70ee41206c0003cbf6f58d44519605248bbaf0b03d784d7e5932
3
+ size 119657520
training.log ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 01/07/2026 23:04:10 - INFO - __main__ - Distributed environment: DistributedType.NO
2
+ Num processes: 1
3
+ Process index: 0
4
+ Local process index: 0
5
+ Device: cuda
6
+
7
+ Mixed precision type: no
8
+
9
+ 01/07/2026 23:04:10 - INFO - __main__ - Arguments:
10
+ 01/07/2026 23:04:10 - INFO - __main__ - Namespace(train_file='data/preprocessed/dependency/train.sequential=False.random=False.convert_method=exponential.jsonl', validation_file='data/preprocessed/dependency/val.sequential=False.random=False.convert_method=exponential.jsonl', model_name_or_path=None, per_device_train_batch_size=32, per_device_eval_batch_size=32, learning_rate=0.0001, weight_decay=0.0, num_train_epochs=10, max_train_steps=None, gradient_accumulation_steps=1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, num_warmup_steps=0, output_dir='experiments/96/', seed=96, block_size=512, preprocessing_num_workers=None, overwrite_cache=False, trust_remote_code=False, checkpointing_steps='epoch', resume_from_checkpoint=None, with_tracking=True, report_to='wandb', low_cpu_mem_usage=False, n_positions=1024, n_embd=512, n_layer=4, n_head=8, n_inner=None, activation_function='gelu_new', resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-05, initializer_range=0.02, attn_loss_weight=0.5, attn_loss_layers=[3], attn_loss_heads=[0], attn_loss_reduction='none')
11
+ 01/07/2026 23:05:14 - INFO - __main__ - Training new model from scratch
12
+ 01/07/2026 23:12:01 - INFO - __main__ - Sample 777192 of the training set: {'token_ids': [32768, 6826, 17525, 1197, 269, 32769], 'attn_matrix': [[1.0, 0.0, 0.0, 0.0, 0.0], [0.5, 0.5, 0.0, 0.0, 0.0], [0.2119415576170854, 0.2119415576170854, 0.5761168847658292, 0.0, 0.0], [0.1748777045271094, 0.1748777045271094, 0.47536688641867175, 0.1748777045271094, 0.0], [0.14884758120207756, 0.14884758120207756, 0.4046096751916897, 0.14884758120207756, 0.14884758120207756]], 'word_token_membership_mask': [[True, False, False, False, False, False], [False, True, False, False, False, False], [False, False, True, False, False, False], [False, False, False, True, False, False], [False, False, False, False, True, False], [False, False, False, False, False, True]], 'input_ids': [32768, 6826, 17525, 1197, 269, 32769], 'row_word_token_membership_mask': [[True, False, False, False, False], [False, True, False, False, False], [False, False, True, False, False], [False, False, False, True, False], [False, False, False, False, True]], 'col_word_token_membership_mask': [[True, False, False, False, False], [False, True, False, False, False], [False, False, True, False, False], [False, False, False, True, False], [False, False, False, False, True]]}.
13
+ 01/07/2026 23:12:01 - INFO - __main__ - Sample 659895 of the training set: {'token_ids': [32768, 43, 322, 294, 329, 389, 269, 32769], 'attn_matrix': [[1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.5, 0.5, 0.0, 0.0, 0.0, 0.0], [0.33333333333333337, 0.33333333333333337, 0.33333333333333337, 0.0, 0.0, 0.0], [0.25, 0.25, 0.25, 0.25, 0.0, 0.0], [0.14884758120207756, 0.14884758120207756, 0.14884758120207756, 0.14884758120207756, 0.4046096751916897, 0.0], [0.1295625143296497, 0.1295625143296497, 0.1295625143296497, 0.1295625143296497, 0.35218742835175154, 0.1295625143296497]], 'word_token_membership_mask': [[True, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False], [False, False, True, False, False, False, False, False], [False, False, False, True, True, False, False, False], [False, False, False, False, False, True, False, False], [False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, True]], 'input_ids': [32768, 43, 322, 294, 329, 389, 269, 32769], 'row_word_token_membership_mask': [[True, False, False, False, False, False, False], [False, True, False, False, False, False, False], [False, False, True, True, False, False, False], [False, False, False, False, True, False, False], [False, False, False, False, False, True, False], [False, False, False, False, False, False, True]], 'col_word_token_membership_mask': [[True, False, False, False, False, False, False], [False, True, False, False, False, False, False], [False, False, True, False, False, False, False], [False, False, False, True, True, False, False], [False, False, False, False, False, True, False], [False, False, False, False, False, False, True]]}.
14
+ 01/07/2026 23:12:01 - INFO - __main__ - Sample 830528 of the training set: {'token_ids': [32768, 7366, 558, 285, 11452, 307, 706, 325, 372, 28218, 9165, 373, 285, 6026, 612, 5837, 285, 322, 335, 20961, 19280, 440, 269, 32769], 'attn_matrix': [[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.26894142136999516, 0.7310585786300049, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.15536240349696362, 0.42231879825151825, 0.42231879825151825, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.1092317725730359, 0.2969227424756547, 0.2969227424756547, 0.2969227424756547, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0604452338466834, 0.16430718078239712, 0.16430718078239712, 0.16430718078239712, 0.4466332238061253, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.041783385623932086, 0.1135790178730315, 0.1135790178730315, 0.1135790178730315, 0.3087397803784867, 0.3087397803784867, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.2772499979105108, 0.10199457429610227, 0.10199457429610227, 0.10199457429610227, 0.2772499979105108, 0.03752170699456926, 0.10199457429610227, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.2170679180771751, 0.0798548243984796, 0.0798548243984796, 0.0798548243984796, 0.2170679180771751, 0.029376948174556335, 0.0798548243984796, 0.2170679180771751, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.17835316735660656, 0.06561246353830512, 0.06561246353830512, 0.06561246353830512, 0.17835316735660656, 0.024137476420353323, 0.06561246353830512, 0.17835316735660656, 0.17835316735660656, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.1201181740414825, 0.04418900674091463, 0.04418900674091463, 0.04418900674091463, 0.1201181740414825, 0.016256227105768774, 0.04418900674091463, 0.1201181740414825, 0.1201181740414825, 0.32651504976464285, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.09055168583484556, 0.03331210358205499, 0.03331210358205499, 0.03331210358205499, 0.09055168583484556, 0.012254838050011593, 0.03331210358205499, 0.09055168583484556, 0.09055168583484556, 0.2461450021411931, 0.2461450021411931, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.05425208058874275, 0.01995822508937474, 0.01995822508937474, 0.01995822508937474, 0.05425208058874275, 0.007342220692653039, 0.01995822508937474, 0.05425208058874275, 0.05425208058874275, 0.1474724448204751, 0.1474724448204751, 0.4008716669539268, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.03872737372632366, 0.014247004604477548, 0.014247004604477548, 0.014247004604477548, 0.03872737372632366, 0.0052411800922621665, 0.014247004604477548, 0.03872737372632366, 0.03872737372632366, 0.10527191626420788, 0.10527191626420788, 0.2861587370280586, 0.2861587370280586, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.021783139087840513, 0.00801356903459457, 0.00801356903459457, 0.00801356903459457, 0.021783139087840513, 0.0029480272982354253, 0.00801356903459457, 0.021783139087840513, 0.021783139087840513, 0.05921271114927281, 0.05921271114927281, 0.16095683673086258, 0.16095683673086258, 0.4375260444517533, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.018763091269766444, 0.0069025555309704485, 0.0069025555309704485, 0.0069025555309704485, 0.018763091269766444, 0.002539308271388257, 0.0069025555309704485, 0.018763091269766444, 0.018763091269766444, 0.05100337004432468, 0.05100337004432468, 0.13864153398166018, 0.13864153398166018, 0.3768667624920342, 0.13864153398166018, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.016478488365126387, 0.00606209709111281, 0.00606209709111281, 0.00606209709111281, 0.016478488365126387, 0.0022301208902056066, 0.00606209709111281, 0.016478488365126387, 0.016478488365126387, 0.044793175483396855, 0.044793175483396855, 0.12176047495549487, 0.12176047495549487, 0.3309792864960644, 0.12176047495549487, 0.12176047495549487, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.012380724878527333, 0.004554614149610009, 0.004554614149610009, 0.004554614149610009, 0.012380724878527333, 0.0016755489081100741, 0.004554614149610009, 0.012380724878527333, 0.012380724878527333, 0.03365429946045167, 0.03365429946045167, 0.09148187067286483, 0.09148187067286483, 0.24867350668348898, 0.09148187067286483, 0.09148187067286483, 0.24867350668348898, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0073872230527096015, 0.0027176074884396047, 0.0027176074884396047, 0.0027176074884396047, 0.0073872230527096015, 0.0009997519241704885, 0.0027176074884396047, 0.0073872230527096015, 0.0073872230527096015, 0.020080554186954265, 0.020080554186954265, 0.05458460555178497, 0.05458460555178497, 0.14837634138502181, 0.05458460555178497, 0.05458460555178497, 0.14837634138502181, 0.4033287125601406, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0052640717649344165, 0.001936543779170441, 0.001936543779170441, 0.001936543779170441, 0.0052640717649344165, 0.000712414643285255, 0.001936543779170441, 0.0052640717649344165, 0.0052640717649344165, 0.01430923062232556, 0.01430923062232556, 0.038896521579897286, 0.038896521579897286, 0.1057317078008999, 0.038896521579897286, 0.038896521579897286, 0.1057317078008999, 0.28740858000712766, 0.28740858000712766, 0.0, 0.0, 0.0, 0.0], [0.0029552558822721424, 0.0010871778824888935, 0.0010871778824888935, 0.0010871778824888935, 0.0029552558822721424, 0.00039995039186396607, 0.0010871778824888935, 0.0029552558822721424, 0.0029552558822721424, 0.008033218363227068, 0.008033218363227068, 0.02183655150080365, 0.02183655150080365, 0.05935790114084465, 0.02183655150080365, 0.02183655150080365, 0.05935790114084465, 0.16135150404662646, 0.16135150404662646, 0.4385988614444809, 0.0, 0.0, 0.0], [0.003778565454608256, 0.0013900565478710022, 0.0013900565478710022, 0.0013900565478710022, 0.003778565454608256, 0.0005113732260274886, 0.0013900565478710022, 0.003778565454608256, 0.003778565454608256, 0.010271205812904713, 0.010271205812904713, 0.027920032117581796, 0.027920032117581796, 0.07589451595521551, 0.027920032117581796, 0.027920032117581796, 0.07589451595521551, 0.20630268360075749, 0.20630268360075749, 0.07589451595521551, 0.20630268360075749, 0.0, 0.0], [0.09306499868476079, 0.034236699708770815, 0.034236699708770815, 0.034236699708770815, 0.09306499868476079, 0.012594977956417088, 0.034236699708770815, 0.09306499868476079, 0.09306499868476079, 0.25297689479035024, 0.034236699708770815, 0.09306499868476079, 0.012594977956417088, 0.034236699708770815, 0.012594977956417088, 0.012594977956417088, 0.004633433452173354, 0.012594977956417088, 0.0017045449090906002, 0.000627067028607877, 0.0017045449090906002, 0.004633433452173354, 0.0], [0.08514132169335034, 0.03132174184514772, 0.03132174184514772, 0.03132174184514772, 0.08514132169335034, 0.011522624886509125, 0.03132174184514772, 0.08514132169335034, 0.08514132169335034, 0.2314381076100202, 0.03132174184514772, 0.08514132169335034, 0.011522624886509125, 0.03132174184514772, 0.011522624886509125, 0.011522624886509125, 0.004238936804077132, 0.011522624886509125, 0.0015594177026449548, 0.0005736777130018804, 0.0015594177026449548, 0.004238936804077132, 0.08514132169335034]], 'word_token_membership_mask': [[True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True]], 'input_ids': [32768, 7366, 558, 285, 11452, 307, 706, 325, 372, 28218, 9165, 373, 285, 6026, 612, 5837, 285, 322, 335, 20961, 19280, 440, 269, 32769], 'row_word_token_membership_mask': [[True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True]], 'col_word_token_membership_mask': [[True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True]]}.
15
+ 01/07/2026 23:12:01 - INFO - __main__ - ***** Running training *****
16
+ 01/07/2026 23:12:01 - INFO - __main__ - Num examples = 1132837
17
+ 01/07/2026 23:12:01 - INFO - __main__ - Num Epochs = 10
18
+ 01/07/2026 23:12:01 - INFO - __main__ - Instantaneous batch size per device = 32
19
+ 01/07/2026 23:12:01 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 32
20
+ 01/07/2026 23:12:01 - INFO - __main__ - Gradient Accumulation steps = 1
21
+ 01/07/2026 23:12:01 - INFO - __main__ - Total optimization steps = 354020
22
+ 01/07/2026 23:12:01 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_0_init
23
+ 01/07/2026 23:12:01 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
24
+ 01/07/2026 23:12:01 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_0_init/model.safetensors
25
+ 01/07/2026 23:12:01 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_0_init/optimizer.bin
26
+ 01/07/2026 23:12:01 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_0_init/scheduler.bin
27
+ 01/07/2026 23:12:01 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_0_init/sampler.bin
28
+ 01/07/2026 23:12:01 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_0_init/sampler_1.bin
29
+ 01/07/2026 23:12:01 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_0_init/random_states_0.pkl
30
+ 01/07/2026 23:26:50 - INFO - __main__ - epoch 0: perplexity: 61.24869546449607 eval_loss: 4.179543972015381 eval_nwp_loss: 4.11494255065918 eval_attn_loss: 0.12920285761356354
31
+ 01/07/2026 23:26:50 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_1
32
+ 01/07/2026 23:26:50 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
33
+ 01/07/2026 23:26:50 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_1/model.safetensors
34
+ 01/07/2026 23:26:50 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_1/optimizer.bin
35
+ 01/07/2026 23:26:50 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_1/scheduler.bin
36
+ 01/07/2026 23:26:50 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_1/sampler.bin
37
+ 01/07/2026 23:26:50 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_1/sampler_1.bin
38
+ 01/07/2026 23:26:50 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_1/random_states_0.pkl
39
+ 01/07/2026 23:44:26 - INFO - __main__ - epoch 1: perplexity: 54.29593580362274 eval_loss: 4.055508613586426 eval_nwp_loss: 3.9944493770599365 eval_attn_loss: 0.12211816757917404
40
+ 01/07/2026 23:44:26 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_2
41
+ 01/07/2026 23:44:26 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
42
+ 01/07/2026 23:44:26 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_2/model.safetensors
43
+ 01/07/2026 23:44:26 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_2/optimizer.bin
44
+ 01/07/2026 23:44:26 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_2/scheduler.bin
45
+ 01/07/2026 23:44:26 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_2/sampler.bin
46
+ 01/07/2026 23:44:26 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_2/sampler_1.bin
47
+ 01/07/2026 23:44:26 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_2/random_states_0.pkl
48
+ 01/08/2026 00:02:05 - INFO - __main__ - epoch 2: perplexity: 50.87950992936742 eval_loss: 3.9889845848083496 eval_nwp_loss: 3.929460287094116 eval_attn_loss: 0.11904826015233994
49
+ 01/08/2026 00:02:05 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_3
50
+ 01/08/2026 00:02:05 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
51
+ 01/08/2026 00:02:05 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_3/model.safetensors
52
+ 01/08/2026 00:02:06 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_3/optimizer.bin
53
+ 01/08/2026 00:02:06 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_3/scheduler.bin
54
+ 01/08/2026 00:02:06 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_3/sampler.bin
55
+ 01/08/2026 00:02:06 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_3/sampler_1.bin
56
+ 01/08/2026 00:02:06 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_3/random_states_0.pkl
57
+ 01/08/2026 00:20:23 - INFO - __main__ - epoch 3: perplexity: 49.35910454674509 eval_loss: 3.9575889110565186 eval_nwp_loss: 3.8991222381591797 eval_attn_loss: 0.11693348735570908
58
+ 01/08/2026 00:20:23 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_4
59
+ 01/08/2026 00:20:23 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
60
+ 01/08/2026 00:20:23 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_4/model.safetensors
61
+ 01/08/2026 00:20:23 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_4/optimizer.bin
62
+ 01/08/2026 00:20:23 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_4/scheduler.bin
63
+ 01/08/2026 00:20:23 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_4/sampler.bin
64
+ 01/08/2026 00:20:23 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_4/sampler_1.bin
65
+ 01/08/2026 00:20:23 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_4/random_states_0.pkl
66
+ 01/08/2026 00:38:26 - INFO - __main__ - epoch 4: perplexity: 48.175829165813475 eval_loss: 3.9326725006103516 eval_nwp_loss: 3.8748574256896973 eval_attn_loss: 0.11563039571046829
67
+ 01/08/2026 00:38:26 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_5
68
+ 01/08/2026 00:38:26 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
69
+ 01/08/2026 00:38:26 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_5/model.safetensors
70
+ 01/08/2026 00:38:26 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_5/optimizer.bin
71
+ 01/08/2026 00:38:26 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_5/scheduler.bin
72
+ 01/08/2026 00:38:26 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_5/sampler.bin
73
+ 01/08/2026 00:38:26 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_5/sampler_1.bin
74
+ 01/08/2026 00:38:26 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_5/random_states_0.pkl
75
+ 01/08/2026 00:56:19 - INFO - __main__ - epoch 5: perplexity: 47.385371384410426 eval_loss: 3.9157824516296387 eval_nwp_loss: 3.85831356048584 eval_attn_loss: 0.1149379163980484
76
+ 01/08/2026 00:56:19 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_6
77
+ 01/08/2026 00:56:19 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
78
+ 01/08/2026 00:56:19 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_6/model.safetensors
79
+ 01/08/2026 00:56:20 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_6/optimizer.bin
80
+ 01/08/2026 00:56:20 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_6/scheduler.bin
81
+ 01/08/2026 00:56:20 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_6/sampler.bin
82
+ 01/08/2026 00:56:20 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_6/sampler_1.bin
83
+ 01/08/2026 00:56:20 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_6/random_states_0.pkl
84
+ 01/08/2026 01:13:59 - INFO - __main__ - epoch 6: perplexity: 47.240746613031696 eval_loss: 3.912421703338623 eval_nwp_loss: 3.8552567958831787 eval_attn_loss: 0.11432892084121704
85
+ 01/08/2026 01:13:59 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_7
86
+ 01/08/2026 01:13:59 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
87
+ 01/08/2026 01:13:59 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_7/model.safetensors
88
+ 01/08/2026 01:13:59 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_7/optimizer.bin
89
+ 01/08/2026 01:13:59 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_7/scheduler.bin
90
+ 01/08/2026 01:13:59 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_7/sampler.bin
91
+ 01/08/2026 01:13:59 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_7/sampler_1.bin
92
+ 01/08/2026 01:13:59 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_7/random_states_0.pkl
93
+ 01/08/2026 01:32:37 - INFO - __main__ - epoch 7: perplexity: 46.918543944135685 eval_loss: 3.9052536487579346 eval_nwp_loss: 3.8484129905700684 eval_attn_loss: 0.11368121206760406
94
+ 01/08/2026 01:32:37 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_8
95
+ 01/08/2026 01:32:37 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
96
+ 01/08/2026 01:32:37 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_8/model.safetensors
97
+ 01/08/2026 01:32:37 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_8/optimizer.bin
98
+ 01/08/2026 01:32:37 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_8/scheduler.bin
99
+ 01/08/2026 01:32:37 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_8/sampler.bin
100
+ 01/08/2026 01:32:37 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_8/sampler_1.bin
101
+ 01/08/2026 01:32:37 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_8/random_states_0.pkl
102
+ 01/08/2026 01:51:08 - INFO - __main__ - epoch 8: perplexity: 46.69080938866687 eval_loss: 3.900165557861328 eval_nwp_loss: 3.8435473442077637 eval_attn_loss: 0.11323680728673935
103
+ 01/08/2026 01:51:08 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_9
104
+ 01/08/2026 01:51:08 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
105
+ 01/08/2026 01:51:08 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_9/model.safetensors
106
+ 01/08/2026 01:51:08 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_9/optimizer.bin
107
+ 01/08/2026 01:51:08 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_9/scheduler.bin
108
+ 01/08/2026 01:51:08 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_9/sampler.bin
109
+ 01/08/2026 01:51:08 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_9/sampler_1.bin
110
+ 01/08/2026 01:51:08 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_9/random_states_0.pkl
111
+ 01/08/2026 02:08:43 - INFO - __main__ - epoch 9: perplexity: 46.75517415664002 eval_loss: 3.901474714279175 eval_nwp_loss: 3.8449249267578125 eval_attn_loss: 0.11309906840324402
112
+ 01/08/2026 02:08:43 - INFO - accelerate.accelerator - Saving current state to experiments/96/epoch_10
113
+ 01/08/2026 02:08:43 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
114
+ 01/08/2026 02:08:43 - INFO - accelerate.checkpointing - Model weights saved in experiments/96/epoch_10/model.safetensors
115
+ 01/08/2026 02:08:43 - INFO - accelerate.checkpointing - Optimizer state saved in experiments/96/epoch_10/optimizer.bin
116
+ 01/08/2026 02:08:43 - INFO - accelerate.checkpointing - Scheduler state saved in experiments/96/epoch_10/scheduler.bin
117
+ 01/08/2026 02:08:43 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in experiments/96/epoch_10/sampler.bin
118
+ 01/08/2026 02:08:43 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in experiments/96/epoch_10/sampler_1.bin
119
+ 01/08/2026 02:08:43 - INFO - accelerate.checkpointing - Random states saved in experiments/96/epoch_10/random_states_0.pkl