ptsv/olmo-7b-lora_tinystories_baseline

Browse files

Files changed (5) hide show

README.md +61 -0
adapter_config.json +38 -0
adapter_model.safetensors +3 -0
trainer_state.json +1367 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: allenai/OLMo-7B-0724-hf
+tags:
+- generated_from_trainer
+model-index:
+- name: tinystories_baseline
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/ptsvil/tom-training/runs/84suv6on)
+# tinystories_baseline
+This model is a fine-tuned version of [allenai/OLMo-7B-0724-hf](https://huggingface.co/allenai/OLMo-7B-0724-hf) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.2266
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.001
+- train_batch_size: 16
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 128
+- optimizer: Use paged_adamw_8bit with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 1
+- num_epochs: 3
+- mixed_precision_training: Native AMP
+### Training results
+### Framework versions
+- PEFT 0.14.0
+- Transformers 4.47.0
+- Pytorch 2.4.1+cu121
+- Datasets 3.2.0
+- Tokenizers 0.21.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "allenai/OLMo-7B-0724-hf",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "lm_head",
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6779fb767ecd6aeedd8633755a220532b5fc1064f0ac22dde0c008ca28c9924c
+size 1151021048

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1367 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 400,
+  "global_step": 375,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5721463561058044,
+      "learning_rate": 0.000997326203208556,
+      "loss": 22.6723,
+      "step": 2
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.5221903324127197,
+      "learning_rate": 0.0009919786096256684,
+      "loss": 22.7502,
+      "step": 4
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 3.269012212753296,
+      "learning_rate": 0.0009866310160427808,
+      "loss": 22.2706,
+      "step": 6
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 4.567020416259766,
+      "learning_rate": 0.0009812834224598931,
+      "loss": 21.3625,
+      "step": 8
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 7.019204139709473,
+      "learning_rate": 0.0009759358288770054,
+      "loss": 20.7279,
+      "step": 10
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 8.498096466064453,
+      "learning_rate": 0.0009705882352941176,
+      "loss": 20.8221,
+      "step": 12
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 7.8151397705078125,
+      "learning_rate": 0.00096524064171123,
+      "loss": 20.4136,
+      "step": 14
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 8.028499603271484,
+      "learning_rate": 0.0009598930481283422,
+      "loss": 20.2719,
+      "step": 16
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 8.516434669494629,
+      "learning_rate": 0.0009545454545454546,
+      "loss": 20.1681,
+      "step": 18
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 8.52490520477295,
+      "learning_rate": 0.0009491978609625669,
+      "loss": 19.8895,
+      "step": 20
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 6.709629058837891,
+      "learning_rate": 0.0009438502673796791,
+      "loss": 19.93,
+      "step": 22
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 6.038687705993652,
+      "learning_rate": 0.0009385026737967914,
+      "loss": 19.6312,
+      "step": 24
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 5.785665512084961,
+      "learning_rate": 0.0009331550802139037,
+      "loss": 19.7683,
+      "step": 26
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 5.79067850112915,
+      "learning_rate": 0.0009278074866310161,
+      "loss": 19.6965,
+      "step": 28
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 5.166928291320801,
+      "learning_rate": 0.0009224598930481284,
+      "loss": 19.4005,
+      "step": 30
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 4.578023433685303,
+      "learning_rate": 0.0009171122994652407,
+      "loss": 19.3963,
+      "step": 32
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 4.7540693283081055,
+      "learning_rate": 0.0009117647058823529,
+      "loss": 19.4129,
+      "step": 34
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 5.394408226013184,
+      "learning_rate": 0.0009064171122994653,
+      "loss": 19.5821,
+      "step": 36
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 4.4902753829956055,
+      "learning_rate": 0.0009010695187165776,
+      "loss": 19.6562,
+      "step": 38
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.49019193649292,
+      "learning_rate": 0.0008957219251336899,
+      "loss": 19.3588,
+      "step": 40
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 4.184142589569092,
+      "learning_rate": 0.0008903743315508022,
+      "loss": 18.9032,
+      "step": 42
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 3.98618483543396,
+      "learning_rate": 0.0008850267379679144,
+      "loss": 19.1882,
+      "step": 44
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 4.851687908172607,
+      "learning_rate": 0.0008796791443850267,
+      "loss": 19.4565,
+      "step": 46
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 4.108444690704346,
+      "learning_rate": 0.0008743315508021391,
+      "loss": 19.6149,
+      "step": 48
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 3.7055838108062744,
+      "learning_rate": 0.0008689839572192514,
+      "loss": 18.9573,
+      "step": 50
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 4.930137634277344,
+      "learning_rate": 0.0008636363636363636,
+      "loss": 19.4389,
+      "step": 52
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 3.910098075866699,
+      "learning_rate": 0.000858288770053476,
+      "loss": 19.1465,
+      "step": 54
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 4.0127716064453125,
+      "learning_rate": 0.0008529411764705882,
+      "loss": 19.5038,
+      "step": 56
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 4.495028018951416,
+      "learning_rate": 0.0008475935828877005,
+      "loss": 19.3252,
+      "step": 58
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 3.7703821659088135,
+      "learning_rate": 0.0008422459893048129,
+      "loss": 19.0238,
+      "step": 60
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 3.6335291862487793,
+      "learning_rate": 0.0008368983957219252,
+      "loss": 19.1296,
+      "step": 62
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 3.819183588027954,
+      "learning_rate": 0.0008315508021390374,
+      "loss": 18.4946,
+      "step": 64
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 3.3171255588531494,
+      "learning_rate": 0.0008262032085561497,
+      "loss": 18.8054,
+      "step": 66
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 4.316566467285156,
+      "learning_rate": 0.000820855614973262,
+      "loss": 19.162,
+      "step": 68
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 3.39648175239563,
+      "learning_rate": 0.0008155080213903744,
+      "loss": 18.5671,
+      "step": 70
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 3.7200136184692383,
+      "learning_rate": 0.0008101604278074867,
+      "loss": 18.9179,
+      "step": 72
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 3.6730430126190186,
+      "learning_rate": 0.0008048128342245989,
+      "loss": 18.7162,
+      "step": 74
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 3.5580945014953613,
+      "learning_rate": 0.0007994652406417113,
+      "loss": 19.0574,
+      "step": 76
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 3.4793589115142822,
+      "learning_rate": 0.0007941176470588235,
+      "loss": 18.8649,
+      "step": 78
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 4.074679374694824,
+      "learning_rate": 0.0007887700534759359,
+      "loss": 18.5553,
+      "step": 80
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 3.315810441970825,
+      "learning_rate": 0.0007834224598930482,
+      "loss": 18.2136,
+      "step": 82
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 4.288172721862793,
+      "learning_rate": 0.0007780748663101605,
+      "loss": 18.6089,
+      "step": 84
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 3.5749149322509766,
+      "learning_rate": 0.0007727272727272727,
+      "loss": 18.8697,
+      "step": 86
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 3.608825206756592,
+      "learning_rate": 0.000767379679144385,
+      "loss": 18.4129,
+      "step": 88
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 3.5199592113494873,
+      "learning_rate": 0.0007620320855614974,
+      "loss": 18.1619,
+      "step": 90
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 3.5022549629211426,
+      "learning_rate": 0.0007566844919786096,
+      "loss": 18.7368,
+      "step": 92
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 3.6002230644226074,
+      "learning_rate": 0.000751336898395722,
+      "loss": 18.7792,
+      "step": 94
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 4.682362079620361,
+      "learning_rate": 0.0007459893048128342,
+      "loss": 18.5495,
+      "step": 96
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 3.6108767986297607,
+      "learning_rate": 0.0007406417112299465,
+      "loss": 18.7077,
+      "step": 98
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 3.4719815254211426,
+      "learning_rate": 0.0007352941176470589,
+      "loss": 18.3262,
+      "step": 100
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 4.4115986824035645,
+      "learning_rate": 0.0007299465240641712,
+      "loss": 18.3416,
+      "step": 102
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 3.324169158935547,
+      "learning_rate": 0.0007245989304812834,
+      "loss": 18.7297,
+      "step": 104
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 3.4287421703338623,
+      "learning_rate": 0.0007192513368983958,
+      "loss": 18.4499,
+      "step": 106
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 3.9451239109039307,
+      "learning_rate": 0.000713903743315508,
+      "loss": 18.2669,
+      "step": 108
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 3.5031988620758057,
+      "learning_rate": 0.0007085561497326202,
+      "loss": 18.8895,
+      "step": 110
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 3.5174903869628906,
+      "learning_rate": 0.0007032085561497327,
+      "loss": 18.2961,
+      "step": 112
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 4.080729961395264,
+      "learning_rate": 0.0006978609625668449,
+      "loss": 18.5613,
+      "step": 114
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 3.7523930072784424,
+      "learning_rate": 0.0006925133689839572,
+      "loss": 18.5538,
+      "step": 116
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 3.066669225692749,
+      "learning_rate": 0.0006871657754010695,
+      "loss": 18.6904,
+      "step": 118
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 4.274256706237793,
+      "learning_rate": 0.0006818181818181818,
+      "loss": 18.6147,
+      "step": 120
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 3.690139055252075,
+      "learning_rate": 0.0006764705882352942,
+      "loss": 18.1693,
+      "step": 122
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 3.6681807041168213,
+      "learning_rate": 0.0006711229946524065,
+      "loss": 18.2498,
+      "step": 124
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 3.5203354358673096,
+      "learning_rate": 0.0006657754010695187,
+      "loss": 18.4522,
+      "step": 126
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 4.650991439819336,
+      "learning_rate": 0.000660427807486631,
+      "loss": 18.2839,
+      "step": 128
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 3.7944228649139404,
+      "learning_rate": 0.0006550802139037433,
+      "loss": 18.051,
+      "step": 130
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 3.2437500953674316,
+      "learning_rate": 0.0006497326203208556,
+      "loss": 18.1842,
+      "step": 132
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 3.2863543033599854,
+      "learning_rate": 0.000644385026737968,
+      "loss": 18.2304,
+      "step": 134
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 3.553260326385498,
+      "learning_rate": 0.0006390374331550802,
+      "loss": 18.1385,
+      "step": 136
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 3.4277195930480957,
+      "learning_rate": 0.0006336898395721925,
+      "loss": 18.1337,
+      "step": 138
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 3.974073886871338,
+      "learning_rate": 0.0006283422459893048,
+      "loss": 18.0326,
+      "step": 140
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 3.3450510501861572,
+      "learning_rate": 0.0006229946524064172,
+      "loss": 18.2695,
+      "step": 142
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 3.2181997299194336,
+      "learning_rate": 0.0006176470588235294,
+      "loss": 18.0315,
+      "step": 144
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 3.8346364498138428,
+      "learning_rate": 0.0006122994652406418,
+      "loss": 18.4272,
+      "step": 146
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 3.2085418701171875,
+      "learning_rate": 0.000606951871657754,
+      "loss": 18.1768,
+      "step": 148
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 3.462108850479126,
+      "learning_rate": 0.0006016042780748662,
+      "loss": 18.1731,
+      "step": 150
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 3.444965362548828,
+      "learning_rate": 0.0005962566844919787,
+      "loss": 18.3599,
+      "step": 152
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 3.3701171875,
+      "learning_rate": 0.0005909090909090909,
+      "loss": 18.1495,
+      "step": 154
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 3.5145843029022217,
+      "learning_rate": 0.0005855614973262032,
+      "loss": 18.0835,
+      "step": 156
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 3.4785313606262207,
+      "learning_rate": 0.0005802139037433155,
+      "loss": 17.8138,
+      "step": 158
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 3.9735538959503174,
+      "learning_rate": 0.0005748663101604278,
+      "loss": 18.0071,
+      "step": 160
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 3.650447368621826,
+      "learning_rate": 0.00056951871657754,
+      "loss": 18.0124,
+      "step": 162
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 3.6459813117980957,
+      "learning_rate": 0.0005641711229946525,
+      "loss": 18.0059,
+      "step": 164
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 3.2154831886291504,
+      "learning_rate": 0.0005588235294117647,
+      "loss": 17.9694,
+      "step": 166
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 3.367403507232666,
+      "learning_rate": 0.0005534759358288771,
+      "loss": 17.6557,
+      "step": 168
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 3.9948298931121826,
+      "learning_rate": 0.0005481283422459893,
+      "loss": 18.1942,
+      "step": 170
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 3.3495073318481445,
+      "learning_rate": 0.0005427807486631015,
+      "loss": 18.2016,
+      "step": 172
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 3.373162269592285,
+      "learning_rate": 0.000537433155080214,
+      "loss": 18.0422,
+      "step": 174
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 4.063633441925049,
+      "learning_rate": 0.0005320855614973262,
+      "loss": 18.0809,
+      "step": 176
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 3.4912514686584473,
+      "learning_rate": 0.0005267379679144385,
+      "loss": 18.0674,
+      "step": 178
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 3.5900015830993652,
+      "learning_rate": 0.0005213903743315508,
+      "loss": 17.9285,
+      "step": 180
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 4.066802024841309,
+      "learning_rate": 0.0005160427807486631,
+      "loss": 18.1551,
+      "step": 182
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 3.9782357215881348,
+      "learning_rate": 0.0005106951871657754,
+      "loss": 18.0509,
+      "step": 184
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 3.314682960510254,
+      "learning_rate": 0.0005053475935828878,
+      "loss": 17.7608,
+      "step": 186
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 3.3548595905303955,
+      "learning_rate": 0.0005,
+      "loss": 17.8103,
+      "step": 188
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 3.3475797176361084,
+      "learning_rate": 0.0004946524064171123,
+      "loss": 17.9465,
+      "step": 190
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 3.4256432056427,
+      "learning_rate": 0.0004893048128342246,
+      "loss": 17.6619,
+      "step": 192
+    },
+    {
+      "epoch": 1.552,
+      "grad_norm": 3.390056848526001,
+      "learning_rate": 0.0004839572192513369,
+      "loss": 17.9681,
+      "step": 194
+    },
+    {
+      "epoch": 1.568,
+      "grad_norm": 3.4441208839416504,
+      "learning_rate": 0.00047860962566844924,
+      "loss": 17.9407,
+      "step": 196
+    },
+    {
+      "epoch": 1.584,
+      "grad_norm": 3.2374165058135986,
+      "learning_rate": 0.0004732620320855615,
+      "loss": 17.7235,
+      "step": 198
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 3.5628514289855957,
+      "learning_rate": 0.0004679144385026738,
+      "loss": 18.1743,
+      "step": 200
+    },
+    {
+      "epoch": 1.616,
+      "grad_norm": 3.41139554977417,
+      "learning_rate": 0.00046256684491978613,
+      "loss": 17.8456,
+      "step": 202
+    },
+    {
+      "epoch": 1.6320000000000001,
+      "grad_norm": 3.423110008239746,
+      "learning_rate": 0.0004572192513368984,
+      "loss": 17.6656,
+      "step": 204
+    },
+    {
+      "epoch": 1.6480000000000001,
+      "grad_norm": 3.3344337940216064,
+      "learning_rate": 0.00045187165775401067,
+      "loss": 17.962,
+      "step": 206
+    },
+    {
+      "epoch": 1.6640000000000001,
+      "grad_norm": 3.5036981105804443,
+      "learning_rate": 0.000446524064171123,
+      "loss": 18.0875,
+      "step": 208
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 3.4953839778900146,
+      "learning_rate": 0.0004411764705882353,
+      "loss": 17.3435,
+      "step": 210
+    },
+    {
+      "epoch": 1.696,
+      "grad_norm": 3.6864068508148193,
+      "learning_rate": 0.0004358288770053476,
+      "loss": 17.9087,
+      "step": 212
+    },
+    {
+      "epoch": 1.712,
+      "grad_norm": 3.4755449295043945,
+      "learning_rate": 0.0004304812834224599,
+      "loss": 17.5076,
+      "step": 214
+    },
+    {
+      "epoch": 1.728,
+      "grad_norm": 3.8116891384124756,
+      "learning_rate": 0.0004251336898395722,
+      "loss": 17.9272,
+      "step": 216
+    },
+    {
+      "epoch": 1.744,
+      "grad_norm": 3.18284010887146,
+      "learning_rate": 0.0004197860962566845,
+      "loss": 17.7148,
+      "step": 218
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 3.2884979248046875,
+      "learning_rate": 0.0004144385026737968,
+      "loss": 17.8813,
+      "step": 220
+    },
+    {
+      "epoch": 1.776,
+      "grad_norm": 3.3735768795013428,
+      "learning_rate": 0.00040909090909090913,
+      "loss": 18.0372,
+      "step": 222
+    },
+    {
+      "epoch": 1.792,
+      "grad_norm": 3.2611794471740723,
+      "learning_rate": 0.00040374331550802143,
+      "loss": 17.3771,
+      "step": 224
+    },
+    {
+      "epoch": 1.808,
+      "grad_norm": 3.3338570594787598,
+      "learning_rate": 0.00039839572192513367,
+      "loss": 18.4657,
+      "step": 226
+    },
+    {
+      "epoch": 1.8239999999999998,
+      "grad_norm": 3.405127763748169,
+      "learning_rate": 0.000393048128342246,
+      "loss": 17.9076,
+      "step": 228
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 3.561793565750122,
+      "learning_rate": 0.0003877005347593583,
+      "loss": 17.8996,
+      "step": 230
+    },
+    {
+      "epoch": 1.8559999999999999,
+      "grad_norm": 3.5615479946136475,
+      "learning_rate": 0.00038235294117647055,
+      "loss": 17.6746,
+      "step": 232
+    },
+    {
+      "epoch": 1.8719999999999999,
+      "grad_norm": 3.4306275844573975,
+      "learning_rate": 0.0003770053475935829,
+      "loss": 17.7182,
+      "step": 234
+    },
+    {
+      "epoch": 1.888,
+      "grad_norm": 3.5057003498077393,
+      "learning_rate": 0.0003716577540106952,
+      "loss": 17.8058,
+      "step": 236
+    },
+    {
+      "epoch": 1.904,
+      "grad_norm": 3.3117101192474365,
+      "learning_rate": 0.0003663101604278075,
+      "loss": 17.8643,
+      "step": 238
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 3.6897945404052734,
+      "learning_rate": 0.0003609625668449198,
+      "loss": 17.8266,
+      "step": 240
+    },
+    {
+      "epoch": 1.936,
+      "grad_norm": 3.7577505111694336,
+      "learning_rate": 0.0003556149732620321,
+      "loss": 18.6381,
+      "step": 242
+    },
+    {
+      "epoch": 1.952,
+      "grad_norm": 3.2401480674743652,
+      "learning_rate": 0.0003502673796791444,
+      "loss": 17.6933,
+      "step": 244
+    },
+    {
+      "epoch": 1.968,
+      "grad_norm": 3.6619515419006348,
+      "learning_rate": 0.0003449197860962567,
+      "loss": 18.0547,
+      "step": 246
+    },
+    {
+      "epoch": 1.984,
+      "grad_norm": 3.8387668132781982,
+      "learning_rate": 0.000339572192513369,
+      "loss": 17.7932,
+      "step": 248
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 3.390653371810913,
+      "learning_rate": 0.0003342245989304813,
+      "loss": 17.2655,
+      "step": 250
+    },
+    {
+      "epoch": 2.016,
+      "grad_norm": 3.40058970451355,
+      "learning_rate": 0.00032887700534759356,
+      "loss": 17.703,
+      "step": 252
+    },
+    {
+      "epoch": 2.032,
+      "grad_norm": 3.568702220916748,
+      "learning_rate": 0.0003235294117647059,
+      "loss": 17.2042,
+      "step": 254
+    },
+    {
+      "epoch": 2.048,
+      "grad_norm": 3.529431104660034,
+      "learning_rate": 0.0003181818181818182,
+      "loss": 17.5732,
+      "step": 256
+    },
+    {
+      "epoch": 2.064,
+      "grad_norm": 3.3919003009796143,
+      "learning_rate": 0.00031283422459893044,
+      "loss": 17.6191,
+      "step": 258
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 3.878042459487915,
+      "learning_rate": 0.0003074866310160428,
+      "loss": 17.4911,
+      "step": 260
+    },
+    {
+      "epoch": 2.096,
+      "grad_norm": 3.772318124771118,
+      "learning_rate": 0.0003021390374331551,
+      "loss": 17.7258,
+      "step": 262
+    },
+    {
+      "epoch": 2.112,
+      "grad_norm": 3.4453060626983643,
+      "learning_rate": 0.0002967914438502674,
+      "loss": 17.4906,
+      "step": 264
+    },
+    {
+      "epoch": 2.128,
+      "grad_norm": 3.4957454204559326,
+      "learning_rate": 0.0002914438502673797,
+      "loss": 17.5716,
+      "step": 266
+    },
+    {
+      "epoch": 2.144,
+      "grad_norm": 3.530831813812256,
+      "learning_rate": 0.000286096256684492,
+      "loss": 17.4089,
+      "step": 268
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 3.7524755001068115,
+      "learning_rate": 0.0002807486631016043,
+      "loss": 17.7712,
+      "step": 270
+    },
+    {
+      "epoch": 2.176,
+      "grad_norm": 3.297961711883545,
+      "learning_rate": 0.00027540106951871656,
+      "loss": 17.4408,
+      "step": 272
+    },
+    {
+      "epoch": 2.192,
+      "grad_norm": 3.3661088943481445,
+      "learning_rate": 0.0002700534759358289,
+      "loss": 17.6753,
+      "step": 274
+    },
+    {
+      "epoch": 2.208,
+      "grad_norm": 3.646210193634033,
+      "learning_rate": 0.0002647058823529412,
+      "loss": 17.7821,
+      "step": 276
+    },
+    {
+      "epoch": 2.224,
+      "grad_norm": 3.475140333175659,
+      "learning_rate": 0.00025935828877005345,
+      "loss": 17.6129,
+      "step": 278
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 3.4734578132629395,
+      "learning_rate": 0.0002540106951871658,
+      "loss": 17.6856,
+      "step": 280
+    },
+    {
+      "epoch": 2.2560000000000002,
+      "grad_norm": 3.491572380065918,
+      "learning_rate": 0.0002486631016042781,
+      "loss": 17.6071,
+      "step": 282
+    },
+    {
+      "epoch": 2.2720000000000002,
+      "grad_norm": 3.4102542400360107,
+      "learning_rate": 0.0002433155080213904,
+      "loss": 17.352,
+      "step": 284
+    },
+    {
+      "epoch": 2.288,
+      "grad_norm": 3.393477439880371,
+      "learning_rate": 0.00023796791443850268,
+      "loss": 17.2612,
+      "step": 286
+    },
+    {
+      "epoch": 2.304,
+      "grad_norm": 3.112462282180786,
+      "learning_rate": 0.000232620320855615,
+      "loss": 17.3272,
+      "step": 288
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 3.3398191928863525,
+      "learning_rate": 0.00022727272727272727,
+      "loss": 17.5815,
+      "step": 290
+    },
+    {
+      "epoch": 2.336,
+      "grad_norm": 3.5039889812469482,
+      "learning_rate": 0.00022192513368983957,
+      "loss": 17.7557,
+      "step": 292
+    },
+    {
+      "epoch": 2.352,
+      "grad_norm": 3.532892942428589,
+      "learning_rate": 0.0002165775401069519,
+      "loss": 18.0523,
+      "step": 294
+    },
+    {
+      "epoch": 2.368,
+      "grad_norm": 3.2969062328338623,
+      "learning_rate": 0.00021122994652406418,
+      "loss": 17.7496,
+      "step": 296
+    },
+    {
+      "epoch": 2.384,
+      "grad_norm": 3.262855291366577,
+      "learning_rate": 0.00020588235294117645,
+      "loss": 17.793,
+      "step": 298
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 3.459914445877075,
+      "learning_rate": 0.00020053475935828877,
+      "loss": 17.9245,
+      "step": 300
+    },
+    {
+      "epoch": 2.416,
+      "grad_norm": 3.6749696731567383,
+      "learning_rate": 0.00019518716577540107,
+      "loss": 17.7125,
+      "step": 302
+    },
+    {
+      "epoch": 2.432,
+      "grad_norm": 3.266754150390625,
+      "learning_rate": 0.0001898395721925134,
+      "loss": 17.5905,
+      "step": 304
+    },
+    {
+      "epoch": 2.448,
+      "grad_norm": 3.1848971843719482,
+      "learning_rate": 0.00018449197860962566,
+      "loss": 17.523,
+      "step": 306
+    },
+    {
+      "epoch": 2.464,
+      "grad_norm": 3.2962844371795654,
+      "learning_rate": 0.00017914438502673795,
+      "loss": 17.5297,
+      "step": 308
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 3.4688000679016113,
+      "learning_rate": 0.00017379679144385028,
+      "loss": 17.6315,
+      "step": 310
+    },
+    {
+      "epoch": 2.496,
+      "grad_norm": 3.4146833419799805,
+      "learning_rate": 0.00016844919786096257,
+      "loss": 17.5776,
+      "step": 312
+    },
+    {
+      "epoch": 2.512,
+      "grad_norm": 3.3122944831848145,
+      "learning_rate": 0.0001631016042780749,
+      "loss": 17.7264,
+      "step": 314
+    },
+    {
+      "epoch": 2.528,
+      "grad_norm": 3.2939462661743164,
+      "learning_rate": 0.00015775401069518716,
+      "loss": 17.48,
+      "step": 316
+    },
+    {
+      "epoch": 2.544,
+      "grad_norm": 3.8504631519317627,
+      "learning_rate": 0.00015240641711229946,
+      "loss": 17.3854,
+      "step": 318
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 4.062356948852539,
+      "learning_rate": 0.00014705882352941178,
+      "loss": 17.6811,
+      "step": 320
+    },
+    {
+      "epoch": 2.576,
+      "grad_norm": 3.741989850997925,
+      "learning_rate": 0.00014171122994652407,
+      "loss": 17.4078,
+      "step": 322
+    },
+    {
+      "epoch": 2.592,
+      "grad_norm": 3.7287967205047607,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 17.3517,
+      "step": 324
+    },
+    {
+      "epoch": 2.608,
+      "grad_norm": 3.6224465370178223,
+      "learning_rate": 0.00013101604278074866,
+      "loss": 17.254,
+      "step": 326
+    },
+    {
+      "epoch": 2.624,
+      "grad_norm": 3.5674147605895996,
+      "learning_rate": 0.00012566844919786096,
+      "loss": 17.869,
+      "step": 328
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 3.722736358642578,
+      "learning_rate": 0.00012032085561497325,
+      "loss": 17.7399,
+      "step": 330
+    },
+    {
+      "epoch": 2.656,
+      "grad_norm": 3.6463096141815186,
+      "learning_rate": 0.00011497326203208556,
+      "loss": 17.5016,
+      "step": 332
+    },
+    {
+      "epoch": 2.672,
+      "grad_norm": 3.5358524322509766,
+      "learning_rate": 0.00010962566844919786,
+      "loss": 17.0355,
+      "step": 334
+    },
+    {
+      "epoch": 2.6879999999999997,
+      "grad_norm": 3.5321309566497803,
+      "learning_rate": 0.00010427807486631017,
+      "loss": 17.5089,
+      "step": 336
+    },
+    {
+      "epoch": 2.7039999999999997,
+      "grad_norm": 3.4019291400909424,
+      "learning_rate": 9.893048128342247e-05,
+      "loss": 17.3768,
+      "step": 338
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 3.4486570358276367,
+      "learning_rate": 9.358288770053476e-05,
+      "loss": 17.488,
+      "step": 340
+    },
+    {
+      "epoch": 2.7359999999999998,
+      "grad_norm": 3.7740256786346436,
+      "learning_rate": 8.823529411764706e-05,
+      "loss": 17.5768,
+      "step": 342
+    },
+    {
+      "epoch": 2.752,
+      "grad_norm": 3.5659339427948,
+      "learning_rate": 8.288770053475936e-05,
+      "loss": 17.6865,
+      "step": 344
+    },
+    {
+      "epoch": 2.768,
+      "grad_norm": 3.3678972721099854,
+      "learning_rate": 7.754010695187167e-05,
+      "loss": 17.4687,
+      "step": 346
+    },
+    {
+      "epoch": 2.784,
+      "grad_norm": 3.585134506225586,
+      "learning_rate": 7.219251336898395e-05,
+      "loss": 17.536,
+      "step": 348
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 3.6471846103668213,
+      "learning_rate": 6.684491978609626e-05,
+      "loss": 17.6269,
+      "step": 350
+    },
+    {
+      "epoch": 2.816,
+      "grad_norm": 3.533790111541748,
+      "learning_rate": 6.149732620320857e-05,
+      "loss": 17.5771,
+      "step": 352
+    },
+    {
+      "epoch": 2.832,
+      "grad_norm": 3.7971367835998535,
+      "learning_rate": 5.614973262032086e-05,
+      "loss": 17.874,
+      "step": 354
+    },
+    {
+      "epoch": 2.848,
+      "grad_norm": 3.391874074935913,
+      "learning_rate": 5.080213903743316e-05,
+      "loss": 17.2528,
+      "step": 356
+    },
+    {
+      "epoch": 2.864,
+      "grad_norm": 3.069033145904541,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 17.6175,
+      "step": 358
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 3.780275821685791,
+      "learning_rate": 4.0106951871657754e-05,
+      "loss": 17.2663,
+      "step": 360
+    },
+    {
+      "epoch": 2.896,
+      "grad_norm": 3.3377978801727295,
+      "learning_rate": 3.4759358288770055e-05,
+      "loss": 17.3711,
+      "step": 362
+    },
+    {
+      "epoch": 2.912,
+      "grad_norm": 3.356203317642212,
+      "learning_rate": 2.9411764705882354e-05,
+      "loss": 17.6077,
+      "step": 364
+    },
+    {
+      "epoch": 2.928,
+      "grad_norm": 3.302241563796997,
+      "learning_rate": 2.4064171122994652e-05,
+      "loss": 17.4777,
+      "step": 366
+    },
+    {
+      "epoch": 2.944,
+      "grad_norm": 3.73811411857605,
+      "learning_rate": 1.871657754010695e-05,
+      "loss": 17.3149,
+      "step": 368
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 3.392902135848999,
+      "learning_rate": 1.336898395721925e-05,
+      "loss": 17.8118,
+      "step": 370
+    },
+    {
+      "epoch": 2.976,
+      "grad_norm": 3.8080010414123535,
+      "learning_rate": 8.021390374331552e-06,
+      "loss": 17.1875,
+      "step": 372
+    },
+    {
+      "epoch": 2.992,
+      "grad_norm": 3.5202646255493164,
+      "learning_rate": 2.67379679144385e-06,
+      "loss": 17.7556,
+      "step": 374
+    },
+    {
+      "epoch": 3.0,
+      "step": 375,
+      "total_flos": 2.6461914289864704e+17,
+      "train_loss": 18.264725362141927,
+      "train_runtime": 1944.3243,
+      "train_samples_per_second": 24.687,
+      "train_steps_per_second": 0.193
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 2.2290163040161133,
+      "eval_runtime": 83.3238,
+      "eval_samples_per_second": 24.003,
+      "eval_steps_per_second": 3.0,
+      "step": 375
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 2.226619243621826,
+      "eval_runtime": 83.9815,
+      "eval_samples_per_second": 23.815,
+      "eval_steps_per_second": 2.977,
+      "step": 375
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 375,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.6461914289864704e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02e08c5b7c7f54b1dd01657560b2b757c51c561ffc2a7ceb17a677da68fb107e
+size 5368