Model save

Browse files

Files changed (4) hide show

README.md +69 -0
all_results.json +9 -0
train_results.json +9 -0
trainer_state.json +624 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+license: llama3
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: meta-llama/Meta-Llama-3-8B
+datasets:
+- generator
+model-index:
+- name: downstream_0.1p_seed42_level2_syntax
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# downstream_0.1p_seed42_level2_syntax
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.0375
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 8
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 128
+- total_eval_batch_size: 4
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.1119        | 0.9994 | 408  | 1.0375          |
+### Framework versions
+- PEFT 0.11.1
+- Transformers 4.43.4
+- Pytorch 2.3.1+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9993876301285977,
+    "total_flos": 1.2948113606049792e+16,
+    "train_loss": 1.1371603935372596,
+    "train_runtime": 15464.5644,
+    "train_samples": 90000,
+    "train_samples_per_second": 3.378,
+    "train_steps_per_second": 0.026
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9993876301285977,
+    "total_flos": 1.2948113606049792e+16,
+    "train_loss": 1.1371603935372596,
+    "train_runtime": 15464.5644,
+    "train_samples": 90000,
+    "train_samples_per_second": 3.378,
+    "train_steps_per_second": 0.026
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,624 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9993876301285977,
+  "eval_steps": 500,
+  "global_step": 408,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.002449479485609308,
+      "grad_norm": 1.0681437691408622,
+      "learning_rate": 4.8780487804878055e-06,
+      "loss": 1.4232,
+      "step": 1
+    },
+    {
+      "epoch": 0.01224739742804654,
+      "grad_norm": 1.136306986221139,
+      "learning_rate": 2.4390243902439026e-05,
+      "loss": 1.3933,
+      "step": 5
+    },
+    {
+      "epoch": 0.02449479485609308,
+      "grad_norm": 0.245338434807422,
+      "learning_rate": 4.878048780487805e-05,
+      "loss": 1.3421,
+      "step": 10
+    },
+    {
+      "epoch": 0.03674219228413962,
+      "grad_norm": 0.2375193035213865,
+      "learning_rate": 7.317073170731707e-05,
+      "loss": 1.3138,
+      "step": 15
+    },
+    {
+      "epoch": 0.04898958971218616,
+      "grad_norm": 0.18791567927506073,
+      "learning_rate": 9.75609756097561e-05,
+      "loss": 1.2873,
+      "step": 20
+    },
+    {
+      "epoch": 0.0612369871402327,
+      "grad_norm": 0.12816603940375418,
+      "learning_rate": 0.00012195121951219512,
+      "loss": 1.2549,
+      "step": 25
+    },
+    {
+      "epoch": 0.07348438456827924,
+      "grad_norm": 0.11662240238108945,
+      "learning_rate": 0.00014634146341463414,
+      "loss": 1.2099,
+      "step": 30
+    },
+    {
+      "epoch": 0.08573178199632578,
+      "grad_norm": 0.08250874631468892,
+      "learning_rate": 0.0001707317073170732,
+      "loss": 1.1902,
+      "step": 35
+    },
+    {
+      "epoch": 0.09797917942437231,
+      "grad_norm": 0.09447143113709522,
+      "learning_rate": 0.0001951219512195122,
+      "loss": 1.2005,
+      "step": 40
+    },
+    {
+      "epoch": 0.11022657685241886,
+      "grad_norm": 0.0824021009170569,
+      "learning_rate": 0.00019994138413588491,
+      "loss": 1.1864,
+      "step": 45
+    },
+    {
+      "epoch": 0.1224739742804654,
+      "grad_norm": 0.0894227247413217,
+      "learning_rate": 0.0001997033749537941,
+      "loss": 1.1574,
+      "step": 50
+    },
+    {
+      "epoch": 0.13472137170851195,
+      "grad_norm": 0.07854947144546601,
+      "learning_rate": 0.00019928274457498818,
+      "loss": 1.1595,
+      "step": 55
+    },
+    {
+      "epoch": 0.14696876913655849,
+      "grad_norm": 0.08179447168807087,
+      "learning_rate": 0.00019868026344503306,
+      "loss": 1.1668,
+      "step": 60
+    },
+    {
+      "epoch": 0.15921616656460502,
+      "grad_norm": 0.09018877740754977,
+      "learning_rate": 0.00019789703509552945,
+      "loss": 1.1517,
+      "step": 65
+    },
+    {
+      "epoch": 0.17146356399265156,
+      "grad_norm": 0.08070743453648781,
+      "learning_rate": 0.00019693449412283435,
+      "loss": 1.1557,
+      "step": 70
+    },
+    {
+      "epoch": 0.1837109614206981,
+      "grad_norm": 0.17006096808234306,
+      "learning_rate": 0.00019579440356038967,
+      "loss": 1.1265,
+      "step": 75
+    },
+    {
+      "epoch": 0.19595835884874463,
+      "grad_norm": 0.07490774235953891,
+      "learning_rate": 0.00019447885164947088,
+      "loss": 1.1411,
+      "step": 80
+    },
+    {
+      "epoch": 0.2082057562767912,
+      "grad_norm": 0.07560268552998486,
+      "learning_rate": 0.00019299024801426994,
+      "loss": 1.1346,
+      "step": 85
+    },
+    {
+      "epoch": 0.22045315370483773,
+      "grad_norm": 0.06989244321896809,
+      "learning_rate": 0.00019133131924831917,
+      "loss": 1.1373,
+      "step": 90
+    },
+    {
+      "epoch": 0.23270055113288426,
+      "grad_norm": 0.07297326140007601,
+      "learning_rate": 0.00018950510392033945,
+      "loss": 1.1262,
+      "step": 95
+    },
+    {
+      "epoch": 0.2449479485609308,
+      "grad_norm": 0.08107861966515256,
+      "learning_rate": 0.00018751494700866087,
+      "loss": 1.1266,
+      "step": 100
+    },
+    {
+      "epoch": 0.25719534598897736,
+      "grad_norm": 0.07525334503932822,
+      "learning_rate": 0.0001853644937744095,
+      "loss": 1.1337,
+      "step": 105
+    },
+    {
+      "epoch": 0.2694427434170239,
+      "grad_norm": 0.07418001338537485,
+      "learning_rate": 0.00018305768308468293,
+      "loss": 1.1527,
+      "step": 110
+    },
+    {
+      "epoch": 0.28169014084507044,
+      "grad_norm": 0.07966858826560685,
+      "learning_rate": 0.00018059874019794351,
+      "loss": 1.1275,
+      "step": 115
+    },
+    {
+      "epoch": 0.29393753827311697,
+      "grad_norm": 0.06884328126643421,
+      "learning_rate": 0.00017799216902484466,
+      "loss": 1.1142,
+      "step": 120
+    },
+    {
+      "epoch": 0.3061849357011635,
+      "grad_norm": 0.07638833793093423,
+      "learning_rate": 0.00017524274387866484,
+      "loss": 1.1489,
+      "step": 125
+    },
+    {
+      "epoch": 0.31843233312921004,
+      "grad_norm": 0.07163478075363215,
+      "learning_rate": 0.00017235550073046028,
+      "loss": 1.1334,
+      "step": 130
+    },
+    {
+      "epoch": 0.3306797305572566,
+      "grad_norm": 0.07584970266147063,
+      "learning_rate": 0.00016933572798495328,
+      "loss": 1.1394,
+      "step": 135
+    },
+    {
+      "epoch": 0.3429271279853031,
+      "grad_norm": 0.0882549132067985,
+      "learning_rate": 0.00016618895679405165,
+      "loss": 1.1266,
+      "step": 140
+    },
+    {
+      "epoch": 0.35517452541334965,
+      "grad_norm": 0.0738337228599522,
+      "learning_rate": 0.00016292095092574154,
+      "loss": 1.1356,
+      "step": 145
+    },
+    {
+      "epoch": 0.3674219228413962,
+      "grad_norm": 0.07323403324052054,
+      "learning_rate": 0.00015953769620691022,
+      "loss": 1.1448,
+      "step": 150
+    },
+    {
+      "epoch": 0.3796693202694427,
+      "grad_norm": 0.07258910733356848,
+      "learning_rate": 0.0001560453895594354,
+      "loss": 1.1255,
+      "step": 155
+    },
+    {
+      "epoch": 0.39191671769748926,
+      "grad_norm": 0.08483741713706569,
+      "learning_rate": 0.00015245042764962417,
+      "loss": 1.1203,
+      "step": 160
+    },
+    {
+      "epoch": 0.40416411512553585,
+      "grad_norm": 0.07393069983884801,
+      "learning_rate": 0.00014875939517179016,
+      "loss": 1.1305,
+      "step": 165
+    },
+    {
+      "epoch": 0.4164115125535824,
+      "grad_norm": 0.07536661950821844,
+      "learning_rate": 0.00014497905278743083,
+      "loss": 1.1142,
+      "step": 170
+    },
+    {
+      "epoch": 0.4286589099816289,
+      "grad_norm": 0.0774588990644394,
+      "learning_rate": 0.00014111632474209505,
+      "loss": 1.1014,
+      "step": 175
+    },
+    {
+      "epoch": 0.44090630740967546,
+      "grad_norm": 0.0723327812244184,
+      "learning_rate": 0.0001371782861826226,
+      "loss": 1.1215,
+      "step": 180
+    },
+    {
+      "epoch": 0.453153704837722,
+      "grad_norm": 0.07454342646966894,
+      "learning_rate": 0.00013317215019798638,
+      "loss": 1.1276,
+      "step": 185
+    },
+    {
+      "epoch": 0.46540110226576853,
+      "grad_norm": 0.07195661618627822,
+      "learning_rate": 0.00012910525460747344,
+      "loss": 1.1083,
+      "step": 190
+    },
+    {
+      "epoch": 0.47764849969381507,
+      "grad_norm": 0.07092309315305423,
+      "learning_rate": 0.00012498504852040434,
+      "loss": 1.1373,
+      "step": 195
+    },
+    {
+      "epoch": 0.4898958971218616,
+      "grad_norm": 0.07301281736550075,
+      "learning_rate": 0.00012081907869200849,
+      "loss": 1.1312,
+      "step": 200
+    },
+    {
+      "epoch": 0.5021432945499081,
+      "grad_norm": 0.07484347637628397,
+      "learning_rate": 0.00011661497570044738,
+      "loss": 1.1208,
+      "step": 205
+    },
+    {
+      "epoch": 0.5143906919779547,
+      "grad_norm": 0.0724091132876655,
+      "learning_rate": 0.00011238043997030329,
+      "loss": 1.1309,
+      "step": 210
+    },
+    {
+      "epoch": 0.5266380894060012,
+      "grad_norm": 0.2342422867496652,
+      "learning_rate": 0.00010812322766813461,
+      "loss": 1.1138,
+      "step": 215
+    },
+    {
+      "epoch": 0.5388854868340478,
+      "grad_norm": 0.07212287103404749,
+      "learning_rate": 0.00010385113649593137,
+      "loss": 1.1192,
+      "step": 220
+    },
+    {
+      "epoch": 0.5511328842620943,
+      "grad_norm": 0.07073394667449048,
+      "learning_rate": 9.957199140849278e-05,
+      "loss": 1.109,
+      "step": 225
+    },
+    {
+      "epoch": 0.5633802816901409,
+      "grad_norm": 0.06964674186116192,
+      "learning_rate": 9.529363028088725e-05,
+      "loss": 1.115,
+      "step": 230
+    },
+    {
+      "epoch": 0.5756276791181874,
+      "grad_norm": 0.07117704481271163,
+      "learning_rate": 9.102388955224703e-05,
+      "loss": 1.1099,
+      "step": 235
+    },
+    {
+      "epoch": 0.5878750765462339,
+      "grad_norm": 0.07216465095526178,
+      "learning_rate": 8.677058987219295e-05,
+      "loss": 1.113,
+      "step": 240
+    },
+    {
+      "epoch": 0.6001224739742804,
+      "grad_norm": 0.0725366248294856,
+      "learning_rate": 8.254152177618e-05,
+      "loss": 1.1047,
+      "step": 245
+    },
+    {
+      "epoch": 0.612369871402327,
+      "grad_norm": 0.07979788000565378,
+      "learning_rate": 7.83444314160013e-05,
+      "loss": 1.1275,
+      "step": 250
+    },
+    {
+      "epoch": 0.6246172688303735,
+      "grad_norm": 0.07038014346187686,
+      "learning_rate": 7.418700637158742e-05,
+      "loss": 1.0942,
+      "step": 255
+    },
+    {
+      "epoch": 0.6368646662584201,
+      "grad_norm": 0.07043699403227373,
+      "learning_rate": 7.00768615700881e-05,
+      "loss": 1.1188,
+      "step": 260
+    },
+    {
+      "epoch": 0.6491120636864667,
+      "grad_norm": 0.07317860829882807,
+      "learning_rate": 6.60215253380287e-05,
+      "loss": 1.1228,
+      "step": 265
+    },
+    {
+      "epoch": 0.6613594611145132,
+      "grad_norm": 0.0736268694865736,
+      "learning_rate": 6.202842561208758e-05,
+      "loss": 1.1004,
+      "step": 270
+    },
+    {
+      "epoch": 0.6736068585425597,
+      "grad_norm": 0.0681966580195897,
+      "learning_rate": 5.810487633375261e-05,
+      "loss": 1.0964,
+      "step": 275
+    },
+    {
+      "epoch": 0.6858542559706062,
+      "grad_norm": 0.06988692587157964,
+      "learning_rate": 5.425806405277609e-05,
+      "loss": 1.1123,
+      "step": 280
+    },
+    {
+      "epoch": 0.6981016533986528,
+      "grad_norm": 0.06961689512931302,
+      "learning_rate": 5.049503476396627e-05,
+      "loss": 1.1254,
+      "step": 285
+    },
+    {
+      "epoch": 0.7103490508266993,
+      "grad_norm": 0.06848007555420067,
+      "learning_rate": 4.682268100142566e-05,
+      "loss": 1.1064,
+      "step": 290
+    },
+    {
+      "epoch": 0.7225964482547459,
+      "grad_norm": 0.06848238221490942,
+      "learning_rate": 4.32477292138746e-05,
+      "loss": 1.1078,
+      "step": 295
+    },
+    {
+      "epoch": 0.7348438456827924,
+      "grad_norm": 0.06932096702672658,
+      "learning_rate": 3.9776727444184744e-05,
+      "loss": 1.1359,
+      "step": 300
+    },
+    {
+      "epoch": 0.747091243110839,
+      "grad_norm": 0.06964742874998163,
+      "learning_rate": 3.641603333568831e-05,
+      "loss": 1.1071,
+      "step": 305
+    },
+    {
+      "epoch": 0.7593386405388854,
+      "grad_norm": 0.07515967784857266,
+      "learning_rate": 3.3171802487232086e-05,
+      "loss": 1.114,
+      "step": 310
+    },
+    {
+      "epoch": 0.771586037966932,
+      "grad_norm": 0.07140996525669459,
+      "learning_rate": 3.0049977178305076e-05,
+      "loss": 1.1179,
+      "step": 315
+    },
+    {
+      "epoch": 0.7838334353949785,
+      "grad_norm": 0.06922024794802567,
+      "learning_rate": 2.7056275484891304e-05,
+      "loss": 1.0962,
+      "step": 320
+    },
+    {
+      "epoch": 0.7960808328230251,
+      "grad_norm": 0.07028157088055875,
+      "learning_rate": 2.419618080598417e-05,
+      "loss": 1.1361,
+      "step": 325
+    },
+    {
+      "epoch": 0.8083282302510717,
+      "grad_norm": 0.07083633675990936,
+      "learning_rate": 2.1474931819945553e-05,
+      "loss": 1.1025,
+      "step": 330
+    },
+    {
+      "epoch": 0.8205756276791182,
+      "grad_norm": 0.07118501791774294,
+      "learning_rate": 1.889751288910645e-05,
+      "loss": 1.0959,
+      "step": 335
+    },
+    {
+      "epoch": 0.8328230251071648,
+      "grad_norm": 0.0724941459460009,
+      "learning_rate": 1.6468644930184095e-05,
+      "loss": 1.0963,
+      "step": 340
+    },
+    {
+      "epoch": 0.8450704225352113,
+      "grad_norm": 0.07065248333558355,
+      "learning_rate": 1.4192776767238158e-05,
+      "loss": 1.1097,
+      "step": 345
+    },
+    {
+      "epoch": 0.8573178199632578,
+      "grad_norm": 0.06638354595318986,
+      "learning_rate": 1.2074076983003958e-05,
+      "loss": 1.1086,
+      "step": 350
+    },
+    {
+      "epoch": 0.8695652173913043,
+      "grad_norm": 0.0678250769481932,
+      "learning_rate": 1.0116426283528302e-05,
+      "loss": 1.1164,
+      "step": 355
+    },
+    {
+      "epoch": 0.8818126148193509,
+      "grad_norm": 0.06908465334552778,
+      "learning_rate": 8.323410390093522e-06,
+      "loss": 1.1219,
+      "step": 360
+    },
+    {
+      "epoch": 0.8940600122473974,
+      "grad_norm": 0.07002593669930346,
+      "learning_rate": 6.698313471448547e-06,
+      "loss": 1.1057,
+      "step": 365
+    },
+    {
+      "epoch": 0.906307409675444,
+      "grad_norm": 0.06951335625337747,
+      "learning_rate": 5.244112128377476e-06,
+      "loss": 1.1156,
+      "step": 370
+    },
+    {
+      "epoch": 0.9185548071034905,
+      "grad_norm": 0.07086629076783696,
+      "learning_rate": 3.963469941623288e-06,
+      "loss": 1.0996,
+      "step": 375
+    },
+    {
+      "epoch": 0.9308022045315371,
+      "grad_norm": 0.07007458613323735,
+      "learning_rate": 2.858732593153246e-06,
+      "loss": 1.1211,
+      "step": 380
+    },
+    {
+      "epoch": 0.9430496019595835,
+      "grad_norm": 0.0665201643250434,
+      "learning_rate": 1.9319235697021763e-06,
+      "loss": 1.1165,
+      "step": 385
+    },
+    {
+      "epoch": 0.9552969993876301,
+      "grad_norm": 0.06858672635827863,
+      "learning_rate": 1.1847404564628185e-06,
+      "loss": 1.0881,
+      "step": 390
+    },
+    {
+      "epoch": 0.9675443968156767,
+      "grad_norm": 0.07063191807948407,
+      "learning_rate": 6.185518277123214e-07,
+      "loss": 1.1031,
+      "step": 395
+    },
+    {
+      "epoch": 0.9797917942437232,
+      "grad_norm": 0.06843622739420911,
+      "learning_rate": 2.343947400698432e-07,
+      "loss": 1.1103,
+      "step": 400
+    },
+    {
+      "epoch": 0.9920391916717698,
+      "grad_norm": 0.0681228499191145,
+      "learning_rate": 3.2972832976918554e-08,
+      "loss": 1.1119,
+      "step": 405
+    },
+    {
+      "epoch": 0.9993876301285977,
+      "eval_loss": 1.037530541419983,
+      "eval_runtime": 2.1401,
+      "eval_samples_per_second": 3.271,
+      "eval_steps_per_second": 0.935,
+      "step": 408
+    },
+    {
+      "epoch": 0.9993876301285977,
+      "step": 408,
+      "total_flos": 1.2948113606049792e+16,
+      "train_loss": 1.1371603935372596,
+      "train_runtime": 15464.5644,
+      "train_samples_per_second": 3.378,
+      "train_steps_per_second": 0.026
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 408,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2948113606049792e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}