Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

shared/checkpoints/latest/config.json +33 -0
shared/checkpoints/latest/generation_config.json +10 -0
shared/checkpoints/latest/model.safetensors +3 -0
shared/checkpoints/latest/optimizer.pt +3 -0
shared/checkpoints/latest/rng_state.pth +3 -0
shared/checkpoints/latest/scaler.pt +3 -0
shared/checkpoints/latest/scheduler.pt +3 -0
shared/checkpoints/latest/trainer_state.json +286 -0

shared/checkpoints/latest/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 1280,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "max_position_embeddings": 128,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 20,
+  "num_hidden_layers": 20,
+  "num_key_value_heads": 5,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

shared/checkpoints/latest/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 0,
+  "transformers_version": "5.0.0",
+  "use_cache": false
+}

shared/checkpoints/latest/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4311470cef32c1db5bc50c12b71d7076de73af2c4e68bc64e1faa26a574b6be7
+size 2228454760

shared/checkpoints/latest/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51eab96e89995e62448c2bab1f28c327ff9df2e722666bf96aac0b55349e22b9
+size 373040459

shared/checkpoints/latest/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
+size 14645

shared/checkpoints/latest/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:850c3d909f8a0af6f9b431fac5a25833ab1658c39f899825e3b347b6af8a490b
+size 1383

shared/checkpoints/latest/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:226108bf7e5c19e39ac121293561fcf99628514e7bf5811de63e81d47d460150
+size 1465

shared/checkpoints/latest/trainer_state.json ADDED Viewed

	@@ -0,0 +1,286 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.53715507343917,
+  "eval_steps": 500,
+  "global_step": 700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011862835959221501,
+      "grad_norm": 1.0314325094223022,
+      "learning_rate": 5e-05,
+      "loss": 10.542851448059082,
+      "step": 1
+    },
+    {
+      "epoch": 0.23725671918443003,
+      "grad_norm": 1.70680832862854,
+      "learning_rate": 4.9992874484134653e-05,
+      "loss": 9.478467439350329,
+      "step": 20
+    },
+    {
+      "epoch": 0.47451343836886006,
+      "grad_norm": 0.7485072016716003,
+      "learning_rate": 4.996998267226905e-05,
+      "loss": 7.760343933105469,
+      "step": 40
+    },
+    {
+      "epoch": 0.7117701575532901,
+      "grad_norm": 0.5104997754096985,
+      "learning_rate": 4.993131928415602e-05,
+      "loss": 7.206849670410156,
+      "step": 60
+    },
+    {
+      "epoch": 0.9490268767377201,
+      "grad_norm": 0.46393775939941406,
+      "learning_rate": 4.9876908740420175e-05,
+      "loss": 7.096773529052735,
+      "step": 80
+    },
+    {
+      "epoch": 1.1779425393883225,
+      "grad_norm": 0.447471559047699,
+      "learning_rate": 4.980678540792715e-05,
+      "loss": 7.068167877197266,
+      "step": 100
+    },
+    {
+      "epoch": 1.4151992585727524,
+      "grad_norm": 0.5010190606117249,
+      "learning_rate": 4.972099357807671e-05,
+      "loss": 7.053585815429687,
+      "step": 120
+    },
+    {
+      "epoch": 1.6524559777571826,
+      "grad_norm": 0.7706360816955566,
+      "learning_rate": 4.961958743882742e-05,
+      "loss": 7.03430404663086,
+      "step": 140
+    },
+    {
+      "epoch": 1.8897126969416127,
+      "grad_norm": 0.5128363370895386,
+      "learning_rate": 4.950263104047031e-05,
+      "loss": 7.022041320800781,
+      "step": 160
+    },
+    {
+      "epoch": 2.118628359592215,
+      "grad_norm": 0.705746054649353,
+      "learning_rate": 4.937019825517333e-05,
+      "loss": 6.9862548828125,
+      "step": 180
+    },
+    {
+      "epoch": 2.355885078776645,
+      "grad_norm": 0.8337900042533875,
+      "learning_rate": 4.9222372730322176e-05,
+      "loss": 6.937237548828125,
+      "step": 200
+    },
+    {
+      "epoch": 1.5406769433320533,
+      "grad_norm": 0.5159856081008911,
+      "learning_rate": 4.568695539880615e-05,
+      "loss": 6.9568915367126465,
+      "step": 220
+    },
+    {
+      "epoch": 1.6811125130286906,
+      "grad_norm": 0.4523787200450897,
+      "learning_rate": 4.489238055764833e-05,
+      "loss": 6.900994873046875,
+      "step": 240
+    },
+    {
+      "epoch": 1.8215480827253279,
+      "grad_norm": 0.48097074031829834,
+      "learning_rate": 4.4038849773874356e-05,
+      "loss": 6.876528930664063,
+      "step": 260
+    },
+    {
+      "epoch": 1.961983652421965,
+      "grad_norm": 0.5671383738517761,
+      "learning_rate": 4.3128892695042654e-05,
+      "loss": 6.83782730102539,
+      "step": 280
+    },
+    {
+      "epoch": 2.0983048987876463,
+      "grad_norm": 0.6388454437255859,
+      "learning_rate": 4.2165206201859265e-05,
+      "loss": 6.788776397705078,
+      "step": 300
+    },
+    {
+      "epoch": 2.2387404684842833,
+      "grad_norm": 0.5785893201828003,
+      "learning_rate": 4.115064641531117e-05,
+      "loss": 6.7587730407714846,
+      "step": 320
+    },
+    {
+      "epoch": 2.3791760381809204,
+      "grad_norm": 0.4917908012866974,
+      "learning_rate": 4.008822023185218e-05,
+      "loss": 6.704537200927734,
+      "step": 340
+    },
+    {
+      "epoch": 2.519611607877558,
+      "grad_norm": 0.607780396938324,
+      "learning_rate": 3.898107641172868e-05,
+      "loss": 6.676227569580078,
+      "step": 360
+    },
+    {
+      "epoch": 2.660047177574195,
+      "grad_norm": 0.45474570989608765,
+      "learning_rate": 3.783249624685734e-05,
+      "loss": 6.64794692993164,
+      "step": 380
+    },
+    {
+      "epoch": 2.8004827472708325,
+      "grad_norm": 0.49765315651893616,
+      "learning_rate": 3.6645883835912714e-05,
+      "loss": 6.635832214355469,
+      "step": 400
+    },
+    {
+      "epoch": 2.9409183169674695,
+      "grad_norm": 0.5915655493736267,
+      "learning_rate": 3.542475599544699e-05,
+      "loss": 6.623738098144531,
+      "step": 420
+    },
+    {
+      "epoch": 3.0772395633331504,
+      "grad_norm": 0.6189742684364319,
+      "learning_rate": 3.417273183694259e-05,
+      "loss": 6.541598510742188,
+      "step": 440
+    },
+    {
+      "epoch": 3.217675133029788,
+      "grad_norm": 0.6115455627441406,
+      "learning_rate": 3.289352204068886e-05,
+      "loss": 6.4810935974121096,
+      "step": 460
+    },
+    {
+      "epoch": 3.358110702726425,
+      "grad_norm": 0.5563525557518005,
+      "learning_rate": 3.1590917858271966e-05,
+      "loss": 6.469013214111328,
+      "step": 480
+    },
+    {
+      "epoch": 3.498546272423062,
+      "grad_norm": 0.5592976212501526,
+      "learning_rate": 3.0268779876272162e-05,
+      "loss": 6.469371032714844,
+      "step": 500
+    },
+    {
+      "epoch": 3.6389818421196996,
+      "grad_norm": 0.6607327461242676,
+      "learning_rate": 2.893102657446976e-05,
+      "loss": 6.4586669921875,
+      "step": 520
+    },
+    {
+      "epoch": 3.7794174118163366,
+      "grad_norm": 0.5511732697486877,
+      "learning_rate": 2.7581622712470417e-05,
+      "loss": 6.438571929931641,
+      "step": 540
+    },
+    {
+      "epoch": 3.9198529815129737,
+      "grad_norm": 0.694491446018219,
+      "learning_rate": 2.6224567579168897e-05,
+      "loss": 6.430049133300781,
+      "step": 560
+    },
+    {
+      "epoch": 4.056174227878655,
+      "grad_norm": 0.5763441324234009,
+      "learning_rate": 2.4863883139876677e-05,
+      "loss": 6.385451889038086,
+      "step": 580
+    },
+    {
+      "epoch": 3.0304050041569344,
+      "grad_norm": 0.603993833065033,
+      "learning_rate": 2.35036021162426e-05,
+      "loss": 6.4556640625,
+      "step": 600
+    },
+    {
+      "epoch": 3.1317550180133815,
+      "grad_norm": 0.7310755252838135,
+      "learning_rate": 2.214775603429435e-05,
+      "loss": 6.486899566650391,
+      "step": 620
+    },
+    {
+      "epoch": 3.2331050318698287,
+      "grad_norm": 0.7162438631057739,
+      "learning_rate": 2.0800363276023586e-05,
+      "loss": 6.448640441894531,
+      "step": 640
+    },
+    {
+      "epoch": 3.334455045726276,
+      "grad_norm": 0.6547061800956726,
+      "learning_rate": 1.9465417169926507e-05,
+      "loss": 6.455082702636719,
+      "step": 660
+    },
+    {
+      "epoch": 3.435805059582723,
+      "grad_norm": 0.6867275834083557,
+      "learning_rate": 1.8146874155796643e-05,
+      "loss": 6.440997314453125,
+      "step": 680
+    },
+    {
+      "epoch": 3.53715507343917,
+      "grad_norm": 0.6481226682662964,
+      "learning_rate": 1.6848642058846426e-05,
+      "loss": 6.447025299072266,
+      "step": 700
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 1154,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 1000000000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.53570793013248e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}