jennhu/olmo-7b-lora_simplewiki_baseline_simplewiki_baseline

Browse files

Files changed (5) hide show

README.md +67 -0
adapter_config.json +38 -0
adapter_model.safetensors +3 -0
trainer_state.json +483 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: allenai/OLMo-7B-0724-hf
+tags:
+- generated_from_trainer
+model-index:
+- name: simplewiki_baseline
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# simplewiki_baseline
+This model is a fine-tuned version of [allenai/OLMo-7B-0724-hf](https://huggingface.co/allenai/OLMo-7B-0724-hf) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.3323
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.002
+- train_batch_size: 16
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 16
+- total_train_batch_size: 256
+- optimizer: Use paged_adamw_8bit with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 1
+- num_epochs: 3
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 2.3175        | 0.5378 | 20   | 2.3252          |
+| 2.3075        | 1.0538 | 40   | 2.2998          |
+| 2.2741        | 1.5916 | 60   | 2.3083          |
+| 2.2257        | 2.1076 | 80   | 2.3041          |
+| 2.2605        | 2.6454 | 100  | 2.3115          |
+### Framework versions
+- PEFT 0.14.0
+- Transformers 4.49.0
+- Pytorch 2.6.0
+- Datasets 3.3.2
+- Tokenizers 0.21.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "allenai/OLMo-7B-0724-hf",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "lm_head",
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "gate_proj",
+    "o_proj",
+    "down_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:403c2e359f3c18efc6d4531b1f787f1930cf4f25c9eaba8a101941af4c904ee9
+size 1477800176

trainer_state.json ADDED Viewed

	@@ -0,0 +1,483 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9411764705882355,
+  "eval_steps": 20,
+  "global_step": 111,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.05378151260504202,
+      "grad_norm": 0.012224463745951653,
+      "learning_rate": 0.001981818181818182,
+      "loss": 2.4733,
+      "step": 2
+    },
+    {
+      "epoch": 0.10756302521008404,
+      "grad_norm": 0.04710804298520088,
+      "learning_rate": 0.0019454545454545456,
+      "loss": 2.4443,
+      "step": 4
+    },
+    {
+      "epoch": 0.16134453781512606,
+      "grad_norm": 0.09238269925117493,
+      "learning_rate": 0.0019090909090909091,
+      "loss": 2.3956,
+      "step": 6
+    },
+    {
+      "epoch": 0.21512605042016808,
+      "grad_norm": 0.17334744334220886,
+      "learning_rate": 0.0018727272727272729,
+      "loss": 2.4043,
+      "step": 8
+    },
+    {
+      "epoch": 0.2689075630252101,
+      "grad_norm": 0.11766365170478821,
+      "learning_rate": 0.0018363636363636364,
+      "loss": 2.385,
+      "step": 10
+    },
+    {
+      "epoch": 0.3226890756302521,
+      "grad_norm": 0.1385774314403534,
+      "learning_rate": 0.0018000000000000002,
+      "loss": 2.3933,
+      "step": 12
+    },
+    {
+      "epoch": 0.3764705882352941,
+      "grad_norm": 0.1210022047162056,
+      "learning_rate": 0.0017636363636363637,
+      "loss": 2.3733,
+      "step": 14
+    },
+    {
+      "epoch": 0.43025210084033616,
+      "grad_norm": 0.11510306596755981,
+      "learning_rate": 0.0017272727272727272,
+      "loss": 2.3297,
+      "step": 16
+    },
+    {
+      "epoch": 0.48403361344537815,
+      "grad_norm": 0.08369912207126617,
+      "learning_rate": 0.001690909090909091,
+      "loss": 2.351,
+      "step": 18
+    },
+    {
+      "epoch": 0.5378151260504201,
+      "grad_norm": 0.09298688918352127,
+      "learning_rate": 0.0016545454545454545,
+      "loss": 2.3175,
+      "step": 20
+    },
+    {
+      "epoch": 0.5378151260504201,
+      "eval_loss": 2.3251912593841553,
+      "eval_runtime": 84.2914,
+      "eval_samples_per_second": 14.118,
+      "eval_steps_per_second": 1.768,
+      "step": 20
+    },
+    {
+      "epoch": 0.5915966386554622,
+      "grad_norm": 0.10441266000270844,
+      "learning_rate": 0.0016181818181818183,
+      "loss": 2.3643,
+      "step": 22
+    },
+    {
+      "epoch": 0.6453781512605042,
+      "grad_norm": 0.09343012422323227,
+      "learning_rate": 0.0015818181818181818,
+      "loss": 2.3391,
+      "step": 24
+    },
+    {
+      "epoch": 0.6991596638655462,
+      "grad_norm": 0.09008985757827759,
+      "learning_rate": 0.0015454545454545454,
+      "loss": 2.2984,
+      "step": 26
+    },
+    {
+      "epoch": 0.7529411764705882,
+      "grad_norm": 0.08069847524166107,
+      "learning_rate": 0.0015090909090909091,
+      "loss": 2.3202,
+      "step": 28
+    },
+    {
+      "epoch": 0.8067226890756303,
+      "grad_norm": 0.08655106276273727,
+      "learning_rate": 0.0014727272727272727,
+      "loss": 2.3438,
+      "step": 30
+    },
+    {
+      "epoch": 0.8605042016806723,
+      "grad_norm": 0.08203998953104019,
+      "learning_rate": 0.0014363636363636362,
+      "loss": 2.2862,
+      "step": 32
+    },
+    {
+      "epoch": 0.9142857142857143,
+      "grad_norm": 0.10055471211671829,
+      "learning_rate": 0.0014,
+      "loss": 2.3448,
+      "step": 34
+    },
+    {
+      "epoch": 0.9680672268907563,
+      "grad_norm": 0.08500978350639343,
+      "learning_rate": 0.0013636363636363635,
+      "loss": 2.3221,
+      "step": 36
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.16502036154270172,
+      "learning_rate": 0.0013272727272727275,
+      "loss": 2.3438,
+      "step": 38
+    },
+    {
+      "epoch": 1.053781512605042,
+      "grad_norm": 0.08134379237890244,
+      "learning_rate": 0.001290909090909091,
+      "loss": 2.3075,
+      "step": 40
+    },
+    {
+      "epoch": 1.053781512605042,
+      "eval_loss": 2.299827814102173,
+      "eval_runtime": 84.1245,
+      "eval_samples_per_second": 14.146,
+      "eval_steps_per_second": 1.771,
+      "step": 40
+    },
+    {
+      "epoch": 1.107563025210084,
+      "grad_norm": 0.09189953655004501,
+      "learning_rate": 0.0012545454545454546,
+      "loss": 2.2457,
+      "step": 42
+    },
+    {
+      "epoch": 1.1613445378151261,
+      "grad_norm": 0.09041959792375565,
+      "learning_rate": 0.0012181818181818183,
+      "loss": 2.2977,
+      "step": 44
+    },
+    {
+      "epoch": 1.2151260504201682,
+      "grad_norm": 0.08456366509199142,
+      "learning_rate": 0.0011818181818181819,
+      "loss": 2.2843,
+      "step": 46
+    },
+    {
+      "epoch": 1.26890756302521,
+      "grad_norm": 0.08097781240940094,
+      "learning_rate": 0.0011454545454545454,
+      "loss": 2.2328,
+      "step": 48
+    },
+    {
+      "epoch": 1.322689075630252,
+      "grad_norm": 0.10243827849626541,
+      "learning_rate": 0.0011090909090909092,
+      "loss": 2.254,
+      "step": 50
+    },
+    {
+      "epoch": 1.3764705882352941,
+      "grad_norm": 0.09242815524339676,
+      "learning_rate": 0.0010727272727272727,
+      "loss": 2.3295,
+      "step": 52
+    },
+    {
+      "epoch": 1.4302521008403362,
+      "grad_norm": 0.09403648227453232,
+      "learning_rate": 0.0010363636363636365,
+      "loss": 2.2749,
+      "step": 54
+    },
+    {
+      "epoch": 1.4840336134453782,
+      "grad_norm": 0.09187959879636765,
+      "learning_rate": 0.001,
+      "loss": 2.2606,
+      "step": 56
+    },
+    {
+      "epoch": 1.53781512605042,
+      "grad_norm": 0.09116198122501373,
+      "learning_rate": 0.0009636363636363637,
+      "loss": 2.2676,
+      "step": 58
+    },
+    {
+      "epoch": 1.5915966386554623,
+      "grad_norm": 0.08270075172185898,
+      "learning_rate": 0.0009272727272727273,
+      "loss": 2.2741,
+      "step": 60
+    },
+    {
+      "epoch": 1.5915966386554623,
+      "eval_loss": 2.3082737922668457,
+      "eval_runtime": 84.2709,
+      "eval_samples_per_second": 14.121,
+      "eval_steps_per_second": 1.768,
+      "step": 60
+    },
+    {
+      "epoch": 1.6453781512605041,
+      "grad_norm": 0.09275200217962265,
+      "learning_rate": 0.0008909090909090909,
+      "loss": 2.2582,
+      "step": 62
+    },
+    {
+      "epoch": 1.6991596638655462,
+      "grad_norm": 0.09241969138383865,
+      "learning_rate": 0.0008545454545454545,
+      "loss": 2.2554,
+      "step": 64
+    },
+    {
+      "epoch": 1.7529411764705882,
+      "grad_norm": 0.08338718116283417,
+      "learning_rate": 0.0008181818181818183,
+      "loss": 2.2244,
+      "step": 66
+    },
+    {
+      "epoch": 1.8067226890756303,
+      "grad_norm": 0.09568168222904205,
+      "learning_rate": 0.0007818181818181819,
+      "loss": 2.2719,
+      "step": 68
+    },
+    {
+      "epoch": 1.8605042016806723,
+      "grad_norm": 0.0905410498380661,
+      "learning_rate": 0.0007454545454545455,
+      "loss": 2.2505,
+      "step": 70
+    },
+    {
+      "epoch": 1.9142857142857141,
+      "grad_norm": 0.08841802924871445,
+      "learning_rate": 0.0007090909090909091,
+      "loss": 2.3005,
+      "step": 72
+    },
+    {
+      "epoch": 1.9680672268907564,
+      "grad_norm": 0.09013470262289047,
+      "learning_rate": 0.0006727272727272728,
+      "loss": 2.2682,
+      "step": 74
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.19737772643566132,
+      "learning_rate": 0.0006363636363636364,
+      "loss": 2.3476,
+      "step": 76
+    },
+    {
+      "epoch": 2.053781512605042,
+      "grad_norm": 0.0839110016822815,
+      "learning_rate": 0.0006,
+      "loss": 2.2338,
+      "step": 78
+    },
+    {
+      "epoch": 2.107563025210084,
+      "grad_norm": 0.10582801699638367,
+      "learning_rate": 0.0005636363636363636,
+      "loss": 2.2257,
+      "step": 80
+    },
+    {
+      "epoch": 2.107563025210084,
+      "eval_loss": 2.304074764251709,
+      "eval_runtime": 84.3347,
+      "eval_samples_per_second": 14.11,
+      "eval_steps_per_second": 1.767,
+      "step": 80
+    },
+    {
+      "epoch": 2.161344537815126,
+      "grad_norm": 0.09145358949899673,
+      "learning_rate": 0.0005272727272727272,
+      "loss": 2.2488,
+      "step": 82
+    },
+    {
+      "epoch": 2.215126050420168,
+      "grad_norm": 0.08459240943193436,
+      "learning_rate": 0.0004909090909090909,
+      "loss": 2.2518,
+      "step": 84
+    },
+    {
+      "epoch": 2.26890756302521,
+      "grad_norm": 0.09590018540620804,
+      "learning_rate": 0.00045454545454545455,
+      "loss": 2.2324,
+      "step": 86
+    },
+    {
+      "epoch": 2.3226890756302523,
+      "grad_norm": 0.10032965242862701,
+      "learning_rate": 0.00041818181818181814,
+      "loss": 2.2099,
+      "step": 88
+    },
+    {
+      "epoch": 2.376470588235294,
+      "grad_norm": 0.09092257171869278,
+      "learning_rate": 0.00038181818181818184,
+      "loss": 2.2077,
+      "step": 90
+    },
+    {
+      "epoch": 2.4302521008403364,
+      "grad_norm": 0.10066290944814682,
+      "learning_rate": 0.00034545454545454544,
+      "loss": 2.2629,
+      "step": 92
+    },
+    {
+      "epoch": 2.484033613445378,
+      "grad_norm": 0.0973694771528244,
+      "learning_rate": 0.0003090909090909091,
+      "loss": 2.2292,
+      "step": 94
+    },
+    {
+      "epoch": 2.53781512605042,
+      "grad_norm": 0.09254106879234314,
+      "learning_rate": 0.00027272727272727274,
+      "loss": 2.1923,
+      "step": 96
+    },
+    {
+      "epoch": 2.5915966386554623,
+      "grad_norm": 0.10056042671203613,
+      "learning_rate": 0.00023636363636363636,
+      "loss": 2.2445,
+      "step": 98
+    },
+    {
+      "epoch": 2.645378151260504,
+      "grad_norm": 0.09601625055074692,
+      "learning_rate": 0.0002,
+      "loss": 2.2605,
+      "step": 100
+    },
+    {
+      "epoch": 2.645378151260504,
+      "eval_loss": 2.3115394115448,
+      "eval_runtime": 84.2807,
+      "eval_samples_per_second": 14.119,
+      "eval_steps_per_second": 1.768,
+      "step": 100
+    },
+    {
+      "epoch": 2.6991596638655464,
+      "grad_norm": 0.09498832374811172,
+      "learning_rate": 0.00016363636363636363,
+      "loss": 2.215,
+      "step": 102
+    },
+    {
+      "epoch": 2.7529411764705882,
+      "grad_norm": 0.09191343188285828,
+      "learning_rate": 0.00012727272727272725,
+      "loss": 2.2116,
+      "step": 104
+    },
+    {
+      "epoch": 2.80672268907563,
+      "grad_norm": 0.10717286169528961,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 2.2435,
+      "step": 106
+    },
+    {
+      "epoch": 2.8605042016806723,
+      "grad_norm": 0.09715902805328369,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 2.2196,
+      "step": 108
+    },
+    {
+      "epoch": 2.914285714285714,
+      "grad_norm": 0.10500436276197433,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 2.2351,
+      "step": 110
+    },
+    {
+      "epoch": 2.9411764705882355,
+      "step": 111,
+      "total_flos": 8.1776874848256e+17,
+      "train_loss": 2.2894913076280474,
+      "train_runtime": 2825.7191,
+      "train_samples_per_second": 10.107,
+      "train_steps_per_second": 0.039
+    },
+    {
+      "epoch": 2.9411764705882355,
+      "eval_loss": 2.311664342880249,
+      "eval_runtime": 84.4111,
+      "eval_samples_per_second": 14.098,
+      "eval_steps_per_second": 1.765,
+      "step": 111
+    },
+    {
+      "epoch": 2.9411764705882355,
+      "eval_loss": 2.3323311805725098,
+      "eval_runtime": 84.1513,
+      "eval_samples_per_second": 14.141,
+      "eval_steps_per_second": 1.771,
+      "step": 111
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 111,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.1776874848256e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4a1052c16d91be09a7b1f09514179cb0cb614fe81af5279d9a8cd2d7333c145
+size 5496