Upload 13 files

Browse files

Files changed (13) hide show

README.md +203 -3
adapter_config-6.json +42 -0
adapter_model-6.safetensors +3 -0
optimizer-5.pt +3 -0
rng_state-2.pth +3 -0
scaler-2.pt +3 -0
scheduler-4.pt +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +44 -0
trainer_state-6.json +1184 -0
training_args-6.bin +3 -0

README.md CHANGED Viewed

@@ -1,7 +1,207 @@
 ---
-license: apache-2.0
 ---
-Epoch	Training Loss	Validation Loss
-2	0.473800	0.684264

 ---
+base_model: mistralai/Mistral-7B-v0.1
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:mistralai/Mistral-7B-v0.1
+- lora
+- transformers
 ---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

adapter_config-6.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model-6.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:673704334c1db9c68a18eec7f2793e9e752da544e503bab1064bc32e438cb6c0
+size 167832240

optimizer-5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48ea4acc1d0f4f5c7ca8cf8743612846af1e30223211f696272c4abf42f486f7
+size 85733607

rng_state-2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89d152c2bbfea8b5176830b4c43f42d24c8f3e43e8c250a9e7ae7fd66305706a
+size 14645

scaler-2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00ddef88593cbb83cbfefa9bbe9b0fac17ba88550ad9bce7578ad68c0bea8ae1
+size 1383

scheduler-4.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:badd1b4c74f1dfb5be5fef2e5d0412cbee5bd3dcd153599495e12ff16bc87721
+size 1465

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

trainer_state-6.json ADDED Viewed

	@@ -0,0 +1,1184 @@

+{
+  "best_global_step": 1620,
+  "best_metric": 0.6842637062072754,
+  "best_model_checkpoint": "./nepal-legal-model/checkpoint-1620",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1620,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012345679012345678,
+      "grad_norm": 2.0793182849884033,
+      "learning_rate": 2.4657534246575342e-05,
+      "loss": 1.858,
+      "step": 10
+    },
+    {
+      "epoch": 0.024691358024691357,
+      "grad_norm": 2.0381851196289062,
+      "learning_rate": 5.2054794520547945e-05,
+      "loss": 1.4272,
+      "step": 20
+    },
+    {
+      "epoch": 0.037037037037037035,
+      "grad_norm": 1.6031200885772705,
+      "learning_rate": 7.945205479452055e-05,
+      "loss": 1.0783,
+      "step": 30
+    },
+    {
+      "epoch": 0.04938271604938271,
+      "grad_norm": 1.6209287643432617,
+      "learning_rate": 0.00010684931506849317,
+      "loss": 1.0151,
+      "step": 40
+    },
+    {
+      "epoch": 0.06172839506172839,
+      "grad_norm": 1.2847559452056885,
+      "learning_rate": 0.00013424657534246576,
+      "loss": 0.9561,
+      "step": 50
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 1.2488341331481934,
+      "learning_rate": 0.00016164383561643837,
+      "loss": 0.9219,
+      "step": 60
+    },
+    {
+      "epoch": 0.08641975308641975,
+      "grad_norm": 1.1916821002960205,
+      "learning_rate": 0.00018904109589041096,
+      "loss": 0.9161,
+      "step": 70
+    },
+    {
+      "epoch": 0.09876543209876543,
+      "grad_norm": 1.1260769367218018,
+      "learning_rate": 0.0001999968022038833,
+      "loss": 0.944,
+      "step": 80
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 1.1027578115463257,
+      "learning_rate": 0.0001999772608571399,
+      "loss": 0.8996,
+      "step": 90
+    },
+    {
+      "epoch": 0.12345679012345678,
+      "grad_norm": 1.143027663230896,
+      "learning_rate": 0.0001999399581844347,
+      "loss": 0.8976,
+      "step": 100
+    },
+    {
+      "epoch": 0.13580246913580246,
+      "grad_norm": 1.2351293563842773,
+      "learning_rate": 0.00019988490081272397,
+      "loss": 0.8754,
+      "step": 110
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 1.026627779006958,
+      "learning_rate": 0.0001998120985231511,
+      "loss": 0.8798,
+      "step": 120
+    },
+    {
+      "epoch": 0.16049382716049382,
+      "grad_norm": 1.2233778238296509,
+      "learning_rate": 0.00019972156424930896,
+      "loss": 0.8562,
+      "step": 130
+    },
+    {
+      "epoch": 0.1728395061728395,
+      "grad_norm": 1.1058930158615112,
+      "learning_rate": 0.00019961331407494245,
+      "loss": 0.8666,
+      "step": 140
+    },
+    {
+      "epoch": 0.18518518518518517,
+      "grad_norm": 1.1126072406768799,
+      "learning_rate": 0.00019948736723109082,
+      "loss": 0.8733,
+      "step": 150
+    },
+    {
+      "epoch": 0.19753086419753085,
+      "grad_norm": 1.0302902460098267,
+      "learning_rate": 0.00019934374609267136,
+      "loss": 0.8287,
+      "step": 160
+    },
+    {
+      "epoch": 0.20987654320987653,
+      "grad_norm": 1.0634909868240356,
+      "learning_rate": 0.00019918247617450454,
+      "loss": 0.834,
+      "step": 170
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 1.0340452194213867,
+      "learning_rate": 0.00019900358612678099,
+      "loss": 0.8747,
+      "step": 180
+    },
+    {
+      "epoch": 0.2345679012345679,
+      "grad_norm": 1.0361789464950562,
+      "learning_rate": 0.0001988071077299718,
+      "loss": 0.8597,
+      "step": 190
+    },
+    {
+      "epoch": 0.24691358024691357,
+      "grad_norm": 1.0306812524795532,
+      "learning_rate": 0.00019859307588918258,
+      "loss": 0.8594,
+      "step": 200
+    },
+    {
+      "epoch": 0.25925925925925924,
+      "grad_norm": 1.1273531913757324,
+      "learning_rate": 0.00019836152862795245,
+      "loss": 0.8533,
+      "step": 210
+    },
+    {
+      "epoch": 0.2716049382716049,
+      "grad_norm": 1.0480186939239502,
+      "learning_rate": 0.0001981125070814991,
+      "loss": 0.8409,
+      "step": 220
+    },
+    {
+      "epoch": 0.2839506172839506,
+      "grad_norm": 1.0523358583450317,
+      "learning_rate": 0.00019784605548941073,
+      "loss": 0.8555,
+      "step": 230
+    },
+    {
+      "epoch": 0.2962962962962963,
+      "grad_norm": 1.0661187171936035,
+      "learning_rate": 0.00019756222118778706,
+      "loss": 0.8623,
+      "step": 240
+    },
+    {
+      "epoch": 0.30864197530864196,
+      "grad_norm": 1.0556374788284302,
+      "learning_rate": 0.0001972610546008295,
+      "loss": 0.8104,
+      "step": 250
+    },
+    {
+      "epoch": 0.32098765432098764,
+      "grad_norm": 1.0738286972045898,
+      "learning_rate": 0.00019694260923188356,
+      "loss": 0.805,
+      "step": 260
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.0582962036132812,
+      "learning_rate": 0.00019660694165393334,
+      "loss": 0.8487,
+      "step": 270
+    },
+    {
+      "epoch": 0.345679012345679,
+      "grad_norm": 0.99802565574646,
+      "learning_rate": 0.00019625411149955154,
+      "loss": 0.8211,
+      "step": 280
+    },
+    {
+      "epoch": 0.35802469135802467,
+      "grad_norm": 1.1545002460479736,
+      "learning_rate": 0.00019588418145030518,
+      "loss": 0.7967,
+      "step": 290
+    },
+    {
+      "epoch": 0.37037037037037035,
+      "grad_norm": 1.1147046089172363,
+      "learning_rate": 0.00019549721722562018,
+      "loss": 0.823,
+      "step": 300
+    },
+    {
+      "epoch": 0.38271604938271603,
+      "grad_norm": 1.0882933139801025,
+      "learning_rate": 0.00019509328757110598,
+      "loss": 0.8206,
+      "step": 310
+    },
+    {
+      "epoch": 0.3950617283950617,
+      "grad_norm": 1.0245946645736694,
+      "learning_rate": 0.0001946724642463427,
+      "loss": 0.771,
+      "step": 320
+    },
+    {
+      "epoch": 0.4074074074074074,
+      "grad_norm": 1.1092945337295532,
+      "learning_rate": 0.00019423482201213276,
+      "loss": 0.791,
+      "step": 330
+    },
+    {
+      "epoch": 0.41975308641975306,
+      "grad_norm": 0.9481214284896851,
+      "learning_rate": 0.0001937804386172193,
+      "loss": 0.8142,
+      "step": 340
+    },
+    {
+      "epoch": 0.43209876543209874,
+      "grad_norm": 1.1183067560195923,
+      "learning_rate": 0.00019330939478447393,
+      "loss": 0.7952,
+      "step": 350
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 1.094511866569519,
+      "learning_rate": 0.00019282177419655585,
+      "loss": 0.7853,
+      "step": 360
+    },
+    {
+      "epoch": 0.4567901234567901,
+      "grad_norm": 0.9660793542861938,
+      "learning_rate": 0.00019231766348104556,
+      "loss": 0.7678,
+      "step": 370
+    },
+    {
+      "epoch": 0.4691358024691358,
+      "grad_norm": 1.1652113199234009,
+      "learning_rate": 0.000191797152195055,
+      "loss": 0.7881,
+      "step": 380
+    },
+    {
+      "epoch": 0.48148148148148145,
+      "grad_norm": 1.0458804368972778,
+      "learning_rate": 0.00019126033280931733,
+      "loss": 0.7882,
+      "step": 390
+    },
+    {
+      "epoch": 0.49382716049382713,
+      "grad_norm": 1.1295260190963745,
+      "learning_rate": 0.00019070730069175936,
+      "loss": 0.8328,
+      "step": 400
+    },
+    {
+      "epoch": 0.5061728395061729,
+      "grad_norm": 1.1698542833328247,
+      "learning_rate": 0.00019013815409055896,
+      "loss": 0.803,
+      "step": 410
+    },
+    {
+      "epoch": 0.5185185185185185,
+      "grad_norm": 1.214735984802246,
+      "learning_rate": 0.0001895529941166909,
+      "loss": 0.797,
+      "step": 420
+    },
+    {
+      "epoch": 0.5308641975308642,
+      "grad_norm": 1.0859205722808838,
+      "learning_rate": 0.00018895192472596426,
+      "loss": 0.7961,
+      "step": 430
+    },
+    {
+      "epoch": 0.5432098765432098,
+      "grad_norm": 1.1454962491989136,
+      "learning_rate": 0.0001883350527005541,
+      "loss": 0.7848,
+      "step": 440
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 1.109433889389038,
+      "learning_rate": 0.00018770248763003134,
+      "loss": 0.7801,
+      "step": 450
+    },
+    {
+      "epoch": 0.5679012345679012,
+      "grad_norm": 1.0427923202514648,
+      "learning_rate": 0.00018705434189189376,
+      "loss": 0.7957,
+      "step": 460
+    },
+    {
+      "epoch": 0.5802469135802469,
+      "grad_norm": 1.0453143119812012,
+      "learning_rate": 0.00018639073063160172,
+      "loss": 0.7812,
+      "step": 470
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 1.0415431261062622,
+      "learning_rate": 0.00018571177174212214,
+      "loss": 0.7463,
+      "step": 480
+    },
+    {
+      "epoch": 0.6049382716049383,
+      "grad_norm": 1.1567952632904053,
+      "learning_rate": 0.00018501758584298433,
+      "loss": 0.7643,
+      "step": 490
+    },
+    {
+      "epoch": 0.6172839506172839,
+      "grad_norm": 1.0001814365386963,
+      "learning_rate": 0.00018430829625885165,
+      "loss": 0.7885,
+      "step": 500
+    },
+    {
+      "epoch": 0.6296296296296297,
+      "grad_norm": 1.0246291160583496,
+      "learning_rate": 0.00018358402899761218,
+      "loss": 0.7723,
+      "step": 510
+    },
+    {
+      "epoch": 0.6419753086419753,
+      "grad_norm": 1.0211082696914673,
+      "learning_rate": 0.00018284491272799327,
+      "loss": 0.7739,
+      "step": 520
+    },
+    {
+      "epoch": 0.654320987654321,
+      "grad_norm": 1.1272550821304321,
+      "learning_rate": 0.00018209107875670277,
+      "loss": 0.7844,
+      "step": 530
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 1.2581202983856201,
+      "learning_rate": 0.00018132266100510214,
+      "loss": 0.769,
+      "step": 540
+    },
+    {
+      "epoch": 0.6790123456790124,
+      "grad_norm": 1.1308720111846924,
+      "learning_rate": 0.0001805397959854147,
+      "loss": 0.7587,
+      "step": 550
+    },
+    {
+      "epoch": 0.691358024691358,
+      "grad_norm": 0.9830589890480042,
+      "learning_rate": 0.00017974262277647374,
+      "loss": 0.766,
+      "step": 560
+    },
+    {
+      "epoch": 0.7037037037037037,
+      "grad_norm": 1.083027958869934,
+      "learning_rate": 0.00017893128299901472,
+      "loss": 0.7503,
+      "step": 570
+    },
+    {
+      "epoch": 0.7160493827160493,
+      "grad_norm": 1.0589890480041504,
+      "learning_rate": 0.00017810592079051585,
+      "loss": 0.7865,
+      "step": 580
+    },
+    {
+      "epoch": 0.7283950617283951,
+      "grad_norm": 1.1053671836853027,
+      "learning_rate": 0.00017726668277959136,
+      "loss": 0.7639,
+      "step": 590
+    },
+    {
+      "epoch": 0.7407407407407407,
+      "grad_norm": 1.1152055263519287,
+      "learning_rate": 0.00017641371805994264,
+      "loss": 0.7614,
+      "step": 600
+    },
+    {
+      "epoch": 0.7530864197530864,
+      "grad_norm": 1.1337790489196777,
+      "learning_rate": 0.00017554717816387107,
+      "loss": 0.761,
+      "step": 610
+    },
+    {
+      "epoch": 0.7654320987654321,
+      "grad_norm": 1.1052172183990479,
+      "learning_rate": 0.00017466721703535764,
+      "loss": 0.7506,
+      "step": 620
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 1.0141187906265259,
+      "learning_rate": 0.0001737739910027145,
+      "loss": 0.7529,
+      "step": 630
+    },
+    {
+      "epoch": 0.7901234567901234,
+      "grad_norm": 1.1626172065734863,
+      "learning_rate": 0.00017286765875081244,
+      "loss": 0.7786,
+      "step": 640
+    },
+    {
+      "epoch": 0.8024691358024691,
+      "grad_norm": 1.1584755182266235,
+      "learning_rate": 0.00017194838129289006,
+      "loss": 0.745,
+      "step": 650
+    },
+    {
+      "epoch": 0.8148148148148148,
+      "grad_norm": 1.1189895868301392,
+      "learning_rate": 0.0001710163219419491,
+      "loss": 0.7436,
+      "step": 660
+    },
+    {
+      "epoch": 0.8271604938271605,
+      "grad_norm": 1.096261739730835,
+      "learning_rate": 0.00017007164628174139,
+      "loss": 0.7656,
+      "step": 670
+    },
+    {
+      "epoch": 0.8395061728395061,
+      "grad_norm": 1.0469647645950317,
+      "learning_rate": 0.00016911452213735223,
+      "loss": 0.752,
+      "step": 680
+    },
+    {
+      "epoch": 0.8518518518518519,
+      "grad_norm": 1.1135762929916382,
+      "learning_rate": 0.00016814511954538558,
+      "loss": 0.7516,
+      "step": 690
+    },
+    {
+      "epoch": 0.8641975308641975,
+      "grad_norm": 1.0560423135757446,
+      "learning_rate": 0.00016716361072375657,
+      "loss": 0.7412,
+      "step": 700
+    },
+    {
+      "epoch": 0.8765432098765432,
+      "grad_norm": 1.190370798110962,
+      "learning_rate": 0.00016617017004109632,
+      "loss": 0.6996,
+      "step": 710
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.9996367692947388,
+      "learning_rate": 0.0001651649739857746,
+      "loss": 0.7436,
+      "step": 720
+    },
+    {
+      "epoch": 0.9012345679012346,
+      "grad_norm": 1.0223890542984009,
+      "learning_rate": 0.00016414820113454622,
+      "loss": 0.7632,
+      "step": 730
+    },
+    {
+      "epoch": 0.9135802469135802,
+      "grad_norm": 1.1123933792114258,
+      "learning_rate": 0.0001631200321208261,
+      "loss": 0.7437,
+      "step": 740
+    },
+    {
+      "epoch": 0.9259259259259259,
+      "grad_norm": 1.1075836420059204,
+      "learning_rate": 0.00016208064960259897,
+      "loss": 0.7722,
+      "step": 750
+    },
+    {
+      "epoch": 0.9382716049382716,
+      "grad_norm": 1.114262342453003,
+      "learning_rate": 0.00016103023822996982,
+      "loss": 0.7664,
+      "step": 760
+    },
+    {
+      "epoch": 0.9506172839506173,
+      "grad_norm": 1.1287472248077393,
+      "learning_rate": 0.00015996898461235977,
+      "loss": 0.7474,
+      "step": 770
+    },
+    {
+      "epoch": 0.9629629629629629,
+      "grad_norm": 1.188179850578308,
+      "learning_rate": 0.00015889707728535462,
+      "loss": 0.7508,
+      "step": 780
+    },
+    {
+      "epoch": 0.9753086419753086,
+      "grad_norm": 1.1187824010849,
+      "learning_rate": 0.0001578147066772104,
+      "loss": 0.7656,
+      "step": 790
+    },
+    {
+      "epoch": 0.9876543209876543,
+      "grad_norm": 1.1188304424285889,
+      "learning_rate": 0.00015672206507502337,
+      "loss": 0.7268,
+      "step": 800
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.1175668239593506,
+      "learning_rate": 0.00015561934659056947,
+      "loss": 0.7362,
+      "step": 810
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.733393669128418,
+      "eval_runtime": 411.0606,
+      "eval_samples_per_second": 3.503,
+      "eval_steps_per_second": 0.876,
+      "step": 810
+    },
+    {
+      "epoch": 1.0123456790123457,
+      "grad_norm": 1.548414945602417,
+      "learning_rate": 0.0001545067471258196,
+      "loss": 0.5331,
+      "step": 820
+    },
+    {
+      "epoch": 1.0246913580246915,
+      "grad_norm": 1.2169567346572876,
+      "learning_rate": 0.00015338446433813693,
+      "loss": 0.5351,
+      "step": 830
+    },
+    {
+      "epoch": 1.037037037037037,
+      "grad_norm": 1.2411112785339355,
+      "learning_rate": 0.00015225269760516232,
+      "loss": 0.5072,
+      "step": 840
+    },
+    {
+      "epoch": 1.0493827160493827,
+      "grad_norm": 1.0267516374588013,
+      "learning_rate": 0.00015111164798939432,
+      "loss": 0.5127,
+      "step": 850
+    },
+    {
+      "epoch": 1.0617283950617284,
+      "grad_norm": 1.082014560699463,
+      "learning_rate": 0.00014996151820246935,
+      "loss": 0.507,
+      "step": 860
+    },
+    {
+      "epoch": 1.074074074074074,
+      "grad_norm": 1.1765062808990479,
+      "learning_rate": 0.00014880251256914963,
+      "loss": 0.517,
+      "step": 870
+    },
+    {
+      "epoch": 1.0864197530864197,
+      "grad_norm": 1.1010947227478027,
+      "learning_rate": 0.0001476348369910238,
+      "loss": 0.5151,
+      "step": 880
+    },
+    {
+      "epoch": 1.0987654320987654,
+      "grad_norm": 1.2586389780044556,
+      "learning_rate": 0.00014645869890992803,
+      "loss": 0.5277,
+      "step": 890
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 1.0955579280853271,
+      "learning_rate": 0.000145274307271093,
+      "loss": 0.502,
+      "step": 900
+    },
+    {
+      "epoch": 1.123456790123457,
+      "grad_norm": 1.1811714172363281,
+      "learning_rate": 0.0001440818724860241,
+      "loss": 0.5271,
+      "step": 910
+    },
+    {
+      "epoch": 1.1358024691358024,
+      "grad_norm": 1.2689094543457031,
+      "learning_rate": 0.00014288160639512105,
+      "loss": 0.5348,
+      "step": 920
+    },
+    {
+      "epoch": 1.1481481481481481,
+      "grad_norm": 1.3397020101547241,
+      "learning_rate": 0.0001416737222300438,
+      "loss": 0.5278,
+      "step": 930
+    },
+    {
+      "epoch": 1.1604938271604939,
+      "grad_norm": 1.1110204458236694,
+      "learning_rate": 0.00014045843457583085,
+      "loss": 0.5328,
+      "step": 940
+    },
+    {
+      "epoch": 1.1728395061728394,
+      "grad_norm": 1.1803778409957886,
+      "learning_rate": 0.0001392359593327778,
+      "loss": 0.5188,
+      "step": 950
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 1.1317882537841797,
+      "learning_rate": 0.00013800651367808158,
+      "loss": 0.5181,
+      "step": 960
+    },
+    {
+      "epoch": 1.1975308641975309,
+      "grad_norm": 1.2042200565338135,
+      "learning_rate": 0.00013677031602725822,
+      "loss": 0.5165,
+      "step": 970
+    },
+    {
+      "epoch": 1.2098765432098766,
+      "grad_norm": 1.357170820236206,
+      "learning_rate": 0.0001355275859953406,
+      "loss": 0.5201,
+      "step": 980
+    },
+    {
+      "epoch": 1.2222222222222223,
+      "grad_norm": 1.24747896194458,
+      "learning_rate": 0.00013427854435786303,
+      "loss": 0.5213,
+      "step": 990
+    },
+    {
+      "epoch": 1.2345679012345678,
+      "grad_norm": 1.2096565961837769,
+      "learning_rate": 0.00013302341301163953,
+      "loss": 0.5144,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2469135802469136,
+      "grad_norm": 1.2155942916870117,
+      "learning_rate": 0.0001317624149353432,
+      "loss": 0.5203,
+      "step": 1010
+    },
+    {
+      "epoch": 1.2592592592592593,
+      "grad_norm": 1.2768797874450684,
+      "learning_rate": 0.00013049577414989317,
+      "loss": 0.5253,
+      "step": 1020
+    },
+    {
+      "epoch": 1.2716049382716048,
+      "grad_norm": 1.174811840057373,
+      "learning_rate": 0.0001292237156786565,
+      "loss": 0.5268,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2839506172839505,
+      "grad_norm": 1.1357448101043701,
+      "learning_rate": 0.00012794646550747196,
+      "loss": 0.5317,
+      "step": 1040
+    },
+    {
+      "epoch": 1.2962962962962963,
+      "grad_norm": 1.138178825378418,
+      "learning_rate": 0.00012666425054450275,
+      "loss": 0.5052,
+      "step": 1050
+    },
+    {
+      "epoch": 1.308641975308642,
+      "grad_norm": 1.2756052017211914,
+      "learning_rate": 0.0001253772985799255,
+      "loss": 0.5297,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3209876543209877,
+      "grad_norm": 1.306522250175476,
+      "learning_rate": 0.00012408583824546248,
+      "loss": 0.5199,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 1.2052675485610962,
+      "learning_rate": 0.00012279009897376444,
+      "loss": 0.5215,
+      "step": 1080
+    },
+    {
+      "epoch": 1.345679012345679,
+      "grad_norm": 1.2734299898147583,
+      "learning_rate": 0.00012149031095765087,
+      "loss": 0.5091,
+      "step": 1090
+    },
+    {
+      "epoch": 1.3580246913580247,
+      "grad_norm": 1.2760584354400635,
+      "learning_rate": 0.00012018670510921557,
+      "loss": 0.4978,
+      "step": 1100
+    },
+    {
+      "epoch": 1.3703703703703702,
+      "grad_norm": 1.286183476448059,
+      "learning_rate": 0.0001188795130188042,
+      "loss": 0.5433,
+      "step": 1110
+    },
+    {
+      "epoch": 1.382716049382716,
+      "grad_norm": 1.3962328433990479,
+      "learning_rate": 0.00011756896691387141,
+      "loss": 0.5576,
+      "step": 1120
+    },
+    {
+      "epoch": 1.3950617283950617,
+      "grad_norm": 1.234094262123108,
+      "learning_rate": 0.00011625529961772481,
+      "loss": 0.5274,
+      "step": 1130
+    },
+    {
+      "epoch": 1.4074074074074074,
+      "grad_norm": 1.2545151710510254,
+      "learning_rate": 0.00011493874450816302,
+      "loss": 0.5132,
+      "step": 1140
+    },
+    {
+      "epoch": 1.4197530864197532,
+      "grad_norm": 1.2998137474060059,
+      "learning_rate": 0.00011361953547601532,
+      "loss": 0.5102,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4320987654320987,
+      "grad_norm": 1.1854768991470337,
+      "learning_rate": 0.00011229790688358994,
+      "loss": 0.5389,
+      "step": 1160
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 1.2062567472457886,
+      "learning_rate": 0.00011097409352303896,
+      "loss": 0.5038,
+      "step": 1170
+    },
+    {
+      "epoch": 1.4567901234567902,
+      "grad_norm": 1.1520670652389526,
+      "learning_rate": 0.00010964833057464645,
+      "loss": 0.5273,
+      "step": 1180
+    },
+    {
+      "epoch": 1.4691358024691357,
+      "grad_norm": 1.1943365335464478,
+      "learning_rate": 0.00010832085356504786,
+      "loss": 0.5149,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4814814814814814,
+      "grad_norm": 1.239999771118164,
+      "learning_rate": 0.00010699189832538795,
+      "loss": 0.5234,
+      "step": 1200
+    },
+    {
+      "epoch": 1.4938271604938271,
+      "grad_norm": 1.1738687753677368,
+      "learning_rate": 0.00010566170094942438,
+      "loss": 0.5077,
+      "step": 1210
+    },
+    {
+      "epoch": 1.5061728395061729,
+      "grad_norm": 1.2650293111801147,
+      "learning_rate": 0.00010433049775158497,
+      "loss": 0.5139,
+      "step": 1220
+    },
+    {
+      "epoch": 1.5185185185185186,
+      "grad_norm": 1.3197226524353027,
+      "learning_rate": 0.00010299852522498535,
+      "loss": 0.4822,
+      "step": 1230
+    },
+    {
+      "epoch": 1.5308641975308643,
+      "grad_norm": 1.2888526916503906,
+      "learning_rate": 0.00010166601999941528,
+      "loss": 0.5005,
+      "step": 1240
+    },
+    {
+      "epoch": 1.5432098765432098,
+      "grad_norm": 1.2094467878341675,
+      "learning_rate": 0.00010033321879930044,
+      "loss": 0.489,
+      "step": 1250
+    },
+    {
+      "epoch": 1.5555555555555556,
+      "grad_norm": 1.161982536315918,
+      "learning_rate": 9.900035840164752e-05,
+      "loss": 0.5058,
+      "step": 1260
+    },
+    {
+      "epoch": 1.567901234567901,
+      "grad_norm": 1.1806972026824951,
+      "learning_rate": 9.766767559397977e-05,
+      "loss": 0.4991,
+      "step": 1270
+    },
+    {
+      "epoch": 1.5802469135802468,
+      "grad_norm": 1.2687534093856812,
+      "learning_rate": 9.633540713227095e-05,
+      "loss": 0.5262,
+      "step": 1280
+    },
+    {
+      "epoch": 1.5925925925925926,
+      "grad_norm": 1.2309614419937134,
+      "learning_rate": 9.500378969888479e-05,
+      "loss": 0.4909,
+      "step": 1290
+    },
+    {
+      "epoch": 1.6049382716049383,
+      "grad_norm": 1.2943015098571777,
+      "learning_rate": 9.367305986052747e-05,
+      "loss": 0.5267,
+      "step": 1300
+    },
+    {
+      "epoch": 1.617283950617284,
+      "grad_norm": 1.1998778581619263,
+      "learning_rate": 9.234345402622064e-05,
+      "loss": 0.5039,
+      "step": 1310
+    },
+    {
+      "epoch": 1.6296296296296298,
+      "grad_norm": 1.2614692449569702,
+      "learning_rate": 9.101520840530245e-05,
+      "loss": 0.509,
+      "step": 1320
+    },
+    {
+      "epoch": 1.6419753086419753,
+      "grad_norm": 1.2492973804473877,
+      "learning_rate": 8.968855896546429e-05,
+      "loss": 0.5279,
+      "step": 1330
+    },
+    {
+      "epoch": 1.654320987654321,
+      "grad_norm": 1.2979342937469482,
+      "learning_rate": 8.83637413908301e-05,
+      "loss": 0.4958,
+      "step": 1340
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 1.2049380540847778,
+      "learning_rate": 8.70409910400862e-05,
+      "loss": 0.5236,
+      "step": 1350
+    },
+    {
+      "epoch": 1.6790123456790123,
+      "grad_norm": 1.494537591934204,
+      "learning_rate": 8.572054290466911e-05,
+      "loss": 0.5099,
+      "step": 1360
+    },
+    {
+      "epoch": 1.691358024691358,
+      "grad_norm": 1.2835593223571777,
+      "learning_rate": 8.440263156701835e-05,
+      "loss": 0.5148,
+      "step": 1370
+    },
+    {
+      "epoch": 1.7037037037037037,
+      "grad_norm": 1.40305495262146,
+      "learning_rate": 8.308749115890212e-05,
+      "loss": 0.4957,
+      "step": 1380
+    },
+    {
+      "epoch": 1.7160493827160495,
+      "grad_norm": 1.2815101146697998,
+      "learning_rate": 8.177535531982266e-05,
+      "loss": 0.4919,
+      "step": 1390
+    },
+    {
+      "epoch": 1.7283950617283952,
+      "grad_norm": 1.3042811155319214,
+      "learning_rate": 8.046645715550971e-05,
+      "loss": 0.5041,
+      "step": 1400
+    },
+    {
+      "epoch": 1.7407407407407407,
+      "grad_norm": 1.3259227275848389,
+      "learning_rate": 7.916102919650826e-05,
+      "loss": 0.5199,
+      "step": 1410
+    },
+    {
+      "epoch": 1.7530864197530864,
+      "grad_norm": 1.3886185884475708,
+      "learning_rate": 7.785930335686843e-05,
+      "loss": 0.5027,
+      "step": 1420
+    },
+    {
+      "epoch": 1.765432098765432,
+      "grad_norm": 1.3747823238372803,
+      "learning_rate": 7.656151089294553e-05,
+      "loss": 0.5039,
+      "step": 1430
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 1.3446813821792603,
+      "learning_rate": 7.526788236231621e-05,
+      "loss": 0.4968,
+      "step": 1440
+    },
+    {
+      "epoch": 1.7901234567901234,
+      "grad_norm": 1.2697521448135376,
+      "learning_rate": 7.397864758281909e-05,
+      "loss": 0.492,
+      "step": 1450
+    },
+    {
+      "epoch": 1.8024691358024691,
+      "grad_norm": 1.2015522718429565,
+      "learning_rate": 7.26940355917269e-05,
+      "loss": 0.5009,
+      "step": 1460
+    },
+    {
+      "epoch": 1.8148148148148149,
+      "grad_norm": 1.1260066032409668,
+      "learning_rate": 7.141427460505712e-05,
+      "loss": 0.4872,
+      "step": 1470
+    },
+    {
+      "epoch": 1.8271604938271606,
+      "grad_norm": 1.410532832145691,
+      "learning_rate": 7.013959197702851e-05,
+      "loss": 0.5243,
+      "step": 1480
+    },
+    {
+      "epoch": 1.8395061728395061,
+      "grad_norm": 1.280680537223816,
+      "learning_rate": 6.887021415967081e-05,
+      "loss": 0.4939,
+      "step": 1490
+    },
+    {
+      "epoch": 1.8518518518518519,
+      "grad_norm": 1.3018995523452759,
+      "learning_rate": 6.760636666259485e-05,
+      "loss": 0.5088,
+      "step": 1500
+    },
+    {
+      "epoch": 1.8641975308641974,
+      "grad_norm": 1.2070612907409668,
+      "learning_rate": 6.634827401292981e-05,
+      "loss": 0.4701,
+      "step": 1510
+    },
+    {
+      "epoch": 1.876543209876543,
+      "grad_norm": 1.1327502727508545,
+      "learning_rate": 6.50961597154351e-05,
+      "loss": 0.4843,
+      "step": 1520
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": 1.2304959297180176,
+      "learning_rate": 6.385024621279411e-05,
+      "loss": 0.4749,
+      "step": 1530
+    },
+    {
+      "epoch": 1.9012345679012346,
+      "grad_norm": 1.3501994609832764,
+      "learning_rate": 6.261075484609634e-05,
+      "loss": 0.4999,
+      "step": 1540
+    },
+    {
+      "epoch": 1.9135802469135803,
+      "grad_norm": 1.2083770036697388,
+      "learning_rate": 6.137790581551525e-05,
+      "loss": 0.4813,
+      "step": 1550
+    },
+    {
+      "epoch": 1.925925925925926,
+      "grad_norm": 1.230406641960144,
+      "learning_rate": 6.0151918141189156e-05,
+      "loss": 0.4897,
+      "step": 1560
+    },
+    {
+      "epoch": 1.9382716049382716,
+      "grad_norm": 1.3704227209091187,
+      "learning_rate": 5.893300962431123e-05,
+      "loss": 0.4931,
+      "step": 1570
+    },
+    {
+      "epoch": 1.9506172839506173,
+      "grad_norm": 1.153743863105774,
+      "learning_rate": 5.772139680843651e-05,
+      "loss": 0.4776,
+      "step": 1580
+    },
+    {
+      "epoch": 1.9629629629629628,
+      "grad_norm": 1.553464651107788,
+      "learning_rate": 5.651729494101201e-05,
+      "loss": 0.4899,
+      "step": 1590
+    },
+    {
+      "epoch": 1.9753086419753085,
+      "grad_norm": 1.3536055088043213,
+      "learning_rate": 5.532091793513732e-05,
+      "loss": 0.4734,
+      "step": 1600
+    },
+    {
+      "epoch": 1.9876543209876543,
+      "grad_norm": 1.220577597618103,
+      "learning_rate": 5.413247833156219e-05,
+      "loss": 0.4677,
+      "step": 1610
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.194543480873108,
+      "learning_rate": 5.2952187260927675e-05,
+      "loss": 0.4738,
+      "step": 1620
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.6842637062072754,
+      "eval_runtime": 410.4698,
+      "eval_samples_per_second": 3.508,
+      "eval_steps_per_second": 0.877,
+      "step": 1620
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2430,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.8474547416911053e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args-6.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c63a8058f1f43024b3dc1982d0bdea55cef0e28323149cf48e044a325a6f066
+size 5841