dwnmf commited on Mar 8

Commit

0dc26a0

verified ·

1 Parent(s): ccd4ce8

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

.gitattributes +2 -0
checkpoints/README.md +59 -0
checkpoints/checkpoint-1200/README.md +210 -0
checkpoints/checkpoint-1200/adapter_config.json +50 -0
checkpoints/checkpoint-1200/adapter_model.safetensors +3 -0
checkpoints/checkpoint-1200/chat_template.jinja +8 -0
checkpoints/checkpoint-1200/optimizer.pt +3 -0
checkpoints/checkpoint-1200/rng_state.pth +3 -0
checkpoints/checkpoint-1200/scheduler.pt +3 -0
checkpoints/checkpoint-1200/tokenizer.json +3 -0
checkpoints/checkpoint-1200/tokenizer_config.json +9 -0
checkpoints/checkpoint-1200/trainer_state.json +874 -0
checkpoints/checkpoint-1200/training_args.bin +3 -0
checkpoints/checkpoint-1264/README.md +210 -0
checkpoints/checkpoint-1264/adapter_config.json +50 -0
checkpoints/checkpoint-1264/adapter_model.safetensors +3 -0
checkpoints/checkpoint-1264/chat_template.jinja +8 -0
checkpoints/checkpoint-1264/optimizer.pt +3 -0
checkpoints/checkpoint-1264/rng_state.pth +3 -0
checkpoints/checkpoint-1264/scheduler.pt +3 -0
checkpoints/checkpoint-1264/tokenizer.json +3 -0
checkpoints/checkpoint-1264/tokenizer_config.json +9 -0
checkpoints/checkpoint-1264/trainer_state.json +916 -0
checkpoints/checkpoint-1264/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/checkpoint-1264/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+base_model: Nanbeige/Nanbeige4.1-3B
+library_name: transformers
+model_name: checkpoints
+tags:
+- generated_from_trainer
+- sft
+- unsloth
+- trl
+licence: license
+---
+# Model Card for checkpoints
+This model is a fine-tuned version of [Nanbeige/Nanbeige4.1-3B](https://huggingface.co/Nanbeige/Nanbeige4.1-3B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.22.2
+- Transformers: 5.2.0
+- Pytorch: 2.8.0
+- Datasets: 4.3.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

checkpoints/checkpoint-1200/README.md ADDED Viewed

	@@ -0,0 +1,210 @@

+---
+base_model: Nanbeige/Nanbeige4.1-3B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Nanbeige/Nanbeige4.1-3B
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoints/checkpoint-1200/adapter_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "Nanbeige/Nanbeige4.1-3B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/checkpoint-1200/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51e63b780a72dea5fb265dd60ccfe4a1dffb4cff0f519c4717631fd64bf38918
+size 455142376

checkpoints/checkpoint-1200/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,8 @@

+{% for message in messages %}{% if message['role'] == 'user' %}{{'<|im_start|>user
+' + message['content'] + '<|im_end|>
+'}}{% elif message['role'] == 'assistant' %}{{'<|im_start|>assistant
+' + message['content'] + '<|im_end|>
+' }}{% else %}{{ '<|im_start|>system
+' + message['content'] + '<|im_end|>
+' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/checkpoint-1200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59062d6a2b6fde81ef8d50a9e894d2f4bc612f5d3a4c695d6d7e6dd83beb38dd
+size 231623653

checkpoints/checkpoint-1200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c800b778fa7e115e4c34de8529902de8b61c9a1b4bab3eb8295d06dafff030e
+size 14645

checkpoints/checkpoint-1200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc25d9e1d0e95b057038fa6c0f7cef25ec39c084034f1bf655dfcb4214ff7555
+size 1465

checkpoints/checkpoint-1200/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d8f0326910136aca20831249220b38ce5299527647bc8c6b65404485c479740
+size 18451122

checkpoints/checkpoint-1200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|im_start|>",
+  "eos_token": "<|im_end|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

checkpoints/checkpoint-1200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,874 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.8990898298377523,
+  "eval_steps": 500,
+  "global_step": 1200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.015829046299960427,
+      "grad_norm": 3.5748817920684814,
+      "learning_rate": 3.5433070866141735e-06,
+      "loss": 1.7827543258666991,
+      "step": 10
+    },
+    {
+      "epoch": 0.031658092599920855,
+      "grad_norm": 1.181498408317566,
+      "learning_rate": 7.4803149606299226e-06,
+      "loss": 1.6411018371582031,
+      "step": 20
+    },
+    {
+      "epoch": 0.04748713889988128,
+      "grad_norm": 0.5824679732322693,
+      "learning_rate": 1.141732283464567e-05,
+      "loss": 1.4533835411071778,
+      "step": 30
+    },
+    {
+      "epoch": 0.06331618519984171,
+      "grad_norm": 0.5377753973007202,
+      "learning_rate": 1.535433070866142e-05,
+      "loss": 1.3427258491516114,
+      "step": 40
+    },
+    {
+      "epoch": 0.07914523149980214,
+      "grad_norm": 0.5031867027282715,
+      "learning_rate": 1.9291338582677166e-05,
+      "loss": 1.1043410301208496,
+      "step": 50
+    },
+    {
+      "epoch": 0.09497427779976256,
+      "grad_norm": 0.2011471688747406,
+      "learning_rate": 2.3228346456692916e-05,
+      "loss": 0.963739013671875,
+      "step": 60
+    },
+    {
+      "epoch": 0.11080332409972299,
+      "grad_norm": 0.1923132687807083,
+      "learning_rate": 2.7165354330708666e-05,
+      "loss": 0.9648835182189941,
+      "step": 70
+    },
+    {
+      "epoch": 0.12663237039968342,
+      "grad_norm": 0.19071003794670105,
+      "learning_rate": 3.110236220472441e-05,
+      "loss": 0.9172444343566895,
+      "step": 80
+    },
+    {
+      "epoch": 0.14246141669964385,
+      "grad_norm": 0.16827628016471863,
+      "learning_rate": 3.5039370078740156e-05,
+      "loss": 0.9281296730041504,
+      "step": 90
+    },
+    {
+      "epoch": 0.15829046299960428,
+      "grad_norm": 0.17542941868305206,
+      "learning_rate": 3.8976377952755905e-05,
+      "loss": 0.9268568992614746,
+      "step": 100
+    },
+    {
+      "epoch": 0.1741195092995647,
+      "grad_norm": 0.18605543673038483,
+      "learning_rate": 4.2913385826771655e-05,
+      "loss": 0.9156542778015136,
+      "step": 110
+    },
+    {
+      "epoch": 0.18994855559952512,
+      "grad_norm": 0.19004584848880768,
+      "learning_rate": 4.6850393700787405e-05,
+      "loss": 0.912873363494873,
+      "step": 120
+    },
+    {
+      "epoch": 0.20577760189948555,
+      "grad_norm": 0.1750367432832718,
+      "learning_rate": 4.999961827753896e-05,
+      "loss": 0.9295230865478515,
+      "step": 130
+    },
+    {
+      "epoch": 0.22160664819944598,
+      "grad_norm": 0.18699847161769867,
+      "learning_rate": 4.998625921534381e-05,
+      "loss": 0.9282792091369629,
+      "step": 140
+    },
+    {
+      "epoch": 0.2374356944994064,
+      "grad_norm": 0.18123294413089752,
+      "learning_rate": 4.9953825685459635e-05,
+      "loss": 0.9071338653564454,
+      "step": 150
+    },
+    {
+      "epoch": 0.25326474079936684,
+      "grad_norm": 0.19085095822811127,
+      "learning_rate": 4.990234244758785e-05,
+      "loss": 0.8944445610046386,
+      "step": 160
+    },
+    {
+      "epoch": 0.26909378709932724,
+      "grad_norm": 0.1798243671655655,
+      "learning_rate": 4.9831848803944474e-05,
+      "loss": 0.9216786384582519,
+      "step": 170
+    },
+    {
+      "epoch": 0.2849228333992877,
+      "grad_norm": 0.16492608189582825,
+      "learning_rate": 4.9742398569256896e-05,
+      "loss": 0.8954303741455079,
+      "step": 180
+    },
+    {
+      "epoch": 0.3007518796992481,
+      "grad_norm": 0.1820351481437683,
+      "learning_rate": 4.963406002968179e-05,
+      "loss": 0.8920731544494629,
+      "step": 190
+    },
+    {
+      "epoch": 0.31658092599920856,
+      "grad_norm": 0.1755346953868866,
+      "learning_rate": 4.950691589067557e-05,
+      "loss": 0.8888743400573731,
+      "step": 200
+    },
+    {
+      "epoch": 0.33240997229916897,
+      "grad_norm": 0.17681735754013062,
+      "learning_rate": 4.9361063213857194e-05,
+      "loss": 0.8969594955444335,
+      "step": 210
+    },
+    {
+      "epoch": 0.3482390185991294,
+      "grad_norm": 0.1814550906419754,
+      "learning_rate": 4.91966133429115e-05,
+      "loss": 0.8851950645446778,
+      "step": 220
+    },
+    {
+      "epoch": 0.3640680648990898,
+      "grad_norm": 0.17385666072368622,
+      "learning_rate": 4.901369181858964e-05,
+      "loss": 0.9051773071289062,
+      "step": 230
+    },
+    {
+      "epoch": 0.37989711119905023,
+      "grad_norm": 0.1662660837173462,
+      "learning_rate": 4.88124382828714e-05,
+      "loss": 0.8571414947509766,
+      "step": 240
+    },
+    {
+      "epoch": 0.3957261574990107,
+      "grad_norm": 0.18324530124664307,
+      "learning_rate": 4.859300637236289e-05,
+      "loss": 0.8993622779846191,
+      "step": 250
+    },
+    {
+      "epoch": 0.4115552037989711,
+      "grad_norm": 0.17792873084545135,
+      "learning_rate": 4.83555636010105e-05,
+      "loss": 0.888437557220459,
+      "step": 260
+    },
+    {
+      "epoch": 0.42738425009893155,
+      "grad_norm": 0.17820881307125092,
+      "learning_rate": 4.810029123222109e-05,
+      "loss": 0.873835277557373,
+      "step": 270
+    },
+    {
+      "epoch": 0.44321329639889195,
+      "grad_norm": 0.17558087408542633,
+      "learning_rate": 4.782738414048581e-05,
+      "loss": 0.8789983749389648,
+      "step": 280
+    },
+    {
+      "epoch": 0.4590423426988524,
+      "grad_norm": 0.18305832147598267,
+      "learning_rate": 4.753705066261326e-05,
+      "loss": 0.8827195167541504,
+      "step": 290
+    },
+    {
+      "epoch": 0.4748713889988128,
+      "grad_norm": 0.1801442950963974,
+      "learning_rate": 4.722951243868548e-05,
+      "loss": 0.8785966873168946,
+      "step": 300
+    },
+    {
+      "epoch": 0.4907004352987733,
+      "grad_norm": 0.16704759001731873,
+      "learning_rate": 4.690500424285833e-05,
+      "loss": 0.8711637496948242,
+      "step": 310
+    },
+    {
+      "epoch": 0.5065294815987337,
+      "grad_norm": 0.1603054255247116,
+      "learning_rate": 4.6563773804135305e-05,
+      "loss": 0.8773131370544434,
+      "step": 320
+    },
+    {
+      "epoch": 0.5223585278986941,
+      "grad_norm": 0.17996226251125336,
+      "learning_rate": 4.62060816172516e-05,
+      "loss": 0.8801346778869629,
+      "step": 330
+    },
+    {
+      "epoch": 0.5381875741986545,
+      "grad_norm": 0.16810840368270874,
+      "learning_rate": 4.583220074381288e-05,
+      "loss": 0.8802348136901855,
+      "step": 340
+    },
+    {
+      "epoch": 0.554016620498615,
+      "grad_norm": 0.18790695071220398,
+      "learning_rate": 4.544241660384057e-05,
+      "loss": 0.8991734504699707,
+      "step": 350
+    },
+    {
+      "epoch": 0.5698456667985754,
+      "grad_norm": 0.17079883813858032,
+      "learning_rate": 4.503702675788262e-05,
+      "loss": 0.8455135345458984,
+      "step": 360
+    },
+    {
+      "epoch": 0.5856747130985358,
+      "grad_norm": 0.1780555248260498,
+      "learning_rate": 4.4616340679856336e-05,
+      "loss": 0.8810781478881836,
+      "step": 370
+    },
+    {
+      "epoch": 0.6015037593984962,
+      "grad_norm": 0.17815925180912018,
+      "learning_rate": 4.418067952079651e-05,
+      "loss": 0.8761508941650391,
+      "step": 380
+    },
+    {
+      "epoch": 0.6173328056984567,
+      "grad_norm": 0.1707397699356079,
+      "learning_rate": 4.3730375863689256e-05,
+      "loss": 0.8592373847961425,
+      "step": 390
+    },
+    {
+      "epoch": 0.6331618519984171,
+      "grad_norm": 0.16248367726802826,
+      "learning_rate": 4.326577346957875e-05,
+      "loss": 0.8755002021789551,
+      "step": 400
+    },
+    {
+      "epoch": 0.6489908982983775,
+      "grad_norm": 0.165540874004364,
+      "learning_rate": 4.278722701514061e-05,
+      "loss": 0.867929744720459,
+      "step": 410
+    },
+    {
+      "epoch": 0.6648199445983379,
+      "grad_norm": 0.16495656967163086,
+      "learning_rate": 4.229510182192235e-05,
+      "loss": 0.8697072982788085,
+      "step": 420
+    },
+    {
+      "epoch": 0.6806489908982983,
+      "grad_norm": 0.16391977667808533,
+      "learning_rate": 4.178977357745749e-05,
+      "loss": 0.8804462432861329,
+      "step": 430
+    },
+    {
+      "epoch": 0.6964780371982588,
+      "grad_norm": 0.16260066628456116,
+      "learning_rate": 4.12716280484664e-05,
+      "loss": 0.8600423812866211,
+      "step": 440
+    },
+    {
+      "epoch": 0.7123070834982193,
+      "grad_norm": 0.16413277387619019,
+      "learning_rate": 4.0741060786362585e-05,
+      "loss": 0.8589424133300781,
+      "step": 450
+    },
+    {
+      "epoch": 0.7281361297981797,
+      "grad_norm": 0.16915135085582733,
+      "learning_rate": 4.0198476825289434e-05,
+      "loss": 0.8609626770019532,
+      "step": 460
+    },
+    {
+      "epoch": 0.7439651760981401,
+      "grad_norm": 0.17290306091308594,
+      "learning_rate": 3.9644290372917844e-05,
+      "loss": 0.8830280303955078,
+      "step": 470
+    },
+    {
+      "epoch": 0.7597942223981005,
+      "grad_norm": 0.16407273709774017,
+      "learning_rate": 3.907892449424081e-05,
+      "loss": 0.8745547294616699,
+      "step": 480
+    },
+    {
+      "epoch": 0.775623268698061,
+      "grad_norm": 0.16706892848014832,
+      "learning_rate": 3.850281078860627e-05,
+      "loss": 0.8945063591003418,
+      "step": 490
+    },
+    {
+      "epoch": 0.7914523149980214,
+      "grad_norm": 0.17343616485595703,
+      "learning_rate": 3.7916389060234964e-05,
+      "loss": 0.8862369537353516,
+      "step": 500
+    },
+    {
+      "epoch": 0.8072813612979818,
+      "grad_norm": 0.17225787043571472,
+      "learning_rate": 3.7320106982474625e-05,
+      "loss": 0.8728190422058105,
+      "step": 510
+    },
+    {
+      "epoch": 0.8231104075979422,
+      "grad_norm": 0.1631278693675995,
+      "learning_rate": 3.6714419756046885e-05,
+      "loss": 0.8206952095031739,
+      "step": 520
+    },
+    {
+      "epoch": 0.8389394538979027,
+      "grad_norm": 0.1752168983221054,
+      "learning_rate": 3.6099789761547834e-05,
+      "loss": 0.8330535888671875,
+      "step": 530
+    },
+    {
+      "epoch": 0.8547685001978631,
+      "grad_norm": 0.1791313886642456,
+      "learning_rate": 3.5476686206467466e-05,
+      "loss": 0.8688525199890137,
+      "step": 540
+    },
+    {
+      "epoch": 0.8705975464978235,
+      "grad_norm": 0.16879358887672424,
+      "learning_rate": 3.484558476699748e-05,
+      "loss": 0.8748814582824707,
+      "step": 550
+    },
+    {
+      "epoch": 0.8864265927977839,
+      "grad_norm": 0.1700020432472229,
+      "learning_rate": 3.4206967224900884e-05,
+      "loss": 0.8380928993225097,
+      "step": 560
+    },
+    {
+      "epoch": 0.9022556390977443,
+      "grad_norm": 0.16251260042190552,
+      "learning_rate": 3.35613210997206e-05,
+      "loss": 0.8361630439758301,
+      "step": 570
+    },
+    {
+      "epoch": 0.9180846853977048,
+      "grad_norm": 0.1657862812280655,
+      "learning_rate": 3.290913927660793e-05,
+      "loss": 0.8503948211669922,
+      "step": 580
+    },
+    {
+      "epoch": 0.9339137316976652,
+      "grad_norm": 0.17638036608695984,
+      "learning_rate": 3.22509196300548e-05,
+      "loss": 0.8723633766174317,
+      "step": 590
+    },
+    {
+      "epoch": 0.9497427779976256,
+      "grad_norm": 0.17021600902080536,
+      "learning_rate": 3.158716464381728e-05,
+      "loss": 0.8567726135253906,
+      "step": 600
+    },
+    {
+      "epoch": 0.965571824297586,
+      "grad_norm": 0.17130261659622192,
+      "learning_rate": 3.091838102732031e-05,
+      "loss": 0.8626362800598144,
+      "step": 610
+    },
+    {
+      "epoch": 0.9814008705975465,
+      "grad_norm": 0.16850486397743225,
+      "learning_rate": 3.024507932883659e-05,
+      "loss": 0.8559526443481446,
+      "step": 620
+    },
+    {
+      "epoch": 0.997229916897507,
+      "grad_norm": 0.1741161048412323,
+      "learning_rate": 2.9567773545734916e-05,
+      "loss": 0.8548683166503906,
+      "step": 630
+    },
+    {
+      "epoch": 1.0126632370399684,
+      "grad_norm": 0.1648906171321869,
+      "learning_rate": 2.8886980732095464e-05,
+      "loss": 0.8245421409606933,
+      "step": 640
+    },
+    {
+      "epoch": 1.0284922833399288,
+      "grad_norm": 0.1882442682981491,
+      "learning_rate": 2.820322060399156e-05,
+      "loss": 0.8283076286315918,
+      "step": 650
+    },
+    {
+      "epoch": 1.0443213296398892,
+      "grad_norm": 0.16794943809509277,
+      "learning_rate": 2.7517015142739335e-05,
+      "loss": 0.8383002281188965,
+      "step": 660
+    },
+    {
+      "epoch": 1.0601503759398496,
+      "grad_norm": 0.17361986637115479,
+      "learning_rate": 2.6828888196418088e-05,
+      "loss": 0.8475232124328613,
+      "step": 670
+    },
+    {
+      "epoch": 1.07597942223981,
+      "grad_norm": 0.16774870455265045,
+      "learning_rate": 2.6139365079965538e-05,
+      "loss": 0.8095685005187988,
+      "step": 680
+    },
+    {
+      "epoch": 1.0918084685397704,
+      "grad_norm": 0.1640879362821579,
+      "learning_rate": 2.5448972174153318e-05,
+      "loss": 0.8009868621826172,
+      "step": 690
+    },
+    {
+      "epoch": 1.1076375148397308,
+      "grad_norm": 0.18289130926132202,
+      "learning_rate": 2.4758236523748733e-05,
+      "loss": 0.8200379371643066,
+      "step": 700
+    },
+    {
+      "epoch": 1.1234665611396912,
+      "grad_norm": 0.1810799241065979,
+      "learning_rate": 2.4067685435169772e-05,
+      "loss": 0.8436824798583984,
+      "step": 710
+    },
+    {
+      "epoch": 1.1392956074396516,
+      "grad_norm": 0.18245543539524078,
+      "learning_rate": 2.3377846073940207e-05,
+      "loss": 0.8141367912292481,
+      "step": 720
+    },
+    {
+      "epoch": 1.1551246537396123,
+      "grad_norm": 0.18102015554904938,
+      "learning_rate": 2.2689245062252398e-05,
+      "loss": 0.8481219291687012,
+      "step": 730
+    },
+    {
+      "epoch": 1.1709537000395727,
+      "grad_norm": 0.19379396736621857,
+      "learning_rate": 2.200240807694474e-05,
+      "loss": 0.8380316734313965,
+      "step": 740
+    },
+    {
+      "epoch": 1.186782746339533,
+      "grad_norm": 0.17735467851161957,
+      "learning_rate": 2.131785944820092e-05,
+      "loss": 0.8194567680358886,
+      "step": 750
+    },
+    {
+      "epoch": 1.2026117926394935,
+      "grad_norm": 0.18803226947784424,
+      "learning_rate": 2.0636121759277137e-05,
+      "loss": 0.8280925750732422,
+      "step": 760
+    },
+    {
+      "epoch": 1.2184408389394539,
+      "grad_norm": 0.19116626679897308,
+      "learning_rate": 1.995771544756287e-05,
+      "loss": 0.8177350044250489,
+      "step": 770
+    },
+    {
+      "epoch": 1.2342698852394143,
+      "grad_norm": 0.17773468792438507,
+      "learning_rate": 1.9283158407279977e-05,
+      "loss": 0.8399795532226563,
+      "step": 780
+    },
+    {
+      "epoch": 1.2500989315393747,
+      "grad_norm": 0.18355493247509003,
+      "learning_rate": 1.861296559412303e-05,
+      "loss": 0.8454893112182618,
+      "step": 790
+    },
+    {
+      "epoch": 1.2659279778393353,
+      "grad_norm": 0.18507550656795502,
+      "learning_rate": 1.7947648632143076e-05,
+      "loss": 0.8371585845947266,
+      "step": 800
+    },
+    {
+      "epoch": 1.2817570241392957,
+      "grad_norm": 0.18332813680171967,
+      "learning_rate": 1.728771542317466e-05,
+      "loss": 0.8566581726074218,
+      "step": 810
+    },
+    {
+      "epoch": 1.2975860704392561,
+      "grad_norm": 0.18364256620407104,
+      "learning_rate": 1.6633669759104488e-05,
+      "loss": 0.8170791625976562,
+      "step": 820
+    },
+    {
+      "epoch": 1.3134151167392165,
+      "grad_norm": 0.1980493664741516,
+      "learning_rate": 1.598601093727749e-05,
+      "loss": 0.852113151550293,
+      "step": 830
+    },
+    {
+      "epoch": 1.329244163039177,
+      "grad_norm": 0.20105807483196259,
+      "learning_rate": 1.5345233379334155e-05,
+      "loss": 0.8228271484375,
+      "step": 840
+    },
+    {
+      "epoch": 1.3450732093391373,
+      "grad_norm": 0.18281184136867523,
+      "learning_rate": 1.4711826253769827e-05,
+      "loss": 0.8469521522521972,
+      "step": 850
+    },
+    {
+      "epoch": 1.3609022556390977,
+      "grad_norm": 0.19825202226638794,
+      "learning_rate": 1.4086273102504341e-05,
+      "loss": 0.8229537010192871,
+      "step": 860
+    },
+    {
+      "epoch": 1.3767313019390581,
+      "grad_norm": 0.1836749017238617,
+      "learning_rate": 1.346905147174694e-05,
+      "loss": 0.8486099243164062,
+      "step": 870
+    },
+    {
+      "epoch": 1.3925603482390185,
+      "grad_norm": 0.18626976013183594,
+      "learning_rate": 1.2860632547438333e-05,
+      "loss": 0.8549923896789551,
+      "step": 880
+    },
+    {
+      "epoch": 1.408389394538979,
+      "grad_norm": 0.1800047755241394,
+      "learning_rate": 1.2261480795548123e-05,
+      "loss": 0.8070058822631836,
+      "step": 890
+    },
+    {
+      "epoch": 1.4242184408389393,
+      "grad_norm": 0.1864195019006729,
+      "learning_rate": 1.167205360750227e-05,
+      "loss": 0.831914234161377,
+      "step": 900
+    },
+    {
+      "epoch": 1.4400474871389,
+      "grad_norm": 0.18130667507648468,
+      "learning_rate": 1.1092800951011285e-05,
+      "loss": 0.826213264465332,
+      "step": 910
+    },
+    {
+      "epoch": 1.4558765334388604,
+      "grad_norm": 0.19690996408462524,
+      "learning_rate": 1.0524165026565654e-05,
+      "loss": 0.8235694885253906,
+      "step": 920
+    },
+    {
+      "epoch": 1.4717055797388208,
+      "grad_norm": 0.1875549703836441,
+      "learning_rate": 9.966579929860704e-06,
+      "loss": 0.8336921691894531,
+      "step": 930
+    },
+    {
+      "epoch": 1.4875346260387812,
+      "grad_norm": 0.18889833986759186,
+      "learning_rate": 9.420471320408669e-06,
+      "loss": 0.7959968090057373,
+      "step": 940
+    },
+    {
+      "epoch": 1.5033636723387416,
+      "grad_norm": 0.18145473301410675,
+      "learning_rate": 8.886256096591048e-06,
+      "loss": 0.8122786521911621,
+      "step": 950
+    },
+    {
+      "epoch": 1.519192718638702,
+      "grad_norm": 0.18262115120887756,
+      "learning_rate": 8.364342077398971e-06,
+      "loss": 0.8186595916748047,
+      "step": 960
+    },
+    {
+      "epoch": 1.5350217649386626,
+      "grad_norm": 0.19150428473949432,
+      "learning_rate": 7.855127691104943e-06,
+      "loss": 0.8238736152648926,
+      "step": 970
+    },
+    {
+      "epoch": 1.550850811238623,
+      "grad_norm": 0.1934870034456253,
+      "learning_rate": 7.359001671103361e-06,
+      "loss": 0.8132981300354004,
+      "step": 980
+    },
+    {
+      "epoch": 1.5666798575385834,
+      "grad_norm": 0.1910739690065384,
+      "learning_rate": 6.8763427591521215e-06,
+      "loss": 0.8252732276916503,
+      "step": 990
+    },
+    {
+      "epoch": 1.5825089038385438,
+      "grad_norm": 0.19539424777030945,
+      "learning_rate": 6.407519416241778e-06,
+      "loss": 0.8613625526428222,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5983379501385042,
+      "grad_norm": 0.19278913736343384,
+      "learning_rate": 5.95288954131307e-06,
+      "loss": 0.8396816253662109,
+      "step": 1010
+    },
+    {
+      "epoch": 1.6141669964384646,
+      "grad_norm": 0.19722530245780945,
+      "learning_rate": 5.512800198037477e-06,
+      "loss": 0.849429988861084,
+      "step": 1020
+    },
+    {
+      "epoch": 1.629996042738425,
+      "grad_norm": 0.19314146041870117,
+      "learning_rate": 5.087587349869396e-06,
+      "loss": 0.853305721282959,
+      "step": 1030
+    },
+    {
+      "epoch": 1.6458250890383854,
+      "grad_norm": 0.18120236694812775,
+      "learning_rate": 4.677575603572235e-06,
+      "loss": 0.8267157554626465,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6616541353383458,
+      "grad_norm": 0.1896786242723465,
+      "learning_rate": 4.283077961414125e-06,
+      "loss": 0.811297607421875,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6774831816383062,
+      "grad_norm": 0.19132214784622192,
+      "learning_rate": 3.904395582222578e-06,
+      "loss": 0.8262212753295899,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6933122279382666,
+      "grad_norm": 0.18690507113933563,
+      "learning_rate": 3.541817551480292e-06,
+      "loss": 0.8474508285522461,
+      "step": 1070
+    },
+    {
+      "epoch": 1.709141274238227,
+      "grad_norm": 0.1666112095117569,
+      "learning_rate": 3.1956206606378187e-06,
+      "loss": 0.8323755264282227,
+      "step": 1080
+    },
+    {
+      "epoch": 1.7249703205381874,
+      "grad_norm": 0.1838321089744568,
+      "learning_rate": 2.8660691958114382e-06,
+      "loss": 0.8172253608703614,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7407993668381478,
+      "grad_norm": 0.1931111067533493,
+      "learning_rate": 2.553414736027601e-06,
+      "loss": 0.8497982978820801,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7566284131381085,
+      "grad_norm": 0.19277822971343994,
+      "learning_rate": 2.257895961167886e-06,
+      "loss": 0.8303280830383301,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7724574594380689,
+      "grad_norm": 0.1878863126039505,
+      "learning_rate": 1.9797384697612276e-06,
+      "loss": 0.8127182006835938,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7882865057380293,
+      "grad_norm": 0.20031479001045227,
+      "learning_rate": 1.7191546067623772e-06,
+      "loss": 0.8480099678039551,
+      "step": 1130
+    },
+    {
+      "epoch": 1.8041155520379897,
+      "grad_norm": 0.18454383313655853,
+      "learning_rate": 1.4763433014481103e-06,
+      "loss": 0.837119197845459,
+      "step": 1140
+    },
+    {
+      "epoch": 1.8199445983379503,
+      "grad_norm": 0.2058745175600052,
+      "learning_rate": 1.2514899155549626e-06,
+      "loss": 0.8471168518066406,
+      "step": 1150
+    },
+    {
+      "epoch": 1.8357736446379107,
+      "grad_norm": 0.19359512627124786,
+      "learning_rate": 1.044766101774397e-06,
+      "loss": 0.8292506217956543,
+      "step": 1160
+    },
+    {
+      "epoch": 1.851602690937871,
+      "grad_norm": 0.2011612206697464,
+      "learning_rate": 8.563296727134435e-07,
+      "loss": 0.8529328346252442,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8674317372378315,
+      "grad_norm": 0.1884250044822693,
+      "learning_rate": 6.863244804208052e-07,
+      "loss": 0.8327092170715332,
+      "step": 1180
+    },
+    {
+      "epoch": 1.883260783537792,
+      "grad_norm": 0.19094131886959076,
+      "learning_rate": 5.348803065704483e-07,
+      "loss": 0.843638801574707,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8990898298377523,
+      "grad_norm": 0.20150353014469147,
+      "learning_rate": 4.021127633865196e-07,
+      "loss": 0.8138184547424316,
+      "step": 1200
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1264,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.3554170580086354e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/checkpoint-1200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c014ebd63dab9deb812f3c92192ce130717d3332c747af68efe98595e1e3890
+size 5713

checkpoints/checkpoint-1264/README.md ADDED Viewed

	@@ -0,0 +1,210 @@

+---
+base_model: Nanbeige/Nanbeige4.1-3B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Nanbeige/Nanbeige4.1-3B
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoints/checkpoint-1264/adapter_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "Nanbeige/Nanbeige4.1-3B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/checkpoint-1264/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81bcffd4d2674e1f8d95a6c41a921fba9b5e05aed96acbbb0ef1dea117b8d660
+size 455142376

checkpoints/checkpoint-1264/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,8 @@

+{% for message in messages %}{% if message['role'] == 'user' %}{{'<|im_start|>user
+' + message['content'] + '<|im_end|>
+'}}{% elif message['role'] == 'assistant' %}{{'<|im_start|>assistant
+' + message['content'] + '<|im_end|>
+' }}{% else %}{{ '<|im_start|>system
+' + message['content'] + '<|im_end|>
+' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/checkpoint-1264/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ffea5b8698b067f13a4a31d3317b3e6b7977d195f3ecf092fa13115e9e12b11
+size 231623653

checkpoints/checkpoint-1264/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c800b778fa7e115e4c34de8529902de8b61c9a1b4bab3eb8295d06dafff030e
+size 14645

checkpoints/checkpoint-1264/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78206c712fe04332840566adca2e32409b8392aa97505d085270d0a692a991e2
+size 1465

checkpoints/checkpoint-1264/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d8f0326910136aca20831249220b38ce5299527647bc8c6b65404485c479740
+size 18451122

checkpoints/checkpoint-1264/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|im_start|>",
+  "eos_token": "<|im_end|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

checkpoints/checkpoint-1264/trainer_state.json ADDED Viewed

	@@ -0,0 +1,916 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1264,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.015829046299960427,
+      "grad_norm": 3.5748817920684814,
+      "learning_rate": 3.5433070866141735e-06,
+      "loss": 1.7827543258666991,
+      "step": 10
+    },
+    {
+      "epoch": 0.031658092599920855,
+      "grad_norm": 1.181498408317566,
+      "learning_rate": 7.4803149606299226e-06,
+      "loss": 1.6411018371582031,
+      "step": 20
+    },
+    {
+      "epoch": 0.04748713889988128,
+      "grad_norm": 0.5824679732322693,
+      "learning_rate": 1.141732283464567e-05,
+      "loss": 1.4533835411071778,
+      "step": 30
+    },
+    {
+      "epoch": 0.06331618519984171,
+      "grad_norm": 0.5377753973007202,
+      "learning_rate": 1.535433070866142e-05,
+      "loss": 1.3427258491516114,
+      "step": 40
+    },
+    {
+      "epoch": 0.07914523149980214,
+      "grad_norm": 0.5031867027282715,
+      "learning_rate": 1.9291338582677166e-05,
+      "loss": 1.1043410301208496,
+      "step": 50
+    },
+    {
+      "epoch": 0.09497427779976256,
+      "grad_norm": 0.2011471688747406,
+      "learning_rate": 2.3228346456692916e-05,
+      "loss": 0.963739013671875,
+      "step": 60
+    },
+    {
+      "epoch": 0.11080332409972299,
+      "grad_norm": 0.1923132687807083,
+      "learning_rate": 2.7165354330708666e-05,
+      "loss": 0.9648835182189941,
+      "step": 70
+    },
+    {
+      "epoch": 0.12663237039968342,
+      "grad_norm": 0.19071003794670105,
+      "learning_rate": 3.110236220472441e-05,
+      "loss": 0.9172444343566895,
+      "step": 80
+    },
+    {
+      "epoch": 0.14246141669964385,
+      "grad_norm": 0.16827628016471863,
+      "learning_rate": 3.5039370078740156e-05,
+      "loss": 0.9281296730041504,
+      "step": 90
+    },
+    {
+      "epoch": 0.15829046299960428,
+      "grad_norm": 0.17542941868305206,
+      "learning_rate": 3.8976377952755905e-05,
+      "loss": 0.9268568992614746,
+      "step": 100
+    },
+    {
+      "epoch": 0.1741195092995647,
+      "grad_norm": 0.18605543673038483,
+      "learning_rate": 4.2913385826771655e-05,
+      "loss": 0.9156542778015136,
+      "step": 110
+    },
+    {
+      "epoch": 0.18994855559952512,
+      "grad_norm": 0.19004584848880768,
+      "learning_rate": 4.6850393700787405e-05,
+      "loss": 0.912873363494873,
+      "step": 120
+    },
+    {
+      "epoch": 0.20577760189948555,
+      "grad_norm": 0.1750367432832718,
+      "learning_rate": 4.999961827753896e-05,
+      "loss": 0.9295230865478515,
+      "step": 130
+    },
+    {
+      "epoch": 0.22160664819944598,
+      "grad_norm": 0.18699847161769867,
+      "learning_rate": 4.998625921534381e-05,
+      "loss": 0.9282792091369629,
+      "step": 140
+    },
+    {
+      "epoch": 0.2374356944994064,
+      "grad_norm": 0.18123294413089752,
+      "learning_rate": 4.9953825685459635e-05,
+      "loss": 0.9071338653564454,
+      "step": 150
+    },
+    {
+      "epoch": 0.25326474079936684,
+      "grad_norm": 0.19085095822811127,
+      "learning_rate": 4.990234244758785e-05,
+      "loss": 0.8944445610046386,
+      "step": 160
+    },
+    {
+      "epoch": 0.26909378709932724,
+      "grad_norm": 0.1798243671655655,
+      "learning_rate": 4.9831848803944474e-05,
+      "loss": 0.9216786384582519,
+      "step": 170
+    },
+    {
+      "epoch": 0.2849228333992877,
+      "grad_norm": 0.16492608189582825,
+      "learning_rate": 4.9742398569256896e-05,
+      "loss": 0.8954303741455079,
+      "step": 180
+    },
+    {
+      "epoch": 0.3007518796992481,
+      "grad_norm": 0.1820351481437683,
+      "learning_rate": 4.963406002968179e-05,
+      "loss": 0.8920731544494629,
+      "step": 190
+    },
+    {
+      "epoch": 0.31658092599920856,
+      "grad_norm": 0.1755346953868866,
+      "learning_rate": 4.950691589067557e-05,
+      "loss": 0.8888743400573731,
+      "step": 200
+    },
+    {
+      "epoch": 0.33240997229916897,
+      "grad_norm": 0.17681735754013062,
+      "learning_rate": 4.9361063213857194e-05,
+      "loss": 0.8969594955444335,
+      "step": 210
+    },
+    {
+      "epoch": 0.3482390185991294,
+      "grad_norm": 0.1814550906419754,
+      "learning_rate": 4.91966133429115e-05,
+      "loss": 0.8851950645446778,
+      "step": 220
+    },
+    {
+      "epoch": 0.3640680648990898,
+      "grad_norm": 0.17385666072368622,
+      "learning_rate": 4.901369181858964e-05,
+      "loss": 0.9051773071289062,
+      "step": 230
+    },
+    {
+      "epoch": 0.37989711119905023,
+      "grad_norm": 0.1662660837173462,
+      "learning_rate": 4.88124382828714e-05,
+      "loss": 0.8571414947509766,
+      "step": 240
+    },
+    {
+      "epoch": 0.3957261574990107,
+      "grad_norm": 0.18324530124664307,
+      "learning_rate": 4.859300637236289e-05,
+      "loss": 0.8993622779846191,
+      "step": 250
+    },
+    {
+      "epoch": 0.4115552037989711,
+      "grad_norm": 0.17792873084545135,
+      "learning_rate": 4.83555636010105e-05,
+      "loss": 0.888437557220459,
+      "step": 260
+    },
+    {
+      "epoch": 0.42738425009893155,
+      "grad_norm": 0.17820881307125092,
+      "learning_rate": 4.810029123222109e-05,
+      "loss": 0.873835277557373,
+      "step": 270
+    },
+    {
+      "epoch": 0.44321329639889195,
+      "grad_norm": 0.17558087408542633,
+      "learning_rate": 4.782738414048581e-05,
+      "loss": 0.8789983749389648,
+      "step": 280
+    },
+    {
+      "epoch": 0.4590423426988524,
+      "grad_norm": 0.18305832147598267,
+      "learning_rate": 4.753705066261326e-05,
+      "loss": 0.8827195167541504,
+      "step": 290
+    },
+    {
+      "epoch": 0.4748713889988128,
+      "grad_norm": 0.1801442950963974,
+      "learning_rate": 4.722951243868548e-05,
+      "loss": 0.8785966873168946,
+      "step": 300
+    },
+    {
+      "epoch": 0.4907004352987733,
+      "grad_norm": 0.16704759001731873,
+      "learning_rate": 4.690500424285833e-05,
+      "loss": 0.8711637496948242,
+      "step": 310
+    },
+    {
+      "epoch": 0.5065294815987337,
+      "grad_norm": 0.1603054255247116,
+      "learning_rate": 4.6563773804135305e-05,
+      "loss": 0.8773131370544434,
+      "step": 320
+    },
+    {
+      "epoch": 0.5223585278986941,
+      "grad_norm": 0.17996226251125336,
+      "learning_rate": 4.62060816172516e-05,
+      "loss": 0.8801346778869629,
+      "step": 330
+    },
+    {
+      "epoch": 0.5381875741986545,
+      "grad_norm": 0.16810840368270874,
+      "learning_rate": 4.583220074381288e-05,
+      "loss": 0.8802348136901855,
+      "step": 340
+    },
+    {
+      "epoch": 0.554016620498615,
+      "grad_norm": 0.18790695071220398,
+      "learning_rate": 4.544241660384057e-05,
+      "loss": 0.8991734504699707,
+      "step": 350
+    },
+    {
+      "epoch": 0.5698456667985754,
+      "grad_norm": 0.17079883813858032,
+      "learning_rate": 4.503702675788262e-05,
+      "loss": 0.8455135345458984,
+      "step": 360
+    },
+    {
+      "epoch": 0.5856747130985358,
+      "grad_norm": 0.1780555248260498,
+      "learning_rate": 4.4616340679856336e-05,
+      "loss": 0.8810781478881836,
+      "step": 370
+    },
+    {
+      "epoch": 0.6015037593984962,
+      "grad_norm": 0.17815925180912018,
+      "learning_rate": 4.418067952079651e-05,
+      "loss": 0.8761508941650391,
+      "step": 380
+    },
+    {
+      "epoch": 0.6173328056984567,
+      "grad_norm": 0.1707397699356079,
+      "learning_rate": 4.3730375863689256e-05,
+      "loss": 0.8592373847961425,
+      "step": 390
+    },
+    {
+      "epoch": 0.6331618519984171,
+      "grad_norm": 0.16248367726802826,
+      "learning_rate": 4.326577346957875e-05,
+      "loss": 0.8755002021789551,
+      "step": 400
+    },
+    {
+      "epoch": 0.6489908982983775,
+      "grad_norm": 0.165540874004364,
+      "learning_rate": 4.278722701514061e-05,
+      "loss": 0.867929744720459,
+      "step": 410
+    },
+    {
+      "epoch": 0.6648199445983379,
+      "grad_norm": 0.16495656967163086,
+      "learning_rate": 4.229510182192235e-05,
+      "loss": 0.8697072982788085,
+      "step": 420
+    },
+    {
+      "epoch": 0.6806489908982983,
+      "grad_norm": 0.16391977667808533,
+      "learning_rate": 4.178977357745749e-05,
+      "loss": 0.8804462432861329,
+      "step": 430
+    },
+    {
+      "epoch": 0.6964780371982588,
+      "grad_norm": 0.16260066628456116,
+      "learning_rate": 4.12716280484664e-05,
+      "loss": 0.8600423812866211,
+      "step": 440
+    },
+    {
+      "epoch": 0.7123070834982193,
+      "grad_norm": 0.16413277387619019,
+      "learning_rate": 4.0741060786362585e-05,
+      "loss": 0.8589424133300781,
+      "step": 450
+    },
+    {
+      "epoch": 0.7281361297981797,
+      "grad_norm": 0.16915135085582733,
+      "learning_rate": 4.0198476825289434e-05,
+      "loss": 0.8609626770019532,
+      "step": 460
+    },
+    {
+      "epoch": 0.7439651760981401,
+      "grad_norm": 0.17290306091308594,
+      "learning_rate": 3.9644290372917844e-05,
+      "loss": 0.8830280303955078,
+      "step": 470
+    },
+    {
+      "epoch": 0.7597942223981005,
+      "grad_norm": 0.16407273709774017,
+      "learning_rate": 3.907892449424081e-05,
+      "loss": 0.8745547294616699,
+      "step": 480
+    },
+    {
+      "epoch": 0.775623268698061,
+      "grad_norm": 0.16706892848014832,
+      "learning_rate": 3.850281078860627e-05,
+      "loss": 0.8945063591003418,
+      "step": 490
+    },
+    {
+      "epoch": 0.7914523149980214,
+      "grad_norm": 0.17343616485595703,
+      "learning_rate": 3.7916389060234964e-05,
+      "loss": 0.8862369537353516,
+      "step": 500
+    },
+    {
+      "epoch": 0.8072813612979818,
+      "grad_norm": 0.17225787043571472,
+      "learning_rate": 3.7320106982474625e-05,
+      "loss": 0.8728190422058105,
+      "step": 510
+    },
+    {
+      "epoch": 0.8231104075979422,
+      "grad_norm": 0.1631278693675995,
+      "learning_rate": 3.6714419756046885e-05,
+      "loss": 0.8206952095031739,
+      "step": 520
+    },
+    {
+      "epoch": 0.8389394538979027,
+      "grad_norm": 0.1752168983221054,
+      "learning_rate": 3.6099789761547834e-05,
+      "loss": 0.8330535888671875,
+      "step": 530
+    },
+    {
+      "epoch": 0.8547685001978631,
+      "grad_norm": 0.1791313886642456,
+      "learning_rate": 3.5476686206467466e-05,
+      "loss": 0.8688525199890137,
+      "step": 540
+    },
+    {
+      "epoch": 0.8705975464978235,
+      "grad_norm": 0.16879358887672424,
+      "learning_rate": 3.484558476699748e-05,
+      "loss": 0.8748814582824707,
+      "step": 550
+    },
+    {
+      "epoch": 0.8864265927977839,
+      "grad_norm": 0.1700020432472229,
+      "learning_rate": 3.4206967224900884e-05,
+      "loss": 0.8380928993225097,
+      "step": 560
+    },
+    {
+      "epoch": 0.9022556390977443,
+      "grad_norm": 0.16251260042190552,
+      "learning_rate": 3.35613210997206e-05,
+      "loss": 0.8361630439758301,
+      "step": 570
+    },
+    {
+      "epoch": 0.9180846853977048,
+      "grad_norm": 0.1657862812280655,
+      "learning_rate": 3.290913927660793e-05,
+      "loss": 0.8503948211669922,
+      "step": 580
+    },
+    {
+      "epoch": 0.9339137316976652,
+      "grad_norm": 0.17638036608695984,
+      "learning_rate": 3.22509196300548e-05,
+      "loss": 0.8723633766174317,
+      "step": 590
+    },
+    {
+      "epoch": 0.9497427779976256,
+      "grad_norm": 0.17021600902080536,
+      "learning_rate": 3.158716464381728e-05,
+      "loss": 0.8567726135253906,
+      "step": 600
+    },
+    {
+      "epoch": 0.965571824297586,
+      "grad_norm": 0.17130261659622192,
+      "learning_rate": 3.091838102732031e-05,
+      "loss": 0.8626362800598144,
+      "step": 610
+    },
+    {
+      "epoch": 0.9814008705975465,
+      "grad_norm": 0.16850486397743225,
+      "learning_rate": 3.024507932883659e-05,
+      "loss": 0.8559526443481446,
+      "step": 620
+    },
+    {
+      "epoch": 0.997229916897507,
+      "grad_norm": 0.1741161048412323,
+      "learning_rate": 2.9567773545734916e-05,
+      "loss": 0.8548683166503906,
+      "step": 630
+    },
+    {
+      "epoch": 1.0126632370399684,
+      "grad_norm": 0.1648906171321869,
+      "learning_rate": 2.8886980732095464e-05,
+      "loss": 0.8245421409606933,
+      "step": 640
+    },
+    {
+      "epoch": 1.0284922833399288,
+      "grad_norm": 0.1882442682981491,
+      "learning_rate": 2.820322060399156e-05,
+      "loss": 0.8283076286315918,
+      "step": 650
+    },
+    {
+      "epoch": 1.0443213296398892,
+      "grad_norm": 0.16794943809509277,
+      "learning_rate": 2.7517015142739335e-05,
+      "loss": 0.8383002281188965,
+      "step": 660
+    },
+    {
+      "epoch": 1.0601503759398496,
+      "grad_norm": 0.17361986637115479,
+      "learning_rate": 2.6828888196418088e-05,
+      "loss": 0.8475232124328613,
+      "step": 670
+    },
+    {
+      "epoch": 1.07597942223981,
+      "grad_norm": 0.16774870455265045,
+      "learning_rate": 2.6139365079965538e-05,
+      "loss": 0.8095685005187988,
+      "step": 680
+    },
+    {
+      "epoch": 1.0918084685397704,
+      "grad_norm": 0.1640879362821579,
+      "learning_rate": 2.5448972174153318e-05,
+      "loss": 0.8009868621826172,
+      "step": 690
+    },
+    {
+      "epoch": 1.1076375148397308,
+      "grad_norm": 0.18289130926132202,
+      "learning_rate": 2.4758236523748733e-05,
+      "loss": 0.8200379371643066,
+      "step": 700
+    },
+    {
+      "epoch": 1.1234665611396912,
+      "grad_norm": 0.1810799241065979,
+      "learning_rate": 2.4067685435169772e-05,
+      "loss": 0.8436824798583984,
+      "step": 710
+    },
+    {
+      "epoch": 1.1392956074396516,
+      "grad_norm": 0.18245543539524078,
+      "learning_rate": 2.3377846073940207e-05,
+      "loss": 0.8141367912292481,
+      "step": 720
+    },
+    {
+      "epoch": 1.1551246537396123,
+      "grad_norm": 0.18102015554904938,
+      "learning_rate": 2.2689245062252398e-05,
+      "loss": 0.8481219291687012,
+      "step": 730
+    },
+    {
+      "epoch": 1.1709537000395727,
+      "grad_norm": 0.19379396736621857,
+      "learning_rate": 2.200240807694474e-05,
+      "loss": 0.8380316734313965,
+      "step": 740
+    },
+    {
+      "epoch": 1.186782746339533,
+      "grad_norm": 0.17735467851161957,
+      "learning_rate": 2.131785944820092e-05,
+      "loss": 0.8194567680358886,
+      "step": 750
+    },
+    {
+      "epoch": 1.2026117926394935,
+      "grad_norm": 0.18803226947784424,
+      "learning_rate": 2.0636121759277137e-05,
+      "loss": 0.8280925750732422,
+      "step": 760
+    },
+    {
+      "epoch": 1.2184408389394539,
+      "grad_norm": 0.19116626679897308,
+      "learning_rate": 1.995771544756287e-05,
+      "loss": 0.8177350044250489,
+      "step": 770
+    },
+    {
+      "epoch": 1.2342698852394143,
+      "grad_norm": 0.17773468792438507,
+      "learning_rate": 1.9283158407279977e-05,
+      "loss": 0.8399795532226563,
+      "step": 780
+    },
+    {
+      "epoch": 1.2500989315393747,
+      "grad_norm": 0.18355493247509003,
+      "learning_rate": 1.861296559412303e-05,
+      "loss": 0.8454893112182618,
+      "step": 790
+    },
+    {
+      "epoch": 1.2659279778393353,
+      "grad_norm": 0.18507550656795502,
+      "learning_rate": 1.7947648632143076e-05,
+      "loss": 0.8371585845947266,
+      "step": 800
+    },
+    {
+      "epoch": 1.2817570241392957,
+      "grad_norm": 0.18332813680171967,
+      "learning_rate": 1.728771542317466e-05,
+      "loss": 0.8566581726074218,
+      "step": 810
+    },
+    {
+      "epoch": 1.2975860704392561,
+      "grad_norm": 0.18364256620407104,
+      "learning_rate": 1.6633669759104488e-05,
+      "loss": 0.8170791625976562,
+      "step": 820
+    },
+    {
+      "epoch": 1.3134151167392165,
+      "grad_norm": 0.1980493664741516,
+      "learning_rate": 1.598601093727749e-05,
+      "loss": 0.852113151550293,
+      "step": 830
+    },
+    {
+      "epoch": 1.329244163039177,
+      "grad_norm": 0.20105807483196259,
+      "learning_rate": 1.5345233379334155e-05,
+      "loss": 0.8228271484375,
+      "step": 840
+    },
+    {
+      "epoch": 1.3450732093391373,
+      "grad_norm": 0.18281184136867523,
+      "learning_rate": 1.4711826253769827e-05,
+      "loss": 0.8469521522521972,
+      "step": 850
+    },
+    {
+      "epoch": 1.3609022556390977,
+      "grad_norm": 0.19825202226638794,
+      "learning_rate": 1.4086273102504341e-05,
+      "loss": 0.8229537010192871,
+      "step": 860
+    },
+    {
+      "epoch": 1.3767313019390581,
+      "grad_norm": 0.1836749017238617,
+      "learning_rate": 1.346905147174694e-05,
+      "loss": 0.8486099243164062,
+      "step": 870
+    },
+    {
+      "epoch": 1.3925603482390185,
+      "grad_norm": 0.18626976013183594,
+      "learning_rate": 1.2860632547438333e-05,
+      "loss": 0.8549923896789551,
+      "step": 880
+    },
+    {
+      "epoch": 1.408389394538979,
+      "grad_norm": 0.1800047755241394,
+      "learning_rate": 1.2261480795548123e-05,
+      "loss": 0.8070058822631836,
+      "step": 890
+    },
+    {
+      "epoch": 1.4242184408389393,
+      "grad_norm": 0.1864195019006729,
+      "learning_rate": 1.167205360750227e-05,
+      "loss": 0.831914234161377,
+      "step": 900
+    },
+    {
+      "epoch": 1.4400474871389,
+      "grad_norm": 0.18130667507648468,
+      "learning_rate": 1.1092800951011285e-05,
+      "loss": 0.826213264465332,
+      "step": 910
+    },
+    {
+      "epoch": 1.4558765334388604,
+      "grad_norm": 0.19690996408462524,
+      "learning_rate": 1.0524165026565654e-05,
+      "loss": 0.8235694885253906,
+      "step": 920
+    },
+    {
+      "epoch": 1.4717055797388208,
+      "grad_norm": 0.1875549703836441,
+      "learning_rate": 9.966579929860704e-06,
+      "loss": 0.8336921691894531,
+      "step": 930
+    },
+    {
+      "epoch": 1.4875346260387812,
+      "grad_norm": 0.18889833986759186,
+      "learning_rate": 9.420471320408669e-06,
+      "loss": 0.7959968090057373,
+      "step": 940
+    },
+    {
+      "epoch": 1.5033636723387416,
+      "grad_norm": 0.18145473301410675,
+      "learning_rate": 8.886256096591048e-06,
+      "loss": 0.8122786521911621,
+      "step": 950
+    },
+    {
+      "epoch": 1.519192718638702,
+      "grad_norm": 0.18262115120887756,
+      "learning_rate": 8.364342077398971e-06,
+      "loss": 0.8186595916748047,
+      "step": 960
+    },
+    {
+      "epoch": 1.5350217649386626,
+      "grad_norm": 0.19150428473949432,
+      "learning_rate": 7.855127691104943e-06,
+      "loss": 0.8238736152648926,
+      "step": 970
+    },
+    {
+      "epoch": 1.550850811238623,
+      "grad_norm": 0.1934870034456253,
+      "learning_rate": 7.359001671103361e-06,
+      "loss": 0.8132981300354004,
+      "step": 980
+    },
+    {
+      "epoch": 1.5666798575385834,
+      "grad_norm": 0.1910739690065384,
+      "learning_rate": 6.8763427591521215e-06,
+      "loss": 0.8252732276916503,
+      "step": 990
+    },
+    {
+      "epoch": 1.5825089038385438,
+      "grad_norm": 0.19539424777030945,
+      "learning_rate": 6.407519416241778e-06,
+      "loss": 0.8613625526428222,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5983379501385042,
+      "grad_norm": 0.19278913736343384,
+      "learning_rate": 5.95288954131307e-06,
+      "loss": 0.8396816253662109,
+      "step": 1010
+    },
+    {
+      "epoch": 1.6141669964384646,
+      "grad_norm": 0.19722530245780945,
+      "learning_rate": 5.512800198037477e-06,
+      "loss": 0.849429988861084,
+      "step": 1020
+    },
+    {
+      "epoch": 1.629996042738425,
+      "grad_norm": 0.19314146041870117,
+      "learning_rate": 5.087587349869396e-06,
+      "loss": 0.853305721282959,
+      "step": 1030
+    },
+    {
+      "epoch": 1.6458250890383854,
+      "grad_norm": 0.18120236694812775,
+      "learning_rate": 4.677575603572235e-06,
+      "loss": 0.8267157554626465,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6616541353383458,
+      "grad_norm": 0.1896786242723465,
+      "learning_rate": 4.283077961414125e-06,
+      "loss": 0.811297607421875,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6774831816383062,
+      "grad_norm": 0.19132214784622192,
+      "learning_rate": 3.904395582222578e-06,
+      "loss": 0.8262212753295899,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6933122279382666,
+      "grad_norm": 0.18690507113933563,
+      "learning_rate": 3.541817551480292e-06,
+      "loss": 0.8474508285522461,
+      "step": 1070
+    },
+    {
+      "epoch": 1.709141274238227,
+      "grad_norm": 0.1666112095117569,
+      "learning_rate": 3.1956206606378187e-06,
+      "loss": 0.8323755264282227,
+      "step": 1080
+    },
+    {
+      "epoch": 1.7249703205381874,
+      "grad_norm": 0.1838321089744568,
+      "learning_rate": 2.8660691958114382e-06,
+      "loss": 0.8172253608703614,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7407993668381478,
+      "grad_norm": 0.1931111067533493,
+      "learning_rate": 2.553414736027601e-06,
+      "loss": 0.8497982978820801,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7566284131381085,
+      "grad_norm": 0.19277822971343994,
+      "learning_rate": 2.257895961167886e-06,
+      "loss": 0.8303280830383301,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7724574594380689,
+      "grad_norm": 0.1878863126039505,
+      "learning_rate": 1.9797384697612276e-06,
+      "loss": 0.8127182006835938,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7882865057380293,
+      "grad_norm": 0.20031479001045227,
+      "learning_rate": 1.7191546067623772e-06,
+      "loss": 0.8480099678039551,
+      "step": 1130
+    },
+    {
+      "epoch": 1.8041155520379897,
+      "grad_norm": 0.18454383313655853,
+      "learning_rate": 1.4763433014481103e-06,
+      "loss": 0.837119197845459,
+      "step": 1140
+    },
+    {
+      "epoch": 1.8199445983379503,
+      "grad_norm": 0.2058745175600052,
+      "learning_rate": 1.2514899155549626e-06,
+      "loss": 0.8471168518066406,
+      "step": 1150
+    },
+    {
+      "epoch": 1.8357736446379107,
+      "grad_norm": 0.19359512627124786,
+      "learning_rate": 1.044766101774397e-06,
+      "loss": 0.8292506217956543,
+      "step": 1160
+    },
+    {
+      "epoch": 1.851602690937871,
+      "grad_norm": 0.2011612206697464,
+      "learning_rate": 8.563296727134435e-07,
+      "loss": 0.8529328346252442,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8674317372378315,
+      "grad_norm": 0.1884250044822693,
+      "learning_rate": 6.863244804208052e-07,
+      "loss": 0.8327092170715332,
+      "step": 1180
+    },
+    {
+      "epoch": 1.883260783537792,
+      "grad_norm": 0.19094131886959076,
+      "learning_rate": 5.348803065704483e-07,
+      "loss": 0.843638801574707,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8990898298377523,
+      "grad_norm": 0.20150353014469147,
+      "learning_rate": 4.021127633865196e-07,
+      "loss": 0.8138184547424316,
+      "step": 1200
+    },
+    {
+      "epoch": 1.9149188761377127,
+      "grad_norm": 0.18785829842090607,
+      "learning_rate": 2.881232053851435e-07,
+      "loss": 0.8096948623657226,
+      "step": 1210
+    },
+    {
+      "epoch": 1.9307479224376731,
+      "grad_norm": 0.18520639836788177,
+      "learning_rate": 1.9299865200057553e-07,
+      "loss": 0.8441803932189942,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9465769687376335,
+      "grad_norm": 0.17911553382873535,
+      "learning_rate": 1.1681172115469986e-07,
+      "loss": 0.8313416481018067,
+      "step": 1230
+    },
+    {
+      "epoch": 1.962406015037594,
+      "grad_norm": 0.19538141787052155,
+      "learning_rate": 5.96205738206429e-08,
+      "loss": 0.8286916732788085,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9782350613375543,
+      "grad_norm": 0.1857946664094925,
+      "learning_rate": 2.1468869622781606e-08,
+      "loss": 0.8293286323547363,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9940641076375147,
+      "grad_norm": 0.1903241127729416,
+      "learning_rate": 2.3857335070626154e-09,
+      "loss": 0.8224732398986816,
+      "step": 1260
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1264,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4272423190583951e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/checkpoint-1264/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c014ebd63dab9deb812f3c92192ce130717d3332c747af68efe98595e1e3890
+size 5713