Upload 11 files

Browse files

Files changed (11) hide show

README.md +202 -0
adapter_config.json +33 -0
adapter_model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer_config.json +70 -0
trainer_state.json +2133 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: vilm/vinallama-2.7b-chat
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "vilm/vinallama-2.7b-chat",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5453cd380c9fac1bb563ae23f988362848bc222326feb24d9de3975e258d07a
+size 323013288

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b033661b80368b18ccc9906a9c1f00543ee04294a80f931fe2d5d020c19768ed
+size 646160890

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c8dbef056e0c29c6f7b5c80c56e6188effeae40a6463363cdfb8af9ea35a8d2
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac7de8907e79c9f47fccf3aa89da8ca3d3d851fdb56e005195640f80ea9c8917
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46303": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46304": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46305": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "legacy": false,
+  "max_length": 256,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|im_end|>",
+  "sp_model_kwargs": {},
+  "stride": 0,
+  "tokenizer_class": "LlamaTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true,
+  "use_fast": true
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2133 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 7.733952049497293,
+  "eval_steps": 500,
+  "global_step": 30000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.025779840164990978,
+      "grad_norm": 0.5347822308540344,
+      "learning_rate": 0.0002,
+      "loss": 1.4509,
+      "step": 100
+    },
+    {
+      "epoch": 0.051559680329981955,
+      "grad_norm": 0.4712078273296356,
+      "learning_rate": 0.0002,
+      "loss": 1.1744,
+      "step": 200
+    },
+    {
+      "epoch": 0.07733952049497293,
+      "grad_norm": 0.5031601786613464,
+      "learning_rate": 0.0002,
+      "loss": 1.096,
+      "step": 300
+    },
+    {
+      "epoch": 0.10311936065996391,
+      "grad_norm": 0.49241065979003906,
+      "learning_rate": 0.0002,
+      "loss": 0.9847,
+      "step": 400
+    },
+    {
+      "epoch": 0.12889920082495487,
+      "grad_norm": 0.9957050681114197,
+      "learning_rate": 0.0002,
+      "loss": 0.9928,
+      "step": 500
+    },
+    {
+      "epoch": 0.15467904098994587,
+      "grad_norm": 0.38163048028945923,
+      "learning_rate": 0.0002,
+      "loss": 0.9008,
+      "step": 600
+    },
+    {
+      "epoch": 0.18045888115493683,
+      "grad_norm": 0.4322434663772583,
+      "learning_rate": 0.0002,
+      "loss": 0.9108,
+      "step": 700
+    },
+    {
+      "epoch": 0.20623872131992782,
+      "grad_norm": 0.4072737395763397,
+      "learning_rate": 0.0002,
+      "loss": 0.8713,
+      "step": 800
+    },
+    {
+      "epoch": 0.23201856148491878,
+      "grad_norm": 0.5637839436531067,
+      "learning_rate": 0.0002,
+      "loss": 0.8538,
+      "step": 900
+    },
+    {
+      "epoch": 0.25779840164990975,
+      "grad_norm": 0.6094131469726562,
+      "learning_rate": 0.0002,
+      "loss": 0.8154,
+      "step": 1000
+    },
+    {
+      "epoch": 0.28357824181490077,
+      "grad_norm": 0.4212701618671417,
+      "learning_rate": 0.0002,
+      "loss": 0.7897,
+      "step": 1100
+    },
+    {
+      "epoch": 0.30935808197989173,
+      "grad_norm": 0.4663824737071991,
+      "learning_rate": 0.0002,
+      "loss": 0.8021,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3351379221448827,
+      "grad_norm": 0.3774861693382263,
+      "learning_rate": 0.0002,
+      "loss": 0.7452,
+      "step": 1300
+    },
+    {
+      "epoch": 0.36091776230987366,
+      "grad_norm": 0.19446992874145508,
+      "learning_rate": 0.0002,
+      "loss": 0.737,
+      "step": 1400
+    },
+    {
+      "epoch": 0.3866976024748647,
+      "grad_norm": 0.25984033942222595,
+      "learning_rate": 0.0002,
+      "loss": 0.6966,
+      "step": 1500
+    },
+    {
+      "epoch": 0.41247744263985564,
+      "grad_norm": 0.3495163023471832,
+      "learning_rate": 0.0002,
+      "loss": 0.7179,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4382572828048466,
+      "grad_norm": 0.5092929601669312,
+      "learning_rate": 0.0002,
+      "loss": 0.7132,
+      "step": 1700
+    },
+    {
+      "epoch": 0.46403712296983757,
+      "grad_norm": 0.16095790266990662,
+      "learning_rate": 0.0002,
+      "loss": 0.6652,
+      "step": 1800
+    },
+    {
+      "epoch": 0.4898169631348286,
+      "grad_norm": 0.38502034544944763,
+      "learning_rate": 0.0002,
+      "loss": 0.6564,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5155968032998195,
+      "grad_norm": 0.3100506067276001,
+      "learning_rate": 0.0002,
+      "loss": 0.6082,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5413766434648105,
+      "grad_norm": 0.4585016965866089,
+      "learning_rate": 0.0002,
+      "loss": 0.6491,
+      "step": 2100
+    },
+    {
+      "epoch": 0.5671564836298015,
+      "grad_norm": 0.35394927859306335,
+      "learning_rate": 0.0002,
+      "loss": 0.6136,
+      "step": 2200
+    },
+    {
+      "epoch": 0.5929363237947924,
+      "grad_norm": 0.4828909933567047,
+      "learning_rate": 0.0002,
+      "loss": 0.5639,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6187161639597835,
+      "grad_norm": 0.7377568483352661,
+      "learning_rate": 0.0002,
+      "loss": 0.5998,
+      "step": 2400
+    },
+    {
+      "epoch": 0.6444960041247745,
+      "grad_norm": 0.33992356061935425,
+      "learning_rate": 0.0002,
+      "loss": 0.5535,
+      "step": 2500
+    },
+    {
+      "epoch": 0.6702758442897654,
+      "grad_norm": 0.40880173444747925,
+      "learning_rate": 0.0002,
+      "loss": 0.5839,
+      "step": 2600
+    },
+    {
+      "epoch": 0.6960556844547564,
+      "grad_norm": 0.6135886907577515,
+      "learning_rate": 0.0002,
+      "loss": 0.5697,
+      "step": 2700
+    },
+    {
+      "epoch": 0.7218355246197473,
+      "grad_norm": 0.14242181181907654,
+      "learning_rate": 0.0002,
+      "loss": 0.562,
+      "step": 2800
+    },
+    {
+      "epoch": 0.7476153647847383,
+      "grad_norm": 0.1636349856853485,
+      "learning_rate": 0.0002,
+      "loss": 0.5301,
+      "step": 2900
+    },
+    {
+      "epoch": 0.7733952049497294,
+      "grad_norm": 0.5300703644752502,
+      "learning_rate": 0.0002,
+      "loss": 0.5428,
+      "step": 3000
+    },
+    {
+      "epoch": 0.7991750451147203,
+      "grad_norm": 0.2816906273365021,
+      "learning_rate": 0.0002,
+      "loss": 0.5319,
+      "step": 3100
+    },
+    {
+      "epoch": 0.8249548852797113,
+      "grad_norm": 0.4165875315666199,
+      "learning_rate": 0.0002,
+      "loss": 0.5073,
+      "step": 3200
+    },
+    {
+      "epoch": 0.8507347254447022,
+      "grad_norm": 0.46957316994667053,
+      "learning_rate": 0.0002,
+      "loss": 0.4973,
+      "step": 3300
+    },
+    {
+      "epoch": 0.8765145656096932,
+      "grad_norm": 0.22382797300815582,
+      "learning_rate": 0.0002,
+      "loss": 0.5091,
+      "step": 3400
+    },
+    {
+      "epoch": 0.9022944057746842,
+      "grad_norm": 0.517814576625824,
+      "learning_rate": 0.0002,
+      "loss": 0.4879,
+      "step": 3500
+    },
+    {
+      "epoch": 0.9280742459396751,
+      "grad_norm": 0.44171011447906494,
+      "learning_rate": 0.0002,
+      "loss": 0.4711,
+      "step": 3600
+    },
+    {
+      "epoch": 0.9538540861046662,
+      "grad_norm": 0.3107047379016876,
+      "learning_rate": 0.0002,
+      "loss": 0.465,
+      "step": 3700
+    },
+    {
+      "epoch": 0.9796339262696572,
+      "grad_norm": 0.09984863549470901,
+      "learning_rate": 0.0002,
+      "loss": 0.4485,
+      "step": 3800
+    },
+    {
+      "epoch": 1.005413766434648,
+      "grad_norm": 0.43100592494010925,
+      "learning_rate": 0.0002,
+      "loss": 0.4752,
+      "step": 3900
+    },
+    {
+      "epoch": 1.031193606599639,
+      "grad_norm": 0.5259262919425964,
+      "learning_rate": 0.0002,
+      "loss": 0.3621,
+      "step": 4000
+    },
+    {
+      "epoch": 1.0569734467646301,
+      "grad_norm": 0.47033509612083435,
+      "learning_rate": 0.0002,
+      "loss": 0.3569,
+      "step": 4100
+    },
+    {
+      "epoch": 1.082753286929621,
+      "grad_norm": 0.5318751931190491,
+      "learning_rate": 0.0002,
+      "loss": 0.3512,
+      "step": 4200
+    },
+    {
+      "epoch": 1.108533127094612,
+      "grad_norm": 0.5434057116508484,
+      "learning_rate": 0.0002,
+      "loss": 0.3504,
+      "step": 4300
+    },
+    {
+      "epoch": 1.134312967259603,
+      "grad_norm": 0.47843560576438904,
+      "learning_rate": 0.0002,
+      "loss": 0.3712,
+      "step": 4400
+    },
+    {
+      "epoch": 1.160092807424594,
+      "grad_norm": 0.5956776142120361,
+      "learning_rate": 0.0002,
+      "loss": 0.3511,
+      "step": 4500
+    },
+    {
+      "epoch": 1.1858726475895849,
+      "grad_norm": 0.5072950720787048,
+      "learning_rate": 0.0002,
+      "loss": 0.3445,
+      "step": 4600
+    },
+    {
+      "epoch": 1.211652487754576,
+      "grad_norm": 0.5608052611351013,
+      "learning_rate": 0.0002,
+      "loss": 0.3377,
+      "step": 4700
+    },
+    {
+      "epoch": 1.237432327919567,
+      "grad_norm": 0.474223256111145,
+      "learning_rate": 0.0002,
+      "loss": 0.3276,
+      "step": 4800
+    },
+    {
+      "epoch": 1.2632121680845578,
+      "grad_norm": 0.5215118527412415,
+      "learning_rate": 0.0002,
+      "loss": 0.3375,
+      "step": 4900
+    },
+    {
+      "epoch": 1.288992008249549,
+      "grad_norm": 0.3922516405582428,
+      "learning_rate": 0.0002,
+      "loss": 0.342,
+      "step": 5000
+    },
+    {
+      "epoch": 1.3147718484145399,
+      "grad_norm": 0.4958643615245819,
+      "learning_rate": 0.0002,
+      "loss": 0.3553,
+      "step": 5100
+    },
+    {
+      "epoch": 1.3405516885795308,
+      "grad_norm": 0.564983069896698,
+      "learning_rate": 0.0002,
+      "loss": 0.3389,
+      "step": 5200
+    },
+    {
+      "epoch": 1.3663315287445217,
+      "grad_norm": 0.5662856698036194,
+      "learning_rate": 0.0002,
+      "loss": 0.3382,
+      "step": 5300
+    },
+    {
+      "epoch": 1.3921113689095128,
+      "grad_norm": 0.5040738582611084,
+      "learning_rate": 0.0002,
+      "loss": 0.3408,
+      "step": 5400
+    },
+    {
+      "epoch": 1.4178912090745037,
+      "grad_norm": 0.27346768975257874,
+      "learning_rate": 0.0002,
+      "loss": 0.3266,
+      "step": 5500
+    },
+    {
+      "epoch": 1.4436710492394949,
+      "grad_norm": 0.5055024027824402,
+      "learning_rate": 0.0002,
+      "loss": 0.3561,
+      "step": 5600
+    },
+    {
+      "epoch": 1.4694508894044858,
+      "grad_norm": 0.5442714691162109,
+      "learning_rate": 0.0002,
+      "loss": 0.3241,
+      "step": 5700
+    },
+    {
+      "epoch": 1.4952307295694767,
+      "grad_norm": 0.4862806499004364,
+      "learning_rate": 0.0002,
+      "loss": 0.344,
+      "step": 5800
+    },
+    {
+      "epoch": 1.5210105697344676,
+      "grad_norm": 0.6346714496612549,
+      "learning_rate": 0.0002,
+      "loss": 0.3195,
+      "step": 5900
+    },
+    {
+      "epoch": 1.5467904098994585,
+      "grad_norm": 0.5846338272094727,
+      "learning_rate": 0.0002,
+      "loss": 0.3232,
+      "step": 6000
+    },
+    {
+      "epoch": 1.5725702500644496,
+      "grad_norm": 0.41255345940589905,
+      "learning_rate": 0.0002,
+      "loss": 0.3379,
+      "step": 6100
+    },
+    {
+      "epoch": 1.5983500902294405,
+      "grad_norm": 0.6396617293357849,
+      "learning_rate": 0.0002,
+      "loss": 0.3099,
+      "step": 6200
+    },
+    {
+      "epoch": 1.6241299303944317,
+      "grad_norm": 0.3450670540332794,
+      "learning_rate": 0.0002,
+      "loss": 0.3129,
+      "step": 6300
+    },
+    {
+      "epoch": 1.6499097705594226,
+      "grad_norm": 0.30461055040359497,
+      "learning_rate": 0.0002,
+      "loss": 0.2978,
+      "step": 6400
+    },
+    {
+      "epoch": 1.6756896107244135,
+      "grad_norm": 0.4209739863872528,
+      "learning_rate": 0.0002,
+      "loss": 0.3323,
+      "step": 6500
+    },
+    {
+      "epoch": 1.7014694508894044,
+      "grad_norm": 0.3296062648296356,
+      "learning_rate": 0.0002,
+      "loss": 0.3047,
+      "step": 6600
+    },
+    {
+      "epoch": 1.7272492910543955,
+      "grad_norm": 0.9009484648704529,
+      "learning_rate": 0.0002,
+      "loss": 0.3046,
+      "step": 6700
+    },
+    {
+      "epoch": 1.7530291312193864,
+      "grad_norm": 0.7505986094474792,
+      "learning_rate": 0.0002,
+      "loss": 0.3123,
+      "step": 6800
+    },
+    {
+      "epoch": 1.7788089713843775,
+      "grad_norm": 0.3542492389678955,
+      "learning_rate": 0.0002,
+      "loss": 0.3259,
+      "step": 6900
+    },
+    {
+      "epoch": 1.8045888115493685,
+      "grad_norm": 0.4935378432273865,
+      "learning_rate": 0.0002,
+      "loss": 0.3262,
+      "step": 7000
+    },
+    {
+      "epoch": 1.8303686517143594,
+      "grad_norm": 0.3000539541244507,
+      "learning_rate": 0.0002,
+      "loss": 0.2887,
+      "step": 7100
+    },
+    {
+      "epoch": 1.8561484918793503,
+      "grad_norm": 0.2680779695510864,
+      "learning_rate": 0.0002,
+      "loss": 0.3108,
+      "step": 7200
+    },
+    {
+      "epoch": 1.8819283320443412,
+      "grad_norm": 0.5922934412956238,
+      "learning_rate": 0.0002,
+      "loss": 0.3211,
+      "step": 7300
+    },
+    {
+      "epoch": 1.9077081722093323,
+      "grad_norm": 0.38349688053131104,
+      "learning_rate": 0.0002,
+      "loss": 0.316,
+      "step": 7400
+    },
+    {
+      "epoch": 1.9334880123743234,
+      "grad_norm": 0.7654793858528137,
+      "learning_rate": 0.0002,
+      "loss": 0.3111,
+      "step": 7500
+    },
+    {
+      "epoch": 1.9592678525393143,
+      "grad_norm": 0.2399352639913559,
+      "learning_rate": 0.0002,
+      "loss": 0.3042,
+      "step": 7600
+    },
+    {
+      "epoch": 1.9850476927043053,
+      "grad_norm": 0.42787912487983704,
+      "learning_rate": 0.0002,
+      "loss": 0.2928,
+      "step": 7700
+    },
+    {
+      "epoch": 2.010827532869296,
+      "grad_norm": 0.4771544933319092,
+      "learning_rate": 0.0002,
+      "loss": 0.2487,
+      "step": 7800
+    },
+    {
+      "epoch": 2.036607373034287,
+      "grad_norm": 0.6133277416229248,
+      "learning_rate": 0.0002,
+      "loss": 0.2219,
+      "step": 7900
+    },
+    {
+      "epoch": 2.062387213199278,
+      "grad_norm": 0.43137651681900024,
+      "learning_rate": 0.0002,
+      "loss": 0.2158,
+      "step": 8000
+    },
+    {
+      "epoch": 2.0881670533642693,
+      "grad_norm": 0.41038885712623596,
+      "learning_rate": 0.0002,
+      "loss": 0.2127,
+      "step": 8100
+    },
+    {
+      "epoch": 2.1139468935292602,
+      "grad_norm": 0.351235568523407,
+      "learning_rate": 0.0002,
+      "loss": 0.2185,
+      "step": 8200
+    },
+    {
+      "epoch": 2.139726733694251,
+      "grad_norm": 0.41089433431625366,
+      "learning_rate": 0.0002,
+      "loss": 0.2346,
+      "step": 8300
+    },
+    {
+      "epoch": 2.165506573859242,
+      "grad_norm": 0.3464137613773346,
+      "learning_rate": 0.0002,
+      "loss": 0.2273,
+      "step": 8400
+    },
+    {
+      "epoch": 2.191286414024233,
+      "grad_norm": 0.2753762900829315,
+      "learning_rate": 0.0002,
+      "loss": 0.2359,
+      "step": 8500
+    },
+    {
+      "epoch": 2.217066254189224,
+      "grad_norm": 0.3630015552043915,
+      "learning_rate": 0.0002,
+      "loss": 0.2351,
+      "step": 8600
+    },
+    {
+      "epoch": 2.2428460943542152,
+      "grad_norm": 0.5501378178596497,
+      "learning_rate": 0.0002,
+      "loss": 0.2273,
+      "step": 8700
+    },
+    {
+      "epoch": 2.268625934519206,
+      "grad_norm": 0.31958362460136414,
+      "learning_rate": 0.0002,
+      "loss": 0.2306,
+      "step": 8800
+    },
+    {
+      "epoch": 2.294405774684197,
+      "grad_norm": 0.4495809078216553,
+      "learning_rate": 0.0002,
+      "loss": 0.2283,
+      "step": 8900
+    },
+    {
+      "epoch": 2.320185614849188,
+      "grad_norm": 0.45789313316345215,
+      "learning_rate": 0.0002,
+      "loss": 0.2191,
+      "step": 9000
+    },
+    {
+      "epoch": 2.345965455014179,
+      "grad_norm": 0.2430783361196518,
+      "learning_rate": 0.0002,
+      "loss": 0.2266,
+      "step": 9100
+    },
+    {
+      "epoch": 2.3717452951791698,
+      "grad_norm": 0.512585461139679,
+      "learning_rate": 0.0002,
+      "loss": 0.2293,
+      "step": 9200
+    },
+    {
+      "epoch": 2.3975251353441607,
+      "grad_norm": 0.42088598012924194,
+      "learning_rate": 0.0002,
+      "loss": 0.2388,
+      "step": 9300
+    },
+    {
+      "epoch": 2.423304975509152,
+      "grad_norm": 0.4196650981903076,
+      "learning_rate": 0.0002,
+      "loss": 0.2305,
+      "step": 9400
+    },
+    {
+      "epoch": 2.449084815674143,
+      "grad_norm": 0.45856234431266785,
+      "learning_rate": 0.0002,
+      "loss": 0.2294,
+      "step": 9500
+    },
+    {
+      "epoch": 2.474864655839134,
+      "grad_norm": 0.5690295100212097,
+      "learning_rate": 0.0002,
+      "loss": 0.2237,
+      "step": 9600
+    },
+    {
+      "epoch": 2.5006444960041248,
+      "grad_norm": 0.5325428247451782,
+      "learning_rate": 0.0002,
+      "loss": 0.2125,
+      "step": 9700
+    },
+    {
+      "epoch": 2.5264243361691157,
+      "grad_norm": 0.4254339933395386,
+      "learning_rate": 0.0002,
+      "loss": 0.2335,
+      "step": 9800
+    },
+    {
+      "epoch": 2.5522041763341066,
+      "grad_norm": 0.44463545083999634,
+      "learning_rate": 0.0002,
+      "loss": 0.2247,
+      "step": 9900
+    },
+    {
+      "epoch": 2.577984016499098,
+      "grad_norm": 0.4192294776439667,
+      "learning_rate": 0.0002,
+      "loss": 0.2328,
+      "step": 10000
+    },
+    {
+      "epoch": 2.603763856664089,
+      "grad_norm": 0.39080777764320374,
+      "learning_rate": 0.0002,
+      "loss": 0.2229,
+      "step": 10100
+    },
+    {
+      "epoch": 2.6295436968290797,
+      "grad_norm": 0.3375299870967865,
+      "learning_rate": 0.0002,
+      "loss": 0.2374,
+      "step": 10200
+    },
+    {
+      "epoch": 2.6553235369940706,
+      "grad_norm": 0.6126553416252136,
+      "learning_rate": 0.0002,
+      "loss": 0.2283,
+      "step": 10300
+    },
+    {
+      "epoch": 2.6811033771590616,
+      "grad_norm": 0.21654823422431946,
+      "learning_rate": 0.0002,
+      "loss": 0.2265,
+      "step": 10400
+    },
+    {
+      "epoch": 2.7068832173240525,
+      "grad_norm": 0.41668832302093506,
+      "learning_rate": 0.0002,
+      "loss": 0.2267,
+      "step": 10500
+    },
+    {
+      "epoch": 2.7326630574890434,
+      "grad_norm": 0.5655872225761414,
+      "learning_rate": 0.0002,
+      "loss": 0.2331,
+      "step": 10600
+    },
+    {
+      "epoch": 2.7584428976540343,
+      "grad_norm": 0.49956533312797546,
+      "learning_rate": 0.0002,
+      "loss": 0.2323,
+      "step": 10700
+    },
+    {
+      "epoch": 2.7842227378190256,
+      "grad_norm": 0.4230547547340393,
+      "learning_rate": 0.0002,
+      "loss": 0.2157,
+      "step": 10800
+    },
+    {
+      "epoch": 2.8100025779840165,
+      "grad_norm": 0.5253151655197144,
+      "learning_rate": 0.0002,
+      "loss": 0.2189,
+      "step": 10900
+    },
+    {
+      "epoch": 2.8357824181490074,
+      "grad_norm": 0.3807348906993866,
+      "learning_rate": 0.0002,
+      "loss": 0.2285,
+      "step": 11000
+    },
+    {
+      "epoch": 2.8615622583139984,
+      "grad_norm": 0.6454833149909973,
+      "learning_rate": 0.0002,
+      "loss": 0.228,
+      "step": 11100
+    },
+    {
+      "epoch": 2.8873420984789897,
+      "grad_norm": 0.2508118450641632,
+      "learning_rate": 0.0002,
+      "loss": 0.2139,
+      "step": 11200
+    },
+    {
+      "epoch": 2.9131219386439806,
+      "grad_norm": 0.32768428325653076,
+      "learning_rate": 0.0002,
+      "loss": 0.2206,
+      "step": 11300
+    },
+    {
+      "epoch": 2.9389017788089715,
+      "grad_norm": 0.4850573241710663,
+      "learning_rate": 0.0002,
+      "loss": 0.2235,
+      "step": 11400
+    },
+    {
+      "epoch": 2.9646816189739624,
+      "grad_norm": 0.6089478135108948,
+      "learning_rate": 0.0002,
+      "loss": 0.2081,
+      "step": 11500
+    },
+    {
+      "epoch": 2.9904614591389533,
+      "grad_norm": 0.47153401374816895,
+      "learning_rate": 0.0002,
+      "loss": 0.2463,
+      "step": 11600
+    },
+    {
+      "epoch": 3.0162412993039442,
+      "grad_norm": 0.3843853771686554,
+      "learning_rate": 0.0002,
+      "loss": 0.1911,
+      "step": 11700
+    },
+    {
+      "epoch": 3.042021139468935,
+      "grad_norm": 0.21224769949913025,
+      "learning_rate": 0.0002,
+      "loss": 0.1753,
+      "step": 11800
+    },
+    {
+      "epoch": 3.067800979633926,
+      "grad_norm": 0.3223534822463989,
+      "learning_rate": 0.0002,
+      "loss": 0.1799,
+      "step": 11900
+    },
+    {
+      "epoch": 3.0935808197989174,
+      "grad_norm": 0.399443656206131,
+      "learning_rate": 0.0002,
+      "loss": 0.1755,
+      "step": 12000
+    },
+    {
+      "epoch": 3.1193606599639083,
+      "grad_norm": 0.253034770488739,
+      "learning_rate": 0.0002,
+      "loss": 0.177,
+      "step": 12100
+    },
+    {
+      "epoch": 3.1451405001288992,
+      "grad_norm": 0.318568617105484,
+      "learning_rate": 0.0002,
+      "loss": 0.1772,
+      "step": 12200
+    },
+    {
+      "epoch": 3.17092034029389,
+      "grad_norm": 0.2624630928039551,
+      "learning_rate": 0.0002,
+      "loss": 0.1876,
+      "step": 12300
+    },
+    {
+      "epoch": 3.196700180458881,
+      "grad_norm": 0.46422523260116577,
+      "learning_rate": 0.0002,
+      "loss": 0.1717,
+      "step": 12400
+    },
+    {
+      "epoch": 3.222480020623872,
+      "grad_norm": 0.4504973888397217,
+      "learning_rate": 0.0002,
+      "loss": 0.1862,
+      "step": 12500
+    },
+    {
+      "epoch": 3.2482598607888633,
+      "grad_norm": 0.44676682353019714,
+      "learning_rate": 0.0002,
+      "loss": 0.1865,
+      "step": 12600
+    },
+    {
+      "epoch": 3.274039700953854,
+      "grad_norm": 0.44682949781417847,
+      "learning_rate": 0.0002,
+      "loss": 0.1797,
+      "step": 12700
+    },
+    {
+      "epoch": 3.299819541118845,
+      "grad_norm": 0.22240401804447174,
+      "learning_rate": 0.0002,
+      "loss": 0.1823,
+      "step": 12800
+    },
+    {
+      "epoch": 3.325599381283836,
+      "grad_norm": 0.3457636535167694,
+      "learning_rate": 0.0002,
+      "loss": 0.1839,
+      "step": 12900
+    },
+    {
+      "epoch": 3.351379221448827,
+      "grad_norm": 0.5065191388130188,
+      "learning_rate": 0.0002,
+      "loss": 0.1823,
+      "step": 13000
+    },
+    {
+      "epoch": 3.377159061613818,
+      "grad_norm": 0.516930341720581,
+      "learning_rate": 0.0002,
+      "loss": 0.1812,
+      "step": 13100
+    },
+    {
+      "epoch": 3.4029389017788088,
+      "grad_norm": 0.5823391079902649,
+      "learning_rate": 0.0002,
+      "loss": 0.1851,
+      "step": 13200
+    },
+    {
+      "epoch": 3.4287187419438,
+      "grad_norm": 0.4604497253894806,
+      "learning_rate": 0.0002,
+      "loss": 0.1897,
+      "step": 13300
+    },
+    {
+      "epoch": 3.454498582108791,
+      "grad_norm": 0.3871957063674927,
+      "learning_rate": 0.0002,
+      "loss": 0.1778,
+      "step": 13400
+    },
+    {
+      "epoch": 3.480278422273782,
+      "grad_norm": 0.40806278586387634,
+      "learning_rate": 0.0002,
+      "loss": 0.1854,
+      "step": 13500
+    },
+    {
+      "epoch": 3.506058262438773,
+      "grad_norm": 0.24849525094032288,
+      "learning_rate": 0.0002,
+      "loss": 0.1825,
+      "step": 13600
+    },
+    {
+      "epoch": 3.5318381026037637,
+      "grad_norm": 0.28265008330345154,
+      "learning_rate": 0.0002,
+      "loss": 0.1914,
+      "step": 13700
+    },
+    {
+      "epoch": 3.557617942768755,
+      "grad_norm": 0.18643364310264587,
+      "learning_rate": 0.0002,
+      "loss": 0.1728,
+      "step": 13800
+    },
+    {
+      "epoch": 3.583397782933746,
+      "grad_norm": 0.36125150322914124,
+      "learning_rate": 0.0002,
+      "loss": 0.184,
+      "step": 13900
+    },
+    {
+      "epoch": 3.609177623098737,
+      "grad_norm": 0.35003572702407837,
+      "learning_rate": 0.0002,
+      "loss": 0.1834,
+      "step": 14000
+    },
+    {
+      "epoch": 3.634957463263728,
+      "grad_norm": 0.29175901412963867,
+      "learning_rate": 0.0002,
+      "loss": 0.1845,
+      "step": 14100
+    },
+    {
+      "epoch": 3.6607373034287187,
+      "grad_norm": 0.37868496775627136,
+      "learning_rate": 0.0002,
+      "loss": 0.1893,
+      "step": 14200
+    },
+    {
+      "epoch": 3.6865171435937096,
+      "grad_norm": 0.3279033899307251,
+      "learning_rate": 0.0002,
+      "loss": 0.1908,
+      "step": 14300
+    },
+    {
+      "epoch": 3.7122969837587005,
+      "grad_norm": 0.31007370352745056,
+      "learning_rate": 0.0002,
+      "loss": 0.1832,
+      "step": 14400
+    },
+    {
+      "epoch": 3.7380768239236914,
+      "grad_norm": 0.298289030790329,
+      "learning_rate": 0.0002,
+      "loss": 0.1948,
+      "step": 14500
+    },
+    {
+      "epoch": 3.763856664088683,
+      "grad_norm": 0.6039551496505737,
+      "learning_rate": 0.0002,
+      "loss": 0.1828,
+      "step": 14600
+    },
+    {
+      "epoch": 3.7896365042536737,
+      "grad_norm": 0.449587345123291,
+      "learning_rate": 0.0002,
+      "loss": 0.1891,
+      "step": 14700
+    },
+    {
+      "epoch": 3.8154163444186646,
+      "grad_norm": 0.6465901136398315,
+      "learning_rate": 0.0002,
+      "loss": 0.1895,
+      "step": 14800
+    },
+    {
+      "epoch": 3.8411961845836555,
+      "grad_norm": 0.5226249098777771,
+      "learning_rate": 0.0002,
+      "loss": 0.1767,
+      "step": 14900
+    },
+    {
+      "epoch": 3.8669760247486464,
+      "grad_norm": 0.29470816254615784,
+      "learning_rate": 0.0002,
+      "loss": 0.1958,
+      "step": 15000
+    },
+    {
+      "epoch": 3.892755864913638,
+      "grad_norm": 0.4997386336326599,
+      "learning_rate": 0.0002,
+      "loss": 0.1984,
+      "step": 15100
+    },
+    {
+      "epoch": 3.9185357050786287,
+      "grad_norm": 0.35381177067756653,
+      "learning_rate": 0.0002,
+      "loss": 0.1839,
+      "step": 15200
+    },
+    {
+      "epoch": 3.9443155452436196,
+      "grad_norm": 0.29231759905815125,
+      "learning_rate": 0.0002,
+      "loss": 0.1812,
+      "step": 15300
+    },
+    {
+      "epoch": 3.9700953854086105,
+      "grad_norm": 0.40497833490371704,
+      "learning_rate": 0.0002,
+      "loss": 0.1798,
+      "step": 15400
+    },
+    {
+      "epoch": 3.9958752255736014,
+      "grad_norm": 0.1775328516960144,
+      "learning_rate": 0.0002,
+      "loss": 0.1931,
+      "step": 15500
+    },
+    {
+      "epoch": 4.021655065738592,
+      "grad_norm": 0.2625548243522644,
+      "learning_rate": 0.0002,
+      "loss": 0.1513,
+      "step": 15600
+    },
+    {
+      "epoch": 4.047434905903583,
+      "grad_norm": 0.47476592659950256,
+      "learning_rate": 0.0002,
+      "loss": 0.1607,
+      "step": 15700
+    },
+    {
+      "epoch": 4.073214746068574,
+      "grad_norm": 0.4454491138458252,
+      "learning_rate": 0.0002,
+      "loss": 0.1529,
+      "step": 15800
+    },
+    {
+      "epoch": 4.098994586233565,
+      "grad_norm": 0.12239188700914383,
+      "learning_rate": 0.0002,
+      "loss": 0.1539,
+      "step": 15900
+    },
+    {
+      "epoch": 4.124774426398556,
+      "grad_norm": 0.2339598536491394,
+      "learning_rate": 0.0002,
+      "loss": 0.1572,
+      "step": 16000
+    },
+    {
+      "epoch": 4.150554266563548,
+      "grad_norm": 0.19658803939819336,
+      "learning_rate": 0.0002,
+      "loss": 0.1571,
+      "step": 16100
+    },
+    {
+      "epoch": 4.176334106728539,
+      "grad_norm": 0.25842776894569397,
+      "learning_rate": 0.0002,
+      "loss": 0.155,
+      "step": 16200
+    },
+    {
+      "epoch": 4.20211394689353,
+      "grad_norm": 0.4655442535877228,
+      "learning_rate": 0.0002,
+      "loss": 0.1584,
+      "step": 16300
+    },
+    {
+      "epoch": 4.2278937870585205,
+      "grad_norm": 0.3778013586997986,
+      "learning_rate": 0.0002,
+      "loss": 0.1587,
+      "step": 16400
+    },
+    {
+      "epoch": 4.253673627223511,
+      "grad_norm": 0.22199797630310059,
+      "learning_rate": 0.0002,
+      "loss": 0.1573,
+      "step": 16500
+    },
+    {
+      "epoch": 4.279453467388502,
+      "grad_norm": 0.23724961280822754,
+      "learning_rate": 0.0002,
+      "loss": 0.1649,
+      "step": 16600
+    },
+    {
+      "epoch": 4.305233307553493,
+      "grad_norm": 0.4558769166469574,
+      "learning_rate": 0.0002,
+      "loss": 0.1633,
+      "step": 16700
+    },
+    {
+      "epoch": 4.331013147718484,
+      "grad_norm": 0.27720391750335693,
+      "learning_rate": 0.0002,
+      "loss": 0.1613,
+      "step": 16800
+    },
+    {
+      "epoch": 4.356792987883475,
+      "grad_norm": 0.3628349304199219,
+      "learning_rate": 0.0002,
+      "loss": 0.16,
+      "step": 16900
+    },
+    {
+      "epoch": 4.382572828048466,
+      "grad_norm": 0.6290438175201416,
+      "learning_rate": 0.0002,
+      "loss": 0.1658,
+      "step": 17000
+    },
+    {
+      "epoch": 4.408352668213457,
+      "grad_norm": 0.14983007311820984,
+      "learning_rate": 0.0002,
+      "loss": 0.1629,
+      "step": 17100
+    },
+    {
+      "epoch": 4.434132508378448,
+      "grad_norm": 0.30865323543548584,
+      "learning_rate": 0.0002,
+      "loss": 0.1603,
+      "step": 17200
+    },
+    {
+      "epoch": 4.459912348543439,
+      "grad_norm": 0.5674950480461121,
+      "learning_rate": 0.0002,
+      "loss": 0.1674,
+      "step": 17300
+    },
+    {
+      "epoch": 4.4856921887084305,
+      "grad_norm": 0.40429455041885376,
+      "learning_rate": 0.0002,
+      "loss": 0.1677,
+      "step": 17400
+    },
+    {
+      "epoch": 4.511472028873421,
+      "grad_norm": 0.27213749289512634,
+      "learning_rate": 0.0002,
+      "loss": 0.1642,
+      "step": 17500
+    },
+    {
+      "epoch": 4.537251869038412,
+      "grad_norm": 0.40964949131011963,
+      "learning_rate": 0.0002,
+      "loss": 0.1626,
+      "step": 17600
+    },
+    {
+      "epoch": 4.563031709203403,
+      "grad_norm": 0.3955250382423401,
+      "learning_rate": 0.0002,
+      "loss": 0.1564,
+      "step": 17700
+    },
+    {
+      "epoch": 4.588811549368394,
+      "grad_norm": 0.3900775611400604,
+      "learning_rate": 0.0002,
+      "loss": 0.1605,
+      "step": 17800
+    },
+    {
+      "epoch": 4.614591389533385,
+      "grad_norm": 0.2436327487230301,
+      "learning_rate": 0.0002,
+      "loss": 0.1603,
+      "step": 17900
+    },
+    {
+      "epoch": 4.640371229698376,
+      "grad_norm": 0.4188991189002991,
+      "learning_rate": 0.0002,
+      "loss": 0.163,
+      "step": 18000
+    },
+    {
+      "epoch": 4.666151069863367,
+      "grad_norm": 0.15686850249767303,
+      "learning_rate": 0.0002,
+      "loss": 0.1656,
+      "step": 18100
+    },
+    {
+      "epoch": 4.691930910028358,
+      "grad_norm": 0.30334389209747314,
+      "learning_rate": 0.0002,
+      "loss": 0.1612,
+      "step": 18200
+    },
+    {
+      "epoch": 4.717710750193349,
+      "grad_norm": 0.33619073033332825,
+      "learning_rate": 0.0002,
+      "loss": 0.1626,
+      "step": 18300
+    },
+    {
+      "epoch": 4.7434905903583395,
+      "grad_norm": 0.20497629046440125,
+      "learning_rate": 0.0002,
+      "loss": 0.1647,
+      "step": 18400
+    },
+    {
+      "epoch": 4.76927043052333,
+      "grad_norm": 0.20428726077079773,
+      "learning_rate": 0.0002,
+      "loss": 0.1726,
+      "step": 18500
+    },
+    {
+      "epoch": 4.795050270688321,
+      "grad_norm": 0.3606746196746826,
+      "learning_rate": 0.0002,
+      "loss": 0.1638,
+      "step": 18600
+    },
+    {
+      "epoch": 4.820830110853313,
+      "grad_norm": 0.3441687226295471,
+      "learning_rate": 0.0002,
+      "loss": 0.1676,
+      "step": 18700
+    },
+    {
+      "epoch": 4.846609951018304,
+      "grad_norm": 0.3479159474372864,
+      "learning_rate": 0.0002,
+      "loss": 0.1654,
+      "step": 18800
+    },
+    {
+      "epoch": 4.872389791183295,
+      "grad_norm": 0.39751461148262024,
+      "learning_rate": 0.0002,
+      "loss": 0.1592,
+      "step": 18900
+    },
+    {
+      "epoch": 4.898169631348286,
+      "grad_norm": 0.1793346256017685,
+      "learning_rate": 0.0002,
+      "loss": 0.1683,
+      "step": 19000
+    },
+    {
+      "epoch": 4.923949471513277,
+      "grad_norm": 0.100714772939682,
+      "learning_rate": 0.0002,
+      "loss": 0.1592,
+      "step": 19100
+    },
+    {
+      "epoch": 4.949729311678268,
+      "grad_norm": 0.6268895864486694,
+      "learning_rate": 0.0002,
+      "loss": 0.1667,
+      "step": 19200
+    },
+    {
+      "epoch": 4.975509151843259,
+      "grad_norm": 0.32232895493507385,
+      "learning_rate": 0.0002,
+      "loss": 0.1615,
+      "step": 19300
+    },
+    {
+      "epoch": 5.0012889920082495,
+      "grad_norm": 0.3094789683818817,
+      "learning_rate": 0.0002,
+      "loss": 0.1648,
+      "step": 19400
+    },
+    {
+      "epoch": 5.02706883217324,
+      "grad_norm": 0.3806459307670593,
+      "learning_rate": 0.0002,
+      "loss": 0.149,
+      "step": 19500
+    },
+    {
+      "epoch": 5.052848672338231,
+      "grad_norm": 0.28195375204086304,
+      "learning_rate": 0.0002,
+      "loss": 0.1409,
+      "step": 19600
+    },
+    {
+      "epoch": 5.078628512503222,
+      "grad_norm": 0.1819002479314804,
+      "learning_rate": 0.0002,
+      "loss": 0.1403,
+      "step": 19700
+    },
+    {
+      "epoch": 5.104408352668213,
+      "grad_norm": 0.27728572487831116,
+      "learning_rate": 0.0002,
+      "loss": 0.1426,
+      "step": 19800
+    },
+    {
+      "epoch": 5.130188192833204,
+      "grad_norm": 0.21889761090278625,
+      "learning_rate": 0.0002,
+      "loss": 0.1499,
+      "step": 19900
+    },
+    {
+      "epoch": 5.155968032998196,
+      "grad_norm": 0.3974555432796478,
+      "learning_rate": 0.0002,
+      "loss": 0.1427,
+      "step": 20000
+    },
+    {
+      "epoch": 5.181747873163187,
+      "grad_norm": 0.48159608244895935,
+      "learning_rate": 0.0002,
+      "loss": 0.1477,
+      "step": 20100
+    },
+    {
+      "epoch": 5.207527713328178,
+      "grad_norm": 0.3865210711956024,
+      "learning_rate": 0.0002,
+      "loss": 0.1424,
+      "step": 20200
+    },
+    {
+      "epoch": 5.233307553493169,
+      "grad_norm": 0.26485195755958557,
+      "learning_rate": 0.0002,
+      "loss": 0.1486,
+      "step": 20300
+    },
+    {
+      "epoch": 5.2590873936581595,
+      "grad_norm": 0.41939619183540344,
+      "learning_rate": 0.0002,
+      "loss": 0.151,
+      "step": 20400
+    },
+    {
+      "epoch": 5.28486723382315,
+      "grad_norm": 0.3483380973339081,
+      "learning_rate": 0.0002,
+      "loss": 0.1475,
+      "step": 20500
+    },
+    {
+      "epoch": 5.310647073988141,
+      "grad_norm": 0.40975695848464966,
+      "learning_rate": 0.0002,
+      "loss": 0.1461,
+      "step": 20600
+    },
+    {
+      "epoch": 5.336426914153132,
+      "grad_norm": 0.27101436257362366,
+      "learning_rate": 0.0002,
+      "loss": 0.1528,
+      "step": 20700
+    },
+    {
+      "epoch": 5.362206754318123,
+      "grad_norm": 0.27852606773376465,
+      "learning_rate": 0.0002,
+      "loss": 0.1484,
+      "step": 20800
+    },
+    {
+      "epoch": 5.387986594483114,
+      "grad_norm": 0.4176689684391022,
+      "learning_rate": 0.0002,
+      "loss": 0.1485,
+      "step": 20900
+    },
+    {
+      "epoch": 5.413766434648105,
+      "grad_norm": 0.4901387691497803,
+      "learning_rate": 0.0002,
+      "loss": 0.1479,
+      "step": 21000
+    },
+    {
+      "epoch": 5.439546274813096,
+      "grad_norm": 0.33768975734710693,
+      "learning_rate": 0.0002,
+      "loss": 0.15,
+      "step": 21100
+    },
+    {
+      "epoch": 5.465326114978087,
+      "grad_norm": 0.5349870324134827,
+      "learning_rate": 0.0002,
+      "loss": 0.1485,
+      "step": 21200
+    },
+    {
+      "epoch": 5.4911059551430785,
+      "grad_norm": 0.24405865371227264,
+      "learning_rate": 0.0002,
+      "loss": 0.146,
+      "step": 21300
+    },
+    {
+      "epoch": 5.516885795308069,
+      "grad_norm": 0.2870001494884491,
+      "learning_rate": 0.0002,
+      "loss": 0.1482,
+      "step": 21400
+    },
+    {
+      "epoch": 5.54266563547306,
+      "grad_norm": 0.34606364369392395,
+      "learning_rate": 0.0002,
+      "loss": 0.1535,
+      "step": 21500
+    },
+    {
+      "epoch": 5.568445475638051,
+      "grad_norm": 0.4999238848686218,
+      "learning_rate": 0.0002,
+      "loss": 0.1523,
+      "step": 21600
+    },
+    {
+      "epoch": 5.594225315803042,
+      "grad_norm": 0.2526559829711914,
+      "learning_rate": 0.0002,
+      "loss": 0.1524,
+      "step": 21700
+    },
+    {
+      "epoch": 5.620005155968033,
+      "grad_norm": 0.270786315202713,
+      "learning_rate": 0.0002,
+      "loss": 0.1511,
+      "step": 21800
+    },
+    {
+      "epoch": 5.645784996133024,
+      "grad_norm": 0.4440493881702423,
+      "learning_rate": 0.0002,
+      "loss": 0.1539,
+      "step": 21900
+    },
+    {
+      "epoch": 5.671564836298015,
+      "grad_norm": 0.4871107041835785,
+      "learning_rate": 0.0002,
+      "loss": 0.1505,
+      "step": 22000
+    },
+    {
+      "epoch": 5.697344676463006,
+      "grad_norm": 0.40973493456840515,
+      "learning_rate": 0.0002,
+      "loss": 0.1553,
+      "step": 22100
+    },
+    {
+      "epoch": 5.723124516627997,
+      "grad_norm": 0.4365851581096649,
+      "learning_rate": 0.0002,
+      "loss": 0.1502,
+      "step": 22200
+    },
+    {
+      "epoch": 5.748904356792988,
+      "grad_norm": 0.5478639602661133,
+      "learning_rate": 0.0002,
+      "loss": 0.1611,
+      "step": 22300
+    },
+    {
+      "epoch": 5.7746841969579785,
+      "grad_norm": 0.29485803842544556,
+      "learning_rate": 0.0002,
+      "loss": 0.157,
+      "step": 22400
+    },
+    {
+      "epoch": 5.800464037122969,
+      "grad_norm": 0.20778502523899078,
+      "learning_rate": 0.0002,
+      "loss": 0.1489,
+      "step": 22500
+    },
+    {
+      "epoch": 5.826243877287961,
+      "grad_norm": 0.1795939952135086,
+      "learning_rate": 0.0002,
+      "loss": 0.1517,
+      "step": 22600
+    },
+    {
+      "epoch": 5.852023717452952,
+      "grad_norm": 0.4165894687175751,
+      "learning_rate": 0.0002,
+      "loss": 0.1464,
+      "step": 22700
+    },
+    {
+      "epoch": 5.877803557617943,
+      "grad_norm": 0.35076722502708435,
+      "learning_rate": 0.0002,
+      "loss": 0.1499,
+      "step": 22800
+    },
+    {
+      "epoch": 5.903583397782934,
+      "grad_norm": 0.3190014362335205,
+      "learning_rate": 0.0002,
+      "loss": 0.1474,
+      "step": 22900
+    },
+    {
+      "epoch": 5.929363237947925,
+      "grad_norm": 0.6232258081436157,
+      "learning_rate": 0.0002,
+      "loss": 0.1521,
+      "step": 23000
+    },
+    {
+      "epoch": 5.955143078112916,
+      "grad_norm": 0.41889217495918274,
+      "learning_rate": 0.0002,
+      "loss": 0.1553,
+      "step": 23100
+    },
+    {
+      "epoch": 5.980922918277907,
+      "grad_norm": 0.4977259635925293,
+      "learning_rate": 0.0002,
+      "loss": 0.1543,
+      "step": 23200
+    },
+    {
+      "epoch": 6.006702758442898,
+      "grad_norm": 0.3092762231826782,
+      "learning_rate": 0.0002,
+      "loss": 0.145,
+      "step": 23300
+    },
+    {
+      "epoch": 6.0324825986078885,
+      "grad_norm": 0.15745452046394348,
+      "learning_rate": 0.0002,
+      "loss": 0.138,
+      "step": 23400
+    },
+    {
+      "epoch": 6.058262438772879,
+      "grad_norm": 0.10685788840055466,
+      "learning_rate": 0.0002,
+      "loss": 0.1345,
+      "step": 23500
+    },
+    {
+      "epoch": 6.08404227893787,
+      "grad_norm": 0.41699907183647156,
+      "learning_rate": 0.0002,
+      "loss": 0.1379,
+      "step": 23600
+    },
+    {
+      "epoch": 6.109822119102861,
+      "grad_norm": 0.18783129751682281,
+      "learning_rate": 0.0002,
+      "loss": 0.1306,
+      "step": 23700
+    },
+    {
+      "epoch": 6.135601959267852,
+      "grad_norm": 0.15569710731506348,
+      "learning_rate": 0.0002,
+      "loss": 0.1372,
+      "step": 23800
+    },
+    {
+      "epoch": 6.161381799432844,
+      "grad_norm": 0.4492259919643402,
+      "learning_rate": 0.0002,
+      "loss": 0.1414,
+      "step": 23900
+    },
+    {
+      "epoch": 6.187161639597835,
+      "grad_norm": 0.1448894888162613,
+      "learning_rate": 0.0002,
+      "loss": 0.1376,
+      "step": 24000
+    },
+    {
+      "epoch": 6.212941479762826,
+      "grad_norm": 0.2028491050004959,
+      "learning_rate": 0.0002,
+      "loss": 0.1349,
+      "step": 24100
+    },
+    {
+      "epoch": 6.238721319927817,
+      "grad_norm": 0.19205012917518616,
+      "learning_rate": 0.0002,
+      "loss": 0.1396,
+      "step": 24200
+    },
+    {
+      "epoch": 6.2645011600928076,
+      "grad_norm": 0.29885369539260864,
+      "learning_rate": 0.0002,
+      "loss": 0.1449,
+      "step": 24300
+    },
+    {
+      "epoch": 6.2902810002577985,
+      "grad_norm": 0.15814617276191711,
+      "learning_rate": 0.0002,
+      "loss": 0.1438,
+      "step": 24400
+    },
+    {
+      "epoch": 6.316060840422789,
+      "grad_norm": 0.2691551148891449,
+      "learning_rate": 0.0002,
+      "loss": 0.1406,
+      "step": 24500
+    },
+    {
+      "epoch": 6.34184068058778,
+      "grad_norm": 0.543335497379303,
+      "learning_rate": 0.0002,
+      "loss": 0.1389,
+      "step": 24600
+    },
+    {
+      "epoch": 6.367620520752771,
+      "grad_norm": 0.33116665482521057,
+      "learning_rate": 0.0002,
+      "loss": 0.1403,
+      "step": 24700
+    },
+    {
+      "epoch": 6.393400360917762,
+      "grad_norm": 0.5159612894058228,
+      "learning_rate": 0.0002,
+      "loss": 0.1408,
+      "step": 24800
+    },
+    {
+      "epoch": 6.419180201082753,
+      "grad_norm": 0.30205056071281433,
+      "learning_rate": 0.0002,
+      "loss": 0.1409,
+      "step": 24900
+    },
+    {
+      "epoch": 6.444960041247744,
+      "grad_norm": 0.44916966557502747,
+      "learning_rate": 0.0002,
+      "loss": 0.1432,
+      "step": 25000
+    },
+    {
+      "epoch": 6.470739881412735,
+      "grad_norm": 0.18665899336338043,
+      "learning_rate": 0.0002,
+      "loss": 0.1434,
+      "step": 25100
+    },
+    {
+      "epoch": 6.496519721577727,
+      "grad_norm": 0.4078758656978607,
+      "learning_rate": 0.0002,
+      "loss": 0.1411,
+      "step": 25200
+    },
+    {
+      "epoch": 6.5222995617427175,
+      "grad_norm": 0.39813536405563354,
+      "learning_rate": 0.0002,
+      "loss": 0.1445,
+      "step": 25300
+    },
+    {
+      "epoch": 6.548079401907708,
+      "grad_norm": 0.2587377727031708,
+      "learning_rate": 0.0002,
+      "loss": 0.1463,
+      "step": 25400
+    },
+    {
+      "epoch": 6.573859242072699,
+      "grad_norm": 0.41181057691574097,
+      "learning_rate": 0.0002,
+      "loss": 0.1487,
+      "step": 25500
+    },
+    {
+      "epoch": 6.59963908223769,
+      "grad_norm": 0.3136518597602844,
+      "learning_rate": 0.0002,
+      "loss": 0.1414,
+      "step": 25600
+    },
+    {
+      "epoch": 6.625418922402681,
+      "grad_norm": 0.4114777445793152,
+      "learning_rate": 0.0002,
+      "loss": 0.1434,
+      "step": 25700
+    },
+    {
+      "epoch": 6.651198762567672,
+      "grad_norm": 0.17142866551876068,
+      "learning_rate": 0.0002,
+      "loss": 0.1411,
+      "step": 25800
+    },
+    {
+      "epoch": 6.676978602732663,
+      "grad_norm": 0.5585296750068665,
+      "learning_rate": 0.0002,
+      "loss": 0.148,
+      "step": 25900
+    },
+    {
+      "epoch": 6.702758442897654,
+      "grad_norm": 0.23773185908794403,
+      "learning_rate": 0.0002,
+      "loss": 0.1468,
+      "step": 26000
+    },
+    {
+      "epoch": 6.728538283062645,
+      "grad_norm": 0.38246840238571167,
+      "learning_rate": 0.0002,
+      "loss": 0.1426,
+      "step": 26100
+    },
+    {
+      "epoch": 6.754318123227636,
+      "grad_norm": 0.5393186807632446,
+      "learning_rate": 0.0002,
+      "loss": 0.1456,
+      "step": 26200
+    },
+    {
+      "epoch": 6.780097963392627,
+      "grad_norm": 0.21433015167713165,
+      "learning_rate": 0.0002,
+      "loss": 0.1456,
+      "step": 26300
+    },
+    {
+      "epoch": 6.8058778035576175,
+      "grad_norm": 0.4375258982181549,
+      "learning_rate": 0.0002,
+      "loss": 0.1461,
+      "step": 26400
+    },
+    {
+      "epoch": 6.831657643722609,
+      "grad_norm": 0.515832781791687,
+      "learning_rate": 0.0002,
+      "loss": 0.1484,
+      "step": 26500
+    },
+    {
+      "epoch": 6.8574374838876,
+      "grad_norm": 0.496559739112854,
+      "learning_rate": 0.0002,
+      "loss": 0.1461,
+      "step": 26600
+    },
+    {
+      "epoch": 6.883217324052591,
+      "grad_norm": 0.30182015895843506,
+      "learning_rate": 0.0002,
+      "loss": 0.1471,
+      "step": 26700
+    },
+    {
+      "epoch": 6.908997164217582,
+      "grad_norm": 0.3858971893787384,
+      "learning_rate": 0.0002,
+      "loss": 0.1469,
+      "step": 26800
+    },
+    {
+      "epoch": 6.934777004382573,
+      "grad_norm": 0.30368533730506897,
+      "learning_rate": 0.0002,
+      "loss": 0.1466,
+      "step": 26900
+    },
+    {
+      "epoch": 6.960556844547564,
+      "grad_norm": 0.29557520151138306,
+      "learning_rate": 0.0002,
+      "loss": 0.1446,
+      "step": 27000
+    },
+    {
+      "epoch": 6.986336684712555,
+      "grad_norm": 0.34702664613723755,
+      "learning_rate": 0.0002,
+      "loss": 0.143,
+      "step": 27100
+    },
+    {
+      "epoch": 7.012116524877546,
+      "grad_norm": 0.18182627856731415,
+      "learning_rate": 0.0002,
+      "loss": 0.1467,
+      "step": 27200
+    },
+    {
+      "epoch": 7.037896365042537,
+      "grad_norm": 0.48641154170036316,
+      "learning_rate": 0.0002,
+      "loss": 0.1337,
+      "step": 27300
+    },
+    {
+      "epoch": 7.0636762052075275,
+      "grad_norm": 0.5797538757324219,
+      "learning_rate": 0.0002,
+      "loss": 0.1291,
+      "step": 27400
+    },
+    {
+      "epoch": 7.089456045372518,
+      "grad_norm": 0.20399855077266693,
+      "learning_rate": 0.0002,
+      "loss": 0.1372,
+      "step": 27500
+    },
+    {
+      "epoch": 7.115235885537509,
+      "grad_norm": 0.12141354382038116,
+      "learning_rate": 0.0002,
+      "loss": 0.1359,
+      "step": 27600
+    },
+    {
+      "epoch": 7.1410157257025,
+      "grad_norm": 0.13764117658138275,
+      "learning_rate": 0.0002,
+      "loss": 0.1276,
+      "step": 27700
+    },
+    {
+      "epoch": 7.166795565867492,
+      "grad_norm": 0.21888123452663422,
+      "learning_rate": 0.0002,
+      "loss": 0.1337,
+      "step": 27800
+    },
+    {
+      "epoch": 7.192575406032483,
+      "grad_norm": 0.1562834531068802,
+      "learning_rate": 0.0002,
+      "loss": 0.133,
+      "step": 27900
+    },
+    {
+      "epoch": 7.218355246197474,
+      "grad_norm": 0.3367880880832672,
+      "learning_rate": 0.0002,
+      "loss": 0.1335,
+      "step": 28000
+    },
+    {
+      "epoch": 7.244135086362465,
+      "grad_norm": 0.1075579896569252,
+      "learning_rate": 0.0002,
+      "loss": 0.1334,
+      "step": 28100
+    },
+    {
+      "epoch": 7.269914926527456,
+      "grad_norm": 0.11283877491950989,
+      "learning_rate": 0.0002,
+      "loss": 0.1356,
+      "step": 28200
+    },
+    {
+      "epoch": 7.2956947666924465,
+      "grad_norm": 0.24768362939357758,
+      "learning_rate": 0.0002,
+      "loss": 0.1374,
+      "step": 28300
+    },
+    {
+      "epoch": 7.3214746068574375,
+      "grad_norm": 0.22776305675506592,
+      "learning_rate": 0.0002,
+      "loss": 0.1307,
+      "step": 28400
+    },
+    {
+      "epoch": 7.347254447022428,
+      "grad_norm": 0.13827867805957794,
+      "learning_rate": 0.0002,
+      "loss": 0.1396,
+      "step": 28500
+    },
+    {
+      "epoch": 7.373034287187419,
+      "grad_norm": 0.2935916781425476,
+      "learning_rate": 0.0002,
+      "loss": 0.1355,
+      "step": 28600
+    },
+    {
+      "epoch": 7.39881412735241,
+      "grad_norm": 0.10991048812866211,
+      "learning_rate": 0.0002,
+      "loss": 0.1349,
+      "step": 28700
+    },
+    {
+      "epoch": 7.424593967517401,
+      "grad_norm": 0.30149704217910767,
+      "learning_rate": 0.0002,
+      "loss": 0.1374,
+      "step": 28800
+    },
+    {
+      "epoch": 7.450373807682392,
+      "grad_norm": 0.13918708264827728,
+      "learning_rate": 0.0002,
+      "loss": 0.141,
+      "step": 28900
+    },
+    {
+      "epoch": 7.476153647847383,
+      "grad_norm": 0.13292869925498962,
+      "learning_rate": 0.0002,
+      "loss": 0.1386,
+      "step": 29000
+    },
+    {
+      "epoch": 7.501933488012375,
+      "grad_norm": 0.5602275729179382,
+      "learning_rate": 0.0002,
+      "loss": 0.1421,
+      "step": 29100
+    },
+    {
+      "epoch": 7.527713328177366,
+      "grad_norm": 0.12204320728778839,
+      "learning_rate": 0.0002,
+      "loss": 0.1334,
+      "step": 29200
+    },
+    {
+      "epoch": 7.5534931683423565,
+      "grad_norm": 0.17424637079238892,
+      "learning_rate": 0.0002,
+      "loss": 0.1372,
+      "step": 29300
+    },
+    {
+      "epoch": 7.579273008507347,
+      "grad_norm": 0.4190254509449005,
+      "learning_rate": 0.0002,
+      "loss": 0.1458,
+      "step": 29400
+    },
+    {
+      "epoch": 7.605052848672338,
+      "grad_norm": 0.13242638111114502,
+      "learning_rate": 0.0002,
+      "loss": 0.1421,
+      "step": 29500
+    },
+    {
+      "epoch": 7.630832688837329,
+      "grad_norm": 0.23242244124412537,
+      "learning_rate": 0.0002,
+      "loss": 0.1429,
+      "step": 29600
+    },
+    {
+      "epoch": 7.65661252900232,
+      "grad_norm": 0.4323575794696808,
+      "learning_rate": 0.0002,
+      "loss": 0.1402,
+      "step": 29700
+    },
+    {
+      "epoch": 7.682392369167311,
+      "grad_norm": 0.1595413088798523,
+      "learning_rate": 0.0002,
+      "loss": 0.1403,
+      "step": 29800
+    },
+    {
+      "epoch": 7.708172209332302,
+      "grad_norm": 0.1448589414358139,
+      "learning_rate": 0.0002,
+      "loss": 0.136,
+      "step": 29900
+    },
+    {
+      "epoch": 7.733952049497293,
+      "grad_norm": 0.5433810353279114,
+      "learning_rate": 0.0002,
+      "loss": 0.139,
+      "step": 30000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 31032,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 3000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.186321886206116e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceb43f92398ef10cb2bcf7e1ee38c45391bd6c835dd3c9b264b5a740a5b0d28b
+size 5496