DatOneDue commited on Feb 26, 2025

Commit

d9e041d

verified ·

1 Parent(s): b4dbfcb

Upload 27 files

Browse files

Files changed (28) hide show

.gitattributes +1 -0
Stonk_Training_SFT/README.md +166 -0
Stonk_Training_SFT/adapter_config.json +37 -0
Stonk_Training_SFT/adapter_model.safetensors +3 -0
Stonk_Training_SFT/added_tokens.json +24 -0
Stonk_Training_SFT/checkpoint-300/README.md +202 -0
Stonk_Training_SFT/checkpoint-300/adapter_config.json +37 -0
Stonk_Training_SFT/checkpoint-300/adapter_model.safetensors +3 -0
Stonk_Training_SFT/checkpoint-300/optimizer.pt +3 -0
Stonk_Training_SFT/checkpoint-300/rng_state.pth +3 -0
Stonk_Training_SFT/checkpoint-300/scaler.pt +3 -0
Stonk_Training_SFT/checkpoint-300/scheduler.pt +3 -0
Stonk_Training_SFT/checkpoint-300/trainer_state.json +2133 -0
Stonk_Training_SFT/checkpoint-300/training_args.bin +3 -0
Stonk_Training_SFT/checkpoint-312/README.md +155 -0
Stonk_Training_SFT/checkpoint-312/adapter_config.json +37 -0
Stonk_Training_SFT/checkpoint-312/adapter_model.safetensors +3 -0
Stonk_Training_SFT/checkpoint-312/optimizer.pt +3 -0
Stonk_Training_SFT/checkpoint-312/rng_state.pth +3 -0
Stonk_Training_SFT/checkpoint-312/scaler.pt +3 -0
Stonk_Training_SFT/checkpoint-312/scheduler.pt +3 -0
Stonk_Training_SFT/checkpoint-312/trainer_state.json +2217 -0
Stonk_Training_SFT/checkpoint-312/training_args.bin +3 -0
Stonk_Training_SFT/merges.txt +0 -0
Stonk_Training_SFT/special_tokens_map.json +31 -0
Stonk_Training_SFT/tokenizer.json +3 -0
Stonk_Training_SFT/tokenizer_config.json +208 -0
Stonk_Training_SFT/vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Stonk_Training_SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text

Stonk_Training_SFT/README.md ADDED Viewed

	@@ -0,0 +1,166 @@

+---
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+library_name: peft
+---
+# Model Card for Stonk_Training_SFT
+This is a fine-tuned stock prediction model that analyzes company information and provides structured predictions on whether a stock will go UP or DOWN, including a percentage estimate.
+## Model Details
+### Model Description
+The Stonk_Training_SFT model is fine-tuned from Qwen2.5-1.5B-Instruct using Parameter-Efficient Fine-Tuning (PEFT) with Low-Rank Adaptation (LoRA). It was specifically trained to:
+1. Provide structured analysis of stock prospects using specific XML-style tags
+2. Analyze company information, recent price trends, and news
+3. Make balanced UP or DOWN predictions with percentage estimates
+- **Developed by:** 2084Collective
+- **Model type:** LoRA fine-tuned Qwen2.5-1.5B-Instruct
+- **Language(s) (NLP):** English
+- **License:** Research only - not for commercial or production use
+- **Finetuned from model:** Qwen/Qwen2.5-1.5B-Instruct
+## Uses
+### Direct Use
+This model is designed to analyze stock information and provide structured predictions. The model expects input about a company including:
+- Ticker symbol
+- Company name and description
+- Current and previous stock prices
+- Recent news headlines
+It produces responses with the following structure:
+```
+<reason>
+Detailed reasoning about stock movement prediction
+</reason>
+<answer>
+UP/DOWN X.X%
+</answer>
+```
+### Out-of-Scope Use
+This model should NOT be used for:
+- Actual financial decision making or investment advice
+- Production trading systems
+- Any commercial applications
+The model's predictions are for educational and research purposes only.
+## Bias, Risks, and Limitations
+- The model has no access to real-time market data
+- Predictions are based solely on the information provided in the prompt
+- The model was trained on synthetic data and may not accurately reflect real market dynamics
+- Stock market predictions are inherently uncertain and subject to numerous external factors
+### Recommendations
+- Use outputs for educational purposes only
+- Do not make financial decisions based on the model's predictions
+- Consider the model's predictions as one of many inputs in a broader analysis
+## How to Get Started with the Model
+Use the code below to get started with the model:
+```python:Stonk_Training_SFT/README.md
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+# Load the base model and tokenizer
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+# Load the LoRA adapter
+model = PeftModel.from_pretrained(model, "./Stonk_Training_SFT")
+# Prepare the prompt
+system_prompt = """You are an expert stock market analyst with decades of experience.
+IMPORTANT: You MUST use the following format for your response:
+<reason>
+Your detailed analysis explaining why the stock will move up or down. Include technical indicators, news impact, and market trends.
+</reason>
+<answer>
+State UP or DOWN followed by a percentage (e.g., "UP 2.3%" or "DOWN 1.5%")
+</answer>"""
+user_prompt = """Stock: NVDA
+Company: NVIDIA Corporation
+Description: NVIDIA designs GPUs and SoCs for gaming and professional markets.
+Current Price: $950.00
+Previous Close: $920.00
+Recent News:
+- NVIDIA announces new AI supercomputer
+- Record demand for AI chips drives revenue growth
+- NVIDIA partners with major cloud providers
+Question: Based on this information, analyze whether this stock will go UP or DOWN in the next trading day. Provide your reasoning and a specific percentage prediction."""
+# Format with chat template
+chat_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+# Generate prediction
+inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=300)
+response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(response)
+```
+## Training Details
+### Training Data
+The model was trained on a combination of:
+1. Synthetic data created specifically for stock prediction
+2. Data generated to ensure balanced UP and DOWN predictions
+3. Examples with various company profiles and news scenarios
+### Training Procedure
+The training used a two-stage approach:
+1. Initial pre-training focusing on proper tag usage
+2. GRPO (Generative Reinforcement from Prediction Outcomes) to align with direction prediction
+#### Training Hyperparameters
+- **Training regime:** fp16 mixed precision
+- **LoRA Configuration:**
+  - r=8
+  - lora_alpha=16
+  - lora_dropout=0.05
+  - Target modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+## Evaluation
+### Testing Data, Factors & Metrics
+#### Metrics
+The model was evaluated on:
+- Tag accuracy: Correct use of `<reason>` and `<answer>` tags
+- Direction accuracy: Correctly predicting UP vs DOWN based on price movement
+- Prediction balance: Distribution of UP vs DOWN predictions
+### Results
+The model demonstrates:
+- High tag format accuracy
+- Balanced prediction capabilities between UP and DOWN
+- Reasonable percentage estimates based on price movements
+## Framework versions
+- PEFT 0.14.0
+- Transformers 4.37.0
+- PyTorch 2.1.0

Stonk_Training_SFT/adapter_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Stonk_Training_SFT/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e46977a6d1b60085f1a175cdee93f3868b58220446dcd045fbbb0045b5610bda
+size 73911112

Stonk_Training_SFT/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

Stonk_Training_SFT/checkpoint-300/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

Stonk_Training_SFT/checkpoint-300/adapter_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Stonk_Training_SFT/checkpoint-300/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f0a9b293923efd00858043fd4bc92d44537b0644c924c3126c3386b24ca0b09
+size 73911112

Stonk_Training_SFT/checkpoint-300/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03391cff097f734004460bac7ac42e7ffc2979330759b7fabeb3711a0e6fcacb
+size 148047722

Stonk_Training_SFT/checkpoint-300/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82f869af31da3ef95296df174b1770d19ca7604b1d0dbb12b2763db0619f9bac
+size 14244

Stonk_Training_SFT/checkpoint-300/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92873c8c89778fe11b8eeb338a181eefdf056f2f8096c36bf259c3fd791afb34
+size 988

Stonk_Training_SFT/checkpoint-300/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7e6d70253e07bbb30f40853516f498ad692d2ded7e26f21c00dae30b89f8c08
+size 1064

Stonk_Training_SFT/checkpoint-300/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2133 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.96,
+  "eval_steps": 500,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.0318742990493774,
+      "learning_rate": 0.00019935897435897437,
+      "loss": 2.444,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6106225252151489,
+      "learning_rate": 0.00019871794871794874,
+      "loss": 2.2868,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.4373067617416382,
+      "learning_rate": 0.0001980769230769231,
+      "loss": 2.1701,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.4335378408432007,
+      "learning_rate": 0.00019743589743589744,
+      "loss": 2.1096,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.49052542448043823,
+      "learning_rate": 0.00019679487179487178,
+      "loss": 1.9543,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5317227244377136,
+      "learning_rate": 0.00019615384615384615,
+      "loss": 1.8996,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5240360498428345,
+      "learning_rate": 0.0001955128205128205,
+      "loss": 1.8271,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6232216954231262,
+      "learning_rate": 0.00019487179487179487,
+      "loss": 1.554,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4871944785118103,
+      "learning_rate": 0.00019423076923076924,
+      "loss": 1.6762,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4941597878932953,
+      "learning_rate": 0.0001935897435897436,
+      "loss": 1.4755,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.49265003204345703,
+      "learning_rate": 0.00019294871794871797,
+      "loss": 1.4558,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5133666396141052,
+      "learning_rate": 0.00019230769230769233,
+      "loss": 1.4057,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5097830295562744,
+      "learning_rate": 0.00019166666666666667,
+      "loss": 1.2189,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4502263367176056,
+      "learning_rate": 0.00019102564102564104,
+      "loss": 1.4004,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.43125396966934204,
+      "learning_rate": 0.00019038461538461538,
+      "loss": 1.2261,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5045803189277649,
+      "learning_rate": 0.00018974358974358974,
+      "loss": 1.1501,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.385760098695755,
+      "learning_rate": 0.0001891025641025641,
+      "loss": 1.1648,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.40407794713974,
+      "learning_rate": 0.00018846153846153847,
+      "loss": 1.0138,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5278863310813904,
+      "learning_rate": 0.00018782051282051283,
+      "loss": 1.0064,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.42704564332962036,
+      "learning_rate": 0.0001871794871794872,
+      "loss": 1.033,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.40699487924575806,
+      "learning_rate": 0.00018653846153846154,
+      "loss": 1.096,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.49143341183662415,
+      "learning_rate": 0.0001858974358974359,
+      "loss": 1.0027,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.39408907294273376,
+      "learning_rate": 0.00018525641025641027,
+      "loss": 0.9692,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.40637388825416565,
+      "learning_rate": 0.00018461538461538463,
+      "loss": 0.8652,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.46840283274650574,
+      "learning_rate": 0.00018397435897435897,
+      "loss": 0.7854,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.49589014053344727,
+      "learning_rate": 0.00018333333333333334,
+      "loss": 0.7093,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.426786333322525,
+      "learning_rate": 0.0001826923076923077,
+      "loss": 0.7803,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.43341225385665894,
+      "learning_rate": 0.00018205128205128207,
+      "loss": 0.6717,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.35544857382774353,
+      "learning_rate": 0.00018141025641025643,
+      "loss": 0.6876,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3088377118110657,
+      "learning_rate": 0.00018076923076923077,
+      "loss": 0.8013,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5275241732597351,
+      "learning_rate": 0.00018012820512820513,
+      "loss": 0.6156,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.32654592394828796,
+      "learning_rate": 0.0001794871794871795,
+      "loss": 0.6555,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.2824726402759552,
+      "learning_rate": 0.00017884615384615386,
+      "loss": 0.5935,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.2731788158416748,
+      "learning_rate": 0.00017820512820512823,
+      "loss": 0.7172,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.2869996428489685,
+      "learning_rate": 0.00017756410256410257,
+      "loss": 0.6094,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.2364930361509323,
+      "learning_rate": 0.00017692307692307693,
+      "loss": 0.6224,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.23826579749584198,
+      "learning_rate": 0.0001762820512820513,
+      "loss": 0.6619,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.2075989544391632,
+      "learning_rate": 0.00017564102564102566,
+      "loss": 0.5132,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.21996797621250153,
+      "learning_rate": 0.000175,
+      "loss": 0.5349,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.22936391830444336,
+      "learning_rate": 0.00017435897435897436,
+      "loss": 0.6609,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.22315745055675507,
+      "learning_rate": 0.00017371794871794873,
+      "loss": 0.601,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.23068805038928986,
+      "learning_rate": 0.0001730769230769231,
+      "loss": 0.5872,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.21181653439998627,
+      "learning_rate": 0.00017243589743589746,
+      "loss": 0.562,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2169111967086792,
+      "learning_rate": 0.0001717948717948718,
+      "loss": 0.5454,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.23763957619667053,
+      "learning_rate": 0.00017115384615384616,
+      "loss": 0.5714,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.23329143226146698,
+      "learning_rate": 0.00017051282051282053,
+      "loss": 0.6302,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.24837623536586761,
+      "learning_rate": 0.00016987179487179486,
+      "loss": 0.5195,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.23882092535495758,
+      "learning_rate": 0.00016923076923076923,
+      "loss": 0.5077,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.2507559359073639,
+      "learning_rate": 0.0001685897435897436,
+      "loss": 0.6475,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.249229297041893,
+      "learning_rate": 0.00016794871794871796,
+      "loss": 0.6017,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.24630750715732574,
+      "learning_rate": 0.00016730769230769232,
+      "loss": 0.7265,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.2189762443304062,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.5816,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.2525809705257416,
+      "learning_rate": 0.00016602564102564105,
+      "loss": 0.6504,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.2427346110343933,
+      "learning_rate": 0.0001653846153846154,
+      "loss": 0.607,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.25441974401474,
+      "learning_rate": 0.00016474358974358976,
+      "loss": 0.498,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3109598159790039,
+      "learning_rate": 0.0001641025641025641,
+      "loss": 0.7047,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.24834761023521423,
+      "learning_rate": 0.00016346153846153846,
+      "loss": 0.6676,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.2606664001941681,
+      "learning_rate": 0.00016282051282051282,
+      "loss": 0.5359,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.280716210603714,
+      "learning_rate": 0.0001621794871794872,
+      "loss": 0.5061,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.22753354907035828,
+      "learning_rate": 0.00016153846153846155,
+      "loss": 0.5317,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.2716667056083679,
+      "learning_rate": 0.00016089743589743592,
+      "loss": 0.525,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.29029580950737,
+      "learning_rate": 0.00016025641025641028,
+      "loss": 0.5692,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3005497455596924,
+      "learning_rate": 0.00015961538461538462,
+      "loss": 0.5708,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.2747126519680023,
+      "learning_rate": 0.00015897435897435896,
+      "loss": 0.5776,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.23698432743549347,
+      "learning_rate": 0.00015833333333333332,
+      "loss": 0.4512,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.2807236313819885,
+      "learning_rate": 0.0001576923076923077,
+      "loss": 0.6356,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.2509554326534271,
+      "learning_rate": 0.00015705128205128205,
+      "loss": 0.6171,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.2572317123413086,
+      "learning_rate": 0.00015641025641025642,
+      "loss": 0.5074,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2810407280921936,
+      "learning_rate": 0.00015576923076923078,
+      "loss": 0.5584,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.2606565058231354,
+      "learning_rate": 0.00015512820512820515,
+      "loss": 0.5131,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.2663954198360443,
+      "learning_rate": 0.00015448717948717951,
+      "loss": 0.5039,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.2655510902404785,
+      "learning_rate": 0.00015384615384615385,
+      "loss": 0.5378,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.2713690996170044,
+      "learning_rate": 0.00015320512820512822,
+      "loss": 0.4168,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.29010245203971863,
+      "learning_rate": 0.00015256410256410255,
+      "loss": 0.4522,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2629586160182953,
+      "learning_rate": 0.00015192307692307692,
+      "loss": 0.4264,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.2788578271865845,
+      "learning_rate": 0.00015128205128205128,
+      "loss": 0.4294,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.28636300563812256,
+      "learning_rate": 0.00015064102564102565,
+      "loss": 0.4514,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3492796719074249,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.5396,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.28230172395706177,
+      "learning_rate": 0.00014935897435897438,
+      "loss": 0.4536,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3269024193286896,
+      "learning_rate": 0.00014871794871794872,
+      "loss": 0.4935,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.2631271779537201,
+      "learning_rate": 0.00014807692307692308,
+      "loss": 0.4967,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.29715263843536377,
+      "learning_rate": 0.00014743589743589745,
+      "loss": 0.46,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.24541226029396057,
+      "learning_rate": 0.00014679487179487178,
+      "loss": 0.451,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.26244840025901794,
+      "learning_rate": 0.00014615384615384615,
+      "loss": 0.4505,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.28552278876304626,
+      "learning_rate": 0.00014551282051282051,
+      "loss": 0.4962,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3041422963142395,
+      "learning_rate": 0.00014487179487179488,
+      "loss": 0.5714,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.2605431079864502,
+      "learning_rate": 0.00014423076923076924,
+      "loss": 0.424,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.2429913431406021,
+      "learning_rate": 0.0001435897435897436,
+      "loss": 0.4929,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.263763964176178,
+      "learning_rate": 0.00014294871794871795,
+      "loss": 0.51,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.2757089138031006,
+      "learning_rate": 0.0001423076923076923,
+      "loss": 0.5876,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.25437983870506287,
+      "learning_rate": 0.00014166666666666668,
+      "loss": 0.5662,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.28170907497406006,
+      "learning_rate": 0.00014102564102564104,
+      "loss": 0.6068,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.2835545837879181,
+      "learning_rate": 0.00014038461538461538,
+      "loss": 0.5769,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.26598137617111206,
+      "learning_rate": 0.00013974358974358974,
+      "loss": 0.5344,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.27048271894454956,
+      "learning_rate": 0.0001391025641025641,
+      "loss": 0.5439,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.2872040867805481,
+      "learning_rate": 0.00013846153846153847,
+      "loss": 0.4907,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2894354462623596,
+      "learning_rate": 0.00013782051282051284,
+      "loss": 0.5225,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.2700275778770447,
+      "learning_rate": 0.00013717948717948718,
+      "loss": 0.5485,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.25223323702812195,
+      "learning_rate": 0.00013653846153846154,
+      "loss": 0.4103,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.26242420077323914,
+      "learning_rate": 0.0001358974358974359,
+      "loss": 0.4974,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.25841623544692993,
+      "learning_rate": 0.00013525641025641027,
+      "loss": 0.5663,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.2550147473812103,
+      "learning_rate": 0.00013461538461538464,
+      "loss": 0.4468,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.2532574534416199,
+      "learning_rate": 0.00013397435897435897,
+      "loss": 0.4785,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3210897743701935,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.348,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.2774418294429779,
+      "learning_rate": 0.0001326923076923077,
+      "loss": 0.5105,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.2911253571510315,
+      "learning_rate": 0.00013205128205128204,
+      "loss": 0.4461,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3193359971046448,
+      "learning_rate": 0.0001314102564102564,
+      "loss": 0.425,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.25757595896720886,
+      "learning_rate": 0.00013076923076923077,
+      "loss": 0.5055,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.29757291078567505,
+      "learning_rate": 0.00013012820512820514,
+      "loss": 0.4796,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.25091230869293213,
+      "learning_rate": 0.0001294871794871795,
+      "loss": 0.4215,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.24596236646175385,
+      "learning_rate": 0.00012884615384615387,
+      "loss": 0.4959,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.2935032844543457,
+      "learning_rate": 0.00012820512820512823,
+      "loss": 0.5622,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.247608482837677,
+      "learning_rate": 0.00012756410256410257,
+      "loss": 0.5097,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.24816389381885529,
+      "learning_rate": 0.00012692307692307693,
+      "loss": 0.4609,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.2884969115257263,
+      "learning_rate": 0.00012628205128205127,
+      "loss": 0.4562,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2685159742832184,
+      "learning_rate": 0.00012564102564102564,
+      "loss": 0.4159,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.27893656492233276,
+      "learning_rate": 0.000125,
+      "loss": 0.4143,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3658033311367035,
+      "learning_rate": 0.00012435897435897437,
+      "loss": 0.5775,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3107357621192932,
+      "learning_rate": 0.00012371794871794873,
+      "loss": 0.4669,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.303265780210495,
+      "learning_rate": 0.0001230769230769231,
+      "loss": 0.5763,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.21491239964962006,
+      "learning_rate": 0.00012243589743589746,
+      "loss": 0.2977,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.25996512174606323,
+      "learning_rate": 0.00012179487179487179,
+      "loss": 0.4028,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.25640368461608887,
+      "learning_rate": 0.00012115384615384615,
+      "loss": 0.5186,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.25851595401763916,
+      "learning_rate": 0.00012051282051282052,
+      "loss": 0.4153,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2687956988811493,
+      "learning_rate": 0.00011987179487179487,
+      "loss": 0.4214,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.2811547815799713,
+      "learning_rate": 0.00011923076923076923,
+      "loss": 0.4813,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3029988706111908,
+      "learning_rate": 0.0001185897435897436,
+      "loss": 0.6633,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.23572632670402527,
+      "learning_rate": 0.00011794871794871796,
+      "loss": 0.321,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.26160842180252075,
+      "learning_rate": 0.00011730769230769231,
+      "loss": 0.4805,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.2799210548400879,
+      "learning_rate": 0.00011666666666666668,
+      "loss": 0.5449,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.26176995038986206,
+      "learning_rate": 0.00011602564102564104,
+      "loss": 0.5371,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2574150562286377,
+      "learning_rate": 0.00011538461538461538,
+      "loss": 0.4654,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.2815636098384857,
+      "learning_rate": 0.00011474358974358975,
+      "loss": 0.4717,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.2238437682390213,
+      "learning_rate": 0.0001141025641025641,
+      "loss": 0.2716,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.28223180770874023,
+      "learning_rate": 0.00011346153846153846,
+      "loss": 0.4417,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.25392040610313416,
+      "learning_rate": 0.00011282051282051283,
+      "loss": 0.3974,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.26734161376953125,
+      "learning_rate": 0.00011217948717948718,
+      "loss": 0.4821,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3004485070705414,
+      "learning_rate": 0.00011153846153846154,
+      "loss": 0.5427,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3012203872203827,
+      "learning_rate": 0.00011089743589743591,
+      "loss": 0.4845,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.249158576130867,
+      "learning_rate": 0.00011025641025641027,
+      "loss": 0.4654,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.26157674193382263,
+      "learning_rate": 0.00010961538461538463,
+      "loss": 0.4359,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.2885046899318695,
+      "learning_rate": 0.00010897435897435896,
+      "loss": 0.5595,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.24725095927715302,
+      "learning_rate": 0.00010833333333333333,
+      "loss": 0.4303,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.2562006711959839,
+      "learning_rate": 0.0001076923076923077,
+      "loss": 0.4772,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.27621591091156006,
+      "learning_rate": 0.00010705128205128206,
+      "loss": 0.5012,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.24434815347194672,
+      "learning_rate": 0.00010641025641025641,
+      "loss": 0.4038,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.306618332862854,
+      "learning_rate": 0.00010576923076923077,
+      "loss": 0.6279,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.23926320672035217,
+      "learning_rate": 0.00010512820512820514,
+      "loss": 0.3658,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.2628524899482727,
+      "learning_rate": 0.0001044871794871795,
+      "loss": 0.4094,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3073101341724396,
+      "learning_rate": 0.00010384615384615386,
+      "loss": 0.5235,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.2647029757499695,
+      "learning_rate": 0.00010320512820512822,
+      "loss": 0.435,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.2945331335067749,
+      "learning_rate": 0.00010256410256410256,
+      "loss": 0.4805,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.24870315194129944,
+      "learning_rate": 0.00010192307692307692,
+      "loss": 0.3913,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.2954522371292114,
+      "learning_rate": 0.00010128205128205129,
+      "loss": 0.412,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.2745000720024109,
+      "learning_rate": 0.00010064102564102564,
+      "loss": 0.3781,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3542097806930542,
+      "learning_rate": 0.0001,
+      "loss": 0.6366,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.25741928815841675,
+      "learning_rate": 9.935897435897437e-05,
+      "loss": 0.4203,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.24221432209014893,
+      "learning_rate": 9.871794871794872e-05,
+      "loss": 0.3816,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.28175830841064453,
+      "learning_rate": 9.807692307692307e-05,
+      "loss": 0.5144,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.28136804699897766,
+      "learning_rate": 9.743589743589744e-05,
+      "loss": 0.5099,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.2572178542613983,
+      "learning_rate": 9.67948717948718e-05,
+      "loss": 0.4466,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.26523545384407043,
+      "learning_rate": 9.615384615384617e-05,
+      "loss": 0.4726,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.2543681859970093,
+      "learning_rate": 9.551282051282052e-05,
+      "loss": 0.4538,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.2630670964717865,
+      "learning_rate": 9.487179487179487e-05,
+      "loss": 0.396,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.2914413809776306,
+      "learning_rate": 9.423076923076924e-05,
+      "loss": 0.5105,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.23061206936836243,
+      "learning_rate": 9.35897435897436e-05,
+      "loss": 0.3556,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.29008427262306213,
+      "learning_rate": 9.294871794871795e-05,
+      "loss": 0.5202,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3016505241394043,
+      "learning_rate": 9.230769230769232e-05,
+      "loss": 0.5411,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.2557263970375061,
+      "learning_rate": 9.166666666666667e-05,
+      "loss": 0.319,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3112490773200989,
+      "learning_rate": 9.102564102564103e-05,
+      "loss": 0.3887,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.31632199883461,
+      "learning_rate": 9.038461538461538e-05,
+      "loss": 0.5968,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.2825266122817993,
+      "learning_rate": 8.974358974358975e-05,
+      "loss": 0.4054,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.29404228925704956,
+      "learning_rate": 8.910256410256411e-05,
+      "loss": 0.4833,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.2678660750389099,
+      "learning_rate": 8.846153846153847e-05,
+      "loss": 0.5036,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.25883761048316956,
+      "learning_rate": 8.782051282051283e-05,
+      "loss": 0.4133,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.25341886281967163,
+      "learning_rate": 8.717948717948718e-05,
+      "loss": 0.405,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.27091604471206665,
+      "learning_rate": 8.653846153846155e-05,
+      "loss": 0.4839,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.2976224422454834,
+      "learning_rate": 8.58974358974359e-05,
+      "loss": 0.4976,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2556227445602417,
+      "learning_rate": 8.525641025641026e-05,
+      "loss": 0.366,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.30424100160598755,
+      "learning_rate": 8.461538461538461e-05,
+      "loss": 0.5004,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.2819097638130188,
+      "learning_rate": 8.397435897435898e-05,
+      "loss": 0.4262,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.27605441212654114,
+      "learning_rate": 8.333333333333334e-05,
+      "loss": 0.4365,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.27287614345550537,
+      "learning_rate": 8.26923076923077e-05,
+      "loss": 0.4083,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.27792593836784363,
+      "learning_rate": 8.205128205128205e-05,
+      "loss": 0.4137,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3038672208786011,
+      "learning_rate": 8.141025641025641e-05,
+      "loss": 0.4728,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.28418487310409546,
+      "learning_rate": 8.076923076923078e-05,
+      "loss": 0.4671,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.2828688323497772,
+      "learning_rate": 8.012820512820514e-05,
+      "loss": 0.5008,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3183538019657135,
+      "learning_rate": 7.948717948717948e-05,
+      "loss": 0.5282,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.30755531787872314,
+      "learning_rate": 7.884615384615384e-05,
+      "loss": 0.5238,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.30897241830825806,
+      "learning_rate": 7.820512820512821e-05,
+      "loss": 0.5401,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.2755669355392456,
+      "learning_rate": 7.756410256410257e-05,
+      "loss": 0.4468,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.2606697380542755,
+      "learning_rate": 7.692307692307693e-05,
+      "loss": 0.387,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3042752742767334,
+      "learning_rate": 7.628205128205128e-05,
+      "loss": 0.5305,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.29267027974128723,
+      "learning_rate": 7.564102564102564e-05,
+      "loss": 0.419,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.30853956937789917,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 0.5416,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3460753262042999,
+      "learning_rate": 7.435897435897436e-05,
+      "loss": 0.5196,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.298348993062973,
+      "learning_rate": 7.371794871794872e-05,
+      "loss": 0.4421,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.2961275577545166,
+      "learning_rate": 7.307692307692307e-05,
+      "loss": 0.4032,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.30500563979148865,
+      "learning_rate": 7.243589743589744e-05,
+      "loss": 0.442,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.30142742395401,
+      "learning_rate": 7.17948717948718e-05,
+      "loss": 0.4375,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.26335790753364563,
+      "learning_rate": 7.115384615384616e-05,
+      "loss": 0.428,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.24412094056606293,
+      "learning_rate": 7.051282051282052e-05,
+      "loss": 0.3212,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.2796410918235779,
+      "learning_rate": 6.987179487179487e-05,
+      "loss": 0.4459,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.33633318543434143,
+      "learning_rate": 6.923076923076924e-05,
+      "loss": 0.5125,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3190767467021942,
+      "learning_rate": 6.858974358974359e-05,
+      "loss": 0.5465,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.30400681495666504,
+      "learning_rate": 6.794871794871795e-05,
+      "loss": 0.4667,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.294740229845047,
+      "learning_rate": 6.730769230769232e-05,
+      "loss": 0.4955,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.2467174381017685,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.3151,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.27507495880126953,
+      "learning_rate": 6.602564102564102e-05,
+      "loss": 0.4123,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.2465631365776062,
+      "learning_rate": 6.538461538461539e-05,
+      "loss": 0.3117,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3061293363571167,
+      "learning_rate": 6.474358974358975e-05,
+      "loss": 0.5389,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.2974298596382141,
+      "learning_rate": 6.410256410256412e-05,
+      "loss": 0.4471,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.2876061797142029,
+      "learning_rate": 6.346153846153847e-05,
+      "loss": 0.4253,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.29899370670318604,
+      "learning_rate": 6.282051282051282e-05,
+      "loss": 0.4474,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.30990898609161377,
+      "learning_rate": 6.217948717948718e-05,
+      "loss": 0.3905,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.29554882645606995,
+      "learning_rate": 6.153846153846155e-05,
+      "loss": 0.4147,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.31249329447746277,
+      "learning_rate": 6.089743589743589e-05,
+      "loss": 0.4023,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.32120752334594727,
+      "learning_rate": 6.025641025641026e-05,
+      "loss": 0.4418,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.311110258102417,
+      "learning_rate": 5.9615384615384616e-05,
+      "loss": 0.463,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.2687552571296692,
+      "learning_rate": 5.897435897435898e-05,
+      "loss": 0.3984,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.30315181612968445,
+      "learning_rate": 5.833333333333334e-05,
+      "loss": 0.5084,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.28704944252967834,
+      "learning_rate": 5.769230769230769e-05,
+      "loss": 0.4327,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.2689530551433563,
+      "learning_rate": 5.705128205128205e-05,
+      "loss": 0.3964,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.321878582239151,
+      "learning_rate": 5.6410256410256414e-05,
+      "loss": 0.4681,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.30374661087989807,
+      "learning_rate": 5.576923076923077e-05,
+      "loss": 0.4465,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.29247525334358215,
+      "learning_rate": 5.512820512820514e-05,
+      "loss": 0.4501,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.274850994348526,
+      "learning_rate": 5.448717948717948e-05,
+      "loss": 0.3943,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.24561376869678497,
+      "learning_rate": 5.384615384615385e-05,
+      "loss": 0.3225,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.28236258029937744,
+      "learning_rate": 5.3205128205128205e-05,
+      "loss": 0.3247,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.29767942428588867,
+      "learning_rate": 5.256410256410257e-05,
+      "loss": 0.4514,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3310510516166687,
+      "learning_rate": 5.192307692307693e-05,
+      "loss": 0.4417,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.28120821714401245,
+      "learning_rate": 5.128205128205128e-05,
+      "loss": 0.3511,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.38378533720970154,
+      "learning_rate": 5.0641025641025644e-05,
+      "loss": 0.5135,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3125383257865906,
+      "learning_rate": 5e-05,
+      "loss": 0.4614,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.2549707591533661,
+      "learning_rate": 4.935897435897436e-05,
+      "loss": 0.3333,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.30307722091674805,
+      "learning_rate": 4.871794871794872e-05,
+      "loss": 0.4185,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.2714983820915222,
+      "learning_rate": 4.8076923076923084e-05,
+      "loss": 0.3744,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.29321545362472534,
+      "learning_rate": 4.7435897435897435e-05,
+      "loss": 0.4177,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3908475637435913,
+      "learning_rate": 4.67948717948718e-05,
+      "loss": 0.5391,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.2907383143901825,
+      "learning_rate": 4.615384615384616e-05,
+      "loss": 0.4457,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.32630524039268494,
+      "learning_rate": 4.5512820512820516e-05,
+      "loss": 0.4652,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.33661359548568726,
+      "learning_rate": 4.4871794871794874e-05,
+      "loss": 0.4093,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.27634692192077637,
+      "learning_rate": 4.423076923076923e-05,
+      "loss": 0.3958,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.40667521953582764,
+      "learning_rate": 4.358974358974359e-05,
+      "loss": 0.4525,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.30798301100730896,
+      "learning_rate": 4.294871794871795e-05,
+      "loss": 0.4243,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3315953016281128,
+      "learning_rate": 4.230769230769231e-05,
+      "loss": 0.4833,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.29299864172935486,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 0.3727,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3195595443248749,
+      "learning_rate": 4.1025641025641023e-05,
+      "loss": 0.4243,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.28434109687805176,
+      "learning_rate": 4.038461538461539e-05,
+      "loss": 0.4001,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.28316378593444824,
+      "learning_rate": 3.974358974358974e-05,
+      "loss": 0.3611,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.2709490656852722,
+      "learning_rate": 3.9102564102564105e-05,
+      "loss": 0.337,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.2810059189796448,
+      "learning_rate": 3.846153846153846e-05,
+      "loss": 0.3383,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.31759873032569885,
+      "learning_rate": 3.782051282051282e-05,
+      "loss": 0.4404,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.27550122141838074,
+      "learning_rate": 3.717948717948718e-05,
+      "loss": 0.3898,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.29137155413627625,
+      "learning_rate": 3.653846153846154e-05,
+      "loss": 0.3887,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.30342838168144226,
+      "learning_rate": 3.58974358974359e-05,
+      "loss": 0.3747,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3385205566883087,
+      "learning_rate": 3.525641025641026e-05,
+      "loss": 0.4591,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.28427594900131226,
+      "learning_rate": 3.461538461538462e-05,
+      "loss": 0.3955,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3554467558860779,
+      "learning_rate": 3.397435897435898e-05,
+      "loss": 0.4754,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2872997224330902,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.3336,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.32470715045928955,
+      "learning_rate": 3.269230769230769e-05,
+      "loss": 0.5002,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.2823026180267334,
+      "learning_rate": 3.205128205128206e-05,
+      "loss": 0.4217,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.27532005310058594,
+      "learning_rate": 3.141025641025641e-05,
+      "loss": 0.3505,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.30065861344337463,
+      "learning_rate": 3.0769230769230774e-05,
+      "loss": 0.385,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3140447437763214,
+      "learning_rate": 3.012820512820513e-05,
+      "loss": 0.4599,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.30165910720825195,
+      "learning_rate": 2.948717948717949e-05,
+      "loss": 0.4027,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3036094307899475,
+      "learning_rate": 2.8846153846153845e-05,
+      "loss": 0.406,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.31652310490608215,
+      "learning_rate": 2.8205128205128207e-05,
+      "loss": 0.4115,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3006727397441864,
+      "learning_rate": 2.756410256410257e-05,
+      "loss": 0.4464,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.34472909569740295,
+      "learning_rate": 2.6923076923076923e-05,
+      "loss": 0.4643,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.2880076766014099,
+      "learning_rate": 2.6282051282051285e-05,
+      "loss": 0.3675,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.34488052129745483,
+      "learning_rate": 2.564102564102564e-05,
+      "loss": 0.4661,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.30622804164886475,
+      "learning_rate": 2.5e-05,
+      "loss": 0.405,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3236006200313568,
+      "learning_rate": 2.435897435897436e-05,
+      "loss": 0.4426,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3554222285747528,
+      "learning_rate": 2.3717948717948718e-05,
+      "loss": 0.4809,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.34005558490753174,
+      "learning_rate": 2.307692307692308e-05,
+      "loss": 0.417,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.30246976017951965,
+      "learning_rate": 2.2435897435897437e-05,
+      "loss": 0.4264,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3341597616672516,
+      "learning_rate": 2.1794871794871795e-05,
+      "loss": 0.4871,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.349656343460083,
+      "learning_rate": 2.1153846153846154e-05,
+      "loss": 0.5549,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.32281672954559326,
+      "learning_rate": 2.0512820512820512e-05,
+      "loss": 0.3897,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.34751957654953003,
+      "learning_rate": 1.987179487179487e-05,
+      "loss": 0.429,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.30903372168540955,
+      "learning_rate": 1.923076923076923e-05,
+      "loss": 0.3939,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.29195931553840637,
+      "learning_rate": 1.858974358974359e-05,
+      "loss": 0.3129,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3270445168018341,
+      "learning_rate": 1.794871794871795e-05,
+      "loss": 0.4435,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3003033995628357,
+      "learning_rate": 1.730769230769231e-05,
+      "loss": 0.3446,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.35107478499412537,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 0.5102,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.28601357340812683,
+      "learning_rate": 1.602564102564103e-05,
+      "loss": 0.3619,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2772061228752136,
+      "learning_rate": 1.5384615384615387e-05,
+      "loss": 0.3428,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3195333182811737,
+      "learning_rate": 1.4743589743589745e-05,
+      "loss": 0.4477,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.2672041654586792,
+      "learning_rate": 1.4102564102564104e-05,
+      "loss": 0.2811,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.36466383934020996,
+      "learning_rate": 1.3461538461538462e-05,
+      "loss": 0.5833,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.31829240918159485,
+      "learning_rate": 1.282051282051282e-05,
+      "loss": 0.4144,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3314779996871948,
+      "learning_rate": 1.217948717948718e-05,
+      "loss": 0.458,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.31584641337394714,
+      "learning_rate": 1.153846153846154e-05,
+      "loss": 0.4482,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2965344190597534,
+      "learning_rate": 1.0897435897435898e-05,
+      "loss": 0.3293,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.2547074556350708,
+      "learning_rate": 1.0256410256410256e-05,
+      "loss": 0.3007,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3165215849876404,
+      "learning_rate": 9.615384615384616e-06,
+      "loss": 0.4053,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.29612070322036743,
+      "learning_rate": 8.974358974358976e-06,
+      "loss": 0.4188,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3118067681789398,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.4429,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.29589030146598816,
+      "learning_rate": 7.692307692307694e-06,
+      "loss": 0.3348,
+      "step": 300
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.91880651046912e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

Stonk_Training_SFT/checkpoint-300/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46e61b3ea2344ac46fb729121ab3faf1d3cbf202ba9ecb6f78320571a83fec82
+size 5432

Stonk_Training_SFT/checkpoint-312/README.md ADDED Viewed

	@@ -0,0 +1,155 @@

+---
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+library_name: peft
+---
+# Model Card for Stonk_Training_SFT Checkpoint 312
+This checkpoint represents an intermediate state of the stock prediction model fine-tuned to analyze company information and provide structured UP/DOWN predictions with percentage estimates.
+## Model Details
+### Model Description
+This checkpoint of the Stonk_Training_SFT model represents the model after 312 training steps. The model was fine-tuned from Qwen2.5-1.5B-Instruct using Parameter-Efficient Fine-Tuning (PEFT) with Low-Rank Adaptation (LoRA). It has been trained to:
+1. Follow a specific formatted output structure using XML-style tags
+2. Analyze company information, price trends, and news headlines
+3. Make reasoned predictions about stock price movements
+- **Developed by:** 2084Collective
+- **Model type:** LoRA fine-tuned Qwen2.5-1.5B-Instruct (Checkpoint 312)
+- **Language(s):** English
+- **License:** Research only - not for commercial or production use
+- **Finetuned from model:** Qwen/Qwen2.5-1.5B-Instruct
+## Uses
+### Direct Use
+This checkpoint can be used to analyze stock information and generate structured predictions. It expects input containing:
+- Company ticker symbol
+- Company description
+- Current and previous stock prices
+- Recent news headlines
+It produces responses with the following structure:
+```
+<reason>
+Detailed reasoning about stock movement prediction
+</reason>
+<answer>
+UP/DOWN X.X%
+</answer>
+```
+### Out-of-Scope Use
+This checkpoint should NOT be used for:
+- Actual financial advice or investment decisions
+- Production trading systems
+- Commercial applications
+The model's predictions are for research and educational purposes only.
+## Bias, Risks, and Limitations
+- This checkpoint may not have fully balanced UP/DOWN prediction capabilities
+- The model has no access to real market data beyond what's in the prompt
+- Predictions are based only on the limited information provided
+- Stock market predictions are inherently uncertain and influenced by many factors
+- As an intermediate checkpoint, it may have inconsistencies in output formatting
+### Recommendations
+- Use outputs for educational and research purposes only
+- Do not make financial decisions based on the model's predictions
+- Consider outputs as one of many inputs in a broader analytical process
+- For more refined predictions, use the final model rather than this checkpoint
+## How to Get Started with the Model
+Use the code below to get started with this checkpoint:
+```python:Stonk_Training_SFT/checkpoint-312/README.md
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+# Load the base model and tokenizer
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+# Load the LoRA adapter checkpoint
+model = PeftModel.from_pretrained(model, "./Stonk_Training_SFT/checkpoint-312")
+# Prepare the prompt
+system_prompt = """You are an expert stock market analyst with decades of experience.
+IMPORTANT: You MUST use the following format for your response:
+<reason>
+Your detailed analysis explaining why the stock will move up or down. Include technical indicators, news impact, and market trends.
+</reason>
+<answer>
+State UP or DOWN followed by a percentage (e.g., "UP 2.3%" or "DOWN 1.5%")
+</answer>"""
+user_prompt = """Stock: AMZN
+Company: Amazon.com, Inc.
+Description: Amazon.com, Inc. is an American multinational technology company focusing on e-commerce, cloud computing, online advertising, digital streaming, and artificial intelligence.
+Current Price: $175.00
+Previous Close: $172.50
+Recent News:
+- Amazon Web Services announces new AI capabilities
+- E-commerce sales exceed analyst expectations
+- Amazon increases Prime subscription cost
+Question: Based on this information, analyze whether this stock will go UP or DOWN in the next trading day. Provide your reasoning and a specific percentage prediction."""
+# Format with chat template
+chat_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+# Generate prediction
+inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=300)
+response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(response)
+```
+## Training Details
+### Training Data
+This checkpoint represents training on:
+1. Synthetic stock prediction examples
+2. A mix of UP and DOWN prediction scenarios
+3. Various company profiles with different types of news
+### Training Procedure
+This checkpoint was saved during the training process after 312 steps of fine-tuning.
+#### Training Hyperparameters
+- **Training regime:** fp16 mixed precision
+- **LoRA Configuration:**
+  - r=8
+  - lora_alpha=16
+  - lora_dropout=0.05
+  - Target modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+## Evaluation
+As an intermediate checkpoint, this model may show:
+- Improving but not fully refined tag format usage
+- Developing balance between UP and DOWN predictions
+- Reasonable but not fully optimized percentage estimates
+## Framework versions
+- PEFT 0.14.0
+- Transformers 4.37.0
+- PyTorch 2.1.0

Stonk_Training_SFT/checkpoint-312/adapter_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Stonk_Training_SFT/checkpoint-312/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e46977a6d1b60085f1a175cdee93f3868b58220446dcd045fbbb0045b5610bda
+size 73911112

Stonk_Training_SFT/checkpoint-312/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:010c96eef481dcbf24fd904bd78799471a921591132e9dc96cc376b1474867d4
+size 148047722

Stonk_Training_SFT/checkpoint-312/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d3ce3643048833b4eb6d5d5aa3c14251b48b61ee442dbd7946f2671925793fa
+size 14244

Stonk_Training_SFT/checkpoint-312/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a333750d52e9ce5c74b2f1dcc9feae57a1d36067e56a00ec8f1d367a90dd5
+size 988

Stonk_Training_SFT/checkpoint-312/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07515c2b046d99c3743a994e794dd47b173671b79b64b92ecc764813da2c450b
+size 1064

Stonk_Training_SFT/checkpoint-312/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2217 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.0318742990493774,
+      "learning_rate": 0.00019935897435897437,
+      "loss": 2.444,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6106225252151489,
+      "learning_rate": 0.00019871794871794874,
+      "loss": 2.2868,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.4373067617416382,
+      "learning_rate": 0.0001980769230769231,
+      "loss": 2.1701,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.4335378408432007,
+      "learning_rate": 0.00019743589743589744,
+      "loss": 2.1096,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.49052542448043823,
+      "learning_rate": 0.00019679487179487178,
+      "loss": 1.9543,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5317227244377136,
+      "learning_rate": 0.00019615384615384615,
+      "loss": 1.8996,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5240360498428345,
+      "learning_rate": 0.0001955128205128205,
+      "loss": 1.8271,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6232216954231262,
+      "learning_rate": 0.00019487179487179487,
+      "loss": 1.554,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4871944785118103,
+      "learning_rate": 0.00019423076923076924,
+      "loss": 1.6762,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4941597878932953,
+      "learning_rate": 0.0001935897435897436,
+      "loss": 1.4755,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.49265003204345703,
+      "learning_rate": 0.00019294871794871797,
+      "loss": 1.4558,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5133666396141052,
+      "learning_rate": 0.00019230769230769233,
+      "loss": 1.4057,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5097830295562744,
+      "learning_rate": 0.00019166666666666667,
+      "loss": 1.2189,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4502263367176056,
+      "learning_rate": 0.00019102564102564104,
+      "loss": 1.4004,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.43125396966934204,
+      "learning_rate": 0.00019038461538461538,
+      "loss": 1.2261,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5045803189277649,
+      "learning_rate": 0.00018974358974358974,
+      "loss": 1.1501,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.385760098695755,
+      "learning_rate": 0.0001891025641025641,
+      "loss": 1.1648,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.40407794713974,
+      "learning_rate": 0.00018846153846153847,
+      "loss": 1.0138,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5278863310813904,
+      "learning_rate": 0.00018782051282051283,
+      "loss": 1.0064,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.42704564332962036,
+      "learning_rate": 0.0001871794871794872,
+      "loss": 1.033,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.40699487924575806,
+      "learning_rate": 0.00018653846153846154,
+      "loss": 1.096,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.49143341183662415,
+      "learning_rate": 0.0001858974358974359,
+      "loss": 1.0027,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.39408907294273376,
+      "learning_rate": 0.00018525641025641027,
+      "loss": 0.9692,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.40637388825416565,
+      "learning_rate": 0.00018461538461538463,
+      "loss": 0.8652,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.46840283274650574,
+      "learning_rate": 0.00018397435897435897,
+      "loss": 0.7854,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.49589014053344727,
+      "learning_rate": 0.00018333333333333334,
+      "loss": 0.7093,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.426786333322525,
+      "learning_rate": 0.0001826923076923077,
+      "loss": 0.7803,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.43341225385665894,
+      "learning_rate": 0.00018205128205128207,
+      "loss": 0.6717,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.35544857382774353,
+      "learning_rate": 0.00018141025641025643,
+      "loss": 0.6876,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3088377118110657,
+      "learning_rate": 0.00018076923076923077,
+      "loss": 0.8013,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5275241732597351,
+      "learning_rate": 0.00018012820512820513,
+      "loss": 0.6156,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.32654592394828796,
+      "learning_rate": 0.0001794871794871795,
+      "loss": 0.6555,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.2824726402759552,
+      "learning_rate": 0.00017884615384615386,
+      "loss": 0.5935,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.2731788158416748,
+      "learning_rate": 0.00017820512820512823,
+      "loss": 0.7172,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.2869996428489685,
+      "learning_rate": 0.00017756410256410257,
+      "loss": 0.6094,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.2364930361509323,
+      "learning_rate": 0.00017692307692307693,
+      "loss": 0.6224,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.23826579749584198,
+      "learning_rate": 0.0001762820512820513,
+      "loss": 0.6619,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.2075989544391632,
+      "learning_rate": 0.00017564102564102566,
+      "loss": 0.5132,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.21996797621250153,
+      "learning_rate": 0.000175,
+      "loss": 0.5349,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.22936391830444336,
+      "learning_rate": 0.00017435897435897436,
+      "loss": 0.6609,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.22315745055675507,
+      "learning_rate": 0.00017371794871794873,
+      "loss": 0.601,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.23068805038928986,
+      "learning_rate": 0.0001730769230769231,
+      "loss": 0.5872,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.21181653439998627,
+      "learning_rate": 0.00017243589743589746,
+      "loss": 0.562,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2169111967086792,
+      "learning_rate": 0.0001717948717948718,
+      "loss": 0.5454,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.23763957619667053,
+      "learning_rate": 0.00017115384615384616,
+      "loss": 0.5714,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.23329143226146698,
+      "learning_rate": 0.00017051282051282053,
+      "loss": 0.6302,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.24837623536586761,
+      "learning_rate": 0.00016987179487179486,
+      "loss": 0.5195,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.23882092535495758,
+      "learning_rate": 0.00016923076923076923,
+      "loss": 0.5077,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.2507559359073639,
+      "learning_rate": 0.0001685897435897436,
+      "loss": 0.6475,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.249229297041893,
+      "learning_rate": 0.00016794871794871796,
+      "loss": 0.6017,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.24630750715732574,
+      "learning_rate": 0.00016730769230769232,
+      "loss": 0.7265,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.2189762443304062,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.5816,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.2525809705257416,
+      "learning_rate": 0.00016602564102564105,
+      "loss": 0.6504,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.2427346110343933,
+      "learning_rate": 0.0001653846153846154,
+      "loss": 0.607,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.25441974401474,
+      "learning_rate": 0.00016474358974358976,
+      "loss": 0.498,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3109598159790039,
+      "learning_rate": 0.0001641025641025641,
+      "loss": 0.7047,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.24834761023521423,
+      "learning_rate": 0.00016346153846153846,
+      "loss": 0.6676,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.2606664001941681,
+      "learning_rate": 0.00016282051282051282,
+      "loss": 0.5359,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.280716210603714,
+      "learning_rate": 0.0001621794871794872,
+      "loss": 0.5061,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.22753354907035828,
+      "learning_rate": 0.00016153846153846155,
+      "loss": 0.5317,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.2716667056083679,
+      "learning_rate": 0.00016089743589743592,
+      "loss": 0.525,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.29029580950737,
+      "learning_rate": 0.00016025641025641028,
+      "loss": 0.5692,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3005497455596924,
+      "learning_rate": 0.00015961538461538462,
+      "loss": 0.5708,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.2747126519680023,
+      "learning_rate": 0.00015897435897435896,
+      "loss": 0.5776,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.23698432743549347,
+      "learning_rate": 0.00015833333333333332,
+      "loss": 0.4512,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.2807236313819885,
+      "learning_rate": 0.0001576923076923077,
+      "loss": 0.6356,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.2509554326534271,
+      "learning_rate": 0.00015705128205128205,
+      "loss": 0.6171,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.2572317123413086,
+      "learning_rate": 0.00015641025641025642,
+      "loss": 0.5074,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2810407280921936,
+      "learning_rate": 0.00015576923076923078,
+      "loss": 0.5584,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.2606565058231354,
+      "learning_rate": 0.00015512820512820515,
+      "loss": 0.5131,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.2663954198360443,
+      "learning_rate": 0.00015448717948717951,
+      "loss": 0.5039,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.2655510902404785,
+      "learning_rate": 0.00015384615384615385,
+      "loss": 0.5378,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.2713690996170044,
+      "learning_rate": 0.00015320512820512822,
+      "loss": 0.4168,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.29010245203971863,
+      "learning_rate": 0.00015256410256410255,
+      "loss": 0.4522,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2629586160182953,
+      "learning_rate": 0.00015192307692307692,
+      "loss": 0.4264,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.2788578271865845,
+      "learning_rate": 0.00015128205128205128,
+      "loss": 0.4294,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.28636300563812256,
+      "learning_rate": 0.00015064102564102565,
+      "loss": 0.4514,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3492796719074249,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.5396,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.28230172395706177,
+      "learning_rate": 0.00014935897435897438,
+      "loss": 0.4536,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3269024193286896,
+      "learning_rate": 0.00014871794871794872,
+      "loss": 0.4935,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.2631271779537201,
+      "learning_rate": 0.00014807692307692308,
+      "loss": 0.4967,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.29715263843536377,
+      "learning_rate": 0.00014743589743589745,
+      "loss": 0.46,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.24541226029396057,
+      "learning_rate": 0.00014679487179487178,
+      "loss": 0.451,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.26244840025901794,
+      "learning_rate": 0.00014615384615384615,
+      "loss": 0.4505,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.28552278876304626,
+      "learning_rate": 0.00014551282051282051,
+      "loss": 0.4962,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3041422963142395,
+      "learning_rate": 0.00014487179487179488,
+      "loss": 0.5714,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.2605431079864502,
+      "learning_rate": 0.00014423076923076924,
+      "loss": 0.424,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.2429913431406021,
+      "learning_rate": 0.0001435897435897436,
+      "loss": 0.4929,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.263763964176178,
+      "learning_rate": 0.00014294871794871795,
+      "loss": 0.51,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.2757089138031006,
+      "learning_rate": 0.0001423076923076923,
+      "loss": 0.5876,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.25437983870506287,
+      "learning_rate": 0.00014166666666666668,
+      "loss": 0.5662,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.28170907497406006,
+      "learning_rate": 0.00014102564102564104,
+      "loss": 0.6068,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.2835545837879181,
+      "learning_rate": 0.00014038461538461538,
+      "loss": 0.5769,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.26598137617111206,
+      "learning_rate": 0.00013974358974358974,
+      "loss": 0.5344,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.27048271894454956,
+      "learning_rate": 0.0001391025641025641,
+      "loss": 0.5439,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.2872040867805481,
+      "learning_rate": 0.00013846153846153847,
+      "loss": 0.4907,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2894354462623596,
+      "learning_rate": 0.00013782051282051284,
+      "loss": 0.5225,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.2700275778770447,
+      "learning_rate": 0.00013717948717948718,
+      "loss": 0.5485,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.25223323702812195,
+      "learning_rate": 0.00013653846153846154,
+      "loss": 0.4103,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.26242420077323914,
+      "learning_rate": 0.0001358974358974359,
+      "loss": 0.4974,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.25841623544692993,
+      "learning_rate": 0.00013525641025641027,
+      "loss": 0.5663,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.2550147473812103,
+      "learning_rate": 0.00013461538461538464,
+      "loss": 0.4468,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.2532574534416199,
+      "learning_rate": 0.00013397435897435897,
+      "loss": 0.4785,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3210897743701935,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.348,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.2774418294429779,
+      "learning_rate": 0.0001326923076923077,
+      "loss": 0.5105,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.2911253571510315,
+      "learning_rate": 0.00013205128205128204,
+      "loss": 0.4461,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3193359971046448,
+      "learning_rate": 0.0001314102564102564,
+      "loss": 0.425,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.25757595896720886,
+      "learning_rate": 0.00013076923076923077,
+      "loss": 0.5055,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.29757291078567505,
+      "learning_rate": 0.00013012820512820514,
+      "loss": 0.4796,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.25091230869293213,
+      "learning_rate": 0.0001294871794871795,
+      "loss": 0.4215,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.24596236646175385,
+      "learning_rate": 0.00012884615384615387,
+      "loss": 0.4959,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.2935032844543457,
+      "learning_rate": 0.00012820512820512823,
+      "loss": 0.5622,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.247608482837677,
+      "learning_rate": 0.00012756410256410257,
+      "loss": 0.5097,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.24816389381885529,
+      "learning_rate": 0.00012692307692307693,
+      "loss": 0.4609,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.2884969115257263,
+      "learning_rate": 0.00012628205128205127,
+      "loss": 0.4562,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2685159742832184,
+      "learning_rate": 0.00012564102564102564,
+      "loss": 0.4159,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.27893656492233276,
+      "learning_rate": 0.000125,
+      "loss": 0.4143,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3658033311367035,
+      "learning_rate": 0.00012435897435897437,
+      "loss": 0.5775,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3107357621192932,
+      "learning_rate": 0.00012371794871794873,
+      "loss": 0.4669,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.303265780210495,
+      "learning_rate": 0.0001230769230769231,
+      "loss": 0.5763,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.21491239964962006,
+      "learning_rate": 0.00012243589743589746,
+      "loss": 0.2977,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.25996512174606323,
+      "learning_rate": 0.00012179487179487179,
+      "loss": 0.4028,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.25640368461608887,
+      "learning_rate": 0.00012115384615384615,
+      "loss": 0.5186,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.25851595401763916,
+      "learning_rate": 0.00012051282051282052,
+      "loss": 0.4153,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2687956988811493,
+      "learning_rate": 0.00011987179487179487,
+      "loss": 0.4214,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.2811547815799713,
+      "learning_rate": 0.00011923076923076923,
+      "loss": 0.4813,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3029988706111908,
+      "learning_rate": 0.0001185897435897436,
+      "loss": 0.6633,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.23572632670402527,
+      "learning_rate": 0.00011794871794871796,
+      "loss": 0.321,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.26160842180252075,
+      "learning_rate": 0.00011730769230769231,
+      "loss": 0.4805,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.2799210548400879,
+      "learning_rate": 0.00011666666666666668,
+      "loss": 0.5449,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.26176995038986206,
+      "learning_rate": 0.00011602564102564104,
+      "loss": 0.5371,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2574150562286377,
+      "learning_rate": 0.00011538461538461538,
+      "loss": 0.4654,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.2815636098384857,
+      "learning_rate": 0.00011474358974358975,
+      "loss": 0.4717,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.2238437682390213,
+      "learning_rate": 0.0001141025641025641,
+      "loss": 0.2716,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.28223180770874023,
+      "learning_rate": 0.00011346153846153846,
+      "loss": 0.4417,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.25392040610313416,
+      "learning_rate": 0.00011282051282051283,
+      "loss": 0.3974,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.26734161376953125,
+      "learning_rate": 0.00011217948717948718,
+      "loss": 0.4821,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3004485070705414,
+      "learning_rate": 0.00011153846153846154,
+      "loss": 0.5427,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3012203872203827,
+      "learning_rate": 0.00011089743589743591,
+      "loss": 0.4845,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.249158576130867,
+      "learning_rate": 0.00011025641025641027,
+      "loss": 0.4654,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.26157674193382263,
+      "learning_rate": 0.00010961538461538463,
+      "loss": 0.4359,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.2885046899318695,
+      "learning_rate": 0.00010897435897435896,
+      "loss": 0.5595,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.24725095927715302,
+      "learning_rate": 0.00010833333333333333,
+      "loss": 0.4303,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.2562006711959839,
+      "learning_rate": 0.0001076923076923077,
+      "loss": 0.4772,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.27621591091156006,
+      "learning_rate": 0.00010705128205128206,
+      "loss": 0.5012,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.24434815347194672,
+      "learning_rate": 0.00010641025641025641,
+      "loss": 0.4038,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.306618332862854,
+      "learning_rate": 0.00010576923076923077,
+      "loss": 0.6279,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.23926320672035217,
+      "learning_rate": 0.00010512820512820514,
+      "loss": 0.3658,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.2628524899482727,
+      "learning_rate": 0.0001044871794871795,
+      "loss": 0.4094,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3073101341724396,
+      "learning_rate": 0.00010384615384615386,
+      "loss": 0.5235,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.2647029757499695,
+      "learning_rate": 0.00010320512820512822,
+      "loss": 0.435,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.2945331335067749,
+      "learning_rate": 0.00010256410256410256,
+      "loss": 0.4805,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.24870315194129944,
+      "learning_rate": 0.00010192307692307692,
+      "loss": 0.3913,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.2954522371292114,
+      "learning_rate": 0.00010128205128205129,
+      "loss": 0.412,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.2745000720024109,
+      "learning_rate": 0.00010064102564102564,
+      "loss": 0.3781,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3542097806930542,
+      "learning_rate": 0.0001,
+      "loss": 0.6366,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.25741928815841675,
+      "learning_rate": 9.935897435897437e-05,
+      "loss": 0.4203,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.24221432209014893,
+      "learning_rate": 9.871794871794872e-05,
+      "loss": 0.3816,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.28175830841064453,
+      "learning_rate": 9.807692307692307e-05,
+      "loss": 0.5144,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.28136804699897766,
+      "learning_rate": 9.743589743589744e-05,
+      "loss": 0.5099,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.2572178542613983,
+      "learning_rate": 9.67948717948718e-05,
+      "loss": 0.4466,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.26523545384407043,
+      "learning_rate": 9.615384615384617e-05,
+      "loss": 0.4726,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.2543681859970093,
+      "learning_rate": 9.551282051282052e-05,
+      "loss": 0.4538,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.2630670964717865,
+      "learning_rate": 9.487179487179487e-05,
+      "loss": 0.396,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.2914413809776306,
+      "learning_rate": 9.423076923076924e-05,
+      "loss": 0.5105,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.23061206936836243,
+      "learning_rate": 9.35897435897436e-05,
+      "loss": 0.3556,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.29008427262306213,
+      "learning_rate": 9.294871794871795e-05,
+      "loss": 0.5202,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3016505241394043,
+      "learning_rate": 9.230769230769232e-05,
+      "loss": 0.5411,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.2557263970375061,
+      "learning_rate": 9.166666666666667e-05,
+      "loss": 0.319,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3112490773200989,
+      "learning_rate": 9.102564102564103e-05,
+      "loss": 0.3887,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.31632199883461,
+      "learning_rate": 9.038461538461538e-05,
+      "loss": 0.5968,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.2825266122817993,
+      "learning_rate": 8.974358974358975e-05,
+      "loss": 0.4054,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.29404228925704956,
+      "learning_rate": 8.910256410256411e-05,
+      "loss": 0.4833,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.2678660750389099,
+      "learning_rate": 8.846153846153847e-05,
+      "loss": 0.5036,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.25883761048316956,
+      "learning_rate": 8.782051282051283e-05,
+      "loss": 0.4133,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.25341886281967163,
+      "learning_rate": 8.717948717948718e-05,
+      "loss": 0.405,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.27091604471206665,
+      "learning_rate": 8.653846153846155e-05,
+      "loss": 0.4839,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.2976224422454834,
+      "learning_rate": 8.58974358974359e-05,
+      "loss": 0.4976,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2556227445602417,
+      "learning_rate": 8.525641025641026e-05,
+      "loss": 0.366,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.30424100160598755,
+      "learning_rate": 8.461538461538461e-05,
+      "loss": 0.5004,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.2819097638130188,
+      "learning_rate": 8.397435897435898e-05,
+      "loss": 0.4262,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.27605441212654114,
+      "learning_rate": 8.333333333333334e-05,
+      "loss": 0.4365,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.27287614345550537,
+      "learning_rate": 8.26923076923077e-05,
+      "loss": 0.4083,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.27792593836784363,
+      "learning_rate": 8.205128205128205e-05,
+      "loss": 0.4137,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3038672208786011,
+      "learning_rate": 8.141025641025641e-05,
+      "loss": 0.4728,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.28418487310409546,
+      "learning_rate": 8.076923076923078e-05,
+      "loss": 0.4671,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.2828688323497772,
+      "learning_rate": 8.012820512820514e-05,
+      "loss": 0.5008,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3183538019657135,
+      "learning_rate": 7.948717948717948e-05,
+      "loss": 0.5282,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.30755531787872314,
+      "learning_rate": 7.884615384615384e-05,
+      "loss": 0.5238,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.30897241830825806,
+      "learning_rate": 7.820512820512821e-05,
+      "loss": 0.5401,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.2755669355392456,
+      "learning_rate": 7.756410256410257e-05,
+      "loss": 0.4468,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.2606697380542755,
+      "learning_rate": 7.692307692307693e-05,
+      "loss": 0.387,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3042752742767334,
+      "learning_rate": 7.628205128205128e-05,
+      "loss": 0.5305,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.29267027974128723,
+      "learning_rate": 7.564102564102564e-05,
+      "loss": 0.419,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.30853956937789917,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 0.5416,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3460753262042999,
+      "learning_rate": 7.435897435897436e-05,
+      "loss": 0.5196,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.298348993062973,
+      "learning_rate": 7.371794871794872e-05,
+      "loss": 0.4421,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.2961275577545166,
+      "learning_rate": 7.307692307692307e-05,
+      "loss": 0.4032,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.30500563979148865,
+      "learning_rate": 7.243589743589744e-05,
+      "loss": 0.442,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.30142742395401,
+      "learning_rate": 7.17948717948718e-05,
+      "loss": 0.4375,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.26335790753364563,
+      "learning_rate": 7.115384615384616e-05,
+      "loss": 0.428,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.24412094056606293,
+      "learning_rate": 7.051282051282052e-05,
+      "loss": 0.3212,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.2796410918235779,
+      "learning_rate": 6.987179487179487e-05,
+      "loss": 0.4459,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.33633318543434143,
+      "learning_rate": 6.923076923076924e-05,
+      "loss": 0.5125,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3190767467021942,
+      "learning_rate": 6.858974358974359e-05,
+      "loss": 0.5465,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.30400681495666504,
+      "learning_rate": 6.794871794871795e-05,
+      "loss": 0.4667,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.294740229845047,
+      "learning_rate": 6.730769230769232e-05,
+      "loss": 0.4955,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.2467174381017685,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.3151,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.27507495880126953,
+      "learning_rate": 6.602564102564102e-05,
+      "loss": 0.4123,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.2465631365776062,
+      "learning_rate": 6.538461538461539e-05,
+      "loss": 0.3117,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3061293363571167,
+      "learning_rate": 6.474358974358975e-05,
+      "loss": 0.5389,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.2974298596382141,
+      "learning_rate": 6.410256410256412e-05,
+      "loss": 0.4471,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.2876061797142029,
+      "learning_rate": 6.346153846153847e-05,
+      "loss": 0.4253,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.29899370670318604,
+      "learning_rate": 6.282051282051282e-05,
+      "loss": 0.4474,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.30990898609161377,
+      "learning_rate": 6.217948717948718e-05,
+      "loss": 0.3905,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.29554882645606995,
+      "learning_rate": 6.153846153846155e-05,
+      "loss": 0.4147,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.31249329447746277,
+      "learning_rate": 6.089743589743589e-05,
+      "loss": 0.4023,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.32120752334594727,
+      "learning_rate": 6.025641025641026e-05,
+      "loss": 0.4418,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.311110258102417,
+      "learning_rate": 5.9615384615384616e-05,
+      "loss": 0.463,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.2687552571296692,
+      "learning_rate": 5.897435897435898e-05,
+      "loss": 0.3984,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.30315181612968445,
+      "learning_rate": 5.833333333333334e-05,
+      "loss": 0.5084,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.28704944252967834,
+      "learning_rate": 5.769230769230769e-05,
+      "loss": 0.4327,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.2689530551433563,
+      "learning_rate": 5.705128205128205e-05,
+      "loss": 0.3964,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.321878582239151,
+      "learning_rate": 5.6410256410256414e-05,
+      "loss": 0.4681,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.30374661087989807,
+      "learning_rate": 5.576923076923077e-05,
+      "loss": 0.4465,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.29247525334358215,
+      "learning_rate": 5.512820512820514e-05,
+      "loss": 0.4501,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.274850994348526,
+      "learning_rate": 5.448717948717948e-05,
+      "loss": 0.3943,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.24561376869678497,
+      "learning_rate": 5.384615384615385e-05,
+      "loss": 0.3225,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.28236258029937744,
+      "learning_rate": 5.3205128205128205e-05,
+      "loss": 0.3247,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.29767942428588867,
+      "learning_rate": 5.256410256410257e-05,
+      "loss": 0.4514,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3310510516166687,
+      "learning_rate": 5.192307692307693e-05,
+      "loss": 0.4417,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.28120821714401245,
+      "learning_rate": 5.128205128205128e-05,
+      "loss": 0.3511,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.38378533720970154,
+      "learning_rate": 5.0641025641025644e-05,
+      "loss": 0.5135,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3125383257865906,
+      "learning_rate": 5e-05,
+      "loss": 0.4614,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.2549707591533661,
+      "learning_rate": 4.935897435897436e-05,
+      "loss": 0.3333,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.30307722091674805,
+      "learning_rate": 4.871794871794872e-05,
+      "loss": 0.4185,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.2714983820915222,
+      "learning_rate": 4.8076923076923084e-05,
+      "loss": 0.3744,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.29321545362472534,
+      "learning_rate": 4.7435897435897435e-05,
+      "loss": 0.4177,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3908475637435913,
+      "learning_rate": 4.67948717948718e-05,
+      "loss": 0.5391,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.2907383143901825,
+      "learning_rate": 4.615384615384616e-05,
+      "loss": 0.4457,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.32630524039268494,
+      "learning_rate": 4.5512820512820516e-05,
+      "loss": 0.4652,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.33661359548568726,
+      "learning_rate": 4.4871794871794874e-05,
+      "loss": 0.4093,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.27634692192077637,
+      "learning_rate": 4.423076923076923e-05,
+      "loss": 0.3958,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.40667521953582764,
+      "learning_rate": 4.358974358974359e-05,
+      "loss": 0.4525,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.30798301100730896,
+      "learning_rate": 4.294871794871795e-05,
+      "loss": 0.4243,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3315953016281128,
+      "learning_rate": 4.230769230769231e-05,
+      "loss": 0.4833,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.29299864172935486,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 0.3727,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3195595443248749,
+      "learning_rate": 4.1025641025641023e-05,
+      "loss": 0.4243,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.28434109687805176,
+      "learning_rate": 4.038461538461539e-05,
+      "loss": 0.4001,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.28316378593444824,
+      "learning_rate": 3.974358974358974e-05,
+      "loss": 0.3611,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.2709490656852722,
+      "learning_rate": 3.9102564102564105e-05,
+      "loss": 0.337,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.2810059189796448,
+      "learning_rate": 3.846153846153846e-05,
+      "loss": 0.3383,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.31759873032569885,
+      "learning_rate": 3.782051282051282e-05,
+      "loss": 0.4404,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.27550122141838074,
+      "learning_rate": 3.717948717948718e-05,
+      "loss": 0.3898,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.29137155413627625,
+      "learning_rate": 3.653846153846154e-05,
+      "loss": 0.3887,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.30342838168144226,
+      "learning_rate": 3.58974358974359e-05,
+      "loss": 0.3747,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3385205566883087,
+      "learning_rate": 3.525641025641026e-05,
+      "loss": 0.4591,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.28427594900131226,
+      "learning_rate": 3.461538461538462e-05,
+      "loss": 0.3955,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3554467558860779,
+      "learning_rate": 3.397435897435898e-05,
+      "loss": 0.4754,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2872997224330902,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.3336,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.32470715045928955,
+      "learning_rate": 3.269230769230769e-05,
+      "loss": 0.5002,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.2823026180267334,
+      "learning_rate": 3.205128205128206e-05,
+      "loss": 0.4217,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.27532005310058594,
+      "learning_rate": 3.141025641025641e-05,
+      "loss": 0.3505,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.30065861344337463,
+      "learning_rate": 3.0769230769230774e-05,
+      "loss": 0.385,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3140447437763214,
+      "learning_rate": 3.012820512820513e-05,
+      "loss": 0.4599,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.30165910720825195,
+      "learning_rate": 2.948717948717949e-05,
+      "loss": 0.4027,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3036094307899475,
+      "learning_rate": 2.8846153846153845e-05,
+      "loss": 0.406,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.31652310490608215,
+      "learning_rate": 2.8205128205128207e-05,
+      "loss": 0.4115,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3006727397441864,
+      "learning_rate": 2.756410256410257e-05,
+      "loss": 0.4464,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.34472909569740295,
+      "learning_rate": 2.6923076923076923e-05,
+      "loss": 0.4643,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.2880076766014099,
+      "learning_rate": 2.6282051282051285e-05,
+      "loss": 0.3675,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.34488052129745483,
+      "learning_rate": 2.564102564102564e-05,
+      "loss": 0.4661,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.30622804164886475,
+      "learning_rate": 2.5e-05,
+      "loss": 0.405,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3236006200313568,
+      "learning_rate": 2.435897435897436e-05,
+      "loss": 0.4426,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3554222285747528,
+      "learning_rate": 2.3717948717948718e-05,
+      "loss": 0.4809,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.34005558490753174,
+      "learning_rate": 2.307692307692308e-05,
+      "loss": 0.417,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.30246976017951965,
+      "learning_rate": 2.2435897435897437e-05,
+      "loss": 0.4264,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3341597616672516,
+      "learning_rate": 2.1794871794871795e-05,
+      "loss": 0.4871,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.349656343460083,
+      "learning_rate": 2.1153846153846154e-05,
+      "loss": 0.5549,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.32281672954559326,
+      "learning_rate": 2.0512820512820512e-05,
+      "loss": 0.3897,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.34751957654953003,
+      "learning_rate": 1.987179487179487e-05,
+      "loss": 0.429,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.30903372168540955,
+      "learning_rate": 1.923076923076923e-05,
+      "loss": 0.3939,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.29195931553840637,
+      "learning_rate": 1.858974358974359e-05,
+      "loss": 0.3129,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3270445168018341,
+      "learning_rate": 1.794871794871795e-05,
+      "loss": 0.4435,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3003033995628357,
+      "learning_rate": 1.730769230769231e-05,
+      "loss": 0.3446,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.35107478499412537,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 0.5102,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.28601357340812683,
+      "learning_rate": 1.602564102564103e-05,
+      "loss": 0.3619,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2772061228752136,
+      "learning_rate": 1.5384615384615387e-05,
+      "loss": 0.3428,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3195333182811737,
+      "learning_rate": 1.4743589743589745e-05,
+      "loss": 0.4477,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.2672041654586792,
+      "learning_rate": 1.4102564102564104e-05,
+      "loss": 0.2811,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.36466383934020996,
+      "learning_rate": 1.3461538461538462e-05,
+      "loss": 0.5833,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.31829240918159485,
+      "learning_rate": 1.282051282051282e-05,
+      "loss": 0.4144,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3314779996871948,
+      "learning_rate": 1.217948717948718e-05,
+      "loss": 0.458,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.31584641337394714,
+      "learning_rate": 1.153846153846154e-05,
+      "loss": 0.4482,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2965344190597534,
+      "learning_rate": 1.0897435897435898e-05,
+      "loss": 0.3293,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.2547074556350708,
+      "learning_rate": 1.0256410256410256e-05,
+      "loss": 0.3007,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3165215849876404,
+      "learning_rate": 9.615384615384616e-06,
+      "loss": 0.4053,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.29612070322036743,
+      "learning_rate": 8.974358974358976e-06,
+      "loss": 0.4188,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3118067681789398,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.4429,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.29589030146598816,
+      "learning_rate": 7.692307692307694e-06,
+      "loss": 0.3348,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3114546537399292,
+      "learning_rate": 7.051282051282052e-06,
+      "loss": 0.3836,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.320286363363266,
+      "learning_rate": 6.41025641025641e-06,
+      "loss": 0.4182,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.30404043197631836,
+      "learning_rate": 5.76923076923077e-06,
+      "loss": 0.3908,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.32467764616012573,
+      "learning_rate": 5.128205128205128e-06,
+      "loss": 0.4187,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3350898325443268,
+      "learning_rate": 4.487179487179488e-06,
+      "loss": 0.4535,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3279955983161926,
+      "learning_rate": 3.846153846153847e-06,
+      "loss": 0.4286,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.30812716484069824,
+      "learning_rate": 3.205128205128205e-06,
+      "loss": 0.3714,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3073851466178894,
+      "learning_rate": 2.564102564102564e-06,
+      "loss": 0.3603,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.29078444838523865,
+      "learning_rate": 1.9230769230769234e-06,
+      "loss": 0.3935,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3246036171913147,
+      "learning_rate": 1.282051282051282e-06,
+      "loss": 0.4224,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3184387683868408,
+      "learning_rate": 6.41025641025641e-07,
+      "loss": 0.4499,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.37150660157203674,
+      "learning_rate": 0.0,
+      "loss": 0.4383,
+      "step": 312
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.075558770887885e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

Stonk_Training_SFT/checkpoint-312/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46e61b3ea2344ac46fb729121ab3faf1d3cbf202ba9ecb6f78320571a83fec82
+size 5432

Stonk_Training_SFT/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Stonk_Training_SFT/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Stonk_Training_SFT/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e667a7d18d94098aefa2473386a7a3e456dff729cdf04a1f060f32b0d8b8fe7
+size 11422176

Stonk_Training_SFT/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

Stonk_Training_SFT/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff