Campis commited on Jul 25, 2025

Commit

d7a037d

verified ·

1 Parent(s): fe3a9a3

Upload 18 files

Browse files

Files changed (19) hide show

.gitattributes +1 -0
README.md +61 -0
adapter_config.json +38 -0
adapter_model.safetensors +3 -0
all_results.json +13 -0
chat_template.jinja +93 -0
eval_results.json +8 -0
llamaboard_config.yaml +86 -0
running_log.txt +635 -0
special_tokens_map.json +26 -0
tokenizer.json +3 -0
tokenizer_config.json +2068 -0
train_results.json +9 -0
trainer_log.jsonl +79 -0
trainer_state.json +821 -0
training_args.bin +3 -0
training_args.yaml +44 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+library_name: peft
+license: other
+base_model: meta-llama/Llama-3.2-1B-Instruct
+tags:
+- llama-factory
+- lora
+- generated_from_trainer
+model-index:
+- name: train_1B-Instruct_pippo_v6
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# train_1B-Instruct_pippo_v6
+This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the pipo_persona dataset.
+It achieves the following results on the evaluation set:
+- Loss: 3.6431
+- Num Input Tokens Seen: 300000
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 16
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- num_epochs: 4.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.52.4
+- Pytorch 2.6.0+cu124
+- Datasets 3.6.0
+- Tokenizers 0.21.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 48,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 24,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a009cfb702a3432cece99f7fdd21a929e3c5bccba7e2ae9b332f7c161eb874c8
+size 63726400

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 4.0,
+    "eval_loss": 3.643085241317749,
+    "eval_runtime": 12.9097,
+    "eval_samples_per_second": 28.893,
+    "eval_steps_per_second": 14.485,
+    "num_input_tokens_seen": 300000,
+    "total_flos": 1780328448000000.0,
+    "train_loss": 0.7932832451101314,
+    "train_runtime": 895.9422,
+    "train_samples_per_second": 6.652,
+    "train_steps_per_second": 0.42
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,93 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+        {{- '{"name": "' + tool_call.name + '", ' }}
+        {{- '"parameters": ' }}
+        {{- tool_call.arguments | tojson }}
+        {{- "}" }}
+        {{- "<|eot_id|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.0,
+    "eval_loss": 3.643085241317749,
+    "eval_runtime": 12.9097,
+    "eval_samples_per_second": 28.893,
+    "eval_steps_per_second": 14.485,
+    "num_input_tokens_seen": 300000
+}

llamaboard_config.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+top.booster: auto
+top.checkpoint_path:
+- train_1B-Instruct_pippo_v4
+top.finetuning_type: lora
+top.model_name: Llama-3.2-1B-Instruct
+top.quantization_bit: none
+top.quantization_method: bnb
+top.rope_scaling: none
+top.template: llama3
+train.additional_target: ''
+train.apollo_rank: 16
+train.apollo_scale: 32
+train.apollo_target: all
+train.apollo_update_interval: 200
+train.badam_mode: layer
+train.badam_switch_interval: 50
+train.badam_switch_mode: ascending
+train.badam_update_ratio: 0.05
+train.batch_size: 2
+train.compute_type: bf16
+train.create_new_adapter: false
+train.cutoff_len: 1024
+train.dataset:
+- pipo_persona
+train.dataset_dir: data
+train.ds_offload: false
+train.ds_stage: none
+train.enable_thinking: true
+train.extra_args: '{"optim": "adamw_torch"}'
+train.freeze_extra_modules: ''
+train.freeze_language_model: false
+train.freeze_multi_modal_projector: true
+train.freeze_trainable_layers: 2
+train.freeze_trainable_modules: all
+train.freeze_vision_tower: true
+train.galore_rank: 16
+train.galore_scale: 2
+train.galore_target: all
+train.galore_update_interval: 200
+train.gradient_accumulation_steps: 8
+train.image_max_pixels: 768*768
+train.image_min_pixels: 32*32
+train.learning_rate: 2e-5
+train.logging_steps: 5
+train.lora_alpha: 32
+train.lora_dropout: 0.1
+train.lora_rank: 16
+train.lora_target: q_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
+train.loraplus_lr_ratio: 8
+train.lr_scheduler_type: cosine
+train.mask_history: false
+train.max_grad_norm: '1.0'
+train.max_samples: '1900'
+train.neat_packing: false
+train.neftune_alpha: 0
+train.num_train_epochs: '4'
+train.packing: false
+train.ppo_score_norm: false
+train.ppo_whiten_rewards: false
+train.pref_beta: 0.1
+train.pref_ftx: 0
+train.pref_loss: sigmoid
+train.report_to: none
+train.resize_vocab: false
+train.reward_model: []
+train.save_steps: 100
+train.swanlab_api_key: ''
+train.swanlab_link: ''
+train.swanlab_mode: cloud
+train.swanlab_project: llamafactory
+train.swanlab_run_name: ''
+train.swanlab_workspace: ''
+train.train_on_prompt: false
+train.training_stage: Supervised Fine-Tuning
+train.use_apollo: false
+train.use_badam: false
+train.use_dora: true
+train.use_galore: false
+train.use_llama_pro: false
+train.use_pissa: true
+train.use_rslora: true
+train.use_swanlab: false
+train.val_size: 0.2
+train.video_max_pixels: 256*256
+train.video_min_pixels: 16*16
+train.warmup_steps: 0

running_log.txt ADDED Viewed

	@@ -0,0 +1,635 @@

+[INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json
+[INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file tokenizer.model from cache at None
+[INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file added_tokens.json from cache at None
+[INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json
+[INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json
+[INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file chat_template.jinja from cache at None
+[INFO|2025-07-24 16:02:16] tokenization_utils_base.py:2299 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|2025-07-24 16:02:17] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:02:17] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json
+[INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file tokenizer.model from cache at None
+[INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file added_tokens.json from cache at None
+[INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json
+[INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json
+[INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file chat_template.jinja from cache at None
+[INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2299 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|2025-07-24 16:02:17] logging.py:143 >> Add pad token: <|eot_id|>
+[INFO|2025-07-24 16:02:17] logging.py:143 >> Add <|eom_id|> to stop words.
+[INFO|2025-07-24 16:02:17] logging.py:143 >> Loading dataset pippo_dataset_v02_1900_lines.json...
+[INFO|2025-07-24 16:02:24] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:02:24] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:02:24] logging.py:143 >> KV cache is disabled during training.
+[INFO|2025-07-24 16:03:23] modeling_utils.py:1151 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/model.safetensors
+[INFO|2025-07-24 16:03:23] modeling_utils.py:2241 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
+[INFO|2025-07-24 16:03:23] configuration_utils.py:1135 >> Generate config GenerationConfig {
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "use_cache": false
+}
+[INFO|2025-07-24 16:03:25] modeling_utils.py:5131 >> All model checkpoint weights were used when initializing LlamaForCausalLM.
+[INFO|2025-07-24 16:03:25] modeling_utils.py:5139 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct.
+If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
+[INFO|2025-07-24 16:03:25] configuration_utils.py:1090 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/generation_config.json
+[INFO|2025-07-24 16:03:25] configuration_utils.py:1135 >> Generate config GenerationConfig {
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "temperature": 0.6,
+  "top_p": 0.9
+}
+[INFO|2025-07-24 16:03:25] logging.py:143 >> Gradient checkpointing enabled.
+[INFO|2025-07-24 16:03:25] logging.py:143 >> Using torch SDPA for faster training and inference.
+[INFO|2025-07-24 16:03:25] logging.py:143 >> Upcasting trainable params to float32.
+[INFO|2025-07-24 16:03:25] logging.py:143 >> Fine-tuning method: DoRA
+[INFO|2025-07-24 16:03:27] logging.py:143 >> Loaded adapter(s): saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v4
+[INFO|2025-07-24 16:03:27] logging.py:143 >> trainable params: 15,925,248 || all params: 1,251,739,648 || trainable%: 1.2722
+[INFO|2025-07-24 16:03:27] trainer.py:756 >> Using auto half precision backend
+[INFO|2025-07-24 16:03:27] logging.py:143 >> Using LoRA+ optimizer with loraplus lr ratio 8.00.
+[INFO|2025-07-24 16:03:28] trainer.py:2409 >> ***** Running training *****
+[INFO|2025-07-24 16:03:28] trainer.py:2410 >>   Num examples = 1,490
+[INFO|2025-07-24 16:03:28] trainer.py:2411 >>   Num Epochs = 4
+[INFO|2025-07-24 16:03:28] trainer.py:2412 >>   Instantaneous batch size per device = 2
+[INFO|2025-07-24 16:03:28] trainer.py:2415 >>   Total train batch size (w. parallel, distributed & accumulation) = 16
+[INFO|2025-07-24 16:03:28] trainer.py:2416 >>   Gradient Accumulation steps = 8
+[INFO|2025-07-24 16:03:28] trainer.py:2417 >>   Total optimization steps = 376
+[INFO|2025-07-24 16:03:28] trainer.py:2418 >>   Number of trainable parameters = 15,925,248
+[INFO|2025-07-24 16:03:28] logging.py:143 >> Initial PiSSA adapter will be saved at: saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/pissa_init.
+[INFO|2025-07-24 16:03:28] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:03:28] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:03:43] logging.py:143 >> {'loss': 1.1748, 'learning_rate': 1.9994e-05, 'epoch': 0.05, 'throughput': 263.90}
+[INFO|2025-07-24 16:03:54] logging.py:143 >> {'loss': 1.2634, 'learning_rate': 1.9972e-05, 'epoch': 0.11, 'throughput': 304.64}
+[INFO|2025-07-24 16:04:06] logging.py:143 >> {'loss': 1.4367, 'learning_rate': 1.9932e-05, 'epoch': 0.16, 'throughput': 315.40}
+[INFO|2025-07-24 16:04:18] logging.py:143 >> {'loss': 1.3167, 'learning_rate': 1.9874e-05, 'epoch': 0.21, 'throughput': 323.64}
+[INFO|2025-07-24 16:04:30] logging.py:143 >> {'loss': 1.4144, 'learning_rate': 1.9800e-05, 'epoch': 0.27, 'throughput': 325.70}
+[INFO|2025-07-24 16:04:41] logging.py:143 >> {'loss': 1.3944, 'learning_rate': 1.9708e-05, 'epoch': 0.32, 'throughput': 331.60}
+[INFO|2025-07-24 16:04:52] logging.py:143 >> {'loss': 1.3946, 'learning_rate': 1.9599e-05, 'epoch': 0.38, 'throughput': 334.75}
+[INFO|2025-07-24 16:05:04] logging.py:143 >> {'loss': 1.3827, 'learning_rate': 1.9474e-05, 'epoch': 0.43, 'throughput': 336.48}
+[INFO|2025-07-24 16:05:16] logging.py:143 >> {'loss': 1.4509, 'learning_rate': 1.9332e-05, 'epoch': 0.48, 'throughput': 336.58}
+[INFO|2025-07-24 16:05:29] logging.py:143 >> {'loss': 1.3347, 'learning_rate': 1.9174e-05, 'epoch': 0.54, 'throughput': 335.59}
+[INFO|2025-07-24 16:05:41] logging.py:143 >> {'loss': 1.3834, 'learning_rate': 1.8999e-05, 'epoch': 0.59, 'throughput': 335.37}
+[INFO|2025-07-24 16:05:53] logging.py:143 >> {'loss': 1.4679, 'learning_rate': 1.8809e-05, 'epoch': 0.64, 'throughput': 335.60}
+[INFO|2025-07-24 16:06:04] logging.py:143 >> {'loss': 1.3973, 'learning_rate': 1.8604e-05, 'epoch': 0.70, 'throughput': 336.68}
+[INFO|2025-07-24 16:06:17] logging.py:143 >> {'loss': 1.5408, 'learning_rate': 1.8384e-05, 'epoch': 0.75, 'throughput': 335.03}
+[INFO|2025-07-24 16:06:28] logging.py:143 >> {'loss': 1.4967, 'learning_rate': 1.8149e-05, 'epoch': 0.81, 'throughput': 335.55}
+[INFO|2025-07-24 16:06:40] logging.py:143 >> {'loss': 1.4644, 'learning_rate': 1.7900e-05, 'epoch': 0.86, 'throughput': 336.30}
+[INFO|2025-07-24 16:06:52] logging.py:143 >> {'loss': 1.4226, 'learning_rate': 1.7637e-05, 'epoch': 0.91, 'throughput': 336.02}
+[INFO|2025-07-24 16:07:03] logging.py:143 >> {'loss': 1.4847, 'learning_rate': 1.7360e-05, 'epoch': 0.97, 'throughput': 336.26}
+[INFO|2025-07-24 16:07:13] logging.py:143 >> {'loss': 1.2927, 'learning_rate': 1.7071e-05, 'epoch': 1.01, 'throughput': 336.91}
+[INFO|2025-07-24 16:07:23] logging.py:143 >> {'loss': 0.8505, 'learning_rate': 1.6770e-05, 'epoch': 1.06, 'throughput': 338.48}
+[INFO|2025-07-24 16:07:23] trainer.py:4327 >>
+***** Running Evaluation *****
+[INFO|2025-07-24 16:07:23] trainer.py:4329 >>   Num examples = 373
+[INFO|2025-07-24 16:07:23] trainer.py:4332 >>   Batch size = 2
+[INFO|2025-07-24 16:07:37] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-100
+[INFO|2025-07-24 16:07:37] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:07:37] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:07:38] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-100/chat_template.jinja
+[INFO|2025-07-24 16:07:38] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-100/tokenizer_config.json
+[INFO|2025-07-24 16:07:38] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-100/special_tokens_map.json
+[INFO|2025-07-24 16:07:51] logging.py:143 >> {'loss': 0.7497, 'learning_rate': 1.6456e-05, 'epoch': 1.12, 'throughput': 318.10}
+[INFO|2025-07-24 16:08:02] logging.py:143 >> {'loss': 0.7761, 'learning_rate': 1.6132e-05, 'epoch': 1.17, 'throughput': 319.79}
+[INFO|2025-07-24 16:08:14] logging.py:143 >> {'loss': 0.7997, 'learning_rate': 1.5796e-05, 'epoch': 1.23, 'throughput': 321.09}
+[INFO|2025-07-24 16:08:25] logging.py:143 >> {'loss': 0.8275, 'learning_rate': 1.5451e-05, 'epoch': 1.28, 'throughput': 322.08}
+[INFO|2025-07-24 16:08:37] logging.py:143 >> {'loss': 0.8599, 'learning_rate': 1.5096e-05, 'epoch': 1.33, 'throughput': 322.83}
+[INFO|2025-07-24 16:08:49] logging.py:143 >> {'loss': 0.9179, 'learning_rate': 1.4732e-05, 'epoch': 1.39, 'throughput': 323.42}
+[INFO|2025-07-24 16:09:00] logging.py:143 >> {'loss': 0.8255, 'learning_rate': 1.4360e-05, 'epoch': 1.44, 'throughput': 324.25}
+[INFO|2025-07-24 16:09:11] logging.py:143 >> {'loss': 0.8806, 'learning_rate': 1.3981e-05, 'epoch': 1.49, 'throughput': 325.46}
+[INFO|2025-07-24 16:09:23] logging.py:143 >> {'loss': 0.8366, 'learning_rate': 1.3594e-05, 'epoch': 1.55, 'throughput': 326.15}
+[INFO|2025-07-24 16:09:34] logging.py:143 >> {'loss': 0.8567, 'learning_rate': 1.3201e-05, 'epoch': 1.60, 'throughput': 327.21}
+[INFO|2025-07-24 16:09:45] logging.py:143 >> {'loss': 0.8939, 'learning_rate': 1.2803e-05, 'epoch': 1.66, 'throughput': 328.25}
+[INFO|2025-07-24 16:09:56] logging.py:143 >> {'loss': 0.9650, 'learning_rate': 1.2399e-05, 'epoch': 1.71, 'throughput': 329.28}
+[INFO|2025-07-24 16:10:07] logging.py:143 >> {'loss': 0.8915, 'learning_rate': 1.1992e-05, 'epoch': 1.76, 'throughput': 330.41}
+[INFO|2025-07-24 16:10:18] logging.py:143 >> {'loss': 0.9085, 'learning_rate': 1.1581e-05, 'epoch': 1.82, 'throughput': 331.65}
+[INFO|2025-07-24 16:10:29] logging.py:143 >> {'loss': 0.8831, 'learning_rate': 1.1167e-05, 'epoch': 1.87, 'throughput': 332.58}
+[INFO|2025-07-24 16:10:41] logging.py:143 >> {'loss': 1.0271, 'learning_rate': 1.0751e-05, 'epoch': 1.92, 'throughput': 332.56}
+[INFO|2025-07-24 16:10:52] logging.py:143 >> {'loss': 0.9636, 'learning_rate': 1.0334e-05, 'epoch': 1.98, 'throughput': 333.29}
+[INFO|2025-07-24 16:11:02] logging.py:143 >> {'loss': 0.7085, 'learning_rate': 9.9164e-06, 'epoch': 2.02, 'throughput': 333.30}
+[INFO|2025-07-24 16:11:13] logging.py:143 >> {'loss': 0.5186, 'learning_rate': 9.4989e-06, 'epoch': 2.08, 'throughput': 333.84}
+[INFO|2025-07-24 16:11:25] logging.py:143 >> {'loss': 0.5023, 'learning_rate': 9.0822e-06, 'epoch': 2.13, 'throughput': 334.41}
+[INFO|2025-07-24 16:11:25] trainer.py:4327 >>
+***** Running Evaluation *****
+[INFO|2025-07-24 16:11:25] trainer.py:4329 >>   Num examples = 373
+[INFO|2025-07-24 16:11:25] trainer.py:4332 >>   Batch size = 2
+[INFO|2025-07-24 16:11:38] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-200
+[INFO|2025-07-24 16:11:39] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:11:39] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:11:39] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-200/chat_template.jinja
+[INFO|2025-07-24 16:11:40] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-200/tokenizer_config.json
+[INFO|2025-07-24 16:11:40] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-200/special_tokens_map.json
+[INFO|2025-07-24 16:11:53] logging.py:143 >> {'loss': 0.5145, 'learning_rate': 8.6671e-06, 'epoch': 2.18, 'throughput': 323.15}
+[INFO|2025-07-24 16:12:05] logging.py:143 >> {'loss': 0.4971, 'learning_rate': 8.2544e-06, 'epoch': 2.24, 'throughput': 323.77}
+[INFO|2025-07-24 16:12:16] logging.py:143 >> {'loss': 0.5042, 'learning_rate': 7.8447e-06, 'epoch': 2.29, 'throughput': 324.87}
+[INFO|2025-07-24 16:12:27] logging.py:143 >> {'loss': 0.5423, 'learning_rate': 7.4387e-06, 'epoch': 2.34, 'throughput': 325.53}
+[INFO|2025-07-24 16:12:39] logging.py:143 >> {'loss': 0.5913, 'learning_rate': 7.0372e-06, 'epoch': 2.40, 'throughput': 325.95}
+[INFO|2025-07-24 16:12:51] logging.py:143 >> {'loss': 0.5403, 'learning_rate': 6.6409e-06, 'epoch': 2.45, 'throughput': 326.13}
+[INFO|2025-07-24 16:13:02] logging.py:143 >> {'loss': 0.5301, 'learning_rate': 6.2505e-06, 'epoch': 2.50, 'throughput': 326.79}
+[INFO|2025-07-24 16:13:13] logging.py:143 >> {'loss': 0.5417, 'learning_rate': 5.8666e-06, 'epoch': 2.56, 'throughput': 327.37}
+[INFO|2025-07-24 16:13:25] logging.py:143 >> {'loss': 0.5358, 'learning_rate': 5.4899e-06, 'epoch': 2.61, 'throughput': 327.79}
+[INFO|2025-07-24 16:13:37] logging.py:143 >> {'loss': 0.5249, 'learning_rate': 5.1211e-06, 'epoch': 2.67, 'throughput': 328.11}
+[INFO|2025-07-24 16:13:48] logging.py:143 >> {'loss': 0.5134, 'learning_rate': 4.7608e-06, 'epoch': 2.72, 'throughput': 328.62}
+[INFO|2025-07-24 16:14:00] logging.py:143 >> {'loss': 0.5160, 'learning_rate': 4.4096e-06, 'epoch': 2.77, 'throughput': 328.73}
+[INFO|2025-07-24 16:14:11] logging.py:143 >> {'loss': 0.5611, 'learning_rate': 4.0682e-06, 'epoch': 2.83, 'throughput': 329.02}
+[INFO|2025-07-24 16:14:22] logging.py:143 >> {'loss': 0.5084, 'learning_rate': 3.7371e-06, 'epoch': 2.88, 'throughput': 329.85}
+[INFO|2025-07-24 16:14:33] logging.py:143 >> {'loss': 0.5458, 'learning_rate': 3.4170e-06, 'epoch': 2.93, 'throughput': 330.35}
+[INFO|2025-07-24 16:14:43] logging.py:143 >> {'loss': 0.5205, 'learning_rate': 3.1084e-06, 'epoch': 2.99, 'throughput': 331.42}
+[INFO|2025-07-24 16:14:52] logging.py:143 >> {'loss': 0.4058, 'learning_rate': 2.8118e-06, 'epoch': 3.03, 'throughput': 332.17}
+[INFO|2025-07-24 16:15:04] logging.py:143 >> {'loss': 0.3859, 'learning_rate': 2.5277e-06, 'epoch': 3.09, 'throughput': 332.42}
+[INFO|2025-07-24 16:15:14] logging.py:143 >> {'loss': 0.3724, 'learning_rate': 2.2567e-06, 'epoch': 3.14, 'throughput': 333.25}
+[INFO|2025-07-24 16:15:25] logging.py:143 >> {'loss': 0.3916, 'learning_rate': 1.9991e-06, 'epoch': 3.19, 'throughput': 333.86}
+[INFO|2025-07-24 16:15:25] trainer.py:4327 >>
+***** Running Evaluation *****
+[INFO|2025-07-24 16:15:25] trainer.py:4329 >>   Num examples = 373
+[INFO|2025-07-24 16:15:25] trainer.py:4332 >>   Batch size = 2
+[INFO|2025-07-24 16:15:37] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-300
+[INFO|2025-07-24 16:15:38] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:15:38] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:15:38] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-300/chat_template.jinja
+[INFO|2025-07-24 16:15:39] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-300/tokenizer_config.json
+[INFO|2025-07-24 16:15:39] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-300/special_tokens_map.json
+[INFO|2025-07-24 16:15:51] logging.py:143 >> {'loss': 0.3676, 'learning_rate': 1.7556e-06, 'epoch': 3.25, 'throughput': 327.35}
+[INFO|2025-07-24 16:16:03] logging.py:143 >> {'loss': 0.3798, 'learning_rate': 1.5264e-06, 'epoch': 3.30, 'throughput': 327.69}
+[INFO|2025-07-24 16:16:13] logging.py:143 >> {'loss': 0.3606, 'learning_rate': 1.3120e-06, 'epoch': 3.35, 'throughput': 328.40}
+[INFO|2025-07-24 16:16:24] logging.py:143 >> {'loss': 0.3938, 'learning_rate': 1.1128e-06, 'epoch': 3.41, 'throughput': 329.00}
+[INFO|2025-07-24 16:16:35] logging.py:143 >> {'loss': 0.3605, 'learning_rate': 9.2909e-07, 'epoch': 3.46, 'throughput': 329.68}
+[INFO|2025-07-24 16:16:45] logging.py:143 >> {'loss': 0.3781, 'learning_rate': 7.6120e-07, 'epoch': 3.52, 'throughput': 330.66}
+[INFO|2025-07-24 16:16:55] logging.py:143 >> {'loss': 0.3794, 'learning_rate': 6.0944e-07, 'epoch': 3.57, 'throughput': 331.22}
+[INFO|2025-07-24 16:17:06] logging.py:143 >> {'loss': 0.3526, 'learning_rate': 4.7406e-07, 'epoch': 3.62, 'throughput': 332.12}
+[INFO|2025-07-24 16:17:16] logging.py:143 >> {'loss': 0.3846, 'learning_rate': 3.5531e-07, 'epoch': 3.68, 'throughput': 332.56}
+[INFO|2025-07-24 16:17:27] logging.py:143 >> {'loss': 0.3855, 'learning_rate': 2.5338e-07, 'epoch': 3.73, 'throughput': 333.19}
+[INFO|2025-07-24 16:17:37] logging.py:143 >> {'loss': 0.3955, 'learning_rate': 1.6847e-07, 'epoch': 3.78, 'throughput': 333.69}
+[INFO|2025-07-24 16:17:48] logging.py:143 >> {'loss': 0.3400, 'learning_rate': 1.0071e-07, 'epoch': 3.84, 'throughput': 334.24}
+[INFO|2025-07-24 16:17:58] logging.py:143 >> {'loss': 0.3584, 'learning_rate': 5.0222e-08, 'epoch': 3.89, 'throughput': 334.96}
+[INFO|2025-07-24 16:18:09] logging.py:143 >> {'loss': 0.3608, 'learning_rate': 1.7099e-08, 'epoch': 3.94, 'throughput': 335.57}
+[INFO|2025-07-24 16:18:20] logging.py:143 >> {'loss': 0.3659, 'learning_rate': 1.3962e-09, 'epoch': 4.00, 'throughput': 336.16}
+[INFO|2025-07-24 16:18:20] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-376
+[INFO|2025-07-24 16:18:20] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:18:20] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:18:21] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-376/chat_template.jinja
+[INFO|2025-07-24 16:18:21] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-376/tokenizer_config.json
+[INFO|2025-07-24 16:18:21] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-376/special_tokens_map.json
+[INFO|2025-07-24 16:18:23] trainer.py:2676 >>
+Training completed. Do not forget to share your model on huggingface.co/models =)
+[INFO|2025-07-24 16:18:24] logging.py:143 >> Converted PiSSA adapter will be saved at: saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/pissa_converted.
+[INFO|2025-07-24 16:18:24] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:18:24] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:18:25] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:18:25] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:18:26] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:18:26] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:18:28] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6
+[INFO|2025-07-24 16:18:28] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
+[INFO|2025-07-24 16:18:28] configuration_utils.py:770 >> Model config LlamaConfig {
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+[INFO|2025-07-24 16:18:29] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/chat_template.jinja
+[INFO|2025-07-24 16:18:29] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/tokenizer_config.json
+[INFO|2025-07-24 16:18:29] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/special_tokens_map.json
+[WARNING|2025-07-24 16:18:30] logging.py:148 >> No metric eval_accuracy to plot.
+[INFO|2025-07-24 16:18:30] trainer.py:4327 >>
+***** Running Evaluation *****
+[INFO|2025-07-24 16:18:30] trainer.py:4329 >>   Num examples = 373
+[INFO|2025-07-24 16:18:30] trainer.py:4332 >>   Batch size = 2
+[INFO|2025-07-24 16:18:43] modelcard.py:450 >> Dropping the following result as it does not have all the necessary fields:
+{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|eom_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|eot_id|>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2068 @@

+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|finetune_right_pad_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|eom_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<|python_tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128041": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128042": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128043": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128044": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128045": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128046": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128047": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128048": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128049": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128050": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128051": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128052": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|reserved_special_token_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|reserved_special_token_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|reserved_special_token_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|reserved_special_token_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|reserved_special_token_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|reserved_special_token_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|reserved_special_token_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|reserved_special_token_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|reserved_special_token_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|reserved_special_token_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|reserved_special_token_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|reserved_special_token_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|reserved_special_token_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|reserved_special_token_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|reserved_special_token_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|reserved_special_token_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|reserved_special_token_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|reserved_special_token_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|reserved_special_token_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|reserved_special_token_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|reserved_special_token_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|reserved_special_token_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|reserved_special_token_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|reserved_special_token_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|reserved_special_token_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|reserved_special_token_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|reserved_special_token_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|reserved_special_token_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|reserved_special_token_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|reserved_special_token_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|reserved_special_token_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|reserved_special_token_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|reserved_special_token_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|reserved_special_token_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|reserved_special_token_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|reserved_special_token_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|reserved_special_token_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|reserved_special_token_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|reserved_special_token_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|reserved_special_token_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|reserved_special_token_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|reserved_special_token_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|reserved_special_token_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|reserved_special_token_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|reserved_special_token_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|reserved_special_token_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|reserved_special_token_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|reserved_special_token_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|reserved_special_token_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|reserved_special_token_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|reserved_special_token_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|reserved_special_token_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|reserved_special_token_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|reserved_special_token_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|reserved_special_token_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|reserved_special_token_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|reserved_special_token_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|reserved_special_token_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|reserved_special_token_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|reserved_special_token_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|reserved_special_token_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|reserved_special_token_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|reserved_special_token_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|reserved_special_token_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|reserved_special_token_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|reserved_special_token_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|reserved_special_token_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|reserved_special_token_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|reserved_special_token_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|reserved_special_token_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|reserved_special_token_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|reserved_special_token_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|reserved_special_token_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|reserved_special_token_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|reserved_special_token_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|reserved_special_token_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|reserved_special_token_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|reserved_special_token_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|reserved_special_token_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|reserved_special_token_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|reserved_special_token_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|reserved_special_token_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|reserved_special_token_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|reserved_special_token_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|reserved_special_token_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|reserved_special_token_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|reserved_special_token_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|reserved_special_token_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|reserved_special_token_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|reserved_special_token_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|reserved_special_token_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|reserved_special_token_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|reserved_special_token_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|reserved_special_token_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|reserved_special_token_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|reserved_special_token_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|reserved_special_token_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|reserved_special_token_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|reserved_special_token_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|reserved_special_token_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|reserved_special_token_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|reserved_special_token_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|reserved_special_token_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|reserved_special_token_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|reserved_special_token_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|reserved_special_token_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|reserved_special_token_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|reserved_special_token_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|reserved_special_token_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|reserved_special_token_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|reserved_special_token_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|reserved_special_token_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|reserved_special_token_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|reserved_special_token_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|reserved_special_token_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|reserved_special_token_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|reserved_special_token_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|reserved_special_token_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|reserved_special_token_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|reserved_special_token_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|reserved_special_token_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|reserved_special_token_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|reserved_special_token_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|reserved_special_token_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|reserved_special_token_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|reserved_special_token_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|reserved_special_token_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|reserved_special_token_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|reserved_special_token_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|reserved_special_token_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|reserved_special_token_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|reserved_special_token_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|reserved_special_token_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|reserved_special_token_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|reserved_special_token_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|reserved_special_token_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|reserved_special_token_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|reserved_special_token_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|reserved_special_token_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|reserved_special_token_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|reserved_special_token_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|reserved_special_token_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|reserved_special_token_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|reserved_special_token_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|reserved_special_token_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|reserved_special_token_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|reserved_special_token_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|reserved_special_token_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|reserved_special_token_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|reserved_special_token_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|reserved_special_token_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|reserved_special_token_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|reserved_special_token_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|reserved_special_token_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|reserved_special_token_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|reserved_special_token_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|reserved_special_token_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|reserved_special_token_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|reserved_special_token_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|reserved_special_token_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|eom_id|>"
+  ],
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "PreTrainedTokenizer"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 4.0,
+    "num_input_tokens_seen": 300000,
+    "total_flos": 1780328448000000.0,
+    "train_loss": 0.7932832451101314,
+    "train_runtime": 895.9422,
+    "train_samples_per_second": 6.652,
+    "train_steps_per_second": 0.42
+}

trainer_log.jsonl ADDED Viewed

	@@ -0,0 +1,79 @@

+{"current_steps": 5, "total_steps": 376, "loss": 1.1748, "lr": 1.9994415637302545e-05, "epoch": 0.053691275167785234, "percentage": 1.33, "elapsed_time": "0:00:15", "remaining_time": "0:19:07", "throughput": 263.9, "total_tokens": 4080}
+{"current_steps": 10, "total_steps": 376, "loss": 1.2634, "lr": 1.9971739852847514e-05, "epoch": 0.10738255033557047, "percentage": 2.66, "elapsed_time": "0:00:26", "remaining_time": "0:16:10", "throughput": 304.64, "total_tokens": 8080}
+{"current_steps": 15, "total_steps": 376, "loss": 1.4367, "lr": 1.9931663163249744e-05, "epoch": 0.1610738255033557, "percentage": 3.99, "elapsed_time": "0:00:38", "remaining_time": "0:15:27", "throughput": 315.4, "total_tokens": 12160}
+{"current_steps": 20, "total_steps": 376, "loss": 1.3167, "lr": 1.9874255503213154e-05, "epoch": 0.21476510067114093, "percentage": 5.32, "elapsed_time": "0:00:50", "remaining_time": "0:14:51", "throughput": 323.64, "total_tokens": 16208}
+{"current_steps": 25, "total_steps": 376, "loss": 1.4144, "lr": 1.979961705036587e-05, "epoch": 0.2684563758389262, "percentage": 6.65, "elapsed_time": "0:01:02", "remaining_time": "0:14:33", "throughput": 325.7, "total_tokens": 20256}
+{"current_steps": 30, "total_steps": 376, "loss": 1.3944, "lr": 1.9707878050448074e-05, "epoch": 0.3221476510067114, "percentage": 7.98, "elapsed_time": "0:01:13", "remaining_time": "0:14:05", "throughput": 331.6, "total_tokens": 24304}
+{"current_steps": 35, "total_steps": 376, "loss": 1.3946, "lr": 1.9599198590030308e-05, "epoch": 0.37583892617449666, "percentage": 9.31, "elapsed_time": "0:01:24", "remaining_time": "0:13:46", "throughput": 334.75, "total_tokens": 28384}
+{"current_steps": 40, "total_steps": 376, "loss": 1.3827, "lr": 1.947376831715892e-05, "epoch": 0.42953020134228187, "percentage": 10.64, "elapsed_time": "0:01:36", "remaining_time": "0:13:30", "throughput": 336.48, "total_tokens": 32480}
+{"current_steps": 45, "total_steps": 376, "loss": 1.4509, "lr": 1.9331806110416027e-05, "epoch": 0.48322147651006714, "percentage": 11.97, "elapsed_time": "0:01:48", "remaining_time": "0:13:20", "throughput": 336.58, "total_tokens": 36608}
+{"current_steps": 50, "total_steps": 376, "loss": 1.3347, "lr": 1.9173559696971594e-05, "epoch": 0.5369127516778524, "percentage": 13.3, "elapsed_time": "0:02:01", "remaining_time": "0:13:08", "throughput": 335.59, "total_tokens": 40608}
+{"current_steps": 55, "total_steps": 376, "loss": 1.3834, "lr": 1.899930522029408e-05, "epoch": 0.5906040268456376, "percentage": 14.63, "elapsed_time": "0:02:13", "remaining_time": "0:12:57", "throughput": 335.37, "total_tokens": 44672}
+{"current_steps": 60, "total_steps": 376, "loss": 1.4679, "lr": 1.8809346758274014e-05, "epoch": 0.6442953020134228, "percentage": 15.96, "elapsed_time": "0:02:25", "remaining_time": "0:12:44", "throughput": 335.6, "total_tokens": 48704}
+{"current_steps": 65, "total_steps": 376, "loss": 1.3973, "lr": 1.8604015792601395e-05, "epoch": 0.697986577181208, "percentage": 17.29, "elapsed_time": "0:02:36", "remaining_time": "0:12:30", "throughput": 336.68, "total_tokens": 52832}
+{"current_steps": 70, "total_steps": 376, "loss": 1.5408, "lr": 1.8383670630322864e-05, "epoch": 0.7516778523489933, "percentage": 18.62, "elapsed_time": "0:02:49", "remaining_time": "0:12:19", "throughput": 335.03, "total_tokens": 56672}
+{"current_steps": 75, "total_steps": 376, "loss": 1.4967, "lr": 1.8148695778588034e-05, "epoch": 0.8053691275167785, "percentage": 19.95, "elapsed_time": "0:03:00", "remaining_time": "0:12:04", "throughput": 335.55, "total_tokens": 60608}
+{"current_steps": 80, "total_steps": 376, "loss": 1.4644, "lr": 1.789950127367606e-05, "epoch": 0.8590604026845637, "percentage": 21.28, "elapsed_time": "0:03:12", "remaining_time": "0:11:51", "throughput": 336.3, "total_tokens": 64640}
+{"current_steps": 85, "total_steps": 376, "loss": 1.4226, "lr": 1.7636521965473324e-05, "epoch": 0.912751677852349, "percentage": 22.61, "elapsed_time": "0:03:24", "remaining_time": "0:11:38", "throughput": 336.02, "total_tokens": 68560}
+{"current_steps": 90, "total_steps": 376, "loss": 1.4847, "lr": 1.7360216758650826e-05, "epoch": 0.9664429530201343, "percentage": 23.94, "elapsed_time": "0:03:35", "remaining_time": "0:11:25", "throughput": 336.26, "total_tokens": 72512}
+{"current_steps": 95, "total_steps": 376, "loss": 1.2927, "lr": 1.7071067811865477e-05, "epoch": 1.010738255033557, "percentage": 25.27, "elapsed_time": "0:03:45", "remaining_time": "0:11:05", "throughput": 336.91, "total_tokens": 75808}
+{"current_steps": 100, "total_steps": 376, "loss": 0.8505, "lr": 1.67695796963826e-05, "epoch": 1.0644295302013422, "percentage": 26.6, "elapsed_time": "0:03:55", "remaining_time": "0:10:51", "throughput": 338.48, "total_tokens": 79840}
+{"current_steps": 100, "total_steps": 376, "eval_loss": 2.885671615600586, "epoch": 1.0644295302013422, "percentage": 26.6, "elapsed_time": "0:04:09", "remaining_time": "0:11:27", "throughput": 320.58, "total_tokens": 79840}
+{"current_steps": 105, "total_steps": 376, "loss": 0.7497, "lr": 1.6456278515588023e-05, "epoch": 1.1181208053691276, "percentage": 27.93, "elapsed_time": "0:04:23", "remaining_time": "0:11:20", "throughput": 318.1, "total_tokens": 83840}
+{"current_steps": 110, "total_steps": 376, "loss": 0.7761, "lr": 1.613171098692611e-05, "epoch": 1.1718120805369128, "percentage": 29.26, "elapsed_time": "0:04:34", "remaining_time": "0:11:04", "throughput": 319.79, "total_tokens": 87872}
+{"current_steps": 115, "total_steps": 376, "loss": 0.7997, "lr": 1.5796443487865774e-05, "epoch": 1.225503355704698, "percentage": 30.59, "elapsed_time": "0:04:46", "remaining_time": "0:10:49", "throughput": 321.09, "total_tokens": 91888}
+{"current_steps": 120, "total_steps": 376, "loss": 0.8275, "lr": 1.54510610675594e-05, "epoch": 1.279194630872483, "percentage": 31.91, "elapsed_time": "0:04:57", "remaining_time": "0:10:35", "throughput": 322.08, "total_tokens": 95888}
+{"current_steps": 125, "total_steps": 376, "loss": 0.8599, "lr": 1.5096166425919176e-05, "epoch": 1.3328859060402685, "percentage": 33.24, "elapsed_time": "0:05:09", "remaining_time": "0:10:21", "throughput": 322.83, "total_tokens": 99888}
+{"current_steps": 130, "total_steps": 376, "loss": 0.9179, "lr": 1.4732378861892524e-05, "epoch": 1.3865771812080536, "percentage": 34.57, "elapsed_time": "0:05:21", "remaining_time": "0:10:07", "throughput": 323.42, "total_tokens": 103840}
+{"current_steps": 135, "total_steps": 376, "loss": 0.8255, "lr": 1.436033319277183e-05, "epoch": 1.440268456375839, "percentage": 35.9, "elapsed_time": "0:05:32", "remaining_time": "0:09:53", "throughput": 324.25, "total_tokens": 107824}
+{"current_steps": 140, "total_steps": 376, "loss": 0.8806, "lr": 1.3980678646424308e-05, "epoch": 1.4939597315436242, "percentage": 37.23, "elapsed_time": "0:05:43", "remaining_time": "0:09:39", "throughput": 325.46, "total_tokens": 111936}
+{"current_steps": 145, "total_steps": 376, "loss": 0.8366, "lr": 1.3594077728375129e-05, "epoch": 1.5476510067114093, "percentage": 38.56, "elapsed_time": "0:05:55", "remaining_time": "0:09:26", "throughput": 326.15, "total_tokens": 115952}
+{"current_steps": 150, "total_steps": 376, "loss": 0.8567, "lr": 1.3201205065720699e-05, "epoch": 1.6013422818791945, "percentage": 39.89, "elapsed_time": "0:06:06", "remaining_time": "0:09:12", "throughput": 327.21, "total_tokens": 119984}
+{"current_steps": 155, "total_steps": 376, "loss": 0.8939, "lr": 1.2802746229889563e-05, "epoch": 1.6550335570469799, "percentage": 41.22, "elapsed_time": "0:06:17", "remaining_time": "0:08:58", "throughput": 328.25, "total_tokens": 124032}
+{"current_steps": 160, "total_steps": 376, "loss": 0.965, "lr": 1.2399396540305205e-05, "epoch": 1.7087248322147652, "percentage": 42.55, "elapsed_time": "0:06:28", "remaining_time": "0:08:44", "throughput": 329.28, "total_tokens": 128000}
+{"current_steps": 165, "total_steps": 376, "loss": 0.8915, "lr": 1.1991859851038362e-05, "epoch": 1.7624161073825504, "percentage": 43.88, "elapsed_time": "0:06:39", "remaining_time": "0:08:31", "throughput": 330.41, "total_tokens": 132096}
+{"current_steps": 170, "total_steps": 376, "loss": 0.9085, "lr": 1.1580847322566224e-05, "epoch": 1.8161073825503355, "percentage": 45.21, "elapsed_time": "0:06:50", "remaining_time": "0:08:17", "throughput": 331.65, "total_tokens": 136208}
+{"current_steps": 175, "total_steps": 376, "loss": 0.8831, "lr": 1.1167076180781764e-05, "epoch": 1.8697986577181207, "percentage": 46.54, "elapsed_time": "0:07:01", "remaining_time": "0:08:04", "throughput": 332.58, "total_tokens": 140320}
+{"current_steps": 180, "total_steps": 376, "loss": 1.0271, "lr": 1.0751268465418784e-05, "epoch": 1.923489932885906, "percentage": 47.87, "elapsed_time": "0:07:13", "remaining_time": "0:07:52", "throughput": 332.56, "total_tokens": 144176}
+{"current_steps": 185, "total_steps": 376, "loss": 0.9636, "lr": 1.0334149770076747e-05, "epoch": 1.9771812080536912, "percentage": 49.2, "elapsed_time": "0:07:24", "remaining_time": "0:07:39", "throughput": 333.29, "total_tokens": 148256}
+{"current_steps": 190, "total_steps": 376, "loss": 0.7085, "lr": 9.916447976043972e-06, "epoch": 2.021476510067114, "percentage": 50.53, "elapsed_time": "0:07:34", "remaining_time": "0:07:24", "throughput": 333.3, "total_tokens": 151488}
+{"current_steps": 195, "total_steps": 376, "loss": 0.5186, "lr": 9.498891982128809e-06, "epoch": 2.0751677852348993, "percentage": 51.86, "elapsed_time": "0:07:45", "remaining_time": "0:07:12", "throughput": 333.84, "total_tokens": 155472}
+{"current_steps": 200, "total_steps": 376, "loss": 0.5023, "lr": 9.082210432715197e-06, "epoch": 2.1288590604026845, "percentage": 53.19, "elapsed_time": "0:07:56", "remaining_time": "0:06:59", "throughput": 334.41, "total_tokens": 159504}
+{"current_steps": 200, "total_steps": 376, "eval_loss": 3.4170751571655273, "epoch": 2.1288590604026845, "percentage": 53.19, "elapsed_time": "0:08:10", "remaining_time": "0:07:11", "throughput": 324.96, "total_tokens": 159504}
+{"current_steps": 205, "total_steps": 376, "loss": 0.5145, "lr": 8.667130446262214e-06, "epoch": 2.1825503355704696, "percentage": 54.52, "elapsed_time": "0:08:25", "remaining_time": "0:07:01", "throughput": 323.15, "total_tokens": 163472}
+{"current_steps": 210, "total_steps": 376, "loss": 0.4971, "lr": 8.25437634646637e-06, "epoch": 2.2362416107382552, "percentage": 55.85, "elapsed_time": "0:08:37", "remaining_time": "0:06:49", "throughput": 323.77, "total_tokens": 167584}
+{"current_steps": 215, "total_steps": 376, "loss": 0.5042, "lr": 7.844668398300866e-06, "epoch": 2.2899328859060404, "percentage": 57.18, "elapsed_time": "0:08:48", "remaining_time": "0:06:35", "throughput": 324.87, "total_tokens": 171712}
+{"current_steps": 220, "total_steps": 376, "loss": 0.5423, "lr": 7.438721551137367e-06, "epoch": 2.3436241610738255, "percentage": 58.51, "elapsed_time": "0:08:59", "remaining_time": "0:06:22", "throughput": 325.53, "total_tokens": 175696}
+{"current_steps": 225, "total_steps": 376, "loss": 0.5913, "lr": 7.037244191143662e-06, "epoch": 2.3973154362416107, "percentage": 59.84, "elapsed_time": "0:09:11", "remaining_time": "0:06:10", "throughput": 325.95, "total_tokens": 179728}
+{"current_steps": 230, "total_steps": 376, "loss": 0.5403, "lr": 6.640936905134212e-06, "epoch": 2.451006711409396, "percentage": 61.17, "elapsed_time": "0:09:23", "remaining_time": "0:05:57", "throughput": 326.13, "total_tokens": 183648}
+{"current_steps": 235, "total_steps": 376, "loss": 0.5301, "lr": 6.2504912580307905e-06, "epoch": 2.504697986577181, "percentage": 62.5, "elapsed_time": "0:09:34", "remaining_time": "0:05:44", "throughput": 326.79, "total_tokens": 187664}
+{"current_steps": 240, "total_steps": 376, "loss": 0.5417, "lr": 5.866588586066481e-06, "epoch": 2.558389261744966, "percentage": 63.83, "elapsed_time": "0:09:45", "remaining_time": "0:05:31", "throughput": 327.37, "total_tokens": 191776}
+{"current_steps": 245, "total_steps": 376, "loss": 0.5358, "lr": 5.48989880783898e-06, "epoch": 2.6120805369127518, "percentage": 65.16, "elapsed_time": "0:09:57", "remaining_time": "0:05:19", "throughput": 327.79, "total_tokens": 195872}
+{"current_steps": 250, "total_steps": 376, "loss": 0.5249, "lr": 5.121079255287953e-06, "epoch": 2.665771812080537, "percentage": 66.49, "elapsed_time": "0:10:08", "remaining_time": "0:05:06", "throughput": 328.11, "total_tokens": 199808}
+{"current_steps": 255, "total_steps": 376, "loss": 0.5134, "lr": 4.760773526636315e-06, "epoch": 2.719463087248322, "percentage": 67.82, "elapsed_time": "0:10:20", "remaining_time": "0:04:54", "throughput": 328.62, "total_tokens": 203888}
+{"current_steps": 260, "total_steps": 376, "loss": 0.516, "lr": 4.409610363297211e-06, "epoch": 2.7731543624161072, "percentage": 69.15, "elapsed_time": "0:10:32", "remaining_time": "0:04:42", "throughput": 328.73, "total_tokens": 207840}
+{"current_steps": 265, "total_steps": 376, "loss": 0.5611, "lr": 4.0682025527064486e-06, "epoch": 2.826845637583893, "percentage": 70.48, "elapsed_time": "0:10:43", "remaining_time": "0:04:29", "throughput": 329.02, "total_tokens": 211856}
+{"current_steps": 270, "total_steps": 376, "loss": 0.5084, "lr": 3.7371458589949337e-06, "epoch": 2.880536912751678, "percentage": 71.81, "elapsed_time": "0:10:54", "remaining_time": "0:04:16", "throughput": 329.85, "total_tokens": 215904}
+{"current_steps": 275, "total_steps": 376, "loss": 0.5458, "lr": 3.4170179833671847e-06, "epoch": 2.934228187919463, "percentage": 73.14, "elapsed_time": "0:11:05", "remaining_time": "0:04:04", "throughput": 330.35, "total_tokens": 219856}
+{"current_steps": 280, "total_steps": 376, "loss": 0.5205, "lr": 3.1083775560000373e-06, "epoch": 2.9879194630872483, "percentage": 74.47, "elapsed_time": "0:11:15", "remaining_time": "0:03:51", "throughput": 331.42, "total_tokens": 223984}
+{"current_steps": 285, "total_steps": 376, "loss": 0.4058, "lr": 2.8117631612207084e-06, "epoch": 3.032214765100671, "percentage": 75.8, "elapsed_time": "0:11:24", "remaining_time": "0:03:38", "throughput": 332.17, "total_tokens": 227440}
+{"current_steps": 290, "total_steps": 376, "loss": 0.3859, "lr": 2.527692397665311e-06, "epoch": 3.085906040268456, "percentage": 77.13, "elapsed_time": "0:11:36", "remaining_time": "0:03:26", "throughput": 332.42, "total_tokens": 231424}
+{"current_steps": 295, "total_steps": 376, "loss": 0.3724, "lr": 2.256660975057867e-06, "epoch": 3.1395973154362418, "percentage": 78.46, "elapsed_time": "0:11:46", "remaining_time": "0:03:13", "throughput": 333.25, "total_tokens": 235456}
+{"current_steps": 300, "total_steps": 376, "loss": 0.3916, "lr": 1.9991418491859383e-06, "epoch": 3.193288590604027, "percentage": 79.79, "elapsed_time": "0:11:57", "remaining_time": "0:03:01", "throughput": 333.86, "total_tokens": 239424}
+{"current_steps": 300, "total_steps": 376, "eval_loss": 3.5844736099243164, "epoch": 3.193288590604027, "percentage": 79.79, "elapsed_time": "0:12:09", "remaining_time": "0:03:04", "throughput": 328.08, "total_tokens": 239424}
+{"current_steps": 305, "total_steps": 376, "loss": 0.3676, "lr": 1.7555843965823992e-06, "epoch": 3.246979865771812, "percentage": 81.12, "elapsed_time": "0:12:23", "remaining_time": "0:02:53", "throughput": 327.35, "total_tokens": 243488}
+{"current_steps": 310, "total_steps": 376, "loss": 0.3798, "lr": 1.5264136303534893e-06, "epoch": 3.3006711409395972, "percentage": 82.45, "elapsed_time": "0:12:35", "remaining_time": "0:02:40", "throughput": 327.69, "total_tokens": 247456}
+{"current_steps": 315, "total_steps": 376, "loss": 0.3606, "lr": 1.3120294585216353e-06, "epoch": 3.3543624161073824, "percentage": 83.78, "elapsed_time": "0:12:45", "remaining_time": "0:02:28", "throughput": 328.4, "total_tokens": 251456}
+{"current_steps": 320, "total_steps": 376, "loss": 0.3938, "lr": 1.11280598617714e-06, "epoch": 3.4080536912751676, "percentage": 85.11, "elapsed_time": "0:12:56", "remaining_time": "0:02:15", "throughput": 329.0, "total_tokens": 255504}
+{"current_steps": 325, "total_steps": 376, "loss": 0.3605, "lr": 9.290908626565931e-07, "epoch": 3.461744966442953, "percentage": 86.44, "elapsed_time": "0:13:07", "remaining_time": "0:02:03", "throughput": 329.68, "total_tokens": 259472}
+{"current_steps": 330, "total_steps": 376, "loss": 0.3781, "lr": 7.612046748871327e-07, "epoch": 3.5154362416107383, "percentage": 87.77, "elapsed_time": "0:13:17", "remaining_time": "0:01:51", "throughput": 330.66, "total_tokens": 263536}
+{"current_steps": 335, "total_steps": 376, "loss": 0.3794, "lr": 6.094403879552213e-07, "epoch": 3.5691275167785235, "percentage": 89.1, "elapsed_time": "0:13:27", "remaining_time": "0:01:38", "throughput": 331.22, "total_tokens": 267584}
+{"current_steps": 340, "total_steps": 376, "loss": 0.3526, "lr": 4.740628338761255e-07, "epoch": 3.6228187919463086, "percentage": 90.43, "elapsed_time": "0:13:37", "remaining_time": "0:01:26", "throughput": 332.12, "total_tokens": 271664}
+{"current_steps": 345, "total_steps": 376, "loss": 0.3846, "lr": 3.553082494562354e-07, "epoch": 3.6765100671140942, "percentage": 91.76, "elapsed_time": "0:13:48", "remaining_time": "0:01:14", "throughput": 332.56, "total_tokens": 275664}
+{"current_steps": 350, "total_steps": 376, "loss": 0.3855, "lr": 2.533838640546438e-07, "epoch": 3.7302013422818794, "percentage": 93.09, "elapsed_time": "0:13:59", "remaining_time": "0:01:02", "throughput": 333.19, "total_tokens": 279648}
+{"current_steps": 355, "total_steps": 376, "loss": 0.3955, "lr": 1.6846753796336491e-07, "epoch": 3.7838926174496645, "percentage": 94.41, "elapsed_time": "0:14:09", "remaining_time": "0:00:50", "throughput": 333.69, "total_tokens": 283600}
+{"current_steps": 360, "total_steps": 376, "loss": 0.34, "lr": 1.0070745203721532e-07, "epoch": 3.8375838926174497, "percentage": 95.74, "elapsed_time": "0:14:20", "remaining_time": "0:00:38", "throughput": 334.24, "total_tokens": 287600}
+{"current_steps": 365, "total_steps": 376, "loss": 0.3584, "lr": 5.022184911495864e-08, "epoch": 3.891275167785235, "percentage": 97.07, "elapsed_time": "0:14:30", "remaining_time": "0:00:26", "throughput": 334.96, "total_tokens": 291712}
+{"current_steps": 370, "total_steps": 376, "loss": 0.3608, "lr": 1.7098827682970885e-08, "epoch": 3.94496644295302, "percentage": 98.4, "elapsed_time": "0:14:41", "remaining_time": "0:00:14", "throughput": 335.57, "total_tokens": 295856}
+{"current_steps": 375, "total_steps": 376, "loss": 0.3659, "lr": 1.3961881414292776e-09, "epoch": 3.998657718120805, "percentage": 99.73, "elapsed_time": "0:14:52", "remaining_time": "0:00:02", "throughput": 336.16, "total_tokens": 299904}
+{"current_steps": 376, "total_steps": 376, "epoch": 4.0, "percentage": 100.0, "elapsed_time": "0:14:55", "remaining_time": "0:00:00", "throughput": 334.85, "total_tokens": 300000}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,821 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.0,
+  "eval_steps": 100,
+  "global_step": 376,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.053691275167785234,
+      "grad_norm": 8.373551368713379,
+      "learning_rate": 1.9994415637302545e-05,
+      "loss": 1.1748,
+      "num_input_tokens_seen": 4080,
+      "step": 5,
+      "train_runtime": 15.4659,
+      "train_tokens_per_second": 263.806
+    },
+    {
+      "epoch": 0.10738255033557047,
+      "grad_norm": 6.174284934997559,
+      "learning_rate": 1.9971739852847514e-05,
+      "loss": 1.2634,
+      "num_input_tokens_seen": 8080,
+      "step": 10,
+      "train_runtime": 26.529,
+      "train_tokens_per_second": 304.572
+    },
+    {
+      "epoch": 0.1610738255033557,
+      "grad_norm": 8.544853210449219,
+      "learning_rate": 1.9931663163249744e-05,
+      "loss": 1.4367,
+      "num_input_tokens_seen": 12160,
+      "step": 15,
+      "train_runtime": 38.5597,
+      "train_tokens_per_second": 315.355
+    },
+    {
+      "epoch": 0.21476510067114093,
+      "grad_norm": 7.589258193969727,
+      "learning_rate": 1.9874255503213154e-05,
+      "loss": 1.3167,
+      "num_input_tokens_seen": 16208,
+      "step": 20,
+      "train_runtime": 50.0852,
+      "train_tokens_per_second": 323.609
+    },
+    {
+      "epoch": 0.2684563758389262,
+      "grad_norm": 8.169995307922363,
+      "learning_rate": 1.979961705036587e-05,
+      "loss": 1.4144,
+      "num_input_tokens_seen": 20256,
+      "step": 25,
+      "train_runtime": 62.1983,
+      "train_tokens_per_second": 325.668
+    },
+    {
+      "epoch": 0.3221476510067114,
+      "grad_norm": 8.032195091247559,
+      "learning_rate": 1.9707878050448074e-05,
+      "loss": 1.3944,
+      "num_input_tokens_seen": 24304,
+      "step": 30,
+      "train_runtime": 73.2989,
+      "train_tokens_per_second": 331.574
+    },
+    {
+      "epoch": 0.37583892617449666,
+      "grad_norm": 7.009624004364014,
+      "learning_rate": 1.9599198590030308e-05,
+      "loss": 1.3946,
+      "num_input_tokens_seen": 28384,
+      "step": 35,
+      "train_runtime": 84.7962,
+      "train_tokens_per_second": 334.732
+    },
+    {
+      "epoch": 0.42953020134228187,
+      "grad_norm": 7.077579498291016,
+      "learning_rate": 1.947376831715892e-05,
+      "loss": 1.3827,
+      "num_input_tokens_seen": 32480,
+      "step": 40,
+      "train_runtime": 96.5334,
+      "train_tokens_per_second": 336.464
+    },
+    {
+      "epoch": 0.48322147651006714,
+      "grad_norm": 6.77874755859375,
+      "learning_rate": 1.9331806110416027e-05,
+      "loss": 1.4509,
+      "num_input_tokens_seen": 36608,
+      "step": 45,
+      "train_runtime": 108.7688,
+      "train_tokens_per_second": 336.567
+    },
+    {
+      "epoch": 0.5369127516778524,
+      "grad_norm": 6.446975231170654,
+      "learning_rate": 1.9173559696971594e-05,
+      "loss": 1.3347,
+      "num_input_tokens_seen": 40608,
+      "step": 50,
+      "train_runtime": 121.0098,
+      "train_tokens_per_second": 335.576
+    },
+    {
+      "epoch": 0.5906040268456376,
+      "grad_norm": 7.034291744232178,
+      "learning_rate": 1.899930522029408e-05,
+      "loss": 1.3834,
+      "num_input_tokens_seen": 44672,
+      "step": 55,
+      "train_runtime": 133.2088,
+      "train_tokens_per_second": 335.353
+    },
+    {
+      "epoch": 0.6442953020134228,
+      "grad_norm": 6.512075424194336,
+      "learning_rate": 1.8809346758274014e-05,
+      "loss": 1.4679,
+      "num_input_tokens_seen": 48704,
+      "step": 60,
+      "train_runtime": 145.1301,
+      "train_tokens_per_second": 335.589
+    },
+    {
+      "epoch": 0.697986577181208,
+      "grad_norm": 5.968503475189209,
+      "learning_rate": 1.8604015792601395e-05,
+      "loss": 1.3973,
+      "num_input_tokens_seen": 52832,
+      "step": 65,
+      "train_runtime": 156.9248,
+      "train_tokens_per_second": 336.671
+    },
+    {
+      "epoch": 0.7516778523489933,
+      "grad_norm": 6.824077129364014,
+      "learning_rate": 1.8383670630322864e-05,
+      "loss": 1.5408,
+      "num_input_tokens_seen": 56672,
+      "step": 70,
+      "train_runtime": 169.159,
+      "train_tokens_per_second": 335.022
+    },
+    {
+      "epoch": 0.8053691275167785,
+      "grad_norm": 6.580747127532959,
+      "learning_rate": 1.8148695778588034e-05,
+      "loss": 1.4967,
+      "num_input_tokens_seen": 60608,
+      "step": 75,
+      "train_runtime": 180.627,
+      "train_tokens_per_second": 335.542
+    },
+    {
+      "epoch": 0.8590604026845637,
+      "grad_norm": 6.616082191467285,
+      "learning_rate": 1.789950127367606e-05,
+      "loss": 1.4644,
+      "num_input_tokens_seen": 64640,
+      "step": 80,
+      "train_runtime": 192.2131,
+      "train_tokens_per_second": 336.293
+    },
+    {
+      "epoch": 0.912751677852349,
+      "grad_norm": 6.736894607543945,
+      "learning_rate": 1.7636521965473324e-05,
+      "loss": 1.4226,
+      "num_input_tokens_seen": 68560,
+      "step": 85,
+      "train_runtime": 204.0434,
+      "train_tokens_per_second": 336.007
+    },
+    {
+      "epoch": 0.9664429530201343,
+      "grad_norm": 6.567346572875977,
+      "learning_rate": 1.7360216758650826e-05,
+      "loss": 1.4847,
+      "num_input_tokens_seen": 72512,
+      "step": 90,
+      "train_runtime": 215.6473,
+      "train_tokens_per_second": 336.253
+    },
+    {
+      "epoch": 1.010738255033557,
+      "grad_norm": 4.928441047668457,
+      "learning_rate": 1.7071067811865477e-05,
+      "loss": 1.2927,
+      "num_input_tokens_seen": 75808,
+      "step": 95,
+      "train_runtime": 225.0164,
+      "train_tokens_per_second": 336.9
+    },
+    {
+      "epoch": 1.0644295302013422,
+      "grad_norm": 5.473133087158203,
+      "learning_rate": 1.67695796963826e-05,
+      "loss": 0.8505,
+      "num_input_tokens_seen": 79840,
+      "step": 100,
+      "train_runtime": 235.8823,
+      "train_tokens_per_second": 338.474
+    },
+    {
+      "epoch": 1.0644295302013422,
+      "eval_loss": 2.885671615600586,
+      "eval_runtime": 13.1654,
+      "eval_samples_per_second": 28.332,
+      "eval_steps_per_second": 14.204,
+      "num_input_tokens_seen": 79840,
+      "step": 100
+    },
+    {
+      "epoch": 1.1181208053691276,
+      "grad_norm": 7.370046138763428,
+      "learning_rate": 1.6456278515588023e-05,
+      "loss": 0.7497,
+      "num_input_tokens_seen": 83840,
+      "step": 105,
+      "train_runtime": 263.568,
+      "train_tokens_per_second": 318.096
+    },
+    {
+      "epoch": 1.1718120805369128,
+      "grad_norm": 6.678887844085693,
+      "learning_rate": 1.613171098692611e-05,
+      "loss": 0.7761,
+      "num_input_tokens_seen": 87872,
+      "step": 110,
+      "train_runtime": 274.7842,
+      "train_tokens_per_second": 319.786
+    },
+    {
+      "epoch": 1.225503355704698,
+      "grad_norm": 5.9977707862854,
+      "learning_rate": 1.5796443487865774e-05,
+      "loss": 0.7997,
+      "num_input_tokens_seen": 91888,
+      "step": 115,
+      "train_runtime": 286.1784,
+      "train_tokens_per_second": 321.086
+    },
+    {
+      "epoch": 1.279194630872483,
+      "grad_norm": 6.782074451446533,
+      "learning_rate": 1.54510610675594e-05,
+      "loss": 0.8275,
+      "num_input_tokens_seen": 95888,
+      "step": 120,
+      "train_runtime": 297.7215,
+      "train_tokens_per_second": 322.073
+    },
+    {
+      "epoch": 1.3328859060402685,
+      "grad_norm": 6.483254909515381,
+      "learning_rate": 1.5096166425919176e-05,
+      "loss": 0.8599,
+      "num_input_tokens_seen": 99888,
+      "step": 125,
+      "train_runtime": 309.418,
+      "train_tokens_per_second": 322.825
+    },
+    {
+      "epoch": 1.3865771812080536,
+      "grad_norm": 6.478480815887451,
+      "learning_rate": 1.4732378861892524e-05,
+      "loss": 0.9179,
+      "num_input_tokens_seen": 103840,
+      "step": 130,
+      "train_runtime": 321.0761,
+      "train_tokens_per_second": 323.412
+    },
+    {
+      "epoch": 1.440268456375839,
+      "grad_norm": 5.326768398284912,
+      "learning_rate": 1.436033319277183e-05,
+      "loss": 0.8255,
+      "num_input_tokens_seen": 107824,
+      "step": 135,
+      "train_runtime": 332.5409,
+      "train_tokens_per_second": 324.243
+    },
+    {
+      "epoch": 1.4939597315436242,
+      "grad_norm": 5.588732719421387,
+      "learning_rate": 1.3980678646424308e-05,
+      "loss": 0.8806,
+      "num_input_tokens_seen": 111936,
+      "step": 140,
+      "train_runtime": 343.9421,
+      "train_tokens_per_second": 325.45
+    },
+    {
+      "epoch": 1.5476510067114093,
+      "grad_norm": 4.919476509094238,
+      "learning_rate": 1.3594077728375129e-05,
+      "loss": 0.8366,
+      "num_input_tokens_seen": 115952,
+      "step": 145,
+      "train_runtime": 355.5203,
+      "train_tokens_per_second": 326.147
+    },
+    {
+      "epoch": 1.6013422818791945,
+      "grad_norm": 5.641396522521973,
+      "learning_rate": 1.3201205065720699e-05,
+      "loss": 0.8567,
+      "num_input_tokens_seen": 119984,
+      "step": 150,
+      "train_runtime": 366.6895,
+      "train_tokens_per_second": 327.209
+    },
+    {
+      "epoch": 1.6550335570469799,
+      "grad_norm": 6.353975296020508,
+      "learning_rate": 1.2802746229889563e-05,
+      "loss": 0.8939,
+      "num_input_tokens_seen": 124032,
+      "step": 155,
+      "train_runtime": 377.8594,
+      "train_tokens_per_second": 328.249
+    },
+    {
+      "epoch": 1.7087248322147652,
+      "grad_norm": 6.592543125152588,
+      "learning_rate": 1.2399396540305205e-05,
+      "loss": 0.965,
+      "num_input_tokens_seen": 128000,
+      "step": 160,
+      "train_runtime": 388.7376,
+      "train_tokens_per_second": 329.271
+    },
+    {
+      "epoch": 1.7624161073825504,
+      "grad_norm": 6.394205570220947,
+      "learning_rate": 1.1991859851038362e-05,
+      "loss": 0.8915,
+      "num_input_tokens_seen": 132096,
+      "step": 165,
+      "train_runtime": 399.8008,
+      "train_tokens_per_second": 330.405
+    },
+    {
+      "epoch": 1.8161073825503355,
+      "grad_norm": 6.431347846984863,
+      "learning_rate": 1.1580847322566224e-05,
+      "loss": 0.9085,
+      "num_input_tokens_seen": 136208,
+      "step": 170,
+      "train_runtime": 410.7015,
+      "train_tokens_per_second": 331.647
+    },
+    {
+      "epoch": 1.8697986577181207,
+      "grad_norm": 6.055410385131836,
+      "learning_rate": 1.1167076180781764e-05,
+      "loss": 0.8831,
+      "num_input_tokens_seen": 140320,
+      "step": 175,
+      "train_runtime": 421.922,
+      "train_tokens_per_second": 332.573
+    },
+    {
+      "epoch": 1.923489932885906,
+      "grad_norm": 6.936720848083496,
+      "learning_rate": 1.0751268465418784e-05,
+      "loss": 1.0271,
+      "num_input_tokens_seen": 144176,
+      "step": 180,
+      "train_runtime": 433.5402,
+      "train_tokens_per_second": 332.555
+    },
+    {
+      "epoch": 1.9771812080536912,
+      "grad_norm": 6.763705253601074,
+      "learning_rate": 1.0334149770076747e-05,
+      "loss": 0.9636,
+      "num_input_tokens_seen": 148256,
+      "step": 185,
+      "train_runtime": 444.8309,
+      "train_tokens_per_second": 333.286
+    },
+    {
+      "epoch": 2.021476510067114,
+      "grad_norm": 3.6498420238494873,
+      "learning_rate": 9.916447976043972e-06,
+      "loss": 0.7085,
+      "num_input_tokens_seen": 151488,
+      "step": 190,
+      "train_runtime": 454.5144,
+      "train_tokens_per_second": 333.296
+    },
+    {
+      "epoch": 2.0751677852348993,
+      "grad_norm": 4.591700553894043,
+      "learning_rate": 9.498891982128809e-06,
+      "loss": 0.5186,
+      "num_input_tokens_seen": 155472,
+      "step": 195,
+      "train_runtime": 465.7186,
+      "train_tokens_per_second": 333.832
+    },
+    {
+      "epoch": 2.1288590604026845,
+      "grad_norm": 5.094573497772217,
+      "learning_rate": 9.082210432715197e-06,
+      "loss": 0.5023,
+      "num_input_tokens_seen": 159504,
+      "step": 200,
+      "train_runtime": 476.9786,
+      "train_tokens_per_second": 334.405
+    },
+    {
+      "epoch": 2.1288590604026845,
+      "eval_loss": 3.4170751571655273,
+      "eval_runtime": 13.8611,
+      "eval_samples_per_second": 26.91,
+      "eval_steps_per_second": 13.491,
+      "num_input_tokens_seen": 159504,
+      "step": 200
+    },
+    {
+      "epoch": 2.1825503355704696,
+      "grad_norm": 5.766697406768799,
+      "learning_rate": 8.667130446262214e-06,
+      "loss": 0.5145,
+      "num_input_tokens_seen": 163472,
+      "step": 205,
+      "train_runtime": 505.879,
+      "train_tokens_per_second": 323.144
+    },
+    {
+      "epoch": 2.2362416107382552,
+      "grad_norm": 4.821939945220947,
+      "learning_rate": 8.25437634646637e-06,
+      "loss": 0.4971,
+      "num_input_tokens_seen": 167584,
+      "step": 210,
+      "train_runtime": 517.6135,
+      "train_tokens_per_second": 323.763
+    },
+    {
+      "epoch": 2.2899328859060404,
+      "grad_norm": 5.04258394241333,
+      "learning_rate": 7.844668398300866e-06,
+      "loss": 0.5042,
+      "num_input_tokens_seen": 171712,
+      "step": 215,
+      "train_runtime": 528.5686,
+      "train_tokens_per_second": 324.862
+    },
+    {
+      "epoch": 2.3436241610738255,
+      "grad_norm": 3.9442596435546875,
+      "learning_rate": 7.438721551137367e-06,
+      "loss": 0.5423,
+      "num_input_tokens_seen": 175696,
+      "step": 220,
+      "train_runtime": 539.7243,
+      "train_tokens_per_second": 325.529
+    },
+    {
+      "epoch": 2.3973154362416107,
+      "grad_norm": 4.744536399841309,
+      "learning_rate": 7.037244191143662e-06,
+      "loss": 0.5913,
+      "num_input_tokens_seen": 179728,
+      "step": 225,
+      "train_runtime": 551.4106,
+      "train_tokens_per_second": 325.942
+    },
+    {
+      "epoch": 2.451006711409396,
+      "grad_norm": 3.8527259826660156,
+      "learning_rate": 6.640936905134212e-06,
+      "loss": 0.5403,
+      "num_input_tokens_seen": 183648,
+      "step": 230,
+      "train_runtime": 563.1128,
+      "train_tokens_per_second": 326.13
+    },
+    {
+      "epoch": 2.504697986577181,
+      "grad_norm": 5.313571453094482,
+      "learning_rate": 6.2504912580307905e-06,
+      "loss": 0.5301,
+      "num_input_tokens_seen": 187664,
+      "step": 235,
+      "train_runtime": 574.2745,
+      "train_tokens_per_second": 326.784
+    },
+    {
+      "epoch": 2.558389261744966,
+      "grad_norm": 4.798688888549805,
+      "learning_rate": 5.866588586066481e-06,
+      "loss": 0.5417,
+      "num_input_tokens_seen": 191776,
+      "step": 240,
+      "train_runtime": 585.8109,
+      "train_tokens_per_second": 327.368
+    },
+    {
+      "epoch": 2.6120805369127518,
+      "grad_norm": 4.535137176513672,
+      "learning_rate": 5.48989880783898e-06,
+      "loss": 0.5358,
+      "num_input_tokens_seen": 195872,
+      "step": 245,
+      "train_runtime": 597.5628,
+      "train_tokens_per_second": 327.785
+    },
+    {
+      "epoch": 2.665771812080537,
+      "grad_norm": 4.101404666900635,
+      "learning_rate": 5.121079255287953e-06,
+      "loss": 0.5249,
+      "num_input_tokens_seen": 199808,
+      "step": 250,
+      "train_runtime": 608.9706,
+      "train_tokens_per_second": 328.108
+    },
+    {
+      "epoch": 2.719463087248322,
+      "grad_norm": 4.927453994750977,
+      "learning_rate": 4.760773526636315e-06,
+      "loss": 0.5134,
+      "num_input_tokens_seen": 203888,
+      "step": 255,
+      "train_runtime": 620.4402,
+      "train_tokens_per_second": 328.618
+    },
+    {
+      "epoch": 2.7731543624161072,
+      "grad_norm": 4.203514575958252,
+      "learning_rate": 4.409610363297211e-06,
+      "loss": 0.516,
+      "num_input_tokens_seen": 207840,
+      "step": 260,
+      "train_runtime": 632.2498,
+      "train_tokens_per_second": 328.731
+    },
+    {
+      "epoch": 2.826845637583893,
+      "grad_norm": 3.7994656562805176,
+      "learning_rate": 4.0682025527064486e-06,
+      "loss": 0.5611,
+      "num_input_tokens_seen": 211856,
+      "step": 265,
+      "train_runtime": 643.9121,
+      "train_tokens_per_second": 329.014
+    },
+    {
+      "epoch": 2.880536912751678,
+      "grad_norm": 4.413597583770752,
+      "learning_rate": 3.7371458589949337e-06,
+      "loss": 0.5084,
+      "num_input_tokens_seen": 215904,
+      "step": 270,
+      "train_runtime": 654.5637,
+      "train_tokens_per_second": 329.844
+    },
+    {
+      "epoch": 2.934228187919463,
+      "grad_norm": 4.6744384765625,
+      "learning_rate": 3.4170179833671847e-06,
+      "loss": 0.5458,
+      "num_input_tokens_seen": 219856,
+      "step": 275,
+      "train_runtime": 665.522,
+      "train_tokens_per_second": 330.351
+    },
+    {
+      "epoch": 2.9879194630872483,
+      "grad_norm": 4.479729652404785,
+      "learning_rate": 3.1083775560000373e-06,
+      "loss": 0.5205,
+      "num_input_tokens_seen": 223984,
+      "step": 280,
+      "train_runtime": 675.8425,
+      "train_tokens_per_second": 331.414
+    },
+    {
+      "epoch": 3.032214765100671,
+      "grad_norm": 2.398268938064575,
+      "learning_rate": 2.8117631612207084e-06,
+      "loss": 0.4058,
+      "num_input_tokens_seen": 227440,
+      "step": 285,
+      "train_runtime": 684.7066,
+      "train_tokens_per_second": 332.171
+    },
+    {
+      "epoch": 3.085906040268456,
+      "grad_norm": 2.7196109294891357,
+      "learning_rate": 2.527692397665311e-06,
+      "loss": 0.3859,
+      "num_input_tokens_seen": 231424,
+      "step": 290,
+      "train_runtime": 696.1897,
+      "train_tokens_per_second": 332.415
+    },
+    {
+      "epoch": 3.1395973154362418,
+      "grad_norm": 2.9645609855651855,
+      "learning_rate": 2.256660975057867e-06,
+      "loss": 0.3724,
+      "num_input_tokens_seen": 235456,
+      "step": 295,
+      "train_runtime": 706.5398,
+      "train_tokens_per_second": 333.252
+    },
+    {
+      "epoch": 3.193288590604027,
+      "grad_norm": 3.25144624710083,
+      "learning_rate": 1.9991418491859383e-06,
+      "loss": 0.3916,
+      "num_input_tokens_seen": 239424,
+      "step": 300,
+      "train_runtime": 717.153,
+      "train_tokens_per_second": 333.853
+    },
+    {
+      "epoch": 3.193288590604027,
+      "eval_loss": 3.5844736099243164,
+      "eval_runtime": 12.6192,
+      "eval_samples_per_second": 29.558,
+      "eval_steps_per_second": 14.819,
+      "num_input_tokens_seen": 239424,
+      "step": 300
+    },
+    {
+      "epoch": 3.246979865771812,
+      "grad_norm": 3.184697151184082,
+      "learning_rate": 1.7555843965823992e-06,
+      "loss": 0.3676,
+      "num_input_tokens_seen": 243488,
+      "step": 305,
+      "train_runtime": 743.816,
+      "train_tokens_per_second": 327.35
+    },
+    {
+      "epoch": 3.3006711409395972,
+      "grad_norm": 3.306453227996826,
+      "learning_rate": 1.5264136303534893e-06,
+      "loss": 0.3798,
+      "num_input_tokens_seen": 247456,
+      "step": 310,
+      "train_runtime": 755.1587,
+      "train_tokens_per_second": 327.687
+    },
+    {
+      "epoch": 3.3543624161073824,
+      "grad_norm": 3.3960649967193604,
+      "learning_rate": 1.3120294585216353e-06,
+      "loss": 0.3606,
+      "num_input_tokens_seen": 251456,
+      "step": 315,
+      "train_runtime": 765.7156,
+      "train_tokens_per_second": 328.393
+    },
+    {
+      "epoch": 3.4080536912751676,
+      "grad_norm": 3.560805320739746,
+      "learning_rate": 1.11280598617714e-06,
+      "loss": 0.3938,
+      "num_input_tokens_seen": 255504,
+      "step": 320,
+      "train_runtime": 776.6023,
+      "train_tokens_per_second": 329.002
+    },
+    {
+      "epoch": 3.461744966442953,
+      "grad_norm": 3.646918773651123,
+      "learning_rate": 9.290908626565931e-07,
+      "loss": 0.3605,
+      "num_input_tokens_seen": 259472,
+      "step": 325,
+      "train_runtime": 787.0463,
+      "train_tokens_per_second": 329.678
+    },
+    {
+      "epoch": 3.5154362416107383,
+      "grad_norm": 2.9772727489471436,
+      "learning_rate": 7.612046748871327e-07,
+      "loss": 0.3781,
+      "num_input_tokens_seen": 263536,
+      "step": 330,
+      "train_runtime": 797.017,
+      "train_tokens_per_second": 330.653
+    },
+    {
+      "epoch": 3.5691275167785235,
+      "grad_norm": 2.8137423992156982,
+      "learning_rate": 6.094403879552213e-07,
+      "loss": 0.3794,
+      "num_input_tokens_seen": 267584,
+      "step": 335,
+      "train_runtime": 807.8723,
+      "train_tokens_per_second": 331.221
+    },
+    {
+      "epoch": 3.6228187919463086,
+      "grad_norm": 2.8694584369659424,
+      "learning_rate": 4.740628338761255e-07,
+      "loss": 0.3526,
+      "num_input_tokens_seen": 271664,
+      "step": 340,
+      "train_runtime": 817.9687,
+      "train_tokens_per_second": 332.12
+    },
+    {
+      "epoch": 3.6765100671140942,
+      "grad_norm": 3.0649948120117188,
+      "learning_rate": 3.553082494562354e-07,
+      "loss": 0.3846,
+      "num_input_tokens_seen": 275664,
+      "step": 345,
+      "train_runtime": 828.9328,
+      "train_tokens_per_second": 332.553
+    },
+    {
+      "epoch": 3.7302013422818794,
+      "grad_norm": 4.392162322998047,
+      "learning_rate": 2.533838640546438e-07,
+      "loss": 0.3855,
+      "num_input_tokens_seen": 279648,
+      "step": 350,
+      "train_runtime": 839.3162,
+      "train_tokens_per_second": 333.186
+    },
+    {
+      "epoch": 3.7838926174496645,
+      "grad_norm": 3.539325475692749,
+      "learning_rate": 1.6846753796336491e-07,
+      "loss": 0.3955,
+      "num_input_tokens_seen": 283600,
+      "step": 355,
+      "train_runtime": 849.891,
+      "train_tokens_per_second": 333.69
+    },
+    {
+      "epoch": 3.8375838926174497,
+      "grad_norm": 3.1544559001922607,
+      "learning_rate": 1.0070745203721532e-07,
+      "loss": 0.34,
+      "num_input_tokens_seen": 287600,
+      "step": 360,
+      "train_runtime": 860.4738,
+      "train_tokens_per_second": 334.234
+    },
+    {
+      "epoch": 3.891275167785235,
+      "grad_norm": 3.2322452068328857,
+      "learning_rate": 5.022184911495864e-08,
+      "loss": 0.3584,
+      "num_input_tokens_seen": 291712,
+      "step": 365,
+      "train_runtime": 870.8827,
+      "train_tokens_per_second": 334.961
+    },
+    {
+      "epoch": 3.94496644295302,
+      "grad_norm": 3.279101610183716,
+      "learning_rate": 1.7098827682970885e-08,
+      "loss": 0.3608,
+      "num_input_tokens_seen": 295856,
+      "step": 370,
+      "train_runtime": 881.666,
+      "train_tokens_per_second": 335.565
+    },
+    {
+      "epoch": 3.998657718120805,
+      "grad_norm": 3.604062080383301,
+      "learning_rate": 1.3961881414292776e-09,
+      "loss": 0.3659,
+      "num_input_tokens_seen": 299904,
+      "step": 375,
+      "train_runtime": 892.1532,
+      "train_tokens_per_second": 336.158
+    },
+    {
+      "epoch": 4.0,
+      "num_input_tokens_seen": 300000,
+      "step": 376,
+      "total_flos": 1780328448000000.0,
+      "train_loss": 0.7932832451101314,
+      "train_runtime": 895.9422,
+      "train_samples_per_second": 6.652,
+      "train_steps_per_second": 0.42
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 376,
+  "num_input_tokens_seen": 300000,
+  "num_train_epochs": 4,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1780328448000000.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33993b60fd6ce357901ae84544b5451fdd8d4d3dc51c04aa377491c86c3089cd
+size 5752

training_args.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+adapter_name_or_path: saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v4
+bf16: true
+cutoff_len: 1024
+dataset: pipo_persona
+dataset_dir: data
+ddp_timeout: 180000000
+do_train: true
+enable_thinking: true
+eval_steps: 100
+eval_strategy: steps
+finetuning_type: lora
+flash_attn: auto
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+learning_rate: 2.0e-05
+logging_steps: 5
+lora_alpha: 32
+lora_dropout: 0.1
+lora_rank: 16
+lora_target: q_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
+loraplus_lr_ratio: 8
+lr_scheduler_type: cosine
+max_grad_norm: 1.0
+max_samples: 1900
+model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
+num_train_epochs: 4.0
+optim: adamw_torch
+output_dir: saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6
+packing: false
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+pissa_convert: true
+pissa_init: true
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: none
+save_steps: 100
+stage: sft
+template: llama3
+trust_remote_code: true
+use_dora: true
+use_rslora: true
+val_size: 0.2
+warmup_steps: 0

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed