Albert-CAC commited on
Commit
bdda4f6
·
verified ·
1 Parent(s): d19af85

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen/Qwen2.5-Coder-7B-Instruct
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: agent2
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # agent2
18
+
19
+ This model is a fine-tuned version of [/scratch/pioneer/jobs/job.2664465.hpc/models/Qwen2.5-Coder-7B-Instruct](https://huggingface.co//scratch/pioneer/jobs/job.2664465.hpc/models/Qwen2.5-Coder-7B-Instruct) on the agent2 dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 8
44
+ - gradient_accumulation_steps: 8
45
+ - total_train_batch_size: 128
46
+ - total_eval_batch_size: 64
47
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
+ - lr_scheduler_type: cosine
49
+ - num_epochs: 1.0
50
+
51
+ ### Training results
52
+
53
+
54
+
55
+ ### Framework versions
56
+
57
+ - Transformers 4.52.4
58
+ - Pytorch 2.7.1+cu126
59
+ - Datasets 3.6.0
60
+ - Tokenizers 0.21.1
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "num_input_tokens_seen": 243707408,
4
+ "total_flos": 487302408765440.0,
5
+ "train_loss": 0.04227780149533199,
6
+ "train_runtime": 3149.3455,
7
+ "train_samples_per_second": 66.569,
8
+ "train_steps_per_second": 0.52
9
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 28,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 28,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 4,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": 131072,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.52.4",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 152064
28
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.52.4"
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3838a17f006e0217a4a92a0c3c8fe03b886942c0107d38756d23b8cae0b03134
3
+ size 4877660776
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1d62e4e4a7fcc6a9ca18a5b41d8d3b758f559f2ce0102c0b061835a73834dea
3
+ size 4932751008
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de91b3188bd68073b9ca53d2a539ff816e2fe7224708ec9b320e55a0c48a4b68
3
+ size 4330865200
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b17cdca930bcca6d6943ab517aab592a6d3d227497e0b9e6bfd88f51dfd6b31
3
+ size 1089994880
model.safetensors.index.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15231233024
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00003-of-00004.safetensors"
345
+ }
346
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 32768,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "num_input_tokens_seen": 243707408,
4
+ "total_flos": 487302408765440.0,
5
+ "train_loss": 0.04227780149533199,
6
+ "train_runtime": 3149.3455,
7
+ "train_samples_per_second": 66.569,
8
+ "train_steps_per_second": 0.52
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 1638, "loss": 0.2014, "lr": 4.999926430159223e-05, "epoch": 0.0030525030525030525, "percentage": 0.31, "elapsed_time": "0:00:25", "remaining_time": "2:20:50", "throughput": 28961.68, "total_tokens": 749328}
2
+ {"current_steps": 10, "total_steps": 1638, "loss": 0.1612, "lr": 4.999627560102124e-05, "epoch": 0.006105006105006105, "percentage": 0.61, "elapsed_time": "0:00:50", "remaining_time": "2:16:24", "throughput": 30025.14, "total_tokens": 1509456}
3
+ {"current_steps": 15, "total_steps": 1638, "loss": 0.1526, "lr": 4.999098819177214e-05, "epoch": 0.009157509157509158, "percentage": 0.92, "elapsed_time": "0:01:14", "remaining_time": "2:13:53", "throughput": 30433.9, "total_tokens": 2259568}
4
+ {"current_steps": 20, "total_steps": 1638, "loss": 0.1573, "lr": 4.998340256008591e-05, "epoch": 0.01221001221001221, "percentage": 1.22, "elapsed_time": "0:01:37", "remaining_time": "2:11:21", "throughput": 30633.41, "total_tokens": 2984432}
5
+ {"current_steps": 25, "total_steps": 1638, "loss": 0.1597, "lr": 4.997351940355277e-05, "epoch": 0.015262515262515262, "percentage": 1.53, "elapsed_time": "0:02:01", "remaining_time": "2:10:18", "throughput": 30854.08, "total_tokens": 3738880}
6
+ {"current_steps": 30, "total_steps": 1638, "loss": 0.1463, "lr": 4.9961339631048035e-05, "epoch": 0.018315018315018316, "percentage": 1.83, "elapsed_time": "0:02:24", "remaining_time": "2:09:25", "throughput": 30874.98, "total_tokens": 4472992}
7
+ {"current_steps": 35, "total_steps": 1638, "loss": 0.1452, "lr": 4.9946864362648506e-05, "epoch": 0.021367521367521368, "percentage": 2.14, "elapsed_time": "0:02:49", "remaining_time": "2:09:08", "throughput": 30963.66, "total_tokens": 5238224}
8
+ {"current_steps": 40, "total_steps": 1638, "loss": 0.1469, "lr": 4.9930094929529506e-05, "epoch": 0.02442002442002442, "percentage": 2.44, "elapsed_time": "0:03:13", "remaining_time": "2:08:37", "throughput": 31008.57, "total_tokens": 5990176}
9
+ {"current_steps": 45, "total_steps": 1638, "loss": 0.1412, "lr": 4.991103287384244e-05, "epoch": 0.027472527472527472, "percentage": 2.75, "elapsed_time": "0:03:37", "remaining_time": "2:08:31", "throughput": 31010.22, "total_tokens": 6755296}
10
+ {"current_steps": 50, "total_steps": 1638, "loss": 0.1573, "lr": 4.9889679948572974e-05, "epoch": 0.030525030525030524, "percentage": 3.05, "elapsed_time": "0:04:02", "remaining_time": "2:08:08", "throughput": 30981.89, "total_tokens": 7500160}
11
+ {"current_steps": 55, "total_steps": 1638, "loss": 0.1423, "lr": 4.9866038117379824e-05, "epoch": 0.033577533577533576, "percentage": 3.36, "elapsed_time": "0:04:25", "remaining_time": "2:07:28", "throughput": 31021.49, "total_tokens": 8243712}
12
+ {"current_steps": 60, "total_steps": 1638, "loss": 0.1419, "lr": 4.984010955441418e-05, "epoch": 0.03663003663003663, "percentage": 3.66, "elapsed_time": "0:04:49", "remaining_time": "2:06:50", "throughput": 31093.65, "total_tokens": 8997632}
13
+ {"current_steps": 65, "total_steps": 1638, "loss": 0.1515, "lr": 4.981189664411981e-05, "epoch": 0.03968253968253968, "percentage": 3.97, "elapsed_time": "0:05:13", "remaining_time": "2:06:17", "throughput": 31136.45, "total_tokens": 9749984}
14
+ {"current_steps": 70, "total_steps": 1638, "loss": 0.1436, "lr": 4.978140198101366e-05, "epoch": 0.042735042735042736, "percentage": 4.27, "elapsed_time": "0:05:37", "remaining_time": "2:05:51", "throughput": 31107.87, "total_tokens": 10486624}
15
+ {"current_steps": 75, "total_steps": 1638, "loss": 0.1403, "lr": 4.97486283694474e-05, "epoch": 0.045787545787545784, "percentage": 4.58, "elapsed_time": "0:06:00", "remaining_time": "2:05:19", "throughput": 31172.61, "total_tokens": 11247040}
16
+ {"current_steps": 80, "total_steps": 1638, "loss": 0.141, "lr": 4.9713578823349456e-05, "epoch": 0.04884004884004884, "percentage": 4.88, "elapsed_time": "0:06:24", "remaining_time": "2:04:46", "throughput": 31169.56, "total_tokens": 11981936}
17
+ {"current_steps": 85, "total_steps": 1638, "loss": 0.1462, "lr": 4.967625656594782e-05, "epoch": 0.051892551892551896, "percentage": 5.19, "elapsed_time": "0:06:47", "remaining_time": "2:04:09", "throughput": 31175.02, "total_tokens": 12711360}
18
+ {"current_steps": 90, "total_steps": 1638, "loss": 0.1359, "lr": 4.9636665029473714e-05, "epoch": 0.054945054945054944, "percentage": 5.49, "elapsed_time": "0:07:11", "remaining_time": "2:03:42", "throughput": 31184.77, "total_tokens": 13457024}
19
+ {"current_steps": 95, "total_steps": 1638, "loss": 0.1376, "lr": 4.959480785484587e-05, "epoch": 0.057997557997558, "percentage": 5.8, "elapsed_time": "0:07:35", "remaining_time": "2:03:10", "throughput": 31253.48, "total_tokens": 14221888}
20
+ {"current_steps": 100, "total_steps": 1638, "loss": 0.1423, "lr": 4.955068889133576e-05, "epoch": 0.06105006105006105, "percentage": 6.11, "elapsed_time": "0:07:58", "remaining_time": "2:02:46", "throughput": 31212.99, "total_tokens": 14949152}
21
+ {"current_steps": 105, "total_steps": 1638, "loss": 0.141, "lr": 4.9504312196213596e-05, "epoch": 0.0641025641025641, "percentage": 6.41, "elapsed_time": "0:08:22", "remaining_time": "2:02:16", "throughput": 31248.46, "total_tokens": 15702960}
22
+ {"current_steps": 110, "total_steps": 1638, "loss": 0.135, "lr": 4.945568203437521e-05, "epoch": 0.06715506715506715, "percentage": 6.72, "elapsed_time": "0:08:46", "remaining_time": "2:01:59", "throughput": 31220.5, "total_tokens": 16451632}
23
+ {"current_steps": 115, "total_steps": 1638, "loss": 0.1326, "lr": 4.9404802877949843e-05, "epoch": 0.07020757020757021, "percentage": 7.02, "elapsed_time": "0:09:11", "remaining_time": "2:01:44", "throughput": 31206.83, "total_tokens": 17212336}
24
+ {"current_steps": 120, "total_steps": 1638, "loss": 0.1394, "lr": 4.935167940588887e-05, "epoch": 0.07326007326007326, "percentage": 7.33, "elapsed_time": "0:09:35", "remaining_time": "2:01:14", "throughput": 31251.26, "total_tokens": 17970192}
25
+ {"current_steps": 125, "total_steps": 1638, "loss": 0.1322, "lr": 4.929631650353555e-05, "epoch": 0.07631257631257632, "percentage": 7.63, "elapsed_time": "0:09:59", "remaining_time": "2:00:55", "throughput": 31276.28, "total_tokens": 18747280}
26
+ {"current_steps": 130, "total_steps": 1638, "loss": 0.1425, "lr": 4.9238719262175724e-05, "epoch": 0.07936507936507936, "percentage": 7.94, "elapsed_time": "0:10:23", "remaining_time": "2:00:37", "throughput": 31264.78, "total_tokens": 19506368}
27
+ {"current_steps": 135, "total_steps": 1638, "loss": 0.1383, "lr": 4.9178892978569625e-05, "epoch": 0.08241758241758242, "percentage": 8.24, "elapsed_time": "0:10:48", "remaining_time": "2:00:15", "throughput": 31256.06, "total_tokens": 20258064}
28
+ {"current_steps": 140, "total_steps": 1638, "loss": 0.1354, "lr": 4.911684315446477e-05, "epoch": 0.08547008547008547, "percentage": 8.55, "elapsed_time": "0:11:12", "remaining_time": "1:59:59", "throughput": 31226.35, "total_tokens": 21009984}
29
+ {"current_steps": 145, "total_steps": 1638, "loss": 0.1385, "lr": 4.9052575496090016e-05, "epoch": 0.08852258852258853, "percentage": 8.85, "elapsed_time": "0:11:36", "remaining_time": "1:59:32", "throughput": 31222.53, "total_tokens": 21749312}
30
+ {"current_steps": 150, "total_steps": 1638, "loss": 0.1353, "lr": 4.8986095913630806e-05, "epoch": 0.09157509157509157, "percentage": 9.16, "elapsed_time": "0:11:59", "remaining_time": "1:58:57", "throughput": 31240.78, "total_tokens": 22476624}
31
+ {"current_steps": 155, "total_steps": 1638, "loss": 0.1255, "lr": 4.8917410520685635e-05, "epoch": 0.09462759462759462, "percentage": 9.46, "elapsed_time": "0:12:23", "remaining_time": "1:58:32", "throughput": 31257.72, "total_tokens": 23237296}
32
+ {"current_steps": 160, "total_steps": 1638, "loss": 0.1386, "lr": 4.884652563370385e-05, "epoch": 0.09768009768009768, "percentage": 9.77, "elapsed_time": "0:12:46", "remaining_time": "1:57:58", "throughput": 31260.1, "total_tokens": 23955088}
33
+ {"current_steps": 165, "total_steps": 1638, "loss": 0.1434, "lr": 4.87734477714048e-05, "epoch": 0.10073260073260074, "percentage": 10.07, "elapsed_time": "0:13:10", "remaining_time": "1:57:34", "throughput": 31266.06, "total_tokens": 24708720}
34
+ {"current_steps": 170, "total_steps": 1638, "loss": 0.133, "lr": 4.86981836541783e-05, "epoch": 0.10378510378510379, "percentage": 10.38, "elapsed_time": "0:13:33", "remaining_time": "1:57:06", "throughput": 31254.12, "total_tokens": 25432544}
35
+ {"current_steps": 175, "total_steps": 1638, "loss": 0.1383, "lr": 4.862074020346664e-05, "epoch": 0.10683760683760683, "percentage": 10.68, "elapsed_time": "0:13:57", "remaining_time": "1:56:39", "throughput": 31300.74, "total_tokens": 26206496}
36
+ {"current_steps": 180, "total_steps": 1638, "loss": 0.1417, "lr": 4.854112454112811e-05, "epoch": 0.10989010989010989, "percentage": 10.99, "elapsed_time": "0:14:20", "remaining_time": "1:56:12", "throughput": 31277.91, "total_tokens": 26925872}
37
+ {"current_steps": 185, "total_steps": 1638, "loss": 0.1387, "lr": 4.845934398878202e-05, "epoch": 0.11294261294261294, "percentage": 11.29, "elapsed_time": "0:14:44", "remaining_time": "1:55:49", "throughput": 31273.05, "total_tokens": 27669680}
38
+ {"current_steps": 190, "total_steps": 1638, "loss": 0.142, "lr": 4.837540606713538e-05, "epoch": 0.115995115995116, "percentage": 11.6, "elapsed_time": "0:15:08", "remaining_time": "1:55:24", "throughput": 31281.27, "total_tokens": 28423664}
39
+ {"current_steps": 195, "total_steps": 1638, "loss": 0.1442, "lr": 4.828931849529129e-05, "epoch": 0.11904761904761904, "percentage": 11.9, "elapsed_time": "0:15:31", "remaining_time": "1:54:55", "throughput": 31293.32, "total_tokens": 29159680}
40
+ {"current_steps": 200, "total_steps": 1638, "loss": 0.1449, "lr": 4.820108919003913e-05, "epoch": 0.1221001221001221, "percentage": 12.21, "elapsed_time": "0:15:55", "remaining_time": "1:54:30", "throughput": 31269.84, "total_tokens": 29878656}
41
+ {"current_steps": 205, "total_steps": 1638, "loss": 0.1355, "lr": 4.811072626512642e-05, "epoch": 0.12515262515262515, "percentage": 12.52, "elapsed_time": "0:16:19", "remaining_time": "1:54:08", "throughput": 31248.43, "total_tokens": 30615968}
42
+ {"current_steps": 210, "total_steps": 1638, "loss": 0.1294, "lr": 4.801823803051274e-05, "epoch": 0.1282051282051282, "percentage": 12.82, "elapsed_time": "0:16:43", "remaining_time": "1:53:45", "throughput": 31248.72, "total_tokens": 31365168}
43
+ {"current_steps": 215, "total_steps": 1638, "loss": 0.1458, "lr": 4.79236329916055e-05, "epoch": 0.13125763125763126, "percentage": 13.13, "elapsed_time": "0:17:08", "remaining_time": "1:53:26", "throughput": 31252.52, "total_tokens": 32138352}
44
+ {"current_steps": 220, "total_steps": 1638, "loss": 0.1282, "lr": 4.782691984847773e-05, "epoch": 0.1343101343101343, "percentage": 13.43, "elapsed_time": "0:17:31", "remaining_time": "1:52:59", "throughput": 31250.75, "total_tokens": 32869952}
45
+ {"current_steps": 225, "total_steps": 1638, "loss": 0.1333, "lr": 4.77281074950681e-05, "epoch": 0.13736263736263737, "percentage": 13.74, "elapsed_time": "0:17:56", "remaining_time": "1:52:40", "throughput": 31240.11, "total_tokens": 33632032}
46
+ {"current_steps": 230, "total_steps": 1638, "loss": 0.1418, "lr": 4.76272050183629e-05, "epoch": 0.14041514041514042, "percentage": 14.04, "elapsed_time": "0:18:19", "remaining_time": "1:52:13", "throughput": 31243.62, "total_tokens": 34365776}
47
+ {"current_steps": 235, "total_steps": 1638, "loss": 0.1361, "lr": 4.752422169756048e-05, "epoch": 0.14346764346764346, "percentage": 14.35, "elapsed_time": "0:18:43", "remaining_time": "1:51:47", "throughput": 31250.12, "total_tokens": 35108640}
48
+ {"current_steps": 240, "total_steps": 1638, "loss": 0.1318, "lr": 4.741916700321785e-05, "epoch": 0.14652014652014653, "percentage": 14.65, "elapsed_time": "0:19:07", "remaining_time": "1:51:21", "throughput": 31266.14, "total_tokens": 35865600}
49
+ {"current_steps": 245, "total_steps": 1638, "loss": 0.149, "lr": 4.7312050596379764e-05, "epoch": 0.14957264957264957, "percentage": 14.96, "elapsed_time": "0:19:31", "remaining_time": "1:50:59", "throughput": 31260.12, "total_tokens": 36614272}
50
+ {"current_steps": 250, "total_steps": 1638, "loss": 0.1333, "lr": 4.7202882327690314e-05, "epoch": 0.15262515262515264, "percentage": 15.26, "elapsed_time": "0:19:54", "remaining_time": "1:50:31", "throughput": 31261.88, "total_tokens": 37340528}
51
+ {"current_steps": 255, "total_steps": 1638, "loss": 0.1315, "lr": 4.709167223648695e-05, "epoch": 0.15567765567765568, "percentage": 15.57, "elapsed_time": "0:20:17", "remaining_time": "1:50:05", "throughput": 31263.1, "total_tokens": 38075840}
52
+ {"current_steps": 260, "total_steps": 1638, "loss": 0.1372, "lr": 4.697843054987737e-05, "epoch": 0.15873015873015872, "percentage": 15.87, "elapsed_time": "0:20:41", "remaining_time": "1:49:39", "throughput": 31269.02, "total_tokens": 38817888}
53
+ {"current_steps": 265, "total_steps": 1638, "loss": 0.1436, "lr": 4.686316768179889e-05, "epoch": 0.1617826617826618, "percentage": 16.18, "elapsed_time": "0:21:04", "remaining_time": "1:49:13", "throughput": 31275.17, "total_tokens": 39556304}
54
+ {"current_steps": 270, "total_steps": 1638, "loss": 0.1326, "lr": 4.674589423206083e-05, "epoch": 0.16483516483516483, "percentage": 16.48, "elapsed_time": "0:21:28", "remaining_time": "1:48:49", "throughput": 31269.91, "total_tokens": 40299952}
55
+ {"current_steps": 275, "total_steps": 1638, "loss": 0.1395, "lr": 4.6626620985369724e-05, "epoch": 0.16788766788766787, "percentage": 16.79, "elapsed_time": "0:21:52", "remaining_time": "1:48:25", "throughput": 31269.05, "total_tokens": 41042560}
56
+ {"current_steps": 280, "total_steps": 1638, "loss": 0.1381, "lr": 4.650535891033752e-05, "epoch": 0.17094017094017094, "percentage": 17.09, "elapsed_time": "0:22:15", "remaining_time": "1:47:58", "throughput": 31278.47, "total_tokens": 41779920}
57
+ {"current_steps": 285, "total_steps": 1638, "loss": 0.1359, "lr": 4.6382119158472895e-05, "epoch": 0.17399267399267399, "percentage": 17.4, "elapsed_time": "0:22:40", "remaining_time": "1:47:39", "throughput": 31263.33, "total_tokens": 42537040}
58
+ {"current_steps": 290, "total_steps": 1638, "loss": 0.1309, "lr": 4.625691306315572e-05, "epoch": 0.17704517704517705, "percentage": 17.7, "elapsed_time": "0:23:04", "remaining_time": "1:47:16", "throughput": 31256.64, "total_tokens": 43279440}
59
+ {"current_steps": 295, "total_steps": 1638, "loss": 0.134, "lr": 4.6129752138594874e-05, "epoch": 0.1800976800976801, "percentage": 18.01, "elapsed_time": "0:23:28", "remaining_time": "1:46:50", "throughput": 31249.25, "total_tokens": 43999504}
60
+ {"current_steps": 300, "total_steps": 1638, "loss": 0.1272, "lr": 4.600064807876929e-05, "epoch": 0.18315018315018314, "percentage": 18.32, "elapsed_time": "0:23:51", "remaining_time": "1:46:26", "throughput": 31247.25, "total_tokens": 44745040}
61
+ {"current_steps": 305, "total_steps": 1638, "loss": 0.1294, "lr": 4.586961275635263e-05, "epoch": 0.1862026862026862, "percentage": 18.62, "elapsed_time": "0:24:15", "remaining_time": "1:46:02", "throughput": 31250.53, "total_tokens": 45495264}
62
+ {"current_steps": 310, "total_steps": 1638, "loss": 0.1363, "lr": 4.57366582216214e-05, "epoch": 0.18925518925518925, "percentage": 18.93, "elapsed_time": "0:24:39", "remaining_time": "1:45:38", "throughput": 31259.44, "total_tokens": 46249280}
63
+ {"current_steps": 315, "total_steps": 1638, "loss": 0.1319, "lr": 4.560179670134681e-05, "epoch": 0.19230769230769232, "percentage": 19.23, "elapsed_time": "0:25:03", "remaining_time": "1:45:14", "throughput": 31248.28, "total_tokens": 46983664}
64
+ {"current_steps": 320, "total_steps": 1638, "loss": 0.1284, "lr": 4.546504059767035e-05, "epoch": 0.19536019536019536, "percentage": 19.54, "elapsed_time": "0:25:27", "remaining_time": "1:44:52", "throughput": 31255.48, "total_tokens": 47753184}
65
+ {"current_steps": 325, "total_steps": 1638, "loss": 0.1317, "lr": 4.532640248696331e-05, "epoch": 0.1984126984126984, "percentage": 19.84, "elapsed_time": "0:25:51", "remaining_time": "1:44:29", "throughput": 31257.32, "total_tokens": 48503792}
66
+ {"current_steps": 330, "total_steps": 1638, "loss": 0.1371, "lr": 4.518589511867017e-05, "epoch": 0.20146520146520147, "percentage": 20.15, "elapsed_time": "0:26:15", "remaining_time": "1:44:06", "throughput": 31256.97, "total_tokens": 49255984}
67
+ {"current_steps": 335, "total_steps": 1638, "loss": 0.141, "lr": 4.504353141413616e-05, "epoch": 0.2045177045177045, "percentage": 20.45, "elapsed_time": "0:26:39", "remaining_time": "1:43:39", "throughput": 31268.79, "total_tokens": 49998832}
68
+ {"current_steps": 340, "total_steps": 1638, "loss": 0.135, "lr": 4.4899324465419036e-05, "epoch": 0.20757020757020758, "percentage": 20.76, "elapsed_time": "0:27:03", "remaining_time": "1:43:16", "throughput": 31265.44, "total_tokens": 50749456}
69
+ {"current_steps": 345, "total_steps": 1638, "loss": 0.1311, "lr": 4.475328753408499e-05, "epoch": 0.21062271062271062, "percentage": 21.06, "elapsed_time": "0:27:27", "remaining_time": "1:42:54", "throughput": 31262.04, "total_tokens": 51499824}
70
+ {"current_steps": 350, "total_steps": 1638, "loss": 0.1308, "lr": 4.460543404998924e-05, "epoch": 0.21367521367521367, "percentage": 21.37, "elapsed_time": "0:27:51", "remaining_time": "1:42:31", "throughput": 31251.93, "total_tokens": 52242768}
71
+ {"current_steps": 355, "total_steps": 1638, "loss": 0.1277, "lr": 4.4455777610040846e-05, "epoch": 0.21672771672771673, "percentage": 21.67, "elapsed_time": "0:28:15", "remaining_time": "1:42:06", "throughput": 31258.26, "total_tokens": 52986432}
72
+ {"current_steps": 360, "total_steps": 1638, "loss": 0.1446, "lr": 4.4304331976952426e-05, "epoch": 0.21978021978021978, "percentage": 21.98, "elapsed_time": "0:28:38", "remaining_time": "1:41:41", "throughput": 31253.77, "total_tokens": 53716960}
73
+ {"current_steps": 365, "total_steps": 1638, "loss": 0.1321, "lr": 4.415111107797445e-05, "epoch": 0.22283272283272285, "percentage": 22.28, "elapsed_time": "0:29:02", "remaining_time": "1:41:16", "throughput": 31260.1, "total_tokens": 54467760}
74
+ {"current_steps": 370, "total_steps": 1638, "loss": 0.1336, "lr": 4.3996129003614476e-05, "epoch": 0.2258852258852259, "percentage": 22.59, "elapsed_time": "0:29:26", "remaining_time": "1:40:53", "throughput": 31251.43, "total_tokens": 55205056}
75
+ {"current_steps": 375, "total_steps": 1638, "loss": 0.1242, "lr": 4.3839400006341335e-05, "epoch": 0.22893772893772893, "percentage": 22.89, "elapsed_time": "0:29:49", "remaining_time": "1:40:26", "throughput": 31255.07, "total_tokens": 55926512}
76
+ {"current_steps": 380, "total_steps": 1638, "loss": 0.13, "lr": 4.3680938499274485e-05, "epoch": 0.231990231990232, "percentage": 23.2, "elapsed_time": "0:30:12", "remaining_time": "1:40:01", "throughput": 31249.2, "total_tokens": 56649424}
77
+ {"current_steps": 385, "total_steps": 1638, "loss": 0.1343, "lr": 4.352075905485854e-05, "epoch": 0.23504273504273504, "percentage": 23.5, "elapsed_time": "0:30:36", "remaining_time": "1:39:37", "throughput": 31260.99, "total_tokens": 57414592}
78
+ {"current_steps": 390, "total_steps": 1638, "loss": 0.1317, "lr": 4.335887640352312e-05, "epoch": 0.23809523809523808, "percentage": 23.81, "elapsed_time": "0:31:00", "remaining_time": "1:39:12", "throughput": 31257.28, "total_tokens": 58141200}
79
+ {"current_steps": 395, "total_steps": 1638, "loss": 0.1232, "lr": 4.319530543232827e-05, "epoch": 0.24114774114774115, "percentage": 24.11, "elapsed_time": "0:31:24", "remaining_time": "1:38:48", "throughput": 31259.27, "total_tokens": 58893360}
80
+ {"current_steps": 400, "total_steps": 1638, "loss": 0.1264, "lr": 4.303006118359537e-05, "epoch": 0.2442002442002442, "percentage": 24.42, "elapsed_time": "0:31:47", "remaining_time": "1:38:22", "throughput": 31264.4, "total_tokens": 59629152}
81
+ {"current_steps": 405, "total_steps": 1638, "loss": 0.128, "lr": 4.286315885352382e-05, "epoch": 0.24725274725274726, "percentage": 24.73, "elapsed_time": "0:32:10", "remaining_time": "1:37:58", "throughput": 31265.36, "total_tokens": 60368640}
82
+ {"current_steps": 410, "total_steps": 1638, "loss": 0.1287, "lr": 4.2694613790793604e-05, "epoch": 0.2503052503052503, "percentage": 25.03, "elapsed_time": "0:32:35", "remaining_time": "1:37:35", "throughput": 31266.52, "total_tokens": 61126592}
83
+ {"current_steps": 415, "total_steps": 1638, "loss": 0.1251, "lr": 4.252444149515374e-05, "epoch": 0.25335775335775335, "percentage": 25.34, "elapsed_time": "0:32:58", "remaining_time": "1:37:10", "throughput": 31261.84, "total_tokens": 61847552}
84
+ {"current_steps": 420, "total_steps": 1638, "loss": 0.1295, "lr": 4.235265761599691e-05, "epoch": 0.2564102564102564, "percentage": 25.64, "elapsed_time": "0:33:22", "remaining_time": "1:36:46", "throughput": 31265.65, "total_tokens": 62599056}
85
+ {"current_steps": 425, "total_steps": 1638, "loss": 0.1294, "lr": 4.217927795092034e-05, "epoch": 0.2594627594627595, "percentage": 25.95, "elapsed_time": "0:33:46", "remaining_time": "1:36:23", "throughput": 31257.58, "total_tokens": 63343712}
86
+ {"current_steps": 430, "total_steps": 1638, "loss": 0.1249, "lr": 4.2004318444272985e-05, "epoch": 0.2625152625152625, "percentage": 26.25, "elapsed_time": "0:34:10", "remaining_time": "1:36:00", "throughput": 31256.5, "total_tokens": 64088256}
87
+ {"current_steps": 435, "total_steps": 1638, "loss": 0.1292, "lr": 4.182779518568926e-05, "epoch": 0.26556776556776557, "percentage": 26.56, "elapsed_time": "0:34:34", "remaining_time": "1:35:36", "throughput": 31254.43, "total_tokens": 64832512}
88
+ {"current_steps": 440, "total_steps": 1638, "loss": 0.1276, "lr": 4.1649724408609406e-05, "epoch": 0.2686202686202686, "percentage": 26.86, "elapsed_time": "0:34:57", "remaining_time": "1:35:11", "throughput": 31260.27, "total_tokens": 65576640}
89
+ {"current_steps": 445, "total_steps": 1638, "loss": 0.1225, "lr": 4.1470122488786645e-05, "epoch": 0.27167277167277165, "percentage": 27.17, "elapsed_time": "0:35:21", "remaining_time": "1:34:47", "throughput": 31261.74, "total_tokens": 66320960}
90
+ {"current_steps": 450, "total_steps": 1638, "loss": 0.1324, "lr": 4.128900594278122e-05, "epoch": 0.27472527472527475, "percentage": 27.47, "elapsed_time": "0:35:44", "remaining_time": "1:34:22", "throughput": 31256.7, "total_tokens": 67041040}
91
+ {"current_steps": 455, "total_steps": 1638, "loss": 0.1277, "lr": 4.110639142644149e-05, "epoch": 0.2777777777777778, "percentage": 27.78, "elapsed_time": "0:36:08", "remaining_time": "1:33:58", "throughput": 31254.42, "total_tokens": 67784672}
92
+ {"current_steps": 460, "total_steps": 1638, "loss": 0.1276, "lr": 4.092229573337223e-05, "epoch": 0.28083028083028083, "percentage": 28.08, "elapsed_time": "0:36:32", "remaining_time": "1:33:34", "throughput": 31257.25, "total_tokens": 68528416}
93
+ {"current_steps": 465, "total_steps": 1638, "loss": 0.1324, "lr": 4.073673579339028e-05, "epoch": 0.2838827838827839, "percentage": 28.39, "elapsed_time": "0:36:57", "remaining_time": "1:33:13", "throughput": 31251.18, "total_tokens": 69293616}
94
+ {"current_steps": 470, "total_steps": 1638, "loss": 0.1333, "lr": 4.05497286709676e-05, "epoch": 0.2869352869352869, "percentage": 28.69, "elapsed_time": "0:37:21", "remaining_time": "1:32:49", "throughput": 31246.13, "total_tokens": 70026160}
95
+ {"current_steps": 475, "total_steps": 1638, "loss": 0.1371, "lr": 4.036129156366203e-05, "epoch": 0.28998778998779, "percentage": 29.0, "elapsed_time": "0:37:44", "remaining_time": "1:32:25", "throughput": 31247.53, "total_tokens": 70771216}
96
+ {"current_steps": 480, "total_steps": 1638, "loss": 0.1292, "lr": 4.017144180053572e-05, "epoch": 0.29304029304029305, "percentage": 29.3, "elapsed_time": "0:38:09", "remaining_time": "1:32:02", "throughput": 31239.39, "total_tokens": 71516128}
97
+ {"current_steps": 485, "total_steps": 1638, "loss": 0.1291, "lr": 3.998019684056158e-05, "epoch": 0.2960927960927961, "percentage": 29.61, "elapsed_time": "0:38:33", "remaining_time": "1:31:38", "throughput": 31241.65, "total_tokens": 72262192}
98
+ {"current_steps": 490, "total_steps": 1638, "loss": 0.1236, "lr": 3.978757427101764e-05, "epoch": 0.29914529914529914, "percentage": 29.91, "elapsed_time": "0:38:57", "remaining_time": "1:31:15", "throughput": 31247.58, "total_tokens": 73028336}
99
+ {"current_steps": 495, "total_steps": 1638, "loss": 0.116, "lr": 3.959359180586975e-05, "epoch": 0.3021978021978022, "percentage": 30.22, "elapsed_time": "0:39:21", "remaining_time": "1:30:53", "throughput": 31245.4, "total_tokens": 73793216}
100
+ {"current_steps": 500, "total_steps": 1638, "loss": 0.1284, "lr": 3.939826728414254e-05, "epoch": 0.3052503052503053, "percentage": 30.53, "elapsed_time": "0:39:44", "remaining_time": "1:30:28", "throughput": 31248.38, "total_tokens": 74526624}
101
+ {"current_steps": 505, "total_steps": 1638, "loss": 0.1194, "lr": 3.920161866827889e-05, "epoch": 0.3083028083028083, "percentage": 30.83, "elapsed_time": "0:40:07", "remaining_time": "1:30:02", "throughput": 31246.95, "total_tokens": 75239984}
102
+ {"current_steps": 510, "total_steps": 1638, "loss": 0.1303, "lr": 3.9003664042488144e-05, "epoch": 0.31135531135531136, "percentage": 31.14, "elapsed_time": "0:40:31", "remaining_time": "1:29:38", "throughput": 31238.03, "total_tokens": 75969680}
103
+ {"current_steps": 515, "total_steps": 1638, "loss": 0.128, "lr": 3.8804421611082916e-05, "epoch": 0.3144078144078144, "percentage": 31.44, "elapsed_time": "0:40:55", "remaining_time": "1:29:15", "throughput": 31229.78, "total_tokens": 76699136}
104
+ {"current_steps": 520, "total_steps": 1638, "loss": 0.1186, "lr": 3.8603909696805104e-05, "epoch": 0.31746031746031744, "percentage": 31.75, "elapsed_time": "0:41:19", "remaining_time": "1:28:51", "throughput": 31227.37, "total_tokens": 77431088}
105
+ {"current_steps": 525, "total_steps": 1638, "loss": 0.1277, "lr": 3.8402146739140874e-05, "epoch": 0.32051282051282054, "percentage": 32.05, "elapsed_time": "0:41:43", "remaining_time": "1:28:28", "throughput": 31228.73, "total_tokens": 78191456}
106
+ {"current_steps": 530, "total_steps": 1638, "loss": 0.1289, "lr": 3.819915129262484e-05, "epoch": 0.3235653235653236, "percentage": 32.36, "elapsed_time": "0:42:07", "remaining_time": "1:28:04", "throughput": 31222.78, "total_tokens": 78920656}
107
+ {"current_steps": 535, "total_steps": 1638, "loss": 0.1259, "lr": 3.799494202513386e-05, "epoch": 0.3266178266178266, "percentage": 32.66, "elapsed_time": "0:42:30", "remaining_time": "1:27:39", "throughput": 31227.87, "total_tokens": 79659328}
108
+ {"current_steps": 540, "total_steps": 1638, "loss": 0.1245, "lr": 3.7789537716170256e-05, "epoch": 0.32967032967032966, "percentage": 32.97, "elapsed_time": "0:42:54", "remaining_time": "1:27:15", "throughput": 31228.72, "total_tokens": 80403760}
109
+ {"current_steps": 545, "total_steps": 1638, "loss": 0.1237, "lr": 3.7582957255134765e-05, "epoch": 0.3327228327228327, "percentage": 33.27, "elapsed_time": "0:43:18", "remaining_time": "1:26:50", "throughput": 31231.05, "total_tokens": 81145952}
110
+ {"current_steps": 550, "total_steps": 1638, "loss": 0.1275, "lr": 3.7375219639589536e-05, "epoch": 0.33577533577533575, "percentage": 33.58, "elapsed_time": "0:43:42", "remaining_time": "1:26:27", "throughput": 31228.56, "total_tokens": 81894240}
111
+ {"current_steps": 555, "total_steps": 1638, "loss": 0.1325, "lr": 3.716634397351097e-05, "epoch": 0.33882783882783885, "percentage": 33.88, "elapsed_time": "0:44:05", "remaining_time": "1:26:02", "throughput": 31233.29, "total_tokens": 82629360}
112
+ {"current_steps": 560, "total_steps": 1638, "loss": 0.1262, "lr": 3.695634946553296e-05, "epoch": 0.3418803418803419, "percentage": 34.19, "elapsed_time": "0:44:29", "remaining_time": "1:25:39", "throughput": 31230.79, "total_tokens": 83382976}
113
+ {"current_steps": 565, "total_steps": 1638, "loss": 0.1341, "lr": 3.674525542718035e-05, "epoch": 0.34493284493284493, "percentage": 34.49, "elapsed_time": "0:44:53", "remaining_time": "1:25:15", "throughput": 31233.2, "total_tokens": 84124464}
114
+ {"current_steps": 570, "total_steps": 1638, "loss": 0.1253, "lr": 3.653308127109309e-05, "epoch": 0.34798534798534797, "percentage": 34.8, "elapsed_time": "0:45:17", "remaining_time": "1:24:51", "throughput": 31230.16, "total_tokens": 84863584}
115
+ {"current_steps": 575, "total_steps": 1638, "loss": 0.1294, "lr": 3.631984650924094e-05, "epoch": 0.351037851037851, "percentage": 35.1, "elapsed_time": "0:45:40", "remaining_time": "1:24:26", "throughput": 31229.92, "total_tokens": 85589552}
116
+ {"current_steps": 580, "total_steps": 1638, "loss": 0.1231, "lr": 3.610557075112914e-05, "epoch": 0.3540903540903541, "percentage": 35.41, "elapsed_time": "0:46:04", "remaining_time": "1:24:02", "throughput": 31238.59, "total_tokens": 86354208}
117
+ {"current_steps": 585, "total_steps": 1638, "loss": 0.1258, "lr": 3.58902737019951e-05, "epoch": 0.35714285714285715, "percentage": 35.71, "elapsed_time": "0:46:27", "remaining_time": "1:23:38", "throughput": 31238.34, "total_tokens": 87086304}
118
+ {"current_steps": 590, "total_steps": 1638, "loss": 0.126, "lr": 3.567397516099621e-05, "epoch": 0.3601953601953602, "percentage": 36.02, "elapsed_time": "0:46:51", "remaining_time": "1:23:14", "throughput": 31245.28, "total_tokens": 87856016}
119
+ {"current_steps": 595, "total_steps": 1638, "loss": 0.1212, "lr": 3.545669501938913e-05, "epoch": 0.36324786324786323, "percentage": 36.32, "elapsed_time": "0:47:15", "remaining_time": "1:22:50", "throughput": 31240.25, "total_tokens": 88583632}
120
+ {"current_steps": 600, "total_steps": 1638, "loss": 0.1201, "lr": 3.5238453258700514e-05, "epoch": 0.3663003663003663, "percentage": 36.63, "elapsed_time": "0:47:39", "remaining_time": "1:22:27", "throughput": 31235.15, "total_tokens": 89328928}
121
+ {"current_steps": 605, "total_steps": 1638, "loss": 0.1249, "lr": 3.501926994888946e-05, "epoch": 0.3693528693528694, "percentage": 36.94, "elapsed_time": "0:48:03", "remaining_time": "1:22:03", "throughput": 31235.59, "total_tokens": 90063824}
122
+ {"current_steps": 610, "total_steps": 1638, "loss": 0.1251, "lr": 3.479916524650188e-05, "epoch": 0.3724053724053724, "percentage": 37.24, "elapsed_time": "0:48:26", "remaining_time": "1:21:37", "throughput": 31230.07, "total_tokens": 90761024}
123
+ {"current_steps": 615, "total_steps": 1638, "loss": 0.1178, "lr": 3.45781593928168e-05, "epoch": 0.37545787545787546, "percentage": 37.55, "elapsed_time": "0:48:50", "remaining_time": "1:21:14", "throughput": 31229.42, "total_tokens": 91512496}
124
+ {"current_steps": 620, "total_steps": 1638, "loss": 0.1199, "lr": 3.4356272711984994e-05, "epoch": 0.3785103785103785, "percentage": 37.85, "elapsed_time": "0:49:14", "remaining_time": "1:20:50", "throughput": 31227.0, "total_tokens": 92248080}
125
+ {"current_steps": 625, "total_steps": 1638, "loss": 0.124, "lr": 3.413352560915988e-05, "epoch": 0.38156288156288154, "percentage": 38.16, "elapsed_time": "0:49:38", "remaining_time": "1:20:27", "throughput": 31222.42, "total_tokens": 92989936}
126
+ {"current_steps": 630, "total_steps": 1638, "loss": 0.1264, "lr": 3.390993856862106e-05, "epoch": 0.38461538461538464, "percentage": 38.46, "elapsed_time": "0:50:01", "remaining_time": "1:20:02", "throughput": 31228.42, "total_tokens": 93726464}
127
+ {"current_steps": 635, "total_steps": 1638, "loss": 0.1186, "lr": 3.368553215189052e-05, "epoch": 0.3876678876678877, "percentage": 38.77, "elapsed_time": "0:50:25", "remaining_time": "1:19:39", "throughput": 31229.73, "total_tokens": 94492800}
128
+ {"current_steps": 640, "total_steps": 1638, "loss": 0.1278, "lr": 3.346032699584176e-05, "epoch": 0.3907203907203907, "percentage": 39.07, "elapsed_time": "0:50:49", "remaining_time": "1:19:15", "throughput": 31229.63, "total_tokens": 95231392}
129
+ {"current_steps": 645, "total_steps": 1638, "loss": 0.1247, "lr": 3.323434381080199e-05, "epoch": 0.39377289377289376, "percentage": 39.38, "elapsed_time": "0:51:13", "remaining_time": "1:18:51", "throughput": 31231.28, "total_tokens": 95982544}
130
+ {"current_steps": 650, "total_steps": 1638, "loss": 0.128, "lr": 3.300760337864755e-05, "epoch": 0.3968253968253968, "percentage": 39.68, "elapsed_time": "0:51:36", "remaining_time": "1:18:26", "throughput": 31230.14, "total_tokens": 96701152}
131
+ {"current_steps": 655, "total_steps": 1638, "loss": 0.1283, "lr": 3.278012655089277e-05, "epoch": 0.3998778998778999, "percentage": 39.99, "elapsed_time": "0:51:59", "remaining_time": "1:18:02", "throughput": 31231.86, "total_tokens": 97439328}
132
+ {"current_steps": 660, "total_steps": 1638, "loss": 0.1248, "lr": 3.255193424677244e-05, "epoch": 0.40293040293040294, "percentage": 40.29, "elapsed_time": "0:52:23", "remaining_time": "1:17:37", "throughput": 31228.96, "total_tokens": 98166064}
133
+ {"current_steps": 665, "total_steps": 1638, "loss": 0.111, "lr": 3.2323047451318023e-05, "epoch": 0.405982905982906, "percentage": 40.6, "elapsed_time": "0:52:47", "remaining_time": "1:17:14", "throughput": 31225.05, "total_tokens": 98902864}
134
+ {"current_steps": 670, "total_steps": 1638, "loss": 0.1187, "lr": 3.209348721342781e-05, "epoch": 0.409035409035409, "percentage": 40.9, "elapsed_time": "0:53:11", "remaining_time": "1:16:51", "throughput": 31223.4, "total_tokens": 99653088}
135
+ {"current_steps": 675, "total_steps": 1638, "loss": 0.1186, "lr": 3.1863274643931244e-05, "epoch": 0.41208791208791207, "percentage": 41.21, "elapsed_time": "0:53:36", "remaining_time": "1:16:28", "throughput": 31229.81, "total_tokens": 100446640}
136
+ {"current_steps": 680, "total_steps": 1638, "loss": 0.1226, "lr": 3.163243091364752e-05, "epoch": 0.41514041514041516, "percentage": 41.51, "elapsed_time": "0:54:00", "remaining_time": "1:16:05", "throughput": 31232.69, "total_tokens": 101212368}
137
+ {"current_steps": 685, "total_steps": 1638, "loss": 0.1262, "lr": 3.140097725143868e-05, "epoch": 0.4181929181929182, "percentage": 41.82, "elapsed_time": "0:54:24", "remaining_time": "1:15:42", "throughput": 31224.36, "total_tokens": 101947248}
138
+ {"current_steps": 690, "total_steps": 1638, "loss": 0.1266, "lr": 3.116893494225734e-05, "epoch": 0.42124542124542125, "percentage": 42.12, "elapsed_time": "0:54:48", "remaining_time": "1:15:18", "throughput": 31231.05, "total_tokens": 102702528}
139
+ {"current_steps": 695, "total_steps": 1638, "loss": 0.1191, "lr": 3.093632532518931e-05, "epoch": 0.4242979242979243, "percentage": 42.43, "elapsed_time": "0:55:12", "remaining_time": "1:14:54", "throughput": 31235.24, "total_tokens": 103463872}
140
+ {"current_steps": 700, "total_steps": 1638, "loss": 0.1206, "lr": 3.0703169791491184e-05, "epoch": 0.42735042735042733, "percentage": 42.74, "elapsed_time": "0:55:35", "remaining_time": "1:14:29", "throughput": 31241.27, "total_tokens": 104209488}
141
+ {"current_steps": 705, "total_steps": 1638, "loss": 0.1125, "lr": 3.0469489782623163e-05, "epoch": 0.43040293040293043, "percentage": 43.04, "elapsed_time": "0:55:59", "remaining_time": "1:14:05", "throughput": 31241.52, "total_tokens": 104951200}
142
+ {"current_steps": 710, "total_steps": 1638, "loss": 0.1203, "lr": 3.0235306788277275e-05, "epoch": 0.43345543345543347, "percentage": 43.35, "elapsed_time": "0:56:23", "remaining_time": "1:13:42", "throughput": 31244.99, "total_tokens": 105713856}
143
+ {"current_steps": 715, "total_steps": 1638, "loss": 0.1163, "lr": 3.0000642344401113e-05, "epoch": 0.4365079365079365, "percentage": 43.65, "elapsed_time": "0:56:47", "remaining_time": "1:13:18", "throughput": 31246.21, "total_tokens": 106470640}
144
+ {"current_steps": 720, "total_steps": 1638, "loss": 0.1128, "lr": 2.9765518031217353e-05, "epoch": 0.43956043956043955, "percentage": 43.96, "elapsed_time": "0:57:11", "remaining_time": "1:12:54", "throughput": 31244.77, "total_tokens": 107210304}
145
+ {"current_steps": 725, "total_steps": 1638, "loss": 0.1159, "lr": 2.952995547123919e-05, "epoch": 0.4426129426129426, "percentage": 44.26, "elapsed_time": "0:57:35", "remaining_time": "1:12:32", "throughput": 31239.23, "total_tokens": 107959696}
146
+ {"current_steps": 730, "total_steps": 1638, "loss": 0.131, "lr": 2.9293976327281908e-05, "epoch": 0.4456654456654457, "percentage": 44.57, "elapsed_time": "0:57:59", "remaining_time": "1:12:07", "throughput": 31240.23, "total_tokens": 108695072}
147
+ {"current_steps": 735, "total_steps": 1638, "loss": 0.1198, "lr": 2.905760230047068e-05, "epoch": 0.44871794871794873, "percentage": 44.87, "elapsed_time": "0:58:23", "remaining_time": "1:11:44", "throughput": 31233.46, "total_tokens": 109430784}
148
+ {"current_steps": 740, "total_steps": 1638, "loss": 0.1135, "lr": 2.882085512824495e-05, "epoch": 0.4517704517704518, "percentage": 45.18, "elapsed_time": "0:58:47", "remaining_time": "1:11:21", "throughput": 31230.41, "total_tokens": 110179680}
149
+ {"current_steps": 745, "total_steps": 1638, "loss": 0.1272, "lr": 2.8583756582359338e-05, "epoch": 0.4548229548229548, "percentage": 45.48, "elapsed_time": "0:59:11", "remaining_time": "1:10:57", "throughput": 31231.58, "total_tokens": 110924720}
150
+ {"current_steps": 750, "total_steps": 1638, "loss": 0.1233, "lr": 2.8346328466881545e-05, "epoch": 0.45787545787545786, "percentage": 45.79, "elapsed_time": "0:59:34", "remaining_time": "1:10:32", "throughput": 31234.96, "total_tokens": 111655808}
151
+ {"current_steps": 755, "total_steps": 1638, "loss": 0.1178, "lr": 2.8108592616187133e-05, "epoch": 0.4609279609279609, "percentage": 46.09, "elapsed_time": "0:59:58", "remaining_time": "1:10:08", "throughput": 31236.36, "total_tokens": 112410512}
152
+ {"current_steps": 760, "total_steps": 1638, "loss": 0.121, "lr": 2.7870570892951642e-05, "epoch": 0.463980463980464, "percentage": 46.4, "elapsed_time": "1:00:22", "remaining_time": "1:09:44", "throughput": 31235.51, "total_tokens": 113144208}
153
+ {"current_steps": 765, "total_steps": 1638, "loss": 0.1194, "lr": 2.763228518614004e-05, "epoch": 0.46703296703296704, "percentage": 46.7, "elapsed_time": "1:00:45", "remaining_time": "1:09:20", "throughput": 31241.14, "total_tokens": 113890288}
154
+ {"current_steps": 770, "total_steps": 1638, "loss": 0.1158, "lr": 2.739375740899375e-05, "epoch": 0.4700854700854701, "percentage": 47.01, "elapsed_time": "1:01:09", "remaining_time": "1:08:56", "throughput": 31238.43, "total_tokens": 114624800}
155
+ {"current_steps": 775, "total_steps": 1638, "loss": 0.1182, "lr": 2.715500949701549e-05, "epoch": 0.4731379731379731, "percentage": 47.31, "elapsed_time": "1:01:33", "remaining_time": "1:08:32", "throughput": 31233.58, "total_tokens": 115354240}
156
+ {"current_steps": 780, "total_steps": 1638, "loss": 0.1221, "lr": 2.6916063405952026e-05, "epoch": 0.47619047619047616, "percentage": 47.62, "elapsed_time": "1:01:56", "remaining_time": "1:08:08", "throughput": 31235.19, "total_tokens": 116095344}
157
+ {"current_steps": 785, "total_steps": 1638, "loss": 0.119, "lr": 2.667694110977506e-05, "epoch": 0.47924297924297926, "percentage": 47.92, "elapsed_time": "1:02:20", "remaining_time": "1:07:45", "throughput": 31234.72, "total_tokens": 116848064}
158
+ {"current_steps": 790, "total_steps": 1638, "loss": 0.124, "lr": 2.6437664598660516e-05, "epoch": 0.4822954822954823, "percentage": 48.23, "elapsed_time": "1:02:44", "remaining_time": "1:07:21", "throughput": 31233.67, "total_tokens": 117585264}
159
+ {"current_steps": 795, "total_steps": 1638, "loss": 0.1181, "lr": 2.6198255876966204e-05, "epoch": 0.48534798534798534, "percentage": 48.53, "elapsed_time": "1:03:08", "remaining_time": "1:06:57", "throughput": 31230.55, "total_tokens": 118323136}
160
+ {"current_steps": 800, "total_steps": 1638, "loss": 0.1197, "lr": 2.5958736961208314e-05, "epoch": 0.4884004884004884, "percentage": 48.84, "elapsed_time": "1:03:32", "remaining_time": "1:06:33", "throughput": 31229.06, "total_tokens": 119069008}
161
+ {"current_steps": 805, "total_steps": 1638, "loss": 0.1067, "lr": 2.5719129878036686e-05, "epoch": 0.49145299145299143, "percentage": 49.15, "elapsed_time": "1:03:56", "remaining_time": "1:06:09", "throughput": 31224.86, "total_tokens": 119794944}
162
+ {"current_steps": 810, "total_steps": 1638, "loss": 0.1213, "lr": 2.547945666220923e-05, "epoch": 0.4945054945054945, "percentage": 49.45, "elapsed_time": "1:04:19", "remaining_time": "1:05:45", "throughput": 31227.05, "total_tokens": 120536208}
163
+ {"current_steps": 815, "total_steps": 1638, "loss": 0.1216, "lr": 2.523973935456554e-05, "epoch": 0.49755799755799757, "percentage": 49.76, "elapsed_time": "1:04:43", "remaining_time": "1:05:21", "throughput": 31223.41, "total_tokens": 121263008}
164
+ {"current_steps": 820, "total_steps": 1638, "loss": 0.1152, "lr": 2.5e-05, "epoch": 0.5006105006105006, "percentage": 50.06, "elapsed_time": "1:05:07", "remaining_time": "1:04:57", "throughput": 31223.89, "total_tokens": 122000960}
165
+ {"current_steps": 825, "total_steps": 1638, "loss": 0.1277, "lr": 2.4760260645434462e-05, "epoch": 0.5036630036630036, "percentage": 50.37, "elapsed_time": "1:05:31", "remaining_time": "1:04:34", "throughput": 31224.22, "total_tokens": 122758368}
166
+ {"current_steps": 830, "total_steps": 1638, "loss": 0.1209, "lr": 2.452054333779078e-05, "epoch": 0.5067155067155067, "percentage": 50.67, "elapsed_time": "1:05:54", "remaining_time": "1:04:10", "throughput": 31221.96, "total_tokens": 123478800}
167
+ {"current_steps": 835, "total_steps": 1638, "loss": 0.1202, "lr": 2.4280870121963323e-05, "epoch": 0.5097680097680097, "percentage": 50.98, "elapsed_time": "1:06:18", "remaining_time": "1:03:46", "throughput": 31225.12, "total_tokens": 124232736}
168
+ {"current_steps": 840, "total_steps": 1638, "loss": 0.1263, "lr": 2.404126303879169e-05, "epoch": 0.5128205128205128, "percentage": 51.28, "elapsed_time": "1:06:43", "remaining_time": "1:03:22", "throughput": 31217.46, "total_tokens": 124967824}
169
+ {"current_steps": 845, "total_steps": 1638, "loss": 0.1157, "lr": 2.38017441230338e-05, "epoch": 0.5158730158730159, "percentage": 51.59, "elapsed_time": "1:07:07", "remaining_time": "1:02:59", "throughput": 31215.31, "total_tokens": 125720640}
170
+ {"current_steps": 850, "total_steps": 1638, "loss": 0.112, "lr": 2.3562335401339486e-05, "epoch": 0.518925518925519, "percentage": 51.89, "elapsed_time": "1:07:31", "remaining_time": "1:02:36", "throughput": 31217.88, "total_tokens": 126482544}
171
+ {"current_steps": 855, "total_steps": 1638, "loss": 0.1183, "lr": 2.3323058890224938e-05, "epoch": 0.521978021978022, "percentage": 52.2, "elapsed_time": "1:07:55", "remaining_time": "1:02:12", "throughput": 31218.83, "total_tokens": 127241776}
172
+ {"current_steps": 860, "total_steps": 1638, "loss": 0.1111, "lr": 2.3083936594047983e-05, "epoch": 0.525030525030525, "percentage": 52.5, "elapsed_time": "1:08:20", "remaining_time": "1:01:49", "throughput": 31222.51, "total_tokens": 128032656}
173
+ {"current_steps": 865, "total_steps": 1638, "loss": 0.1247, "lr": 2.2844990502984513e-05, "epoch": 0.5280830280830281, "percentage": 52.81, "elapsed_time": "1:08:43", "remaining_time": "1:01:25", "throughput": 31222.27, "total_tokens": 128759440}
174
+ {"current_steps": 870, "total_steps": 1638, "loss": 0.1144, "lr": 2.2606242591006253e-05, "epoch": 0.5311355311355311, "percentage": 53.11, "elapsed_time": "1:09:07", "remaining_time": "1:01:01", "throughput": 31225.69, "total_tokens": 129520032}
175
+ {"current_steps": 875, "total_steps": 1638, "loss": 0.115, "lr": 2.2367714813859967e-05, "epoch": 0.5341880341880342, "percentage": 53.42, "elapsed_time": "1:09:31", "remaining_time": "1:00:37", "throughput": 31228.93, "total_tokens": 130265744}
176
+ {"current_steps": 880, "total_steps": 1638, "loss": 0.1195, "lr": 2.2129429107048364e-05, "epoch": 0.5372405372405372, "percentage": 53.72, "elapsed_time": "1:09:55", "remaining_time": "1:00:13", "throughput": 31227.16, "total_tokens": 131007104}
177
+ {"current_steps": 885, "total_steps": 1638, "loss": 0.1264, "lr": 2.189140738381288e-05, "epoch": 0.5402930402930403, "percentage": 54.03, "elapsed_time": "1:10:18", "remaining_time": "0:59:49", "throughput": 31226.43, "total_tokens": 131729648}
178
+ {"current_steps": 890, "total_steps": 1638, "loss": 0.1116, "lr": 2.1653671533118468e-05, "epoch": 0.5433455433455433, "percentage": 54.33, "elapsed_time": "1:10:42", "remaining_time": "0:59:25", "throughput": 31227.78, "total_tokens": 132482960}
179
+ {"current_steps": 895, "total_steps": 1638, "loss": 0.123, "lr": 2.1416243417640668e-05, "epoch": 0.5463980463980463, "percentage": 54.64, "elapsed_time": "1:11:06", "remaining_time": "0:59:02", "throughput": 31228.15, "total_tokens": 133239472}
180
+ {"current_steps": 900, "total_steps": 1638, "loss": 0.115, "lr": 2.1179144871755056e-05, "epoch": 0.5494505494505495, "percentage": 54.95, "elapsed_time": "1:11:30", "remaining_time": "0:58:38", "throughput": 31226.87, "total_tokens": 133981072}
181
+ {"current_steps": 905, "total_steps": 1638, "loss": 0.1117, "lr": 2.0942397699529325e-05, "epoch": 0.5525030525030525, "percentage": 55.25, "elapsed_time": "1:11:54", "remaining_time": "0:58:14", "throughput": 31227.18, "total_tokens": 134739520}
182
+ {"current_steps": 910, "total_steps": 1638, "loss": 0.1177, "lr": 2.0706023672718098e-05, "epoch": 0.5555555555555556, "percentage": 55.56, "elapsed_time": "1:12:18", "remaining_time": "0:57:50", "throughput": 31226.28, "total_tokens": 135479872}
183
+ {"current_steps": 915, "total_steps": 1638, "loss": 0.1133, "lr": 2.047004452876081e-05, "epoch": 0.5586080586080586, "percentage": 55.86, "elapsed_time": "1:12:42", "remaining_time": "0:57:26", "throughput": 31224.5, "total_tokens": 136213072}
184
+ {"current_steps": 920, "total_steps": 1638, "loss": 0.1102, "lr": 2.0234481968782653e-05, "epoch": 0.5616605616605617, "percentage": 56.17, "elapsed_time": "1:13:05", "remaining_time": "0:57:02", "throughput": 31227.12, "total_tokens": 136958032}
185
+ {"current_steps": 925, "total_steps": 1638, "loss": 0.1148, "lr": 1.9999357655598893e-05, "epoch": 0.5647130647130647, "percentage": 56.47, "elapsed_time": "1:13:28", "remaining_time": "0:56:38", "throughput": 31227.97, "total_tokens": 137681904}
186
+ {"current_steps": 930, "total_steps": 1638, "loss": 0.1191, "lr": 1.9764693211722727e-05, "epoch": 0.5677655677655677, "percentage": 56.78, "elapsed_time": "1:13:53", "remaining_time": "0:56:14", "throughput": 31223.66, "total_tokens": 138418096}
187
+ {"current_steps": 935, "total_steps": 1638, "loss": 0.1194, "lr": 1.9530510217376843e-05, "epoch": 0.5708180708180708, "percentage": 57.08, "elapsed_time": "1:14:16", "remaining_time": "0:55:50", "throughput": 31227.85, "total_tokens": 139172816}
188
+ {"current_steps": 940, "total_steps": 1638, "loss": 0.1105, "lr": 1.929683020850883e-05, "epoch": 0.5738705738705738, "percentage": 57.39, "elapsed_time": "1:14:39", "remaining_time": "0:55:26", "throughput": 31230.35, "total_tokens": 139907184}
189
+ {"current_steps": 945, "total_steps": 1638, "loss": 0.1172, "lr": 1.9063674674810696e-05, "epoch": 0.5769230769230769, "percentage": 57.69, "elapsed_time": "1:15:03", "remaining_time": "0:55:02", "throughput": 31230.27, "total_tokens": 140656832}
190
+ {"current_steps": 950, "total_steps": 1638, "loss": 0.1169, "lr": 1.8831065057742657e-05, "epoch": 0.57997557997558, "percentage": 58.0, "elapsed_time": "1:15:27", "remaining_time": "0:54:39", "throughput": 31228.45, "total_tokens": 141397952}
191
+ {"current_steps": 955, "total_steps": 1638, "loss": 0.1205, "lr": 1.8599022748561325e-05, "epoch": 0.5830280830280831, "percentage": 58.3, "elapsed_time": "1:15:52", "remaining_time": "0:54:15", "throughput": 31228.21, "total_tokens": 142152656}
192
+ {"current_steps": 960, "total_steps": 1638, "loss": 0.1078, "lr": 1.8367569086352483e-05, "epoch": 0.5860805860805861, "percentage": 58.61, "elapsed_time": "1:16:15", "remaining_time": "0:53:51", "throughput": 31228.62, "total_tokens": 142883936}
193
+ {"current_steps": 965, "total_steps": 1638, "loss": 0.1208, "lr": 1.8136725356068762e-05, "epoch": 0.5891330891330891, "percentage": 58.91, "elapsed_time": "1:16:39", "remaining_time": "0:53:27", "throughput": 31232.08, "total_tokens": 143644704}
194
+ {"current_steps": 970, "total_steps": 1638, "loss": 0.1181, "lr": 1.7906512786572198e-05, "epoch": 0.5921855921855922, "percentage": 59.22, "elapsed_time": "1:17:02", "remaining_time": "0:53:03", "throughput": 31230.68, "total_tokens": 144375984}
195
+ {"current_steps": 975, "total_steps": 1638, "loss": 0.1137, "lr": 1.767695254868198e-05, "epoch": 0.5952380952380952, "percentage": 59.52, "elapsed_time": "1:17:26", "remaining_time": "0:52:39", "throughput": 31231.67, "total_tokens": 145118576}
196
+ {"current_steps": 980, "total_steps": 1638, "loss": 0.1181, "lr": 1.744806575322756e-05, "epoch": 0.5982905982905983, "percentage": 59.83, "elapsed_time": "1:17:50", "remaining_time": "0:52:15", "throughput": 31229.2, "total_tokens": 145859024}
197
+ {"current_steps": 985, "total_steps": 1638, "loss": 0.1175, "lr": 1.7219873449107233e-05, "epoch": 0.6013431013431013, "percentage": 60.13, "elapsed_time": "1:18:14", "remaining_time": "0:51:52", "throughput": 31229.91, "total_tokens": 146612992}
198
+ {"current_steps": 990, "total_steps": 1638, "loss": 0.1136, "lr": 1.699239662135246e-05, "epoch": 0.6043956043956044, "percentage": 60.44, "elapsed_time": "1:18:37", "remaining_time": "0:51:27", "throughput": 31230.44, "total_tokens": 147331680}
199
+ {"current_steps": 995, "total_steps": 1638, "loss": 0.1134, "lr": 1.6765656189198013e-05, "epoch": 0.6074481074481074, "percentage": 60.74, "elapsed_time": "1:19:01", "remaining_time": "0:51:04", "throughput": 31230.69, "total_tokens": 148081424}
200
+ {"current_steps": 1000, "total_steps": 1638, "loss": 0.1147, "lr": 1.653967300415824e-05, "epoch": 0.6105006105006106, "percentage": 61.05, "elapsed_time": "1:19:24", "remaining_time": "0:50:39", "throughput": 31229.66, "total_tokens": 148805696}
201
+ {"current_steps": 1005, "total_steps": 1638, "loss": 0.1194, "lr": 1.6314467848109483e-05, "epoch": 0.6135531135531136, "percentage": 61.36, "elapsed_time": "1:21:48", "remaining_time": "0:51:31", "throughput": 30467.69, "total_tokens": 149555088}
202
+ {"current_steps": 1010, "total_steps": 1638, "loss": 0.1099, "lr": 1.609006143137895e-05, "epoch": 0.6166056166056166, "percentage": 61.66, "elapsed_time": "1:22:12", "remaining_time": "0:51:06", "throughput": 30467.01, "total_tokens": 150280064}
203
+ {"current_steps": 1015, "total_steps": 1638, "loss": 0.1099, "lr": 1.5866474390840125e-05, "epoch": 0.6196581196581197, "percentage": 61.97, "elapsed_time": "1:22:35", "remaining_time": "0:50:41", "throughput": 30469.46, "total_tokens": 151000304}
204
+ {"current_steps": 1020, "total_steps": 1638, "loss": 0.1085, "lr": 1.564372728801501e-05, "epoch": 0.6227106227106227, "percentage": 62.27, "elapsed_time": "1:22:59", "remaining_time": "0:50:17", "throughput": 30474.96, "total_tokens": 151754352}
205
+ {"current_steps": 1025, "total_steps": 1638, "loss": 0.1078, "lr": 1.5421840607183203e-05, "epoch": 0.6257631257631258, "percentage": 62.58, "elapsed_time": "1:23:24", "remaining_time": "0:49:52", "throughput": 30481.44, "total_tokens": 152529360}
206
+ {"current_steps": 1030, "total_steps": 1638, "loss": 0.112, "lr": 1.5200834753498128e-05, "epoch": 0.6288156288156288, "percentage": 62.88, "elapsed_time": "1:23:47", "remaining_time": "0:49:27", "throughput": 30486.39, "total_tokens": 153265552}
207
+ {"current_steps": 1035, "total_steps": 1638, "loss": 0.1121, "lr": 1.4980730051110541e-05, "epoch": 0.6318681318681318, "percentage": 63.19, "elapsed_time": "1:24:11", "remaining_time": "0:49:02", "throughput": 30491.13, "total_tokens": 154019536}
208
+ {"current_steps": 1040, "total_steps": 1638, "loss": 0.115, "lr": 1.4761546741299495e-05, "epoch": 0.6349206349206349, "percentage": 63.49, "elapsed_time": "1:24:35", "remaining_time": "0:48:38", "throughput": 30493.02, "total_tokens": 154765472}
209
+ {"current_steps": 1045, "total_steps": 1638, "loss": 0.1181, "lr": 1.4543304980610878e-05, "epoch": 0.6379731379731379, "percentage": 63.8, "elapsed_time": "1:24:59", "remaining_time": "0:48:13", "throughput": 30494.44, "total_tokens": 155509408}
210
+ {"current_steps": 1050, "total_steps": 1638, "loss": 0.1148, "lr": 1.4326024839003804e-05, "epoch": 0.6410256410256411, "percentage": 64.1, "elapsed_time": "1:25:23", "remaining_time": "0:47:49", "throughput": 30496.92, "total_tokens": 156264480}
211
+ {"current_steps": 1055, "total_steps": 1638, "loss": 0.1134, "lr": 1.4109726298004911e-05, "epoch": 0.6440781440781441, "percentage": 64.41, "elapsed_time": "1:25:48", "remaining_time": "0:47:24", "throughput": 30502.44, "total_tokens": 157027088}
212
+ {"current_steps": 1060, "total_steps": 1638, "loss": 0.1093, "lr": 1.3894429248870866e-05, "epoch": 0.6471306471306472, "percentage": 64.71, "elapsed_time": "1:26:11", "remaining_time": "0:46:59", "throughput": 30505.37, "total_tokens": 157758048}
213
+ {"current_steps": 1065, "total_steps": 1638, "loss": 0.1138, "lr": 1.3680153490759073e-05, "epoch": 0.6501831501831502, "percentage": 65.02, "elapsed_time": "1:26:35", "remaining_time": "0:46:35", "throughput": 30510.61, "total_tokens": 158510720}
214
+ {"current_steps": 1070, "total_steps": 1638, "loss": 0.1149, "lr": 1.3466918728906919e-05, "epoch": 0.6532356532356532, "percentage": 65.32, "elapsed_time": "1:26:58", "remaining_time": "0:46:10", "throughput": 30522.54, "total_tokens": 159293376}
215
+ {"current_steps": 1075, "total_steps": 1638, "loss": 0.1116, "lr": 1.3254744572819658e-05, "epoch": 0.6562881562881563, "percentage": 65.63, "elapsed_time": "1:27:22", "remaining_time": "0:45:45", "throughput": 30528.01, "total_tokens": 160033920}
216
+ {"current_steps": 1080, "total_steps": 1638, "loss": 0.1126, "lr": 1.3043650534467053e-05, "epoch": 0.6593406593406593, "percentage": 65.93, "elapsed_time": "1:27:45", "remaining_time": "0:45:20", "throughput": 30532.12, "total_tokens": 160775472}
217
+ {"current_steps": 1085, "total_steps": 1638, "loss": 0.107, "lr": 1.2833656026489028e-05, "epoch": 0.6623931623931624, "percentage": 66.24, "elapsed_time": "1:28:08", "remaining_time": "0:44:55", "throughput": 30536.35, "total_tokens": 161494640}
218
+ {"current_steps": 1090, "total_steps": 1638, "loss": 0.1218, "lr": 1.2624780360410466e-05, "epoch": 0.6654456654456654, "percentage": 66.54, "elapsed_time": "1:28:33", "remaining_time": "0:44:31", "throughput": 30533.44, "total_tokens": 162241408}
219
+ {"current_steps": 1095, "total_steps": 1638, "loss": 0.1105, "lr": 1.2417042744865237e-05, "epoch": 0.6684981684981685, "percentage": 66.85, "elapsed_time": "1:28:57", "remaining_time": "0:44:06", "throughput": 30535.9, "total_tokens": 162990224}
220
+ {"current_steps": 1100, "total_steps": 1638, "loss": 0.1102, "lr": 1.2210462283829755e-05, "epoch": 0.6715506715506715, "percentage": 67.16, "elapsed_time": "1:29:21", "remaining_time": "0:43:42", "throughput": 30540.52, "total_tokens": 163757808}
221
+ {"current_steps": 1105, "total_steps": 1638, "loss": 0.1159, "lr": 1.2005057974866135e-05, "epoch": 0.6746031746031746, "percentage": 67.46, "elapsed_time": "1:29:45", "remaining_time": "0:43:17", "throughput": 30546.29, "total_tokens": 164501232}
222
+ {"current_steps": 1110, "total_steps": 1638, "loss": 0.1123, "lr": 1.180084870737516e-05, "epoch": 0.6776556776556777, "percentage": 67.77, "elapsed_time": "1:30:08", "remaining_time": "0:42:52", "throughput": 30549.85, "total_tokens": 165238192}
223
+ {"current_steps": 1115, "total_steps": 1638, "loss": 0.1091, "lr": 1.1597853260859128e-05, "epoch": 0.6807081807081807, "percentage": 68.07, "elapsed_time": "1:30:32", "remaining_time": "0:42:28", "throughput": 30553.46, "total_tokens": 165972784}
224
+ {"current_steps": 1120, "total_steps": 1638, "loss": 0.1179, "lr": 1.1396090303194893e-05, "epoch": 0.6837606837606838, "percentage": 68.38, "elapsed_time": "1:30:56", "remaining_time": "0:42:03", "throughput": 30552.35, "total_tokens": 166698432}
225
+ {"current_steps": 1125, "total_steps": 1638, "loss": 0.11, "lr": 1.1195578388917092e-05, "epoch": 0.6868131868131868, "percentage": 68.68, "elapsed_time": "1:31:20", "remaining_time": "0:41:38", "throughput": 30555.59, "total_tokens": 167445392}
226
+ {"current_steps": 1130, "total_steps": 1638, "loss": 0.1112, "lr": 1.0996335957511867e-05, "epoch": 0.6898656898656899, "percentage": 68.99, "elapsed_time": "1:31:43", "remaining_time": "0:41:14", "throughput": 30560.09, "total_tokens": 168191200}
227
+ {"current_steps": 1135, "total_steps": 1638, "loss": 0.1039, "lr": 1.0798381331721109e-05, "epoch": 0.6929181929181929, "percentage": 69.29, "elapsed_time": "1:32:07", "remaining_time": "0:40:49", "throughput": 30563.39, "total_tokens": 168933088}
228
+ {"current_steps": 1140, "total_steps": 1638, "loss": 0.1118, "lr": 1.060173271585747e-05, "epoch": 0.6959706959706959, "percentage": 69.6, "elapsed_time": "1:32:31", "remaining_time": "0:40:25", "throughput": 30566.29, "total_tokens": 169691472}
229
+ {"current_steps": 1145, "total_steps": 1638, "loss": 0.1175, "lr": 1.0406408194130259e-05, "epoch": 0.699023199023199, "percentage": 69.9, "elapsed_time": "1:32:55", "remaining_time": "0:40:00", "throughput": 30571.15, "total_tokens": 170450960}
230
+ {"current_steps": 1150, "total_steps": 1638, "loss": 0.1083, "lr": 1.021242572898237e-05, "epoch": 0.702075702075702, "percentage": 70.21, "elapsed_time": "1:33:19", "remaining_time": "0:39:36", "throughput": 30575.3, "total_tokens": 171199728}
231
+ {"current_steps": 1155, "total_steps": 1638, "loss": 0.1088, "lr": 1.0019803159438423e-05, "epoch": 0.7051282051282052, "percentage": 70.51, "elapsed_time": "1:33:42", "remaining_time": "0:39:11", "throughput": 30577.98, "total_tokens": 171929312}
232
+ {"current_steps": 1160, "total_steps": 1638, "loss": 0.1121, "lr": 9.82855819946428e-06, "epoch": 0.7081807081807082, "percentage": 70.82, "elapsed_time": "1:34:05", "remaining_time": "0:38:46", "throughput": 30581.74, "total_tokens": 172663152}
233
+ {"current_steps": 1165, "total_steps": 1638, "loss": 0.1147, "lr": 9.638708436337976e-06, "epoch": 0.7112332112332113, "percentage": 71.12, "elapsed_time": "1:34:30", "remaining_time": "0:38:22", "throughput": 30579.84, "total_tokens": 173407584}
234
+ {"current_steps": 1170, "total_steps": 1638, "loss": 0.1105, "lr": 9.450271329032404e-06, "epoch": 0.7142857142857143, "percentage": 71.43, "elapsed_time": "1:34:54", "remaining_time": "0:37:57", "throughput": 30582.84, "total_tokens": 174168016}
235
+ {"current_steps": 1175, "total_steps": 1638, "loss": 0.1064, "lr": 9.263264206609726e-06, "epoch": 0.7173382173382173, "percentage": 71.73, "elapsed_time": "1:35:18", "remaining_time": "0:37:33", "throughput": 30587.55, "total_tokens": 174926656}
236
+ {"current_steps": 1180, "total_steps": 1638, "loss": 0.0989, "lr": 9.077704266627776e-06, "epoch": 0.7203907203907204, "percentage": 72.04, "elapsed_time": "1:35:41", "remaining_time": "0:37:08", "throughput": 30590.89, "total_tokens": 175650240}
237
+ {"current_steps": 1005, "total_steps": 1638, "loss": 0.1194, "lr": 1.6314467848109483e-05, "epoch": 0.6135531135531136, "percentage": 61.36, "elapsed_time": "0:00:25", "remaining_time": "0:00:16", "throughput": 5850455.51, "total_tokens": 149555088}
238
+ {"current_steps": 1010, "total_steps": 1638, "loss": 0.1099, "lr": 1.609006143137895e-05, "epoch": 0.6166056166056166, "percentage": 61.66, "elapsed_time": "0:00:49", "remaining_time": "0:00:30", "throughput": 3025309.07, "total_tokens": 150280064}
239
+ {"current_steps": 1015, "total_steps": 1638, "loss": 0.1098, "lr": 1.5866474390840125e-05, "epoch": 0.6196581196581197, "percentage": 61.97, "elapsed_time": "0:01:13", "remaining_time": "0:00:44", "throughput": 2066643.65, "total_tokens": 151000304}
240
+ {"current_steps": 1020, "total_steps": 1638, "loss": 0.1085, "lr": 1.564372728801501e-05, "epoch": 0.6227106227106227, "percentage": 62.27, "elapsed_time": "0:01:37", "remaining_time": "0:00:58", "throughput": 1563845.36, "total_tokens": 151754352}
241
+ {"current_steps": 1025, "total_steps": 1638, "loss": 0.1078, "lr": 1.5421840607183203e-05, "epoch": 0.6257631257631258, "percentage": 62.58, "elapsed_time": "0:02:01", "remaining_time": "0:01:12", "throughput": 1255299.51, "total_tokens": 152529360}
242
+ {"current_steps": 1030, "total_steps": 1638, "loss": 0.112, "lr": 1.5200834753498128e-05, "epoch": 0.6288156288156288, "percentage": 62.88, "elapsed_time": "0:02:24", "remaining_time": "0:01:25", "throughput": 1057327.39, "total_tokens": 153265552}
243
+ {"current_steps": 1035, "total_steps": 1638, "loss": 0.1121, "lr": 1.4980730051110541e-05, "epoch": 0.6318681318681318, "percentage": 63.19, "elapsed_time": "0:02:49", "remaining_time": "0:01:38", "throughput": 911269.23, "total_tokens": 154019536}
244
+ {"current_steps": 1040, "total_steps": 1638, "loss": 0.1149, "lr": 1.4761546741299495e-05, "epoch": 0.6349206349206349, "percentage": 63.49, "elapsed_time": "0:03:13", "remaining_time": "0:01:51", "throughput": 800970.39, "total_tokens": 154765472}
245
+ {"current_steps": 1045, "total_steps": 1638, "loss": 0.1181, "lr": 1.4543304980610878e-05, "epoch": 0.6379731379731379, "percentage": 63.8, "elapsed_time": "0:03:37", "remaining_time": "0:02:03", "throughput": 715154.54, "total_tokens": 155509408}
246
+ {"current_steps": 1050, "total_steps": 1638, "loss": 0.1149, "lr": 1.4326024839003804e-05, "epoch": 0.6410256410256411, "percentage": 64.1, "elapsed_time": "0:04:01", "remaining_time": "0:02:15", "throughput": 646080.07, "total_tokens": 156264480}
247
+ {"current_steps": 1055, "total_steps": 1638, "loss": 0.1134, "lr": 1.4109726298004911e-05, "epoch": 0.6440781440781441, "percentage": 64.41, "elapsed_time": "0:04:25", "remaining_time": "0:02:26", "throughput": 590330.87, "total_tokens": 157027088}
248
+ {"current_steps": 1060, "total_steps": 1638, "loss": 0.1094, "lr": 1.3894429248870866e-05, "epoch": 0.6471306471306472, "percentage": 64.71, "elapsed_time": "0:04:49", "remaining_time": "0:02:37", "throughput": 544941.31, "total_tokens": 157758048}
249
+ {"current_steps": 1065, "total_steps": 1638, "loss": 0.1138, "lr": 1.3680153490759073e-05, "epoch": 0.6501831501831502, "percentage": 65.02, "elapsed_time": "0:05:13", "remaining_time": "0:02:48", "throughput": 505892.17, "total_tokens": 158510720}
250
+ {"current_steps": 1070, "total_steps": 1638, "loss": 0.1149, "lr": 1.3466918728906919e-05, "epoch": 0.6532356532356532, "percentage": 65.32, "elapsed_time": "0:05:37", "remaining_time": "0:02:58", "throughput": 472663.61, "total_tokens": 159293376}
251
+ {"current_steps": 1075, "total_steps": 1638, "loss": 0.1116, "lr": 1.3254744572819658e-05, "epoch": 0.6562881562881563, "percentage": 65.63, "elapsed_time": "0:06:00", "remaining_time": "0:03:08", "throughput": 444082.15, "total_tokens": 160033920}
252
+ {"current_steps": 1080, "total_steps": 1638, "loss": 0.1126, "lr": 1.3043650534467053e-05, "epoch": 0.6593406593406593, "percentage": 65.93, "elapsed_time": "0:06:23", "remaining_time": "0:03:18", "throughput": 418887.25, "total_tokens": 160775472}
253
+ {"current_steps": 1085, "total_steps": 1638, "loss": 0.107, "lr": 1.2833656026489028e-05, "epoch": 0.6623931623931624, "percentage": 66.24, "elapsed_time": "0:06:46", "remaining_time": "0:03:27", "throughput": 397121.56, "total_tokens": 161494640}
254
+ {"current_steps": 1090, "total_steps": 1638, "loss": 0.1218, "lr": 1.2624780360410466e-05, "epoch": 0.6654456654456654, "percentage": 66.54, "elapsed_time": "0:07:11", "remaining_time": "0:03:36", "throughput": 376220.96, "total_tokens": 162241408}
255
+ {"current_steps": 1095, "total_steps": 1638, "loss": 0.1105, "lr": 1.2417042744865237e-05, "epoch": 0.6684981684981685, "percentage": 66.85, "elapsed_time": "0:07:35", "remaining_time": "0:03:45", "throughput": 357937.03, "total_tokens": 162990224}
256
+ {"current_steps": 1100, "total_steps": 1638, "loss": 0.1102, "lr": 1.2210462283829755e-05, "epoch": 0.6715506715506715, "percentage": 67.16, "elapsed_time": "0:07:59", "remaining_time": "0:03:54", "throughput": 341352.48, "total_tokens": 163757808}
257
+ {"current_steps": 1105, "total_steps": 1638, "loss": 0.1159, "lr": 1.2005057974866135e-05, "epoch": 0.6746031746031746, "percentage": 67.46, "elapsed_time": "0:08:23", "remaining_time": "0:04:02", "throughput": 326998.79, "total_tokens": 164501232}
258
+ {"current_steps": 1110, "total_steps": 1638, "loss": 0.1122, "lr": 1.180084870737516e-05, "epoch": 0.6776556776556777, "percentage": 67.77, "elapsed_time": "0:08:46", "remaining_time": "0:04:10", "throughput": 313777.06, "total_tokens": 165238192}
259
+ {"current_steps": 1115, "total_steps": 1638, "loss": 0.109, "lr": 1.1597853260859128e-05, "epoch": 0.6807081807081807, "percentage": 68.07, "elapsed_time": "0:09:10", "remaining_time": "0:04:18", "throughput": 301747.03, "total_tokens": 165972784}
260
+ {"current_steps": 1120, "total_steps": 1638, "loss": 0.1179, "lr": 1.1396090303194893e-05, "epoch": 0.6837606837606838, "percentage": 68.38, "elapsed_time": "0:09:34", "remaining_time": "0:04:25", "throughput": 290400.95, "total_tokens": 166698432}
261
+ {"current_steps": 1125, "total_steps": 1638, "loss": 0.1099, "lr": 1.1195578388917092e-05, "epoch": 0.6868131868131868, "percentage": 68.68, "elapsed_time": "0:09:57", "remaining_time": "0:04:32", "throughput": 280053.58, "total_tokens": 167445392}
262
+ {"current_steps": 1130, "total_steps": 1638, "loss": 0.1113, "lr": 1.0996335957511867e-05, "epoch": 0.6898656898656899, "percentage": 68.99, "elapsed_time": "0:10:21", "remaining_time": "0:04:39", "throughput": 270604.08, "total_tokens": 168191200}
263
+ {"current_steps": 1135, "total_steps": 1638, "loss": 0.1039, "lr": 1.0798381331721109e-05, "epoch": 0.6929181929181929, "percentage": 69.29, "elapsed_time": "0:10:45", "remaining_time": "0:04:45", "throughput": 261813.54, "total_tokens": 168933088}
264
+ {"current_steps": 1140, "total_steps": 1638, "loss": 0.1118, "lr": 1.060173271585747e-05, "epoch": 0.6959706959706959, "percentage": 69.6, "elapsed_time": "0:11:09", "remaining_time": "0:04:52", "throughput": 253440.3, "total_tokens": 169691472}
265
+ {"current_steps": 1145, "total_steps": 1638, "loss": 0.1174, "lr": 1.0406408194130259e-05, "epoch": 0.699023199023199, "percentage": 69.9, "elapsed_time": "0:11:33", "remaining_time": "0:04:58", "throughput": 245759.13, "total_tokens": 170450960}
266
+ {"current_steps": 1150, "total_steps": 1638, "loss": 0.1082, "lr": 1.021242572898237e-05, "epoch": 0.702075702075702, "percentage": 70.21, "elapsed_time": "0:11:57", "remaining_time": "0:05:04", "throughput": 238661.81, "total_tokens": 171199728}
267
+ {"current_steps": 1155, "total_steps": 1638, "loss": 0.1089, "lr": 1.0019803159438423e-05, "epoch": 0.7051282051282052, "percentage": 70.51, "elapsed_time": "0:12:20", "remaining_time": "0:05:09", "throughput": 232095.2, "total_tokens": 171929312}
268
+ {"current_steps": 1160, "total_steps": 1638, "loss": 0.1121, "lr": 9.82855819946428e-06, "epoch": 0.7081807081807082, "percentage": 70.82, "elapsed_time": "0:12:44", "remaining_time": "0:05:14", "throughput": 225955.86, "total_tokens": 172663152}
269
+ {"current_steps": 1165, "total_steps": 1638, "loss": 0.1147, "lr": 9.638708436337976e-06, "epoch": 0.7112332112332113, "percentage": 71.12, "elapsed_time": "0:13:08", "remaining_time": "0:05:20", "throughput": 219823.85, "total_tokens": 173407584}
270
+ {"current_steps": 1170, "total_steps": 1638, "loss": 0.1105, "lr": 9.450271329032404e-06, "epoch": 0.7142857142857143, "percentage": 71.43, "elapsed_time": "0:13:33", "remaining_time": "0:05:25", "throughput": 214181.72, "total_tokens": 174168016}
271
+ {"current_steps": 1175, "total_steps": 1638, "loss": 0.1063, "lr": 9.263264206609726e-06, "epoch": 0.7173382173382173, "percentage": 71.73, "elapsed_time": "0:13:57", "remaining_time": "0:05:29", "throughput": 208969.31, "total_tokens": 174926656}
272
+ {"current_steps": 1180, "total_steps": 1638, "loss": 0.099, "lr": 9.077704266627776e-06, "epoch": 0.7203907203907204, "percentage": 72.04, "elapsed_time": "0:14:20", "remaining_time": "0:05:33", "throughput": 204212.22, "total_tokens": 175650240}
273
+ {"current_steps": 1185, "total_steps": 1638, "loss": 0.1061, "lr": 8.893608573558515e-06, "epoch": 0.7234432234432234, "percentage": 72.34, "elapsed_time": "0:14:44", "remaining_time": "0:05:38", "throughput": 199474.77, "total_tokens": 176397552}
274
+ {"current_steps": 1190, "total_steps": 1638, "loss": 0.1078, "lr": 8.710994057218782e-06, "epoch": 0.7264957264957265, "percentage": 72.65, "elapsed_time": "0:15:08", "remaining_time": "0:05:42", "throughput": 194981.69, "total_tokens": 177140960}
275
+ {"current_steps": 1195, "total_steps": 1638, "loss": 0.1165, "lr": 8.529877511213357e-06, "epoch": 0.7295482295482295, "percentage": 72.95, "elapsed_time": "0:15:32", "remaining_time": "0:05:45", "throughput": 190751.66, "total_tokens": 177893904}
276
+ {"current_steps": 1200, "total_steps": 1638, "loss": 0.1072, "lr": 8.3502755913906e-06, "epoch": 0.7326007326007326, "percentage": 73.26, "elapsed_time": "0:15:56", "remaining_time": "0:05:49", "throughput": 186788.44, "total_tokens": 178643056}
277
+ {"current_steps": 1205, "total_steps": 1638, "loss": 0.1039, "lr": 8.172204814310742e-06, "epoch": 0.7356532356532357, "percentage": 73.57, "elapsed_time": "0:16:20", "remaining_time": "0:05:52", "throughput": 183059.4, "total_tokens": 179400992}
278
+ {"current_steps": 1210, "total_steps": 1638, "loss": 0.1076, "lr": 7.99568155572701e-06, "epoch": 0.7387057387057387, "percentage": 73.87, "elapsed_time": "0:16:43", "remaining_time": "0:05:55", "throughput": 179473.65, "total_tokens": 180147952}
279
+ {"current_steps": 1215, "total_steps": 1638, "loss": 0.1069, "lr": 7.820722049079653e-06, "epoch": 0.7417582417582418, "percentage": 74.18, "elapsed_time": "0:17:07", "remaining_time": "0:05:57", "throughput": 176120.55, "total_tokens": 180883088}
280
+ {"current_steps": 1220, "total_steps": 1638, "loss": 0.1099, "lr": 7.647342384003087e-06, "epoch": 0.7448107448107448, "percentage": 74.48, "elapsed_time": "0:17:30", "remaining_time": "0:06:00", "throughput": 172843.43, "total_tokens": 181612288}
281
+ {"current_steps": 1225, "total_steps": 1638, "loss": 0.1106, "lr": 7.475558504846264e-06, "epoch": 0.7478632478632479, "percentage": 74.79, "elapsed_time": "0:17:54", "remaining_time": "0:06:02", "throughput": 169763.02, "total_tokens": 182345168}
282
+ {"current_steps": 1230, "total_steps": 1638, "loss": 0.1044, "lr": 7.305386209206397e-06, "epoch": 0.7509157509157509, "percentage": 75.09, "elapsed_time": "0:18:17", "remaining_time": "0:06:04", "throughput": 166843.02, "total_tokens": 183100912}
283
+ {"current_steps": 1235, "total_steps": 1638, "loss": 0.1104, "lr": 7.136841146476181e-06, "epoch": 0.753968253968254, "percentage": 75.4, "elapsed_time": "0:18:40", "remaining_time": "0:06:05", "throughput": 164021.24, "total_tokens": 183851904}
284
+ {"current_steps": 1240, "total_steps": 1638, "loss": 0.105, "lr": 6.969938816404639e-06, "epoch": 0.757020757020757, "percentage": 75.7, "elapsed_time": "0:19:04", "remaining_time": "0:06:07", "throughput": 161271.83, "total_tokens": 184595216}
285
+ {"current_steps": 1245, "total_steps": 1638, "loss": 0.1074, "lr": 6.8046945676717375e-06, "epoch": 0.76007326007326, "percentage": 76.01, "elapsed_time": "0:19:28", "remaining_time": "0:06:08", "throughput": 158625.05, "total_tokens": 185327184}
286
+ {"current_steps": 1250, "total_steps": 1638, "loss": 0.1041, "lr": 6.641123596476889e-06, "epoch": 0.7631257631257631, "percentage": 76.31, "elapsed_time": "0:19:52", "remaining_time": "0:06:10", "throughput": 156092.13, "total_tokens": 186065952}
287
+ {"current_steps": 1255, "total_steps": 1638, "loss": 0.102, "lr": 6.4792409451414735e-06, "epoch": 0.7661782661782662, "percentage": 76.62, "elapsed_time": "0:20:15", "remaining_time": "0:06:10", "throughput": 153682.96, "total_tokens": 186800528}
288
+ {"current_steps": 1260, "total_steps": 1638, "loss": 0.1099, "lr": 6.319061500725515e-06, "epoch": 0.7692307692307693, "percentage": 76.92, "elapsed_time": "0:20:39", "remaining_time": "0:06:11", "throughput": 151364.14, "total_tokens": 187557520}
289
+ {"current_steps": 1265, "total_steps": 1638, "loss": 0.1137, "lr": 6.1605999936586725e-06, "epoch": 0.7722832722832723, "percentage": 77.23, "elapsed_time": "0:21:03", "remaining_time": "0:06:12", "throughput": 149097.17, "total_tokens": 188316848}
290
+ {"current_steps": 1270, "total_steps": 1638, "loss": 0.1025, "lr": 6.003870996385533e-06, "epoch": 0.7753357753357754, "percentage": 77.53, "elapsed_time": "0:21:26", "remaining_time": "0:06:12", "throughput": 146935.86, "total_tokens": 189066896}
291
+ {"current_steps": 1275, "total_steps": 1638, "loss": 0.1051, "lr": 5.848888922025553e-06, "epoch": 0.7783882783882784, "percentage": 77.84, "elapsed_time": "0:21:50", "remaining_time": "0:06:13", "throughput": 144823.35, "total_tokens": 189803936}
292
+ {"current_steps": 1280, "total_steps": 1638, "loss": 0.1109, "lr": 5.695668023047579e-06, "epoch": 0.7814407814407814, "percentage": 78.14, "elapsed_time": "0:22:13", "remaining_time": "0:06:12", "throughput": 142893.68, "total_tokens": 190535536}
293
+ {"current_steps": 1285, "total_steps": 1638, "loss": 0.1128, "lr": 5.544222389959164e-06, "epoch": 0.7844932844932845, "percentage": 78.45, "elapsed_time": "0:22:37", "remaining_time": "0:06:12", "throughput": 140929.13, "total_tokens": 191260240}
294
+ {"current_steps": 1290, "total_steps": 1638, "loss": 0.1095, "lr": 5.394565950010769e-06, "epoch": 0.7875457875457875, "percentage": 78.75, "elapsed_time": "0:23:00", "remaining_time": "0:06:12", "throughput": 139039.17, "total_tokens": 192011984}
295
+ {"current_steps": 1295, "total_steps": 1638, "loss": 0.1073, "lr": 5.246712465915011e-06, "epoch": 0.7905982905982906, "percentage": 79.06, "elapsed_time": "0:23:25", "remaining_time": "0:06:12", "throughput": 137178.03, "total_tokens": 192759088}
296
+ {"current_steps": 1300, "total_steps": 1638, "loss": 0.105, "lr": 5.100675534580973e-06, "epoch": 0.7936507936507936, "percentage": 79.37, "elapsed_time": "0:23:48", "remaining_time": "0:06:11", "throughput": 135424.75, "total_tokens": 193513760}
297
+ {"current_steps": 1305, "total_steps": 1638, "loss": 0.109, "lr": 4.956468585863835e-06, "epoch": 0.7967032967032966, "percentage": 79.67, "elapsed_time": "0:24:13", "remaining_time": "0:06:10", "throughput": 133665.03, "total_tokens": 194263568}
298
+ {"current_steps": 1310, "total_steps": 1638, "loss": 0.1117, "lr": 4.814104881329828e-06, "epoch": 0.7997557997557998, "percentage": 79.98, "elapsed_time": "0:24:37", "remaining_time": "0:06:10", "throughput": 131946.19, "total_tokens": 195013952}
299
+ {"current_steps": 1315, "total_steps": 1638, "loss": 0.1062, "lr": 4.673597513036684e-06, "epoch": 0.8028083028083028, "percentage": 80.28, "elapsed_time": "0:25:01", "remaining_time": "0:06:08", "throughput": 130370.2, "total_tokens": 195750912}
300
+ {"current_steps": 1320, "total_steps": 1638, "loss": 0.1022, "lr": 4.5349594023296446e-06, "epoch": 0.8058608058608059, "percentage": 80.59, "elapsed_time": "0:25:25", "remaining_time": "0:06:07", "throughput": 128847.44, "total_tokens": 196505680}
301
+ {"current_steps": 1325, "total_steps": 1638, "loss": 0.1007, "lr": 4.398203298653195e-06, "epoch": 0.8089133089133089, "percentage": 80.89, "elapsed_time": "0:25:49", "remaining_time": "0:06:06", "throughput": 127278.71, "total_tokens": 197249152}
302
+ {"current_steps": 1330, "total_steps": 1638, "loss": 0.1029, "lr": 4.263341778378608e-06, "epoch": 0.811965811965812, "percentage": 81.2, "elapsed_time": "0:26:13", "remaining_time": "0:06:04", "throughput": 125855.93, "total_tokens": 197988416}
303
+ {"current_steps": 1335, "total_steps": 1638, "loss": 0.1118, "lr": 4.130387243647377e-06, "epoch": 0.815018315018315, "percentage": 81.5, "elapsed_time": "0:26:36", "remaining_time": "0:06:02", "throughput": 124438.2, "total_tokens": 198719120}
304
+ {"current_steps": 1340, "total_steps": 1638, "loss": 0.1084, "lr": 3.9993519212307154e-06, "epoch": 0.818070818070818, "percentage": 81.81, "elapsed_time": "0:27:00", "remaining_time": "0:06:00", "throughput": 123107.1, "total_tokens": 199458576}
305
+ {"current_steps": 1345, "total_steps": 1638, "loss": 0.1037, "lr": 3.8702478614051355e-06, "epoch": 0.8211233211233211, "percentage": 82.11, "elapsed_time": "0:27:22", "remaining_time": "0:05:57", "throughput": 121845.02, "total_tokens": 200186640}
306
+ {"current_steps": 1350, "total_steps": 1638, "loss": 0.1101, "lr": 3.7430869368442837e-06, "epoch": 0.8241758241758241, "percentage": 82.42, "elapsed_time": "0:27:46", "remaining_time": "0:05:55", "throughput": 120552.88, "total_tokens": 200921440}
307
+ {"current_steps": 1355, "total_steps": 1638, "loss": 0.1049, "lr": 3.6178808415271158e-06, "epoch": 0.8272283272283272, "percentage": 82.72, "elapsed_time": "0:28:10", "remaining_time": "0:05:53", "throughput": 119266.11, "total_tokens": 201666624}
308
+ {"current_steps": 1360, "total_steps": 1638, "loss": 0.1086, "lr": 3.4946410896624817e-06, "epoch": 0.8302808302808303, "percentage": 83.03, "elapsed_time": "0:28:34", "remaining_time": "0:05:50", "throughput": 118043.11, "total_tokens": 202417152}
309
+ {"current_steps": 1365, "total_steps": 1638, "loss": 0.1116, "lr": 3.373379014630279e-06, "epoch": 0.8333333333333334, "percentage": 83.33, "elapsed_time": "0:28:58", "remaining_time": "0:05:47", "throughput": 116868.01, "total_tokens": 203154720}
310
+ {"current_steps": 1370, "total_steps": 1638, "loss": 0.1108, "lr": 3.254105767939175e-06, "epoch": 0.8363858363858364, "percentage": 83.64, "elapsed_time": "0:29:22", "remaining_time": "0:05:44", "throughput": 115689.09, "total_tokens": 203918256}
311
+ {"current_steps": 1375, "total_steps": 1638, "loss": 0.1136, "lr": 3.136832318201119e-06, "epoch": 0.8394383394383395, "percentage": 83.94, "elapsed_time": "0:29:46", "remaining_time": "0:05:41", "throughput": 114578.56, "total_tokens": 204669728}
312
+ {"current_steps": 1380, "total_steps": 1638, "loss": 0.1179, "lr": 3.0215694501226384e-06, "epoch": 0.8424908424908425, "percentage": 84.25, "elapsed_time": "0:30:09", "remaining_time": "0:05:38", "throughput": 113495.21, "total_tokens": 205404912}
313
+ {"current_steps": 1385, "total_steps": 1638, "loss": 0.107, "lr": 2.9083277635130523e-06, "epoch": 0.8455433455433455, "percentage": 84.55, "elapsed_time": "0:30:33", "remaining_time": "0:05:34", "throughput": 112429.7, "total_tokens": 206131552}
314
+ {"current_steps": 1390, "total_steps": 1638, "loss": 0.1036, "lr": 2.7971176723096986e-06, "epoch": 0.8485958485958486, "percentage": 84.86, "elapsed_time": "0:30:56", "remaining_time": "0:05:31", "throughput": 111410.63, "total_tokens": 206875600}
315
+ {"current_steps": 1395, "total_steps": 1638, "loss": 0.1029, "lr": 2.687949403620235e-06, "epoch": 0.8516483516483516, "percentage": 85.16, "elapsed_time": "0:31:20", "remaining_time": "0:05:27", "throughput": 110414.25, "total_tokens": 207611712}
316
+ {"current_steps": 1400, "total_steps": 1638, "loss": 0.1059, "lr": 2.5808329967821563e-06, "epoch": 0.8547008547008547, "percentage": 85.47, "elapsed_time": "0:31:43", "remaining_time": "0:05:23", "throughput": 109448.89, "total_tokens": 208359376}
317
+ {"current_steps": 1405, "total_steps": 1638, "loss": 0.1075, "lr": 2.475778302439524e-06, "epoch": 0.8577533577533577, "percentage": 85.78, "elapsed_time": "0:32:07", "remaining_time": "0:05:19", "throughput": 108464.82, "total_tokens": 209108560}
318
+ {"current_steps": 1410, "total_steps": 1638, "loss": 0.1051, "lr": 2.3727949816371e-06, "epoch": 0.8608058608058609, "percentage": 86.08, "elapsed_time": "0:32:31", "remaining_time": "0:05:15", "throughput": 107517.7, "total_tokens": 209857904}
319
+ {"current_steps": 1415, "total_steps": 1638, "loss": 0.1134, "lr": 2.271892504931905e-06, "epoch": 0.8638583638583639, "percentage": 86.39, "elapsed_time": "0:32:55", "remaining_time": "0:05:11", "throughput": 106594.45, "total_tokens": 210611184}
320
+ {"current_steps": 1420, "total_steps": 1638, "loss": 0.1103, "lr": 2.173080151522272e-06, "epoch": 0.8669108669108669, "percentage": 86.69, "elapsed_time": "0:33:20", "remaining_time": "0:05:07", "throughput": 105672.82, "total_tokens": 211378688}
321
+ {"current_steps": 1425, "total_steps": 1638, "loss": 0.1046, "lr": 2.0763670083945114e-06, "epoch": 0.86996336996337, "percentage": 87.0, "elapsed_time": "0:33:43", "remaining_time": "0:05:02", "throughput": 104809.59, "total_tokens": 212108720}
322
+ {"current_steps": 1430, "total_steps": 1638, "loss": 0.0968, "lr": 1.9817619694872614e-06, "epoch": 0.873015873015873, "percentage": 87.3, "elapsed_time": "0:34:06", "remaining_time": "0:04:57", "throughput": 103978.23, "total_tokens": 212823344}
323
+ {"current_steps": 1435, "total_steps": 1638, "loss": 0.1049, "lr": 1.8892737348735812e-06, "epoch": 0.8760683760683761, "percentage": 87.61, "elapsed_time": "0:34:30", "remaining_time": "0:04:52", "throughput": 103150.08, "total_tokens": 213542016}
324
+ {"current_steps": 1440, "total_steps": 1638, "loss": 0.1052, "lr": 1.7989108099608742e-06, "epoch": 0.8791208791208791, "percentage": 87.91, "elapsed_time": "0:34:54", "remaining_time": "0:04:47", "throughput": 102323.68, "total_tokens": 214312144}
325
+ {"current_steps": 1445, "total_steps": 1638, "loss": 0.1088, "lr": 1.710681504708711e-06, "epoch": 0.8821733821733821, "percentage": 88.22, "elapsed_time": "0:35:18", "remaining_time": "0:04:42", "throughput": 101524.72, "total_tokens": 215050304}
326
+ {"current_steps": 1450, "total_steps": 1638, "loss": 0.1145, "lr": 1.624593932864632e-06, "epoch": 0.8852258852258852, "percentage": 88.52, "elapsed_time": "0:35:41", "remaining_time": "0:04:37", "throughput": 100752.82, "total_tokens": 215771968}
327
+ {"current_steps": 1455, "total_steps": 1638, "loss": 0.1132, "lr": 1.5406560112179864e-06, "epoch": 0.8882783882783882, "percentage": 88.83, "elapsed_time": "0:36:05", "remaining_time": "0:04:32", "throughput": 99992.57, "total_tokens": 216536640}
328
+ {"current_steps": 1460, "total_steps": 1638, "loss": 0.1166, "lr": 1.4588754588718862e-06, "epoch": 0.8913308913308914, "percentage": 89.13, "elapsed_time": "0:36:28", "remaining_time": "0:04:26", "throughput": 99264.18, "total_tokens": 217266608}
329
+ {"current_steps": 1465, "total_steps": 1638, "loss": 0.1027, "lr": 1.3792597965333581e-06, "epoch": 0.8943833943833944, "percentage": 89.44, "elapsed_time": "0:36:52", "remaining_time": "0:04:21", "throughput": 98522.93, "total_tokens": 218012736}
330
+ {"current_steps": 1470, "total_steps": 1638, "loss": 0.1046, "lr": 1.3018163458217076e-06, "epoch": 0.8974358974358975, "percentage": 89.74, "elapsed_time": "0:37:16", "remaining_time": "0:04:15", "throughput": 97818.6, "total_tokens": 218754448}
331
+ {"current_steps": 1475, "total_steps": 1638, "loss": 0.1075, "lr": 1.2265522285952013e-06, "epoch": 0.9004884004884005, "percentage": 90.05, "elapsed_time": "0:37:40", "remaining_time": "0:04:09", "throughput": 97117.51, "total_tokens": 219497360}
332
+ {"current_steps": 1480, "total_steps": 1638, "loss": 0.1008, "lr": 1.1534743662961477e-06, "epoch": 0.9035409035409036, "percentage": 90.35, "elapsed_time": "0:38:03", "remaining_time": "0:04:03", "throughput": 96433.1, "total_tokens": 220228720}
333
+ {"current_steps": 1485, "total_steps": 1638, "loss": 0.0993, "lr": 1.0825894793143721e-06, "epoch": 0.9065934065934066, "percentage": 90.66, "elapsed_time": "0:38:27", "remaining_time": "0:03:57", "throughput": 95779.36, "total_tokens": 220991776}
334
+ {"current_steps": 1490, "total_steps": 1638, "loss": 0.1115, "lr": 1.0139040863692023e-06, "epoch": 0.9096459096459096, "percentage": 90.96, "elapsed_time": "0:38:50", "remaining_time": "0:03:51", "throughput": 95140.85, "total_tokens": 221720064}
335
+ {"current_steps": 1495, "total_steps": 1638, "loss": 0.1071, "lr": 9.474245039099882e-07, "epoch": 0.9126984126984127, "percentage": 91.27, "elapsed_time": "0:39:13", "remaining_time": "0:03:45", "throughput": 94525.13, "total_tokens": 222445072}
336
+ {"current_steps": 1500, "total_steps": 1638, "loss": 0.1101, "lr": 8.831568455352352e-07, "epoch": 0.9157509157509157, "percentage": 91.58, "elapsed_time": "0:39:36", "remaining_time": "0:03:38", "throughput": 93905.13, "total_tokens": 223186944}
337
+ {"current_steps": 1505, "total_steps": 1638, "loss": 0.1135, "lr": 8.211070214303812e-07, "epoch": 0.9188034188034188, "percentage": 91.88, "elapsed_time": "0:40:00", "remaining_time": "0:03:32", "throughput": 93277.98, "total_tokens": 223916464}
338
+ {"current_steps": 1510, "total_steps": 1638, "loss": 0.1014, "lr": 7.612807378242798e-07, "epoch": 0.9218559218559218, "percentage": 92.19, "elapsed_time": "0:40:23", "remaining_time": "0:03:25", "throughput": 92681.0, "total_tokens": 224658112}
339
+ {"current_steps": 1515, "total_steps": 1638, "loss": 0.1004, "lr": 7.036834964644523e-07, "epoch": 0.924908424908425, "percentage": 92.49, "elapsed_time": "0:40:48", "remaining_time": "0:03:18", "throughput": 92059.8, "total_tokens": 225410512}
340
+ {"current_steps": 1520, "total_steps": 1638, "loss": 0.1137, "lr": 6.483205941111348e-07, "epoch": 0.927960927960928, "percentage": 92.8, "elapsed_time": "0:41:12", "remaining_time": "0:03:11", "throughput": 91471.19, "total_tokens": 226157008}
341
+ {"current_steps": 1525, "total_steps": 1638, "loss": 0.103, "lr": 5.951971220501645e-07, "epoch": 0.931013431013431, "percentage": 93.1, "elapsed_time": "0:41:36", "remaining_time": "0:03:04", "throughput": 90910.85, "total_tokens": 226914128}
342
+ {"current_steps": 1530, "total_steps": 1638, "loss": 0.115, "lr": 5.44317965624791e-07, "epoch": 0.9340659340659341, "percentage": 93.41, "elapsed_time": "0:41:59", "remaining_time": "0:02:57", "throughput": 90360.87, "total_tokens": 227663280}
343
+ {"current_steps": 1535, "total_steps": 1638, "loss": 0.1122, "lr": 4.956878037864043e-07, "epoch": 0.9371184371184371, "percentage": 93.71, "elapsed_time": "0:42:23", "remaining_time": "0:02:50", "throughput": 89785.38, "total_tokens": 228413792}
344
+ {"current_steps": 1540, "total_steps": 1638, "loss": 0.1088, "lr": 4.4931110866424375e-07, "epoch": 0.9401709401709402, "percentage": 94.02, "elapsed_time": "0:42:47", "remaining_time": "0:02:43", "throughput": 89257.15, "total_tokens": 229167792}
345
+ {"current_steps": 1545, "total_steps": 1638, "loss": 0.1039, "lr": 4.0519214515413463e-07, "epoch": 0.9432234432234432, "percentage": 94.32, "elapsed_time": "0:43:11", "remaining_time": "0:02:36", "throughput": 88710.44, "total_tokens": 229916896}
346
+ {"current_steps": 1550, "total_steps": 1638, "loss": 0.101, "lr": 3.6333497052629115e-07, "epoch": 0.9462759462759462, "percentage": 94.63, "elapsed_time": "0:43:35", "remaining_time": "0:02:28", "throughput": 88197.25, "total_tokens": 230648560}
347
+ {"current_steps": 1555, "total_steps": 1638, "loss": 0.1099, "lr": 3.237434340521789e-07, "epoch": 0.9493284493284493, "percentage": 94.93, "elapsed_time": "0:43:59", "remaining_time": "0:02:20", "throughput": 87668.18, "total_tokens": 231420400}
348
+ {"current_steps": 1560, "total_steps": 1638, "loss": 0.1005, "lr": 2.8642117665055034e-07, "epoch": 0.9523809523809523, "percentage": 95.24, "elapsed_time": "0:44:23", "remaining_time": "0:02:13", "throughput": 87164.83, "total_tokens": 232166064}
349
+ {"current_steps": 1565, "total_steps": 1638, "loss": 0.1143, "lr": 2.5137163055259926e-07, "epoch": 0.9554334554334555, "percentage": 95.54, "elapsed_time": "0:44:47", "remaining_time": "0:02:05", "throughput": 86666.18, "total_tokens": 232916848}
350
+ {"current_steps": 1570, "total_steps": 1638, "loss": 0.1135, "lr": 2.1859801898634347e-07, "epoch": 0.9584859584859585, "percentage": 95.85, "elapsed_time": "0:45:11", "remaining_time": "0:01:57", "throughput": 86171.51, "total_tokens": 233659040}
351
+ {"current_steps": 1575, "total_steps": 1638, "loss": 0.1032, "lr": 1.881033558802009e-07, "epoch": 0.9615384615384616, "percentage": 96.15, "elapsed_time": "0:45:34", "remaining_time": "0:01:49", "throughput": 85697.41, "total_tokens": 234381584}
352
+ {"current_steps": 1580, "total_steps": 1638, "loss": 0.1128, "lr": 1.598904455858169e-07, "epoch": 0.9645909645909646, "percentage": 96.46, "elapsed_time": "0:45:58", "remaining_time": "0:01:41", "throughput": 85235.51, "total_tokens": 235104208}
353
+ {"current_steps": 1585, "total_steps": 1638, "loss": 0.1011, "lr": 1.3396188262018438e-07, "epoch": 0.9676434676434676, "percentage": 96.76, "elapsed_time": "0:46:21", "remaining_time": "0:01:32", "throughput": 84796.29, "total_tokens": 235824192}
354
+ {"current_steps": 1590, "total_steps": 1638, "loss": 0.1099, "lr": 1.1032005142703195e-07, "epoch": 0.9706959706959707, "percentage": 97.07, "elapsed_time": "0:46:45", "remaining_time": "0:01:24", "throughput": 84334.42, "total_tokens": 236586656}
355
+ {"current_steps": 1595, "total_steps": 1638, "loss": 0.107, "lr": 8.896712615756308e-08, "epoch": 0.9737484737484737, "percentage": 97.37, "elapsed_time": "0:47:09", "remaining_time": "0:01:16", "throughput": 83880.66, "total_tokens": 237333776}
356
+ {"current_steps": 1600, "total_steps": 1638, "loss": 0.0996, "lr": 6.990507047049676e-08, "epoch": 0.9768009768009768, "percentage": 97.68, "elapsed_time": "0:47:33", "remaining_time": "0:01:07", "throughput": 83437.89, "total_tokens": 238071712}
357
+ {"current_steps": 1605, "total_steps": 1638, "loss": 0.1036, "lr": 5.313563735149796e-08, "epoch": 0.9798534798534798, "percentage": 97.99, "elapsed_time": "0:47:56", "remaining_time": "0:00:59", "throughput": 83009.57, "total_tokens": 238803952}
358
+ {"current_steps": 1610, "total_steps": 1638, "loss": 0.1043, "lr": 3.8660368951973224e-08, "epoch": 0.9829059829059829, "percentage": 98.29, "elapsed_time": "0:48:20", "remaining_time": "0:00:50", "throughput": 82596.72, "total_tokens": 239534384}
359
+ {"current_steps": 1615, "total_steps": 1638, "loss": 0.1058, "lr": 2.648059644723144e-08, "epoch": 0.985958485958486, "percentage": 98.6, "elapsed_time": "0:48:43", "remaining_time": "0:00:41", "throughput": 82182.94, "total_tokens": 240269104}
360
+ {"current_steps": 1620, "total_steps": 1638, "loss": 0.1051, "lr": 1.6597439914092794e-08, "epoch": 0.989010989010989, "percentage": 98.9, "elapsed_time": "0:49:07", "remaining_time": "0:00:32", "throughput": 81767.24, "total_tokens": 241022096}
361
+ {"current_steps": 1625, "total_steps": 1638, "loss": 0.104, "lr": 9.011808227865625e-09, "epoch": 0.9920634920634921, "percentage": 99.21, "elapsed_time": "0:49:31", "remaining_time": "0:00:23", "throughput": 81362.73, "total_tokens": 241767488}
362
+ {"current_steps": 1630, "total_steps": 1638, "loss": 0.1056, "lr": 3.7243989787633105e-09, "epoch": 0.9951159951159951, "percentage": 99.51, "elapsed_time": "0:49:55", "remaining_time": "0:00:14", "throughput": 80964.39, "total_tokens": 242515568}
363
+ {"current_steps": 1635, "total_steps": 1638, "loss": 0.1037, "lr": 7.356984077722117e-10, "epoch": 0.9981684981684982, "percentage": 99.82, "elapsed_time": "0:50:18", "remaining_time": "0:00:05", "throughput": 80575.97, "total_tokens": 243258848}
364
+ {"current_steps": 1638, "total_steps": 1638, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:52:29", "remaining_time": "0:00:00", "throughput": 77383.54, "total_tokens": 243707408}
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ca2da13c6bebf27e475db718e2cf6ec4f7d34151d61d884c484b32de925d0d4
3
+ size 8017
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff