atharva14 commited on
Commit
8662f1d
·
verified ·
1 Parent(s): 2398215

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-2814/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen/Qwen2.5-3B-Instruct
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: ace_reason_15k_difficulty
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # ace_reason_15k_difficulty
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) on the ace_reason_15k_difficulty dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-06
39
+ - train_batch_size: 4
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - gradient_accumulation_steps: 8
44
+ - total_train_batch_size: 32
45
+ - optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
+ - lr_scheduler_type: cosine
47
+ - lr_scheduler_warmup_ratio: 0.04
48
+ - num_epochs: 3
49
+
50
+ ### Training results
51
+
52
+
53
+
54
+ ### Framework versions
55
+
56
+ - Transformers 4.57.0
57
+ - Pytorch 2.8.0+cu128
58
+ - Datasets 4.0.0
59
+ - Tokenizers 0.22.1
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 6.849963317700592e+18,
4
+ "train_loss": 0.09753237076913879,
5
+ "train_runtime": 6368.1203,
6
+ "train_samples_per_second": 7.066,
7
+ "train_steps_per_second": 0.442
8
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-2814/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-2814/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-2814/config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "float32",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 11008,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention"
49
+ ],
50
+ "max_position_embeddings": 32768,
51
+ "max_window_layers": 70,
52
+ "model_type": "qwen2",
53
+ "num_attention_heads": 16,
54
+ "num_hidden_layers": 36,
55
+ "num_key_value_heads": 2,
56
+ "pad_token_id": 151643,
57
+ "rms_norm_eps": 1e-06,
58
+ "rope_scaling": null,
59
+ "rope_theta": 1000000.0,
60
+ "sliding_window": null,
61
+ "tie_word_embeddings": true,
62
+ "transformers_version": "4.57.0",
63
+ "use_cache": false,
64
+ "use_sliding_window": false,
65
+ "vocab_size": 151936
66
+ }
checkpoint-2814/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.05,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.57.0"
13
+ }
checkpoint-2814/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2814/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccbd581cf67e0b86efdcd7fcce9fbdf7ac578ff78b55db19f825d6b75f5785ee
3
+ size 4982131536
checkpoint-2814/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70386fbf32653ef1cb12d55e0581a64397b15f495e1b1ccb87f90723d9c28877
3
+ size 4932949336
checkpoint-2814/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6040e321cbf96d8d3056d9b29e18353e48d36ee29f1c0d1b0a6431137cef5a3
3
+ size 2428723160
checkpoint-2814/model.safetensors.index.json ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 3085938688,
4
+ "total_size": 12343754752
5
+ },
6
+ "weight_map": {
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
260
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
261
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
262
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
263
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
264
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
265
+ "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
266
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
267
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
268
+ "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
269
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
270
+ "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
271
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
272
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
273
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
274
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
275
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
276
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
277
+ "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
278
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
279
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
280
+ "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
281
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
282
+ "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
283
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
284
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
289
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
290
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
293
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
295
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
297
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
298
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
299
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
300
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
301
+ "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
302
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
303
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
304
+ "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
305
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
306
+ "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
307
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
308
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
309
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
310
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
311
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
312
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
313
+ "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
314
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
315
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
316
+ "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
317
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
318
+ "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
319
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
320
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors",
321
+ "model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
322
+ "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
323
+ "model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
324
+ "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
325
+ "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
326
+ "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
327
+ "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
328
+ "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
329
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
330
+ "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
331
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
332
+ "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors",
333
+ "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
334
+ "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
335
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
336
+ "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
337
+ "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
338
+ "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
339
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
340
+ "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
341
+ "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
342
+ "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
343
+ "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
344
+ "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors",
345
+ "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
346
+ "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
347
+ "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
348
+ "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
349
+ "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
350
+ "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
351
+ "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
352
+ "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
353
+ "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
354
+ "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
355
+ "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
356
+ "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
357
+ "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
358
+ "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
359
+ "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
360
+ "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
361
+ "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
362
+ "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
363
+ "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
364
+ "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
365
+ "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
366
+ "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
367
+ "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
368
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
369
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
370
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
371
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
372
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
373
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
374
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
375
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
376
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
377
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
378
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
379
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
380
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
381
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
382
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
383
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
384
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
385
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
386
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
387
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
388
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
389
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
390
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
391
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
392
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
393
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
394
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
395
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
396
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
397
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
398
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
399
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
400
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
401
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
402
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
403
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
404
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
405
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
406
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
407
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
408
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
409
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
410
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
411
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
412
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
413
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
414
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
415
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
416
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
417
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
418
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
419
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
420
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
421
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
422
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
423
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
424
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
425
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
426
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
427
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
428
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
429
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
430
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
431
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
432
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
433
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
434
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
435
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
436
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
437
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
438
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
439
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
440
+ "model.norm.weight": "model-00003-of-00003.safetensors"
441
+ }
442
+ }
checkpoint-2814/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2e50e25cf1a29a48f010b18646bb19e0ae171e6f1fecfa3c8dfe9a2b72bfb44
3
+ size 24687895753
checkpoint-2814/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c68a129c0d20354821772a4dc646d088a4dd99d014e90c16ebd67e1598ec82fc
3
+ size 14645
checkpoint-2814/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f566acf8f9f46fbae9854e3a3156261fe76179507a935e49406538a2dcdfa241
3
+ size 1465
checkpoint-2814/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-2814/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-2814/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
checkpoint-2814/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2814/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46979f6b0e5272a84c7b78174fabe91a7fe3a4a5c2a861b63e43add65e9e27ca
3
+ size 6353
checkpoint-2814/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "float32",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 11008,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention"
49
+ ],
50
+ "max_position_embeddings": 32768,
51
+ "max_window_layers": 70,
52
+ "model_type": "qwen2",
53
+ "num_attention_heads": 16,
54
+ "num_hidden_layers": 36,
55
+ "num_key_value_heads": 2,
56
+ "pad_token_id": 151643,
57
+ "rms_norm_eps": 1e-06,
58
+ "rope_scaling": null,
59
+ "rope_theta": 1000000.0,
60
+ "sliding_window": null,
61
+ "tie_word_embeddings": true,
62
+ "transformers_version": "4.57.0",
63
+ "use_cache": false,
64
+ "use_sliding_window": false,
65
+ "vocab_size": 151936
66
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.05,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.57.0"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccbd581cf67e0b86efdcd7fcce9fbdf7ac578ff78b55db19f825d6b75f5785ee
3
+ size 4982131536
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70386fbf32653ef1cb12d55e0581a64397b15f495e1b1ccb87f90723d9c28877
3
+ size 4932949336
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6040e321cbf96d8d3056d9b29e18353e48d36ee29f1c0d1b0a6431137cef5a3
3
+ size 2428723160
model.safetensors.index.json ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 3085938688,
4
+ "total_size": 12343754752
5
+ },
6
+ "weight_map": {
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
260
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
261
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
262
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
263
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
264
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
265
+ "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
266
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
267
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
268
+ "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
269
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
270
+ "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
271
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
272
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
273
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
274
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
275
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
276
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
277
+ "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
278
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
279
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
280
+ "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
281
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
282
+ "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
283
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
284
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
289
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
290
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
293
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
295
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
297
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
298
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
299
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
300
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
301
+ "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
302
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
303
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
304
+ "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
305
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
306
+ "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
307
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
308
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
309
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
310
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
311
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
312
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
313
+ "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
314
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
315
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
316
+ "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
317
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
318
+ "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
319
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
320
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors",
321
+ "model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
322
+ "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
323
+ "model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
324
+ "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
325
+ "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
326
+ "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
327
+ "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
328
+ "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
329
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
330
+ "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
331
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
332
+ "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors",
333
+ "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
334
+ "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
335
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
336
+ "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
337
+ "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
338
+ "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
339
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
340
+ "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
341
+ "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
342
+ "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
343
+ "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
344
+ "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors",
345
+ "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
346
+ "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
347
+ "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
348
+ "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
349
+ "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
350
+ "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
351
+ "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
352
+ "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
353
+ "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
354
+ "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
355
+ "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
356
+ "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
357
+ "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
358
+ "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
359
+ "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
360
+ "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
361
+ "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
362
+ "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
363
+ "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
364
+ "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
365
+ "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
366
+ "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
367
+ "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
368
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
369
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
370
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
371
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
372
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
373
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
374
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
375
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
376
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
377
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
378
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
379
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
380
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
381
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
382
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
383
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
384
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
385
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
386
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
387
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
388
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
389
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
390
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
391
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
392
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
393
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
394
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
395
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
396
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
397
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
398
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
399
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
400
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
401
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
402
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
403
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
404
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
405
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
406
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
407
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
408
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
409
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
410
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
411
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
412
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
413
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
414
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
415
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
416
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
417
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
418
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
419
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
420
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
421
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
422
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
423
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
424
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
425
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
426
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
427
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
428
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
429
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
430
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
431
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
432
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
433
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
434
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
435
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
436
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
437
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
438
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
439
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
440
+ "model.norm.weight": "model-00003-of-00003.safetensors"
441
+ }
442
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 6.849963317700592e+18,
4
+ "train_loss": 0.09753237076913879,
5
+ "train_runtime": 6368.1203,
6
+ "train_samples_per_second": 7.066,
7
+ "train_steps_per_second": 0.442
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 2301, "total_steps": 2814, "loss": 0.5189, "lr": 4.336233691998401e-07, "epoch": 2.453333333333333, "percentage": 81.77, "elapsed_time": "0:00:14", "remaining_time": "0:00:03"}
2
+ {"current_steps": 2302, "total_steps": 2814, "loss": 0.5345, "lr": 4.319880733115647e-07, "epoch": 2.4544, "percentage": 81.81, "elapsed_time": "0:00:24", "remaining_time": "0:00:05"}
3
+ {"current_steps": 2303, "total_steps": 2814, "loss": 0.4828, "lr": 4.3035557513890603e-07, "epoch": 2.4554666666666667, "percentage": 81.84, "elapsed_time": "0:00:37", "remaining_time": "0:00:08"}
4
+ {"current_steps": 2304, "total_steps": 2814, "loss": 0.459, "lr": 4.2872587689039486e-07, "epoch": 2.4565333333333332, "percentage": 81.88, "elapsed_time": "0:00:49", "remaining_time": "0:00:10"}
5
+ {"current_steps": 2305, "total_steps": 2814, "loss": 0.5333, "lr": 4.270989807707723e-07, "epoch": 2.4576000000000002, "percentage": 81.91, "elapsed_time": "0:01:00", "remaining_time": "0:00:13"}
6
+ {"current_steps": 2306, "total_steps": 2814, "loss": 0.5494, "lr": 4.25474888980989e-07, "epoch": 2.458666666666667, "percentage": 81.95, "elapsed_time": "0:01:11", "remaining_time": "0:00:15"}
7
+ {"current_steps": 2307, "total_steps": 2814, "loss": 0.6038, "lr": 4.2385360371820254e-07, "epoch": 2.4597333333333333, "percentage": 81.98, "elapsed_time": "0:01:21", "remaining_time": "0:00:17"}
8
+ {"current_steps": 2308, "total_steps": 2814, "loss": 0.5416, "lr": 4.222351271757727e-07, "epoch": 2.4608, "percentage": 82.02, "elapsed_time": "0:01:31", "remaining_time": "0:00:19"}
9
+ {"current_steps": 2309, "total_steps": 2814, "loss": 0.469, "lr": 4.2061946154326043e-07, "epoch": 2.4618666666666664, "percentage": 82.05, "elapsed_time": "0:01:42", "remaining_time": "0:00:22"}
10
+ {"current_steps": 2310, "total_steps": 2814, "loss": 0.5155, "lr": 4.1900660900642265e-07, "epoch": 2.4629333333333334, "percentage": 82.09, "elapsed_time": "0:01:52", "remaining_time": "0:00:24"}
11
+ {"current_steps": 2311, "total_steps": 2814, "loss": 0.4927, "lr": 4.1739657174721225e-07, "epoch": 2.464, "percentage": 82.13, "elapsed_time": "0:02:04", "remaining_time": "0:00:27"}
12
+ {"current_steps": 2312, "total_steps": 2814, "loss": 0.5577, "lr": 4.157893519437717e-07, "epoch": 2.4650666666666665, "percentage": 82.16, "elapsed_time": "0:02:15", "remaining_time": "0:00:29"}
13
+ {"current_steps": 2313, "total_steps": 2814, "loss": 0.4864, "lr": 4.1418495177043383e-07, "epoch": 2.4661333333333335, "percentage": 82.2, "elapsed_time": "0:02:26", "remaining_time": "0:00:31"}
14
+ {"current_steps": 2314, "total_steps": 2814, "loss": 0.5379, "lr": 4.1258337339771443e-07, "epoch": 2.4672, "percentage": 82.23, "elapsed_time": "0:02:38", "remaining_time": "0:00:34"}
15
+ {"current_steps": 2315, "total_steps": 2814, "loss": 0.6081, "lr": 4.1098461899231446e-07, "epoch": 2.4682666666666666, "percentage": 82.27, "elapsed_time": "0:02:50", "remaining_time": "0:00:36"}
16
+ {"current_steps": 2316, "total_steps": 2814, "loss": 0.564, "lr": 4.093886907171121e-07, "epoch": 2.469333333333333, "percentage": 82.3, "elapsed_time": "0:03:01", "remaining_time": "0:00:39"}
17
+ {"current_steps": 2317, "total_steps": 2814, "loss": 0.5566, "lr": 4.0779559073116453e-07, "epoch": 2.4704, "percentage": 82.34, "elapsed_time": "0:03:13", "remaining_time": "0:00:41"}
18
+ {"current_steps": 2318, "total_steps": 2814, "loss": 0.5545, "lr": 4.0620532118970076e-07, "epoch": 2.4714666666666667, "percentage": 82.37, "elapsed_time": "0:03:23", "remaining_time": "0:00:43"}
19
+ {"current_steps": 2319, "total_steps": 2814, "loss": 0.58, "lr": 4.0461788424412065e-07, "epoch": 2.4725333333333332, "percentage": 82.41, "elapsed_time": "0:03:34", "remaining_time": "0:00:45"}
20
+ {"current_steps": 2320, "total_steps": 2814, "loss": 0.5084, "lr": 4.030332820419941e-07, "epoch": 2.4736000000000002, "percentage": 82.44, "elapsed_time": "0:03:45", "remaining_time": "0:00:47"}
21
+ {"current_steps": 2321, "total_steps": 2814, "loss": 0.5185, "lr": 4.0145151672705304e-07, "epoch": 2.474666666666667, "percentage": 82.48, "elapsed_time": "0:03:55", "remaining_time": "0:00:50"}
22
+ {"current_steps": 2322, "total_steps": 2814, "loss": 0.4911, "lr": 3.998725904391942e-07, "epoch": 2.4757333333333333, "percentage": 82.52, "elapsed_time": "0:04:06", "remaining_time": "0:00:52"}
23
+ {"current_steps": 2323, "total_steps": 2814, "loss": 0.5905, "lr": 3.982965053144716e-07, "epoch": 2.4768, "percentage": 82.55, "elapsed_time": "0:04:16", "remaining_time": "0:00:54"}
24
+ {"current_steps": 2324, "total_steps": 2814, "loss": 0.5296, "lr": 3.9672326348509704e-07, "epoch": 2.4778666666666664, "percentage": 82.59, "elapsed_time": "0:04:28", "remaining_time": "0:00:56"}
25
+ {"current_steps": 2325, "total_steps": 2814, "loss": 0.5228, "lr": 3.9515286707943413e-07, "epoch": 2.4789333333333334, "percentage": 82.62, "elapsed_time": "0:04:39", "remaining_time": "0:00:58"}
26
+ {"current_steps": 2326, "total_steps": 2814, "loss": 0.5941, "lr": 3.9358531822199926e-07, "epoch": 2.48, "percentage": 82.66, "elapsed_time": "0:04:49", "remaining_time": "0:01:00"}
27
+ {"current_steps": 2327, "total_steps": 2814, "loss": 0.5733, "lr": 3.920206190334536e-07, "epoch": 2.4810666666666665, "percentage": 82.69, "elapsed_time": "0:04:58", "remaining_time": "0:01:02"}
28
+ {"current_steps": 2328, "total_steps": 2814, "loss": 0.5297, "lr": 3.904587716306063e-07, "epoch": 2.4821333333333335, "percentage": 82.73, "elapsed_time": "0:05:09", "remaining_time": "0:01:04"}
29
+ {"current_steps": 2329, "total_steps": 2814, "loss": 0.5742, "lr": 3.8889977812640536e-07, "epoch": 2.4832, "percentage": 82.76, "elapsed_time": "0:05:19", "remaining_time": "0:01:06"}
30
+ {"current_steps": 2330, "total_steps": 2814, "loss": 0.5269, "lr": 3.8734364062994105e-07, "epoch": 2.4842666666666666, "percentage": 82.8, "elapsed_time": "0:05:30", "remaining_time": "0:01:08"}
31
+ {"current_steps": 2331, "total_steps": 2814, "loss": 0.5369, "lr": 3.857903612464367e-07, "epoch": 2.485333333333333, "percentage": 82.84, "elapsed_time": "0:05:42", "remaining_time": "0:01:11"}
32
+ {"current_steps": 2332, "total_steps": 2814, "loss": 0.5277, "lr": 3.8423994207725216e-07, "epoch": 2.4864, "percentage": 82.87, "elapsed_time": "0:05:54", "remaining_time": "0:01:13"}
33
+ {"current_steps": 2333, "total_steps": 2814, "loss": 0.524, "lr": 3.826923852198752e-07, "epoch": 2.4874666666666667, "percentage": 82.91, "elapsed_time": "0:06:07", "remaining_time": "0:01:15"}
34
+ {"current_steps": 2334, "total_steps": 2814, "loss": 0.5307, "lr": 3.811476927679228e-07, "epoch": 2.4885333333333333, "percentage": 82.94, "elapsed_time": "0:06:18", "remaining_time": "0:01:17"}
35
+ {"current_steps": 2335, "total_steps": 2814, "loss": 0.5276, "lr": 3.7960586681113546e-07, "epoch": 2.4896, "percentage": 82.98, "elapsed_time": "0:06:30", "remaining_time": "0:01:20"}
36
+ {"current_steps": 2336, "total_steps": 2814, "loss": 0.568, "lr": 3.780669094353781e-07, "epoch": 2.490666666666667, "percentage": 83.01, "elapsed_time": "0:06:40", "remaining_time": "0:01:22"}
37
+ {"current_steps": 2337, "total_steps": 2814, "loss": 0.5438, "lr": 3.765308227226322e-07, "epoch": 2.4917333333333334, "percentage": 83.05, "elapsed_time": "0:06:53", "remaining_time": "0:01:24"}
38
+ {"current_steps": 2338, "total_steps": 2814, "loss": 0.4962, "lr": 3.7499760875099737e-07, "epoch": 2.4928, "percentage": 83.08, "elapsed_time": "0:07:06", "remaining_time": "0:01:26"}
39
+ {"current_steps": 2339, "total_steps": 2814, "loss": 0.5215, "lr": 3.7346726959468697e-07, "epoch": 2.4938666666666665, "percentage": 83.12, "elapsed_time": "0:07:16", "remaining_time": "0:01:28"}
40
+ {"current_steps": 2340, "total_steps": 2814, "loss": 0.5069, "lr": 3.7193980732402353e-07, "epoch": 2.4949333333333334, "percentage": 83.16, "elapsed_time": "0:07:28", "remaining_time": "0:01:30"}
41
+ {"current_steps": 2341, "total_steps": 2814, "loss": 0.5482, "lr": 3.7041522400543953e-07, "epoch": 2.496, "percentage": 83.19, "elapsed_time": "0:07:40", "remaining_time": "0:01:33"}
42
+ {"current_steps": 2342, "total_steps": 2814, "loss": 0.5152, "lr": 3.688935217014705e-07, "epoch": 2.4970666666666665, "percentage": 83.23, "elapsed_time": "0:07:50", "remaining_time": "0:01:34"}
43
+ {"current_steps": 2343, "total_steps": 2814, "loss": 0.5404, "lr": 3.6737470247075693e-07, "epoch": 2.4981333333333335, "percentage": 83.26, "elapsed_time": "0:08:02", "remaining_time": "0:01:36"}
44
+ {"current_steps": 2344, "total_steps": 2814, "loss": 0.551, "lr": 3.6585876836803645e-07, "epoch": 2.4992, "percentage": 83.3, "elapsed_time": "0:08:14", "remaining_time": "0:01:39"}
45
+ {"current_steps": 2345, "total_steps": 2814, "loss": 0.544, "lr": 3.6434572144414564e-07, "epoch": 2.5002666666666666, "percentage": 83.33, "elapsed_time": "0:08:24", "remaining_time": "0:01:40"}
46
+ {"current_steps": 2346, "total_steps": 2814, "loss": 0.5062, "lr": 3.628355637460132e-07, "epoch": 2.501333333333333, "percentage": 83.37, "elapsed_time": "0:08:35", "remaining_time": "0:01:42"}
47
+ {"current_steps": 2347, "total_steps": 2814, "loss": 0.4998, "lr": 3.61328297316661e-07, "epoch": 2.5023999999999997, "percentage": 83.4, "elapsed_time": "0:08:46", "remaining_time": "0:01:44"}
48
+ {"current_steps": 2348, "total_steps": 2814, "loss": 0.5378, "lr": 3.598239241951984e-07, "epoch": 2.5034666666666667, "percentage": 83.44, "elapsed_time": "0:08:55", "remaining_time": "0:01:46"}
49
+ {"current_steps": 2349, "total_steps": 2814, "loss": 0.5492, "lr": 3.5832244641682055e-07, "epoch": 2.5045333333333333, "percentage": 83.48, "elapsed_time": "0:09:05", "remaining_time": "0:01:47"}
50
+ {"current_steps": 2350, "total_steps": 2814, "loss": 0.5689, "lr": 3.568238660128051e-07, "epoch": 2.5056000000000003, "percentage": 83.51, "elapsed_time": "0:09:16", "remaining_time": "0:01:49"}
51
+ {"current_steps": 2351, "total_steps": 2814, "loss": 0.6394, "lr": 3.553281850105117e-07, "epoch": 2.506666666666667, "percentage": 83.55, "elapsed_time": "0:09:25", "remaining_time": "0:01:51"}
52
+ {"current_steps": 2352, "total_steps": 2814, "loss": 0.4936, "lr": 3.538354054333756e-07, "epoch": 2.5077333333333334, "percentage": 83.58, "elapsed_time": "0:09:38", "remaining_time": "0:01:53"}
53
+ {"current_steps": 2353, "total_steps": 2814, "loss": 0.5255, "lr": 3.523455293009093e-07, "epoch": 2.5088, "percentage": 83.62, "elapsed_time": "0:09:49", "remaining_time": "0:01:55"}
54
+ {"current_steps": 2354, "total_steps": 2814, "loss": 0.5377, "lr": 3.508585586286936e-07, "epoch": 2.5098666666666665, "percentage": 83.65, "elapsed_time": "0:10:00", "remaining_time": "0:01:57"}
55
+ {"current_steps": 2355, "total_steps": 2814, "loss": 0.4954, "lr": 3.4937449542838317e-07, "epoch": 2.5109333333333335, "percentage": 83.69, "elapsed_time": "0:10:11", "remaining_time": "0:01:59"}
56
+ {"current_steps": 2356, "total_steps": 2814, "loss": 0.5384, "lr": 3.478933417076949e-07, "epoch": 2.512, "percentage": 83.72, "elapsed_time": "0:10:23", "remaining_time": "0:02:01"}
57
+ {"current_steps": 2357, "total_steps": 2814, "loss": 0.506, "lr": 3.464150994704127e-07, "epoch": 2.5130666666666666, "percentage": 83.76, "elapsed_time": "0:10:32", "remaining_time": "0:02:02"}
58
+ {"current_steps": 2358, "total_steps": 2814, "loss": 0.5172, "lr": 3.449397707163812e-07, "epoch": 2.5141333333333336, "percentage": 83.8, "elapsed_time": "0:10:45", "remaining_time": "0:02:04"}
59
+ {"current_steps": 2359, "total_steps": 2814, "loss": 0.5283, "lr": 3.434673574415018e-07, "epoch": 2.5152, "percentage": 83.83, "elapsed_time": "0:10:56", "remaining_time": "0:02:06"}
60
+ {"current_steps": 2360, "total_steps": 2814, "loss": 0.5418, "lr": 3.4199786163773395e-07, "epoch": 2.5162666666666667, "percentage": 83.87, "elapsed_time": "0:11:06", "remaining_time": "0:02:08"}
61
+ {"current_steps": 2361, "total_steps": 2814, "loss": 0.5024, "lr": 3.405312852930881e-07, "epoch": 2.517333333333333, "percentage": 83.9, "elapsed_time": "0:11:19", "remaining_time": "0:02:10"}
62
+ {"current_steps": 2362, "total_steps": 2814, "loss": 0.5106, "lr": 3.390676303916268e-07, "epoch": 2.5183999999999997, "percentage": 83.94, "elapsed_time": "0:11:31", "remaining_time": "0:02:12"}
63
+ {"current_steps": 2363, "total_steps": 2814, "loss": 0.5732, "lr": 3.3760689891345953e-07, "epoch": 2.5194666666666667, "percentage": 83.97, "elapsed_time": "0:11:42", "remaining_time": "0:02:14"}
64
+ {"current_steps": 2364, "total_steps": 2814, "loss": 0.5248, "lr": 3.3614909283474053e-07, "epoch": 2.5205333333333333, "percentage": 84.01, "elapsed_time": "0:11:53", "remaining_time": "0:02:15"}
65
+ {"current_steps": 2365, "total_steps": 2814, "loss": 0.5212, "lr": 3.346942141276666e-07, "epoch": 2.5216, "percentage": 84.04, "elapsed_time": "0:12:03", "remaining_time": "0:02:17"}
66
+ {"current_steps": 2366, "total_steps": 2814, "loss": 0.4839, "lr": 3.3324226476047496e-07, "epoch": 2.522666666666667, "percentage": 84.08, "elapsed_time": "0:12:15", "remaining_time": "0:02:19"}
67
+ {"current_steps": 2367, "total_steps": 2814, "loss": 0.503, "lr": 3.3179324669743857e-07, "epoch": 2.5237333333333334, "percentage": 84.12, "elapsed_time": "0:12:26", "remaining_time": "0:02:21"}
68
+ {"current_steps": 2368, "total_steps": 2814, "loss": 0.5367, "lr": 3.3034716189886656e-07, "epoch": 2.5248, "percentage": 84.15, "elapsed_time": "0:12:39", "remaining_time": "0:02:22"}
69
+ {"current_steps": 2369, "total_steps": 2814, "loss": 0.4543, "lr": 3.2890401232109736e-07, "epoch": 2.5258666666666665, "percentage": 84.19, "elapsed_time": "0:12:49", "remaining_time": "0:02:24"}
70
+ {"current_steps": 2370, "total_steps": 2814, "loss": 0.539, "lr": 3.2746379991650147e-07, "epoch": 2.5269333333333335, "percentage": 84.22, "elapsed_time": "0:13:00", "remaining_time": "0:02:26"}
71
+ {"current_steps": 2371, "total_steps": 2814, "loss": 0.5363, "lr": 3.260265266334725e-07, "epoch": 2.528, "percentage": 84.26, "elapsed_time": "0:13:11", "remaining_time": "0:02:27"}
72
+ {"current_steps": 2372, "total_steps": 2814, "loss": 0.4691, "lr": 3.2459219441643124e-07, "epoch": 2.5290666666666666, "percentage": 84.29, "elapsed_time": "0:13:23", "remaining_time": "0:02:29"}
73
+ {"current_steps": 2373, "total_steps": 2814, "loss": 0.5188, "lr": 3.2316080520581635e-07, "epoch": 2.5301333333333336, "percentage": 84.33, "elapsed_time": "0:13:37", "remaining_time": "0:02:31"}
74
+ {"current_steps": 2374, "total_steps": 2814, "loss": 0.4875, "lr": 3.217323609380882e-07, "epoch": 2.5312, "percentage": 84.36, "elapsed_time": "0:13:49", "remaining_time": "0:02:33"}
75
+ {"current_steps": 2375, "total_steps": 2814, "loss": 0.4964, "lr": 3.203068635457202e-07, "epoch": 2.5322666666666667, "percentage": 84.4, "elapsed_time": "0:14:02", "remaining_time": "0:02:35"}
76
+ {"current_steps": 2376, "total_steps": 2814, "loss": 0.4844, "lr": 3.1888431495720127e-07, "epoch": 2.533333333333333, "percentage": 84.43, "elapsed_time": "0:14:15", "remaining_time": "0:02:37"}
77
+ {"current_steps": 2377, "total_steps": 2814, "loss": 0.4902, "lr": 3.1746471709702963e-07, "epoch": 2.5343999999999998, "percentage": 84.47, "elapsed_time": "0:14:26", "remaining_time": "0:02:39"}
78
+ {"current_steps": 2378, "total_steps": 2814, "loss": 0.5137, "lr": 3.1604807188571283e-07, "epoch": 2.5354666666666668, "percentage": 84.51, "elapsed_time": "0:14:38", "remaining_time": "0:02:41"}
79
+ {"current_steps": 2379, "total_steps": 2814, "loss": 0.5056, "lr": 3.1463438123976286e-07, "epoch": 2.5365333333333333, "percentage": 84.54, "elapsed_time": "0:14:50", "remaining_time": "0:02:42"}
80
+ {"current_steps": 2380, "total_steps": 2814, "loss": 0.5126, "lr": 3.132236470716943e-07, "epoch": 2.5376, "percentage": 84.58, "elapsed_time": "0:15:01", "remaining_time": "0:02:44"}
81
+ {"current_steps": 2381, "total_steps": 2814, "loss": 0.5519, "lr": 3.1181587129002403e-07, "epoch": 2.538666666666667, "percentage": 84.61, "elapsed_time": "0:15:12", "remaining_time": "0:02:45"}
82
+ {"current_steps": 2382, "total_steps": 2814, "loss": 0.5238, "lr": 3.1041105579926426e-07, "epoch": 2.5397333333333334, "percentage": 84.65, "elapsed_time": "0:15:24", "remaining_time": "0:02:47"}
83
+ {"current_steps": 2383, "total_steps": 2814, "loss": 0.4824, "lr": 3.0900920249992446e-07, "epoch": 2.5408, "percentage": 84.68, "elapsed_time": "0:15:35", "remaining_time": "0:02:49"}
84
+ {"current_steps": 2384, "total_steps": 2814, "loss": 0.5285, "lr": 3.076103132885047e-07, "epoch": 2.5418666666666665, "percentage": 84.72, "elapsed_time": "0:15:47", "remaining_time": "0:02:50"}
85
+ {"current_steps": 2385, "total_steps": 2814, "loss": 0.5084, "lr": 3.062143900574971e-07, "epoch": 2.5429333333333335, "percentage": 84.75, "elapsed_time": "0:15:59", "remaining_time": "0:02:52"}
86
+ {"current_steps": 2386, "total_steps": 2814, "loss": 0.5121, "lr": 3.0482143469537955e-07, "epoch": 2.544, "percentage": 84.79, "elapsed_time": "0:16:11", "remaining_time": "0:02:54"}
87
+ {"current_steps": 2387, "total_steps": 2814, "loss": 0.5389, "lr": 3.034314490866161e-07, "epoch": 2.5450666666666666, "percentage": 84.83, "elapsed_time": "0:16:22", "remaining_time": "0:02:55"}
88
+ {"current_steps": 2388, "total_steps": 2814, "loss": 0.5485, "lr": 3.0204443511165167e-07, "epoch": 2.5461333333333336, "percentage": 84.86, "elapsed_time": "0:16:34", "remaining_time": "0:02:57"}
89
+ {"current_steps": 2389, "total_steps": 2814, "loss": 0.5117, "lr": 3.0066039464691304e-07, "epoch": 2.5472, "percentage": 84.9, "elapsed_time": "0:16:48", "remaining_time": "0:02:59"}
90
+ {"current_steps": 2390, "total_steps": 2814, "loss": 0.5698, "lr": 2.992793295648022e-07, "epoch": 2.5482666666666667, "percentage": 84.93, "elapsed_time": "0:16:58", "remaining_time": "0:03:00"}
91
+ {"current_steps": 2391, "total_steps": 2814, "loss": 0.5314, "lr": 2.9790124173369765e-07, "epoch": 2.5493333333333332, "percentage": 84.97, "elapsed_time": "0:17:09", "remaining_time": "0:03:02"}
92
+ {"current_steps": 2392, "total_steps": 2814, "loss": 0.5574, "lr": 2.9652613301794865e-07, "epoch": 2.5504, "percentage": 85.0, "elapsed_time": "0:17:19", "remaining_time": "0:03:03"}
93
+ {"current_steps": 2393, "total_steps": 2814, "loss": 0.5888, "lr": 2.9515400527787537e-07, "epoch": 2.5514666666666668, "percentage": 85.04, "elapsed_time": "0:17:28", "remaining_time": "0:03:04"}
94
+ {"current_steps": 2394, "total_steps": 2814, "loss": 0.5367, "lr": 2.937848603697643e-07, "epoch": 2.5525333333333333, "percentage": 85.07, "elapsed_time": "0:17:42", "remaining_time": "0:03:06"}
95
+ {"current_steps": 2395, "total_steps": 2814, "loss": 0.5259, "lr": 2.9241870014586645e-07, "epoch": 2.5536, "percentage": 85.11, "elapsed_time": "0:17:51", "remaining_time": "0:03:07"}
96
+ {"current_steps": 2396, "total_steps": 2814, "loss": 0.5501, "lr": 2.910555264543966e-07, "epoch": 2.554666666666667, "percentage": 85.15, "elapsed_time": "0:18:02", "remaining_time": "0:03:08"}
97
+ {"current_steps": 2397, "total_steps": 2814, "loss": 0.5682, "lr": 2.896953411395265e-07, "epoch": 2.5557333333333334, "percentage": 85.18, "elapsed_time": "0:18:14", "remaining_time": "0:03:10"}
98
+ {"current_steps": 2398, "total_steps": 2814, "loss": 0.5012, "lr": 2.883381460413881e-07, "epoch": 2.5568, "percentage": 85.22, "elapsed_time": "0:18:24", "remaining_time": "0:03:11"}
99
+ {"current_steps": 2399, "total_steps": 2814, "loss": 0.4842, "lr": 2.8698394299606495e-07, "epoch": 2.5578666666666665, "percentage": 85.25, "elapsed_time": "0:18:34", "remaining_time": "0:03:12"}
100
+ {"current_steps": 2400, "total_steps": 2814, "loss": 0.5043, "lr": 2.8563273383559606e-07, "epoch": 2.558933333333333, "percentage": 85.29, "elapsed_time": "0:18:46", "remaining_time": "0:03:14"}
101
+ {"current_steps": 2401, "total_steps": 2814, "loss": 0.5928, "lr": 2.842845203879671e-07, "epoch": 2.56, "percentage": 85.32, "elapsed_time": "0:20:42", "remaining_time": "0:03:33"}
102
+ {"current_steps": 2402, "total_steps": 2814, "loss": 0.5272, "lr": 2.829393044771131e-07, "epoch": 2.5610666666666666, "percentage": 85.36, "elapsed_time": "0:20:54", "remaining_time": "0:03:35"}
103
+ {"current_steps": 2403, "total_steps": 2814, "loss": 0.515, "lr": 2.815970879229127e-07, "epoch": 2.5621333333333336, "percentage": 85.39, "elapsed_time": "0:21:05", "remaining_time": "0:03:36"}
104
+ {"current_steps": 2404, "total_steps": 2814, "loss": 0.5759, "lr": 2.802578725411875e-07, "epoch": 2.5632, "percentage": 85.43, "elapsed_time": "0:21:15", "remaining_time": "0:03:37"}
105
+ {"current_steps": 2405, "total_steps": 2814, "loss": 0.557, "lr": 2.7892166014369834e-07, "epoch": 2.5642666666666667, "percentage": 85.47, "elapsed_time": "0:21:24", "remaining_time": "0:03:38"}
106
+ {"current_steps": 2406, "total_steps": 2814, "loss": 0.5105, "lr": 2.7758845253814474e-07, "epoch": 2.5653333333333332, "percentage": 85.5, "elapsed_time": "0:21:36", "remaining_time": "0:03:39"}
107
+ {"current_steps": 2407, "total_steps": 2814, "loss": 0.5525, "lr": 2.762582515281595e-07, "epoch": 2.5664, "percentage": 85.54, "elapsed_time": "0:21:46", "remaining_time": "0:03:40"}
108
+ {"current_steps": 2408, "total_steps": 2814, "loss": 0.5078, "lr": 2.7493105891330837e-07, "epoch": 2.567466666666667, "percentage": 85.57, "elapsed_time": "0:21:57", "remaining_time": "0:03:42"}
109
+ {"current_steps": 2409, "total_steps": 2814, "loss": 0.5769, "lr": 2.736068764890884e-07, "epoch": 2.5685333333333333, "percentage": 85.61, "elapsed_time": "0:22:06", "remaining_time": "0:03:43"}
110
+ {"current_steps": 2410, "total_steps": 2814, "loss": 0.5275, "lr": 2.7228570604692335e-07, "epoch": 2.5696, "percentage": 85.64, "elapsed_time": "0:22:17", "remaining_time": "0:03:44"}
111
+ {"current_steps": 2411, "total_steps": 2814, "loss": 0.5808, "lr": 2.709675493741612e-07, "epoch": 2.570666666666667, "percentage": 85.68, "elapsed_time": "0:22:27", "remaining_time": "0:03:45"}
112
+ {"current_steps": 2412, "total_steps": 2814, "loss": 0.5316, "lr": 2.6965240825407466e-07, "epoch": 2.5717333333333334, "percentage": 85.71, "elapsed_time": "0:22:38", "remaining_time": "0:03:46"}
113
+ {"current_steps": 2413, "total_steps": 2814, "loss": 0.5908, "lr": 2.683402844658564e-07, "epoch": 2.5728, "percentage": 85.75, "elapsed_time": "0:22:51", "remaining_time": "0:03:47"}
114
+ {"current_steps": 2414, "total_steps": 2814, "loss": 0.4956, "lr": 2.6703117978461564e-07, "epoch": 2.5738666666666665, "percentage": 85.79, "elapsed_time": "0:23:02", "remaining_time": "0:03:49"}
115
+ {"current_steps": 2415, "total_steps": 2814, "loss": 0.5812, "lr": 2.657250959813795e-07, "epoch": 2.574933333333333, "percentage": 85.82, "elapsed_time": "0:23:14", "remaining_time": "0:03:50"}
116
+ {"current_steps": 2416, "total_steps": 2814, "loss": 0.5151, "lr": 2.644220348230858e-07, "epoch": 2.576, "percentage": 85.86, "elapsed_time": "0:23:25", "remaining_time": "0:03:51"}
117
+ {"current_steps": 2417, "total_steps": 2814, "loss": 0.51, "lr": 2.6312199807258547e-07, "epoch": 2.5770666666666666, "percentage": 85.89, "elapsed_time": "0:23:36", "remaining_time": "0:03:52"}
118
+ {"current_steps": 2418, "total_steps": 2814, "loss": 0.5049, "lr": 2.618249874886358e-07, "epoch": 2.5781333333333336, "percentage": 85.93, "elapsed_time": "0:23:49", "remaining_time": "0:03:54"}
119
+ {"current_steps": 2419, "total_steps": 2814, "loss": 0.5073, "lr": 2.605310048259022e-07, "epoch": 2.5792, "percentage": 85.96, "elapsed_time": "0:24:00", "remaining_time": "0:03:55"}
120
+ {"current_steps": 2420, "total_steps": 2814, "loss": 0.5403, "lr": 2.592400518349517e-07, "epoch": 2.5802666666666667, "percentage": 86.0, "elapsed_time": "0:24:10", "remaining_time": "0:03:56"}
121
+ {"current_steps": 2421, "total_steps": 2814, "loss": 0.5084, "lr": 2.5795213026225427e-07, "epoch": 2.5813333333333333, "percentage": 86.03, "elapsed_time": "0:24:22", "remaining_time": "0:03:57"}
122
+ {"current_steps": 2422, "total_steps": 2814, "loss": 0.516, "lr": 2.5666724185017785e-07, "epoch": 2.5824, "percentage": 86.07, "elapsed_time": "0:24:34", "remaining_time": "0:03:58"}
123
+ {"current_steps": 2423, "total_steps": 2814, "loss": 0.4863, "lr": 2.553853883369875e-07, "epoch": 2.583466666666667, "percentage": 86.11, "elapsed_time": "0:24:47", "remaining_time": "0:04:00"}
124
+ {"current_steps": 2424, "total_steps": 2814, "loss": 0.5543, "lr": 2.5410657145684165e-07, "epoch": 2.5845333333333333, "percentage": 86.14, "elapsed_time": "0:24:58", "remaining_time": "0:04:01"}
125
+ {"current_steps": 2425, "total_steps": 2814, "loss": 0.519, "lr": 2.528307929397919e-07, "epoch": 2.5856, "percentage": 86.18, "elapsed_time": "0:25:08", "remaining_time": "0:04:01"}
126
+ {"current_steps": 2426, "total_steps": 2814, "loss": 0.5528, "lr": 2.5155805451177793e-07, "epoch": 2.586666666666667, "percentage": 86.21, "elapsed_time": "0:25:18", "remaining_time": "0:04:02"}
127
+ {"current_steps": 2427, "total_steps": 2814, "loss": 0.4754, "lr": 2.502883578946286e-07, "epoch": 2.5877333333333334, "percentage": 86.25, "elapsed_time": "0:25:29", "remaining_time": "0:04:03"}
128
+ {"current_steps": 2428, "total_steps": 2814, "loss": 0.5577, "lr": 2.490217048060553e-07, "epoch": 2.5888, "percentage": 86.28, "elapsed_time": "0:25:38", "remaining_time": "0:04:04"}
129
+ {"current_steps": 2429, "total_steps": 2814, "loss": 0.5588, "lr": 2.4775809695965436e-07, "epoch": 2.5898666666666665, "percentage": 86.32, "elapsed_time": "0:25:51", "remaining_time": "0:04:05"}
130
+ {"current_steps": 2430, "total_steps": 2814, "loss": 0.6043, "lr": 2.4649753606489966e-07, "epoch": 2.590933333333333, "percentage": 86.35, "elapsed_time": "0:26:00", "remaining_time": "0:04:06"}
131
+ {"current_steps": 2431, "total_steps": 2814, "loss": 0.5324, "lr": 2.452400238271455e-07, "epoch": 2.592, "percentage": 86.39, "elapsed_time": "0:26:13", "remaining_time": "0:04:07"}
132
+ {"current_steps": 2432, "total_steps": 2814, "loss": 0.5622, "lr": 2.439855619476206e-07, "epoch": 2.5930666666666666, "percentage": 86.43, "elapsed_time": "0:26:26", "remaining_time": "0:04:09"}
133
+ {"current_steps": 2433, "total_steps": 2814, "loss": 0.5026, "lr": 2.427341521234269e-07, "epoch": 2.594133333333333, "percentage": 86.46, "elapsed_time": "0:26:37", "remaining_time": "0:04:10"}
134
+ {"current_steps": 2434, "total_steps": 2814, "loss": 0.5259, "lr": 2.4148579604753805e-07, "epoch": 2.5952, "percentage": 86.5, "elapsed_time": "0:26:48", "remaining_time": "0:04:11"}
135
+ {"current_steps": 2435, "total_steps": 2814, "loss": 0.5747, "lr": 2.4024049540879555e-07, "epoch": 2.5962666666666667, "percentage": 86.53, "elapsed_time": "0:26:59", "remaining_time": "0:04:12"}
136
+ {"current_steps": 2436, "total_steps": 2814, "loss": 0.5452, "lr": 2.389982518919082e-07, "epoch": 2.5973333333333333, "percentage": 86.57, "elapsed_time": "0:27:10", "remaining_time": "0:04:13"}
137
+ {"current_steps": 2437, "total_steps": 2814, "loss": 0.5779, "lr": 2.3775906717744852e-07, "epoch": 2.5984, "percentage": 86.6, "elapsed_time": "0:27:20", "remaining_time": "0:04:13"}
138
+ {"current_steps": 2438, "total_steps": 2814, "loss": 0.4873, "lr": 2.3652294294185056e-07, "epoch": 2.599466666666667, "percentage": 86.64, "elapsed_time": "0:27:30", "remaining_time": "0:04:14"}
139
+ {"current_steps": 2439, "total_steps": 2814, "loss": 0.5392, "lr": 2.3528988085740807e-07, "epoch": 2.6005333333333334, "percentage": 86.67, "elapsed_time": "0:27:40", "remaining_time": "0:04:15"}
140
+ {"current_steps": 2440, "total_steps": 2814, "loss": 0.5535, "lr": 2.340598825922738e-07, "epoch": 2.6016, "percentage": 86.71, "elapsed_time": "0:27:51", "remaining_time": "0:04:16"}
141
+ {"current_steps": 2441, "total_steps": 2814, "loss": 0.5573, "lr": 2.3283294981045285e-07, "epoch": 2.602666666666667, "percentage": 86.74, "elapsed_time": "0:28:03", "remaining_time": "0:04:17"}
142
+ {"current_steps": 2442, "total_steps": 2814, "loss": 0.5478, "lr": 2.316090841718055e-07, "epoch": 2.6037333333333335, "percentage": 86.78, "elapsed_time": "0:28:14", "remaining_time": "0:04:18"}
143
+ {"current_steps": 2443, "total_steps": 2814, "loss": 0.4621, "lr": 2.3038828733204122e-07, "epoch": 2.6048, "percentage": 86.82, "elapsed_time": "0:28:25", "remaining_time": "0:04:19"}
144
+ {"current_steps": 2444, "total_steps": 2814, "loss": 0.5964, "lr": 2.2917056094271916e-07, "epoch": 2.6058666666666666, "percentage": 86.85, "elapsed_time": "0:28:36", "remaining_time": "0:04:19"}
145
+ {"current_steps": 2445, "total_steps": 2814, "loss": 0.5108, "lr": 2.2795590665124267e-07, "epoch": 2.606933333333333, "percentage": 86.89, "elapsed_time": "0:28:46", "remaining_time": "0:04:20"}
146
+ {"current_steps": 2446, "total_steps": 2814, "loss": 0.4799, "lr": 2.2674432610086145e-07, "epoch": 2.608, "percentage": 86.92, "elapsed_time": "0:28:58", "remaining_time": "0:04:21"}
147
+ {"current_steps": 2447, "total_steps": 2814, "loss": 0.513, "lr": 2.2553582093066405e-07, "epoch": 2.6090666666666666, "percentage": 86.96, "elapsed_time": "0:29:09", "remaining_time": "0:04:22"}
148
+ {"current_steps": 2448, "total_steps": 2814, "loss": 0.5035, "lr": 2.2433039277558134e-07, "epoch": 2.610133333333333, "percentage": 86.99, "elapsed_time": "0:29:20", "remaining_time": "0:04:23"}
149
+ {"current_steps": 2449, "total_steps": 2814, "loss": 0.5297, "lr": 2.231280432663796e-07, "epoch": 2.6112, "percentage": 87.03, "elapsed_time": "0:29:31", "remaining_time": "0:04:24"}
150
+ {"current_steps": 2450, "total_steps": 2814, "loss": 0.5666, "lr": 2.219287740296605e-07, "epoch": 2.6122666666666667, "percentage": 87.06, "elapsed_time": "0:29:43", "remaining_time": "0:04:24"}
151
+ {"current_steps": 2451, "total_steps": 2814, "loss": 0.5473, "lr": 2.2073258668785929e-07, "epoch": 2.6133333333333333, "percentage": 87.1, "elapsed_time": "0:29:54", "remaining_time": "0:04:25"}
152
+ {"current_steps": 2452, "total_steps": 2814, "loss": 0.4875, "lr": 2.1953948285924093e-07, "epoch": 2.6144, "percentage": 87.14, "elapsed_time": "0:30:06", "remaining_time": "0:04:26"}
153
+ {"current_steps": 2453, "total_steps": 2814, "loss": 0.5229, "lr": 2.1834946415789926e-07, "epoch": 2.615466666666667, "percentage": 87.17, "elapsed_time": "0:30:16", "remaining_time": "0:04:27"}
154
+ {"current_steps": 2454, "total_steps": 2814, "loss": 0.4858, "lr": 2.1716253219375372e-07, "epoch": 2.6165333333333334, "percentage": 87.21, "elapsed_time": "0:30:27", "remaining_time": "0:04:28"}
155
+ {"current_steps": 2455, "total_steps": 2814, "loss": 0.5591, "lr": 2.1597868857254955e-07, "epoch": 2.6176, "percentage": 87.24, "elapsed_time": "0:30:37", "remaining_time": "0:04:28"}
156
+ {"current_steps": 2456, "total_steps": 2814, "loss": 0.5202, "lr": 2.1479793489585176e-07, "epoch": 2.618666666666667, "percentage": 87.28, "elapsed_time": "0:30:48", "remaining_time": "0:04:29"}
157
+ {"current_steps": 2457, "total_steps": 2814, "loss": 0.5678, "lr": 2.1362027276104757e-07, "epoch": 2.6197333333333335, "percentage": 87.31, "elapsed_time": "0:30:59", "remaining_time": "0:04:30"}
158
+ {"current_steps": 2458, "total_steps": 2814, "loss": 0.5089, "lr": 2.1244570376133894e-07, "epoch": 2.6208, "percentage": 87.35, "elapsed_time": "0:31:09", "remaining_time": "0:04:30"}
159
+ {"current_steps": 2459, "total_steps": 2814, "loss": 0.5652, "lr": 2.112742294857459e-07, "epoch": 2.6218666666666666, "percentage": 87.38, "elapsed_time": "0:31:20", "remaining_time": "0:04:31"}
160
+ {"current_steps": 2460, "total_steps": 2814, "loss": 0.5714, "lr": 2.1010585151910012e-07, "epoch": 2.622933333333333, "percentage": 87.42, "elapsed_time": "0:31:30", "remaining_time": "0:04:32"}
161
+ {"current_steps": 2461, "total_steps": 2814, "loss": 0.522, "lr": 2.0894057144204505e-07, "epoch": 2.624, "percentage": 87.46, "elapsed_time": "0:31:44", "remaining_time": "0:04:33"}
162
+ {"current_steps": 2462, "total_steps": 2814, "loss": 0.6123, "lr": 2.0777839083103295e-07, "epoch": 2.6250666666666667, "percentage": 87.49, "elapsed_time": "0:31:55", "remaining_time": "0:04:33"}
163
+ {"current_steps": 2463, "total_steps": 2814, "loss": 0.5462, "lr": 2.0661931125832342e-07, "epoch": 2.626133333333333, "percentage": 87.53, "elapsed_time": "0:32:05", "remaining_time": "0:04:34"}
164
+ {"current_steps": 2464, "total_steps": 2814, "loss": 0.5554, "lr": 2.0546333429197984e-07, "epoch": 2.6272, "percentage": 87.56, "elapsed_time": "0:32:15", "remaining_time": "0:04:34"}
165
+ {"current_steps": 2465, "total_steps": 2814, "loss": 0.4546, "lr": 2.0431046149586935e-07, "epoch": 2.6282666666666668, "percentage": 87.6, "elapsed_time": "0:32:26", "remaining_time": "0:04:35"}
166
+ {"current_steps": 2466, "total_steps": 2814, "loss": 0.5163, "lr": 2.031606944296588e-07, "epoch": 2.6293333333333333, "percentage": 87.63, "elapsed_time": "0:32:38", "remaining_time": "0:04:36"}
167
+ {"current_steps": 2467, "total_steps": 2814, "loss": 0.6033, "lr": 2.020140346488139e-07, "epoch": 2.6304, "percentage": 87.67, "elapsed_time": "0:32:48", "remaining_time": "0:04:36"}
168
+ {"current_steps": 2468, "total_steps": 2814, "loss": 0.5275, "lr": 2.008704837045966e-07, "epoch": 2.6314666666666664, "percentage": 87.7, "elapsed_time": "0:33:00", "remaining_time": "0:04:37"}
169
+ {"current_steps": 2469, "total_steps": 2814, "loss": 0.5473, "lr": 1.997300431440624e-07, "epoch": 2.6325333333333334, "percentage": 87.74, "elapsed_time": "0:33:10", "remaining_time": "0:04:38"}
170
+ {"current_steps": 2470, "total_steps": 2814, "loss": 0.5295, "lr": 1.9859271451006024e-07, "epoch": 2.6336, "percentage": 87.78, "elapsed_time": "0:33:21", "remaining_time": "0:04:38"}
171
+ {"current_steps": 2471, "total_steps": 2814, "loss": 0.546, "lr": 1.9745849934122707e-07, "epoch": 2.634666666666667, "percentage": 87.81, "elapsed_time": "0:33:31", "remaining_time": "0:04:39"}
172
+ {"current_steps": 2472, "total_steps": 2814, "loss": 0.5379, "lr": 1.9632739917199052e-07, "epoch": 2.6357333333333335, "percentage": 87.85, "elapsed_time": "0:33:43", "remaining_time": "0:04:39"}
173
+ {"current_steps": 2473, "total_steps": 2814, "loss": 0.4922, "lr": 1.9519941553256122e-07, "epoch": 2.6368, "percentage": 87.88, "elapsed_time": "0:33:56", "remaining_time": "0:04:40"}
174
+ {"current_steps": 2474, "total_steps": 2814, "loss": 0.5518, "lr": 1.9407454994893578e-07, "epoch": 2.6378666666666666, "percentage": 87.92, "elapsed_time": "0:34:08", "remaining_time": "0:04:41"}
175
+ {"current_steps": 2475, "total_steps": 2814, "loss": 0.5561, "lr": 1.929528039428913e-07, "epoch": 2.638933333333333, "percentage": 87.95, "elapsed_time": "0:34:20", "remaining_time": "0:04:42"}
176
+ {"current_steps": 2476, "total_steps": 2814, "loss": 0.5441, "lr": 1.9183417903198524e-07, "epoch": 2.64, "percentage": 87.99, "elapsed_time": "0:34:32", "remaining_time": "0:04:42"}
177
+ {"current_steps": 2477, "total_steps": 2814, "loss": 0.5306, "lr": 1.9071867672955185e-07, "epoch": 2.6410666666666667, "percentage": 88.02, "elapsed_time": "0:34:43", "remaining_time": "0:04:43"}
178
+ {"current_steps": 2478, "total_steps": 2814, "loss": 0.5559, "lr": 1.8960629854470252e-07, "epoch": 2.6421333333333332, "percentage": 88.06, "elapsed_time": "0:34:53", "remaining_time": "0:04:43"}
179
+ {"current_steps": 2479, "total_steps": 2814, "loss": 0.5592, "lr": 1.8849704598231989e-07, "epoch": 2.6432, "percentage": 88.1, "elapsed_time": "0:35:04", "remaining_time": "0:04:44"}
180
+ {"current_steps": 2480, "total_steps": 2814, "loss": 0.4687, "lr": 1.8739092054306018e-07, "epoch": 2.6442666666666668, "percentage": 88.13, "elapsed_time": "0:35:18", "remaining_time": "0:04:45"}
181
+ {"current_steps": 2481, "total_steps": 2814, "loss": 0.5645, "lr": 1.8628792372334808e-07, "epoch": 2.6453333333333333, "percentage": 88.17, "elapsed_time": "0:35:28", "remaining_time": "0:04:45"}
182
+ {"current_steps": 2482, "total_steps": 2814, "loss": 0.5119, "lr": 1.851880570153755e-07, "epoch": 2.6464, "percentage": 88.2, "elapsed_time": "0:35:39", "remaining_time": "0:04:46"}
183
+ {"current_steps": 2483, "total_steps": 2814, "loss": 0.5385, "lr": 1.8409132190710056e-07, "epoch": 2.6474666666666664, "percentage": 88.24, "elapsed_time": "0:35:52", "remaining_time": "0:04:46"}
184
+ {"current_steps": 2484, "total_steps": 2814, "loss": 0.5526, "lr": 1.8299771988224442e-07, "epoch": 2.6485333333333334, "percentage": 88.27, "elapsed_time": "0:36:03", "remaining_time": "0:04:47"}
185
+ {"current_steps": 2485, "total_steps": 2814, "loss": 0.6014, "lr": 1.8190725242028888e-07, "epoch": 2.6496, "percentage": 88.31, "elapsed_time": "0:36:14", "remaining_time": "0:04:47"}
186
+ {"current_steps": 2486, "total_steps": 2814, "loss": 0.4951, "lr": 1.8081992099647616e-07, "epoch": 2.6506666666666665, "percentage": 88.34, "elapsed_time": "0:36:24", "remaining_time": "0:04:48"}
187
+ {"current_steps": 2487, "total_steps": 2814, "loss": 0.5393, "lr": 1.797357270818062e-07, "epoch": 2.6517333333333335, "percentage": 88.38, "elapsed_time": "0:36:35", "remaining_time": "0:04:48"}
188
+ {"current_steps": 2488, "total_steps": 2814, "loss": 0.5412, "lr": 1.7865467214303296e-07, "epoch": 2.6528, "percentage": 88.42, "elapsed_time": "0:36:46", "remaining_time": "0:04:49"}
189
+ {"current_steps": 2489, "total_steps": 2814, "loss": 0.5453, "lr": 1.7757675764266512e-07, "epoch": 2.6538666666666666, "percentage": 88.45, "elapsed_time": "0:36:59", "remaining_time": "0:04:49"}
190
+ {"current_steps": 2490, "total_steps": 2814, "loss": 0.5323, "lr": 1.7650198503896182e-07, "epoch": 2.654933333333333, "percentage": 88.49, "elapsed_time": "0:37:12", "remaining_time": "0:04:50"}
191
+ {"current_steps": 2491, "total_steps": 2814, "loss": 0.5236, "lr": 1.754303557859327e-07, "epoch": 2.656, "percentage": 88.52, "elapsed_time": "0:37:20", "remaining_time": "0:04:50"}
192
+ {"current_steps": 2492, "total_steps": 2814, "loss": 0.5268, "lr": 1.7436187133333337e-07, "epoch": 2.6570666666666667, "percentage": 88.56, "elapsed_time": "0:37:31", "remaining_time": "0:04:50"}
193
+ {"current_steps": 2493, "total_steps": 2814, "loss": 0.5311, "lr": 1.73296533126667e-07, "epoch": 2.6581333333333332, "percentage": 88.59, "elapsed_time": "0:37:45", "remaining_time": "0:04:51"}
194
+ {"current_steps": 2494, "total_steps": 2814, "loss": 0.5376, "lr": 1.7223434260717876e-07, "epoch": 2.6592000000000002, "percentage": 88.63, "elapsed_time": "0:37:56", "remaining_time": "0:04:52"}
195
+ {"current_steps": 2495, "total_steps": 2814, "loss": 0.5022, "lr": 1.711753012118561e-07, "epoch": 2.660266666666667, "percentage": 88.66, "elapsed_time": "0:38:11", "remaining_time": "0:04:52"}
196
+ {"current_steps": 2496, "total_steps": 2814, "loss": 0.5504, "lr": 1.7011941037342633e-07, "epoch": 2.6613333333333333, "percentage": 88.7, "elapsed_time": "0:38:23", "remaining_time": "0:04:53"}
197
+ {"current_steps": 2497, "total_steps": 2814, "loss": 0.5396, "lr": 1.690666715203537e-07, "epoch": 2.6624, "percentage": 88.73, "elapsed_time": "0:38:35", "remaining_time": "0:04:53"}
198
+ {"current_steps": 2498, "total_steps": 2814, "loss": 0.5104, "lr": 1.6801708607683958e-07, "epoch": 2.6634666666666664, "percentage": 88.77, "elapsed_time": "0:38:46", "remaining_time": "0:04:54"}
199
+ {"current_steps": 2499, "total_steps": 2814, "loss": 0.5358, "lr": 1.669706554628181e-07, "epoch": 2.6645333333333334, "percentage": 88.81, "elapsed_time": "0:38:57", "remaining_time": "0:04:54"}
200
+ {"current_steps": 2500, "total_steps": 2814, "loss": 0.5529, "lr": 1.6592738109395544e-07, "epoch": 2.6656, "percentage": 88.84, "elapsed_time": "0:39:11", "remaining_time": "0:04:55"}
201
+ {"current_steps": 2501, "total_steps": 2814, "loss": 0.5662, "lr": 1.6488726438164903e-07, "epoch": 2.6666666666666665, "percentage": 88.88, "elapsed_time": "0:40:51", "remaining_time": "0:05:06"}
202
+ {"current_steps": 2502, "total_steps": 2814, "loss": 0.5222, "lr": 1.638503067330227e-07, "epoch": 2.6677333333333335, "percentage": 88.91, "elapsed_time": "0:41:01", "remaining_time": "0:05:06"}
203
+ {"current_steps": 2503, "total_steps": 2814, "loss": 0.4981, "lr": 1.6281650955092792e-07, "epoch": 2.6688, "percentage": 88.95, "elapsed_time": "0:41:12", "remaining_time": "0:05:07"}
204
+ {"current_steps": 2504, "total_steps": 2814, "loss": 0.5224, "lr": 1.6178587423394066e-07, "epoch": 2.6698666666666666, "percentage": 88.98, "elapsed_time": "0:41:23", "remaining_time": "0:05:07"}
205
+ {"current_steps": 2505, "total_steps": 2814, "loss": 0.5272, "lr": 1.6075840217635752e-07, "epoch": 2.670933333333333, "percentage": 89.02, "elapsed_time": "0:41:32", "remaining_time": "0:05:07"}
206
+ {"current_steps": 2506, "total_steps": 2814, "loss": 0.5246, "lr": 1.5973409476819802e-07, "epoch": 2.672, "percentage": 89.05, "elapsed_time": "0:41:42", "remaining_time": "0:05:07"}
207
+ {"current_steps": 2507, "total_steps": 2814, "loss": 0.4995, "lr": 1.5871295339519838e-07, "epoch": 2.6730666666666667, "percentage": 89.09, "elapsed_time": "0:41:54", "remaining_time": "0:05:07"}
208
+ {"current_steps": 2508, "total_steps": 2814, "loss": 0.5277, "lr": 1.576949794388133e-07, "epoch": 2.6741333333333333, "percentage": 89.13, "elapsed_time": "0:42:05", "remaining_time": "0:05:08"}
209
+ {"current_steps": 2509, "total_steps": 2814, "loss": 0.5422, "lr": 1.5668017427621085e-07, "epoch": 2.6752000000000002, "percentage": 89.16, "elapsed_time": "0:42:16", "remaining_time": "0:05:08"}
210
+ {"current_steps": 2510, "total_steps": 2814, "loss": 0.4891, "lr": 1.556685392802737e-07, "epoch": 2.676266666666667, "percentage": 89.2, "elapsed_time": "0:42:26", "remaining_time": "0:05:08"}
211
+ {"current_steps": 2511, "total_steps": 2814, "loss": 0.4773, "lr": 1.5466007581959487e-07, "epoch": 2.6773333333333333, "percentage": 89.23, "elapsed_time": "0:42:39", "remaining_time": "0:05:08"}
212
+ {"current_steps": 2512, "total_steps": 2814, "loss": 0.5231, "lr": 1.536547852584766e-07, "epoch": 2.6784, "percentage": 89.27, "elapsed_time": "0:42:49", "remaining_time": "0:05:08"}
213
+ {"current_steps": 2513, "total_steps": 2814, "loss": 0.5257, "lr": 1.5265266895692883e-07, "epoch": 2.6794666666666664, "percentage": 89.3, "elapsed_time": "0:43:02", "remaining_time": "0:05:09"}
214
+ {"current_steps": 2514, "total_steps": 2814, "loss": 0.538, "lr": 1.5165372827066788e-07, "epoch": 2.6805333333333334, "percentage": 89.34, "elapsed_time": "0:43:12", "remaining_time": "0:05:09"}
215
+ {"current_steps": 2515, "total_steps": 2814, "loss": 0.5292, "lr": 1.5065796455111304e-07, "epoch": 2.6816, "percentage": 89.37, "elapsed_time": "0:43:24", "remaining_time": "0:05:09"}
216
+ {"current_steps": 2516, "total_steps": 2814, "loss": 0.5112, "lr": 1.496653791453864e-07, "epoch": 2.6826666666666665, "percentage": 89.41, "elapsed_time": "0:43:34", "remaining_time": "0:05:09"}
217
+ {"current_steps": 2517, "total_steps": 2814, "loss": 0.5687, "lr": 1.4867597339630918e-07, "epoch": 2.6837333333333335, "percentage": 89.45, "elapsed_time": "0:43:46", "remaining_time": "0:05:09"}
218
+ {"current_steps": 2518, "total_steps": 2814, "loss": 0.4996, "lr": 1.4768974864240255e-07, "epoch": 2.6848, "percentage": 89.48, "elapsed_time": "0:43:57", "remaining_time": "0:05:10"}
219
+ {"current_steps": 2519, "total_steps": 2814, "loss": 0.5781, "lr": 1.467067062178823e-07, "epoch": 2.6858666666666666, "percentage": 89.52, "elapsed_time": "0:44:08", "remaining_time": "0:05:10"}
220
+ {"current_steps": 2520, "total_steps": 2814, "loss": 0.4681, "lr": 1.457268474526613e-07, "epoch": 2.686933333333333, "percentage": 89.55, "elapsed_time": "0:44:20", "remaining_time": "0:05:10"}
221
+ {"current_steps": 2521, "total_steps": 2814, "loss": 0.5112, "lr": 1.4475017367234306e-07, "epoch": 2.6879999999999997, "percentage": 89.59, "elapsed_time": "0:44:31", "remaining_time": "0:05:10"}
222
+ {"current_steps": 2522, "total_steps": 2814, "loss": 0.5548, "lr": 1.43776686198224e-07, "epoch": 2.6890666666666667, "percentage": 89.62, "elapsed_time": "0:44:42", "remaining_time": "0:05:10"}
223
+ {"current_steps": 2523, "total_steps": 2814, "loss": 0.5334, "lr": 1.428063863472895e-07, "epoch": 2.6901333333333333, "percentage": 89.66, "elapsed_time": "0:44:53", "remaining_time": "0:05:10"}
224
+ {"current_steps": 2524, "total_steps": 2814, "loss": 0.5103, "lr": 1.4183927543221177e-07, "epoch": 2.6912000000000003, "percentage": 89.69, "elapsed_time": "0:45:04", "remaining_time": "0:05:10"}
225
+ {"current_steps": 2525, "total_steps": 2814, "loss": 0.5302, "lr": 1.4087535476135007e-07, "epoch": 2.692266666666667, "percentage": 89.73, "elapsed_time": "0:45:15", "remaining_time": "0:05:10"}
226
+ {"current_steps": 2526, "total_steps": 2814, "loss": 0.4707, "lr": 1.3991462563874708e-07, "epoch": 2.6933333333333334, "percentage": 89.77, "elapsed_time": "0:45:28", "remaining_time": "0:05:11"}
227
+ {"current_steps": 2527, "total_steps": 2814, "loss": 0.4776, "lr": 1.3895708936412734e-07, "epoch": 2.6944, "percentage": 89.8, "elapsed_time": "0:45:38", "remaining_time": "0:05:11"}
228
+ {"current_steps": 2528, "total_steps": 2814, "loss": 0.5747, "lr": 1.3800274723289653e-07, "epoch": 2.6954666666666665, "percentage": 89.84, "elapsed_time": "0:45:49", "remaining_time": "0:05:11"}
229
+ {"current_steps": 2529, "total_steps": 2814, "loss": 0.5338, "lr": 1.370516005361394e-07, "epoch": 2.6965333333333334, "percentage": 89.87, "elapsed_time": "0:45:59", "remaining_time": "0:05:11"}
230
+ {"current_steps": 2530, "total_steps": 2814, "loss": 0.5117, "lr": 1.3610365056061668e-07, "epoch": 2.6976, "percentage": 89.91, "elapsed_time": "0:46:11", "remaining_time": "0:05:11"}
231
+ {"current_steps": 2531, "total_steps": 2814, "loss": 0.5443, "lr": 1.3515889858876612e-07, "epoch": 2.6986666666666665, "percentage": 89.94, "elapsed_time": "0:46:24", "remaining_time": "0:05:11"}
232
+ {"current_steps": 2532, "total_steps": 2814, "loss": 0.5338, "lr": 1.34217345898697e-07, "epoch": 2.6997333333333335, "percentage": 89.98, "elapsed_time": "0:46:34", "remaining_time": "0:05:11"}
233
+ {"current_steps": 2533, "total_steps": 2814, "loss": 0.5848, "lr": 1.3327899376419234e-07, "epoch": 2.7008, "percentage": 90.01, "elapsed_time": "0:46:45", "remaining_time": "0:05:11"}
234
+ {"current_steps": 2534, "total_steps": 2814, "loss": 0.5453, "lr": 1.3234384345470364e-07, "epoch": 2.7018666666666666, "percentage": 90.05, "elapsed_time": "0:46:55", "remaining_time": "0:05:11"}
235
+ {"current_steps": 2535, "total_steps": 2814, "loss": 0.5008, "lr": 1.314118962353522e-07, "epoch": 2.702933333333333, "percentage": 90.09, "elapsed_time": "0:47:06", "remaining_time": "0:05:11"}
236
+ {"current_steps": 2536, "total_steps": 2814, "loss": 0.5, "lr": 1.3048315336692484e-07, "epoch": 2.7039999999999997, "percentage": 90.12, "elapsed_time": "0:47:18", "remaining_time": "0:05:11"}
237
+ {"current_steps": 2537, "total_steps": 2814, "loss": 0.5095, "lr": 1.2955761610587481e-07, "epoch": 2.7050666666666667, "percentage": 90.16, "elapsed_time": "0:47:28", "remaining_time": "0:05:11"}
238
+ {"current_steps": 2538, "total_steps": 2814, "loss": 0.5118, "lr": 1.2863528570431633e-07, "epoch": 2.7061333333333333, "percentage": 90.19, "elapsed_time": "0:47:40", "remaining_time": "0:05:11"}
239
+ {"current_steps": 2539, "total_steps": 2814, "loss": 0.604, "lr": 1.2771616341002796e-07, "epoch": 2.7072000000000003, "percentage": 90.23, "elapsed_time": "0:47:50", "remaining_time": "0:05:10"}
240
+ {"current_steps": 2540, "total_steps": 2814, "loss": 0.5456, "lr": 1.2680025046644562e-07, "epoch": 2.708266666666667, "percentage": 90.26, "elapsed_time": "0:48:01", "remaining_time": "0:05:10"}
241
+ {"current_steps": 2541, "total_steps": 2814, "loss": 0.5803, "lr": 1.2588754811266591e-07, "epoch": 2.7093333333333334, "percentage": 90.3, "elapsed_time": "0:48:11", "remaining_time": "0:05:10"}
242
+ {"current_steps": 2542, "total_steps": 2814, "loss": 0.576, "lr": 1.2497805758344002e-07, "epoch": 2.7104, "percentage": 90.33, "elapsed_time": "0:48:22", "remaining_time": "0:05:10"}
243
+ {"current_steps": 2543, "total_steps": 2814, "loss": 0.5501, "lr": 1.2407178010917426e-07, "epoch": 2.7114666666666665, "percentage": 90.37, "elapsed_time": "0:48:34", "remaining_time": "0:05:10"}
244
+ {"current_steps": 2544, "total_steps": 2814, "loss": 0.5378, "lr": 1.2316871691592959e-07, "epoch": 2.7125333333333335, "percentage": 90.41, "elapsed_time": "0:48:45", "remaining_time": "0:05:10"}
245
+ {"current_steps": 2545, "total_steps": 2814, "loss": 0.5382, "lr": 1.2226886922541674e-07, "epoch": 2.7136, "percentage": 90.44, "elapsed_time": "0:48:57", "remaining_time": "0:05:10"}
246
+ {"current_steps": 2546, "total_steps": 2814, "loss": 0.4992, "lr": 1.2137223825499755e-07, "epoch": 2.7146666666666666, "percentage": 90.48, "elapsed_time": "0:49:08", "remaining_time": "0:05:10"}
247
+ {"current_steps": 2547, "total_steps": 2814, "loss": 0.5266, "lr": 1.2047882521768167e-07, "epoch": 2.7157333333333336, "percentage": 90.51, "elapsed_time": "0:49:20", "remaining_time": "0:05:10"}
248
+ {"current_steps": 2548, "total_steps": 2814, "loss": 0.5374, "lr": 1.1958863132212534e-07, "epoch": 2.7168, "percentage": 90.55, "elapsed_time": "0:49:30", "remaining_time": "0:05:10"}
249
+ {"current_steps": 2549, "total_steps": 2814, "loss": 0.5343, "lr": 1.1870165777262937e-07, "epoch": 2.7178666666666667, "percentage": 90.58, "elapsed_time": "0:49:41", "remaining_time": "0:05:10"}
250
+ {"current_steps": 2550, "total_steps": 2814, "loss": 0.5431, "lr": 1.1781790576913948e-07, "epoch": 2.718933333333333, "percentage": 90.62, "elapsed_time": "0:49:52", "remaining_time": "0:05:09"}
251
+ {"current_steps": 2551, "total_steps": 2814, "loss": 0.4893, "lr": 1.1693737650724069e-07, "epoch": 2.7199999999999998, "percentage": 90.65, "elapsed_time": "0:50:02", "remaining_time": "0:05:09"}
252
+ {"current_steps": 2552, "total_steps": 2814, "loss": 0.5551, "lr": 1.1606007117816037e-07, "epoch": 2.7210666666666667, "percentage": 90.69, "elapsed_time": "0:50:12", "remaining_time": "0:05:09"}
253
+ {"current_steps": 2553, "total_steps": 2814, "loss": 0.5272, "lr": 1.1518599096876277e-07, "epoch": 2.7221333333333333, "percentage": 90.72, "elapsed_time": "0:50:22", "remaining_time": "0:05:08"}
254
+ {"current_steps": 2554, "total_steps": 2814, "loss": 0.5514, "lr": 1.1431513706155028e-07, "epoch": 2.7232, "percentage": 90.76, "elapsed_time": "0:50:32", "remaining_time": "0:05:08"}
255
+ {"current_steps": 2555, "total_steps": 2814, "loss": 0.5145, "lr": 1.1344751063465969e-07, "epoch": 2.724266666666667, "percentage": 90.8, "elapsed_time": "0:50:42", "remaining_time": "0:05:08"}
256
+ {"current_steps": 2556, "total_steps": 2814, "loss": 0.5075, "lr": 1.1258311286186208e-07, "epoch": 2.7253333333333334, "percentage": 90.83, "elapsed_time": "0:50:54", "remaining_time": "0:05:08"}
257
+ {"current_steps": 2557, "total_steps": 2814, "loss": 0.4767, "lr": 1.1172194491256006e-07, "epoch": 2.7264, "percentage": 90.87, "elapsed_time": "0:51:05", "remaining_time": "0:05:08"}
258
+ {"current_steps": 2558, "total_steps": 2814, "loss": 0.5385, "lr": 1.1086400795178697e-07, "epoch": 2.7274666666666665, "percentage": 90.9, "elapsed_time": "0:51:15", "remaining_time": "0:05:07"}
259
+ {"current_steps": 2559, "total_steps": 2814, "loss": 0.5823, "lr": 1.1000930314020553e-07, "epoch": 2.7285333333333335, "percentage": 90.94, "elapsed_time": "0:51:25", "remaining_time": "0:05:07"}
260
+ {"current_steps": 2560, "total_steps": 2814, "loss": 0.5575, "lr": 1.0915783163410493e-07, "epoch": 2.7296, "percentage": 90.97, "elapsed_time": "0:51:36", "remaining_time": "0:05:07"}
261
+ {"current_steps": 2561, "total_steps": 2814, "loss": 0.5471, "lr": 1.0830959458540152e-07, "epoch": 2.7306666666666666, "percentage": 91.01, "elapsed_time": "0:51:46", "remaining_time": "0:05:06"}
262
+ {"current_steps": 2562, "total_steps": 2814, "loss": 0.5701, "lr": 1.0746459314163432e-07, "epoch": 2.7317333333333336, "percentage": 91.04, "elapsed_time": "0:51:58", "remaining_time": "0:05:06"}
263
+ {"current_steps": 2563, "total_steps": 2814, "loss": 0.5104, "lr": 1.0662282844596666e-07, "epoch": 2.7328, "percentage": 91.08, "elapsed_time": "0:52:07", "remaining_time": "0:05:06"}
264
+ {"current_steps": 2564, "total_steps": 2814, "loss": 0.5618, "lr": 1.0578430163718151e-07, "epoch": 2.7338666666666667, "percentage": 91.12, "elapsed_time": "0:52:17", "remaining_time": "0:05:05"}
265
+ {"current_steps": 2565, "total_steps": 2814, "loss": 0.4723, "lr": 1.0494901384968281e-07, "epoch": 2.734933333333333, "percentage": 91.15, "elapsed_time": "0:52:30", "remaining_time": "0:05:05"}
266
+ {"current_steps": 2566, "total_steps": 2814, "loss": 0.4977, "lr": 1.0411696621349137e-07, "epoch": 2.7359999999999998, "percentage": 91.19, "elapsed_time": "0:52:40", "remaining_time": "0:05:05"}
267
+ {"current_steps": 2567, "total_steps": 2814, "loss": 0.5096, "lr": 1.0328815985424539e-07, "epoch": 2.7370666666666668, "percentage": 91.22, "elapsed_time": "0:52:50", "remaining_time": "0:05:05"}
268
+ {"current_steps": 2568, "total_steps": 2814, "loss": 0.5514, "lr": 1.0246259589319768e-07, "epoch": 2.7381333333333333, "percentage": 91.26, "elapsed_time": "0:52:59", "remaining_time": "0:05:04"}
269
+ {"current_steps": 2569, "total_steps": 2814, "loss": 0.4977, "lr": 1.0164027544721511e-07, "epoch": 2.7392, "percentage": 91.29, "elapsed_time": "0:53:12", "remaining_time": "0:05:04"}
270
+ {"current_steps": 2570, "total_steps": 2814, "loss": 0.53, "lr": 1.0082119962877584e-07, "epoch": 2.740266666666667, "percentage": 91.33, "elapsed_time": "0:53:22", "remaining_time": "0:05:04"}
271
+ {"current_steps": 2571, "total_steps": 2814, "loss": 0.5141, "lr": 1.0000536954596851e-07, "epoch": 2.7413333333333334, "percentage": 91.36, "elapsed_time": "0:53:34", "remaining_time": "0:05:03"}
272
+ {"current_steps": 2572, "total_steps": 2814, "loss": 0.5541, "lr": 9.91927863024919e-08, "epoch": 2.7424, "percentage": 91.4, "elapsed_time": "0:53:44", "remaining_time": "0:05:03"}
273
+ {"current_steps": 2573, "total_steps": 2814, "loss": 0.4827, "lr": 9.838345099765084e-08, "epoch": 2.7434666666666665, "percentage": 91.44, "elapsed_time": "0:53:57", "remaining_time": "0:05:03"}
274
+ {"current_steps": 2574, "total_steps": 2814, "loss": 0.5758, "lr": 9.757736472635698e-08, "epoch": 2.7445333333333335, "percentage": 91.47, "elapsed_time": "0:54:09", "remaining_time": "0:05:02"}
275
+ {"current_steps": 2575, "total_steps": 2814, "loss": 0.5064, "lr": 9.677452857912662e-08, "epoch": 2.7456, "percentage": 91.51, "elapsed_time": "0:54:20", "remaining_time": "0:05:02"}
276
+ {"current_steps": 2576, "total_steps": 2814, "loss": 0.5292, "lr": 9.597494364207816e-08, "epoch": 2.7466666666666666, "percentage": 91.54, "elapsed_time": "0:54:29", "remaining_time": "0:05:02"}
277
+ {"current_steps": 2577, "total_steps": 2814, "loss": 0.4914, "lr": 9.517861099693271e-08, "epoch": 2.7477333333333336, "percentage": 91.58, "elapsed_time": "0:54:41", "remaining_time": "0:05:01"}
278
+ {"current_steps": 2578, "total_steps": 2814, "loss": 0.5748, "lr": 9.438553172101156e-08, "epoch": 2.7488, "percentage": 91.61, "elapsed_time": "0:54:52", "remaining_time": "0:05:01"}
279
+ {"current_steps": 2579, "total_steps": 2814, "loss": 0.5841, "lr": 9.35957068872334e-08, "epoch": 2.7498666666666667, "percentage": 91.65, "elapsed_time": "0:55:05", "remaining_time": "0:05:01"}
280
+ {"current_steps": 2580, "total_steps": 2814, "loss": 0.5271, "lr": 9.280913756411547e-08, "epoch": 2.7509333333333332, "percentage": 91.68, "elapsed_time": "0:55:16", "remaining_time": "0:05:00"}
281
+ {"current_steps": 2581, "total_steps": 2814, "loss": 0.5621, "lr": 9.20258248157696e-08, "epoch": 2.752, "percentage": 91.72, "elapsed_time": "0:55:27", "remaining_time": "0:05:00"}
282
+ {"current_steps": 2582, "total_steps": 2814, "loss": 0.5501, "lr": 9.124576970190369e-08, "epoch": 2.7530666666666668, "percentage": 91.76, "elapsed_time": "0:55:39", "remaining_time": "0:05:00"}
283
+ {"current_steps": 2583, "total_steps": 2814, "loss": 0.6103, "lr": 9.046897327781607e-08, "epoch": 2.7541333333333333, "percentage": 91.79, "elapsed_time": "0:55:49", "remaining_time": "0:04:59"}
284
+ {"current_steps": 2584, "total_steps": 2814, "loss": 0.5617, "lr": 8.969543659439917e-08, "epoch": 2.7552, "percentage": 91.83, "elapsed_time": "0:56:00", "remaining_time": "0:04:59"}
285
+ {"current_steps": 2585, "total_steps": 2814, "loss": 0.5972, "lr": 8.892516069813367e-08, "epoch": 2.756266666666667, "percentage": 91.86, "elapsed_time": "0:56:12", "remaining_time": "0:04:58"}
286
+ {"current_steps": 2586, "total_steps": 2814, "loss": 0.5502, "lr": 8.815814663108935e-08, "epoch": 2.7573333333333334, "percentage": 91.9, "elapsed_time": "0:56:23", "remaining_time": "0:04:58"}
287
+ {"current_steps": 2587, "total_steps": 2814, "loss": 0.5608, "lr": 8.739439543092282e-08, "epoch": 2.7584, "percentage": 91.93, "elapsed_time": "0:56:35", "remaining_time": "0:04:57"}
288
+ {"current_steps": 2588, "total_steps": 2814, "loss": 0.5232, "lr": 8.663390813087813e-08, "epoch": 2.7594666666666665, "percentage": 91.97, "elapsed_time": "0:56:44", "remaining_time": "0:04:57"}
289
+ {"current_steps": 2589, "total_steps": 2814, "loss": 0.5245, "lr": 8.587668575978176e-08, "epoch": 2.760533333333333, "percentage": 92.0, "elapsed_time": "0:56:57", "remaining_time": "0:04:56"}
290
+ {"current_steps": 2590, "total_steps": 2814, "loss": 0.5338, "lr": 8.512272934204451e-08, "epoch": 2.7616, "percentage": 92.04, "elapsed_time": "0:57:08", "remaining_time": "0:04:56"}
291
+ {"current_steps": 2591, "total_steps": 2814, "loss": 0.5377, "lr": 8.437203989765802e-08, "epoch": 2.7626666666666666, "percentage": 92.08, "elapsed_time": "0:57:18", "remaining_time": "0:04:55"}
292
+ {"current_steps": 2592, "total_steps": 2814, "loss": 0.565, "lr": 8.362461844219516e-08, "epoch": 2.7637333333333336, "percentage": 92.11, "elapsed_time": "0:57:29", "remaining_time": "0:04:55"}
293
+ {"current_steps": 2593, "total_steps": 2814, "loss": 0.5222, "lr": 8.288046598680627e-08, "epoch": 2.7648, "percentage": 92.15, "elapsed_time": "0:57:39", "remaining_time": "0:04:54"}
294
+ {"current_steps": 2594, "total_steps": 2814, "loss": 0.4689, "lr": 8.213958353822076e-08, "epoch": 2.7658666666666667, "percentage": 92.18, "elapsed_time": "0:57:52", "remaining_time": "0:04:54"}
295
+ {"current_steps": 2595, "total_steps": 2814, "loss": 0.4959, "lr": 8.1401972098743e-08, "epoch": 2.7669333333333332, "percentage": 92.22, "elapsed_time": "0:58:03", "remaining_time": "0:04:53"}
296
+ {"current_steps": 2596, "total_steps": 2814, "loss": 0.5525, "lr": 8.066763266625283e-08, "epoch": 2.768, "percentage": 92.25, "elapsed_time": "0:58:15", "remaining_time": "0:04:53"}
297
+ {"current_steps": 2597, "total_steps": 2814, "loss": 0.5081, "lr": 7.993656623420359e-08, "epoch": 2.769066666666667, "percentage": 92.29, "elapsed_time": "0:58:26", "remaining_time": "0:04:52"}
298
+ {"current_steps": 2598, "total_steps": 2814, "loss": 0.5046, "lr": 7.920877379162029e-08, "epoch": 2.7701333333333333, "percentage": 92.32, "elapsed_time": "0:58:38", "remaining_time": "0:04:52"}
299
+ {"current_steps": 2599, "total_steps": 2814, "loss": 0.5802, "lr": 7.848425632309892e-08, "epoch": 2.7712, "percentage": 92.36, "elapsed_time": "0:58:49", "remaining_time": "0:04:51"}
300
+ {"current_steps": 2600, "total_steps": 2814, "loss": 0.5517, "lr": 7.776301480880516e-08, "epoch": 2.772266666666667, "percentage": 92.4, "elapsed_time": "0:58:58", "remaining_time": "0:04:51"}
301
+ {"current_steps": 2601, "total_steps": 2814, "loss": 0.5182, "lr": 7.70450502244724e-08, "epoch": 2.7733333333333334, "percentage": 92.43, "elapsed_time": "1:01:02", "remaining_time": "0:04:59"}
302
+ {"current_steps": 2602, "total_steps": 2814, "loss": 0.5157, "lr": 7.633036354140088e-08, "epoch": 2.7744, "percentage": 92.47, "elapsed_time": "1:01:14", "remaining_time": "0:04:59"}
303
+ {"current_steps": 2603, "total_steps": 2814, "loss": 0.5128, "lr": 7.561895572645666e-08, "epoch": 2.7754666666666665, "percentage": 92.5, "elapsed_time": "1:01:23", "remaining_time": "0:04:58"}
304
+ {"current_steps": 2604, "total_steps": 2814, "loss": 0.472, "lr": 7.491082774206931e-08, "epoch": 2.776533333333333, "percentage": 92.54, "elapsed_time": "1:01:37", "remaining_time": "0:04:58"}
305
+ {"current_steps": 2605, "total_steps": 2814, "loss": 0.5226, "lr": 7.420598054623224e-08, "epoch": 2.7776, "percentage": 92.57, "elapsed_time": "1:01:48", "remaining_time": "0:04:57"}
306
+ {"current_steps": 2606, "total_steps": 2814, "loss": 0.5879, "lr": 7.350441509249989e-08, "epoch": 2.7786666666666666, "percentage": 92.61, "elapsed_time": "1:01:58", "remaining_time": "0:04:56"}
307
+ {"current_steps": 2607, "total_steps": 2814, "loss": 0.5826, "lr": 7.280613232998668e-08, "epoch": 2.779733333333333, "percentage": 92.64, "elapsed_time": "1:02:09", "remaining_time": "0:04:56"}
308
+ {"current_steps": 2608, "total_steps": 2814, "loss": 0.4977, "lr": 7.211113320336665e-08, "epoch": 2.7808, "percentage": 92.68, "elapsed_time": "1:02:21", "remaining_time": "0:04:55"}
309
+ {"current_steps": 2609, "total_steps": 2814, "loss": 0.5358, "lr": 7.141941865287106e-08, "epoch": 2.7818666666666667, "percentage": 92.71, "elapsed_time": "1:02:31", "remaining_time": "0:04:54"}
310
+ {"current_steps": 2610, "total_steps": 2814, "loss": 0.4717, "lr": 7.073098961428799e-08, "epoch": 2.7829333333333333, "percentage": 92.75, "elapsed_time": "1:02:44", "remaining_time": "0:04:54"}
311
+ {"current_steps": 2611, "total_steps": 2814, "loss": 0.4954, "lr": 7.004584701896077e-08, "epoch": 2.784, "percentage": 92.79, "elapsed_time": "1:02:57", "remaining_time": "0:04:53"}
312
+ {"current_steps": 2612, "total_steps": 2814, "loss": 0.5408, "lr": 6.936399179378577e-08, "epoch": 2.785066666666667, "percentage": 92.82, "elapsed_time": "1:03:07", "remaining_time": "0:04:52"}
313
+ {"current_steps": 2613, "total_steps": 2814, "loss": 0.6383, "lr": 6.868542486121315e-08, "epoch": 2.7861333333333334, "percentage": 92.86, "elapsed_time": "1:03:17", "remaining_time": "0:04:52"}
314
+ {"current_steps": 2614, "total_steps": 2814, "loss": 0.5163, "lr": 6.801014713924442e-08, "epoch": 2.7872, "percentage": 92.89, "elapsed_time": "1:03:28", "remaining_time": "0:04:51"}
315
+ {"current_steps": 2615, "total_steps": 2814, "loss": 0.5047, "lr": 6.73381595414302e-08, "epoch": 2.788266666666667, "percentage": 92.93, "elapsed_time": "1:03:42", "remaining_time": "0:04:50"}
316
+ {"current_steps": 2616, "total_steps": 2814, "loss": 0.5421, "lr": 6.666946297687133e-08, "epoch": 2.7893333333333334, "percentage": 92.96, "elapsed_time": "1:03:54", "remaining_time": "0:04:50"}
317
+ {"current_steps": 2617, "total_steps": 2814, "loss": 0.5908, "lr": 6.600405835021529e-08, "epoch": 2.7904, "percentage": 93.0, "elapsed_time": "1:04:04", "remaining_time": "0:04:49"}
318
+ {"current_steps": 2618, "total_steps": 2814, "loss": 0.5551, "lr": 6.534194656165699e-08, "epoch": 2.7914666666666665, "percentage": 93.03, "elapsed_time": "1:04:15", "remaining_time": "0:04:48"}
319
+ {"current_steps": 2619, "total_steps": 2814, "loss": 0.5983, "lr": 6.468312850693576e-08, "epoch": 2.792533333333333, "percentage": 93.07, "elapsed_time": "1:04:24", "remaining_time": "0:04:47"}
320
+ {"current_steps": 2620, "total_steps": 2814, "loss": 0.5078, "lr": 6.402760507733585e-08, "epoch": 2.7936, "percentage": 93.11, "elapsed_time": "1:04:35", "remaining_time": "0:04:46"}
321
+ {"current_steps": 2621, "total_steps": 2814, "loss": 0.5701, "lr": 6.337537715968345e-08, "epoch": 2.7946666666666666, "percentage": 93.14, "elapsed_time": "1:04:45", "remaining_time": "0:04:46"}
322
+ {"current_steps": 2622, "total_steps": 2814, "loss": 0.5352, "lr": 6.272644563634744e-08, "epoch": 2.795733333333333, "percentage": 93.18, "elapsed_time": "1:04:56", "remaining_time": "0:04:45"}
323
+ {"current_steps": 2623, "total_steps": 2814, "loss": 0.5594, "lr": 6.208081138523614e-08, "epoch": 2.7968, "percentage": 93.21, "elapsed_time": "1:05:09", "remaining_time": "0:04:44"}
324
+ {"current_steps": 2624, "total_steps": 2814, "loss": 0.4807, "lr": 6.143847527979808e-08, "epoch": 2.7978666666666667, "percentage": 93.25, "elapsed_time": "1:05:20", "remaining_time": "0:04:43"}
325
+ {"current_steps": 2625, "total_steps": 2814, "loss": 0.4885, "lr": 6.079943818901895e-08, "epoch": 2.7989333333333333, "percentage": 93.28, "elapsed_time": "1:05:33", "remaining_time": "0:04:43"}
326
+ {"current_steps": 2626, "total_steps": 2814, "loss": 0.5547, "lr": 6.016370097742224e-08, "epoch": 2.8, "percentage": 93.32, "elapsed_time": "1:05:44", "remaining_time": "0:04:42"}
327
+ {"current_steps": 2627, "total_steps": 2814, "loss": 0.4851, "lr": 5.9531264505066354e-08, "epoch": 2.801066666666667, "percentage": 93.35, "elapsed_time": "1:05:58", "remaining_time": "0:04:41"}
328
+ {"current_steps": 2628, "total_steps": 2814, "loss": 0.5355, "lr": 5.8902129627545216e-08, "epoch": 2.8021333333333334, "percentage": 93.39, "elapsed_time": "1:06:09", "remaining_time": "0:04:40"}
329
+ {"current_steps": 2629, "total_steps": 2814, "loss": 0.5568, "lr": 5.827629719598521e-08, "epoch": 2.8032, "percentage": 93.43, "elapsed_time": "1:06:21", "remaining_time": "0:04:40"}
330
+ {"current_steps": 2630, "total_steps": 2814, "loss": 0.6018, "lr": 5.7653768057045757e-08, "epoch": 2.804266666666667, "percentage": 93.46, "elapsed_time": "1:06:32", "remaining_time": "0:04:39"}
331
+ {"current_steps": 2631, "total_steps": 2814, "loss": 0.5157, "lr": 5.7034543052917335e-08, "epoch": 2.8053333333333335, "percentage": 93.5, "elapsed_time": "1:06:41", "remaining_time": "0:04:38"}
332
+ {"current_steps": 2632, "total_steps": 2814, "loss": 0.5141, "lr": 5.641862302131928e-08, "epoch": 2.8064, "percentage": 93.53, "elapsed_time": "1:06:51", "remaining_time": "0:04:37"}
333
+ {"current_steps": 2633, "total_steps": 2814, "loss": 0.5397, "lr": 5.5806008795501745e-08, "epoch": 2.8074666666666666, "percentage": 93.57, "elapsed_time": "1:07:00", "remaining_time": "0:04:36"}
334
+ {"current_steps": 2634, "total_steps": 2814, "loss": 0.4664, "lr": 5.519670120424064e-08, "epoch": 2.808533333333333, "percentage": 93.6, "elapsed_time": "1:07:09", "remaining_time": "0:04:35"}
335
+ {"current_steps": 2635, "total_steps": 2814, "loss": 0.5425, "lr": 5.45907010718405e-08, "epoch": 2.8096, "percentage": 93.64, "elapsed_time": "1:07:20", "remaining_time": "0:04:34"}
336
+ {"current_steps": 2636, "total_steps": 2814, "loss": 0.5275, "lr": 5.3988009218129135e-08, "epoch": 2.8106666666666666, "percentage": 93.67, "elapsed_time": "1:07:31", "remaining_time": "0:04:33"}
337
+ {"current_steps": 2637, "total_steps": 2814, "loss": 0.5458, "lr": 5.3388626458460714e-08, "epoch": 2.811733333333333, "percentage": 93.71, "elapsed_time": "1:07:40", "remaining_time": "0:04:32"}
338
+ {"current_steps": 2638, "total_steps": 2814, "loss": 0.5699, "lr": 5.2792553603711326e-08, "epoch": 2.8128, "percentage": 93.75, "elapsed_time": "1:07:50", "remaining_time": "0:04:31"}
339
+ {"current_steps": 2639, "total_steps": 2814, "loss": 0.5077, "lr": 5.219979146028037e-08, "epoch": 2.8138666666666667, "percentage": 93.78, "elapsed_time": "1:08:00", "remaining_time": "0:04:30"}
340
+ {"current_steps": 2640, "total_steps": 2814, "loss": 0.4941, "lr": 5.161034083008693e-08, "epoch": 2.8149333333333333, "percentage": 93.82, "elapsed_time": "1:08:11", "remaining_time": "0:04:29"}
341
+ {"current_steps": 2641, "total_steps": 2814, "loss": 0.5459, "lr": 5.102420251057144e-08, "epoch": 2.816, "percentage": 93.85, "elapsed_time": "1:08:21", "remaining_time": "0:04:28"}
342
+ {"current_steps": 2642, "total_steps": 2814, "loss": 0.5001, "lr": 5.0441377294692675e-08, "epoch": 2.817066666666667, "percentage": 93.89, "elapsed_time": "1:08:31", "remaining_time": "0:04:27"}
343
+ {"current_steps": 2643, "total_steps": 2814, "loss": 0.525, "lr": 4.9861865970927404e-08, "epoch": 2.8181333333333334, "percentage": 93.92, "elapsed_time": "1:08:42", "remaining_time": "0:04:26"}
344
+ {"current_steps": 2644, "total_steps": 2814, "loss": 0.5591, "lr": 4.928566932326906e-08, "epoch": 2.8192, "percentage": 93.96, "elapsed_time": "1:08:53", "remaining_time": "0:04:25"}
345
+ {"current_steps": 2645, "total_steps": 2814, "loss": 0.5258, "lr": 4.871278813122632e-08, "epoch": 2.820266666666667, "percentage": 93.99, "elapsed_time": "1:09:05", "remaining_time": "0:04:24"}
346
+ {"current_steps": 2646, "total_steps": 2814, "loss": 0.6251, "lr": 4.814322316982395e-08, "epoch": 2.8213333333333335, "percentage": 94.03, "elapsed_time": "1:09:13", "remaining_time": "0:04:23"}
347
+ {"current_steps": 2647, "total_steps": 2814, "loss": 0.5052, "lr": 4.7576975209599195e-08, "epoch": 2.8224, "percentage": 94.07, "elapsed_time": "1:09:23", "remaining_time": "0:04:22"}
348
+ {"current_steps": 2648, "total_steps": 2814, "loss": 0.4799, "lr": 4.7014045016601784e-08, "epoch": 2.8234666666666666, "percentage": 94.1, "elapsed_time": "1:09:35", "remaining_time": "0:04:21"}
349
+ {"current_steps": 2649, "total_steps": 2814, "loss": 0.5598, "lr": 4.6454433352393634e-08, "epoch": 2.824533333333333, "percentage": 94.14, "elapsed_time": "1:09:45", "remaining_time": "0:04:20"}
350
+ {"current_steps": 2650, "total_steps": 2814, "loss": 0.5301, "lr": 4.589814097404694e-08, "epoch": 2.8256, "percentage": 94.17, "elapsed_time": "1:09:57", "remaining_time": "0:04:19"}
351
+ {"current_steps": 2651, "total_steps": 2814, "loss": 0.5262, "lr": 4.534516863414329e-08, "epoch": 2.8266666666666667, "percentage": 94.21, "elapsed_time": "1:10:09", "remaining_time": "0:04:18"}
352
+ {"current_steps": 2652, "total_steps": 2814, "loss": 0.5068, "lr": 4.4795517080773175e-08, "epoch": 2.827733333333333, "percentage": 94.24, "elapsed_time": "1:10:20", "remaining_time": "0:04:17"}
353
+ {"current_steps": 2653, "total_steps": 2814, "loss": 0.5667, "lr": 4.4249187057533725e-08, "epoch": 2.8288, "percentage": 94.28, "elapsed_time": "1:10:31", "remaining_time": "0:04:16"}
354
+ {"current_steps": 2654, "total_steps": 2814, "loss": 0.5633, "lr": 4.3706179303529276e-08, "epoch": 2.8298666666666668, "percentage": 94.31, "elapsed_time": "1:10:43", "remaining_time": "0:04:15"}
355
+ {"current_steps": 2655, "total_steps": 2814, "loss": 0.5019, "lr": 4.316649455336913e-08, "epoch": 2.8309333333333333, "percentage": 94.35, "elapsed_time": "1:10:54", "remaining_time": "0:04:14"}
356
+ {"current_steps": 2656, "total_steps": 2814, "loss": 0.53, "lr": 4.2630133537167885e-08, "epoch": 2.832, "percentage": 94.39, "elapsed_time": "1:11:05", "remaining_time": "0:04:13"}
357
+ {"current_steps": 2657, "total_steps": 2814, "loss": 0.5524, "lr": 4.209709698054232e-08, "epoch": 2.8330666666666664, "percentage": 94.42, "elapsed_time": "1:11:14", "remaining_time": "0:04:12"}
358
+ {"current_steps": 2658, "total_steps": 2814, "loss": 0.5574, "lr": 4.156738560461282e-08, "epoch": 2.8341333333333334, "percentage": 94.46, "elapsed_time": "1:11:25", "remaining_time": "0:04:11"}
359
+ {"current_steps": 2659, "total_steps": 2814, "loss": 0.6054, "lr": 4.1041000126000595e-08, "epoch": 2.8352, "percentage": 94.49, "elapsed_time": "1:11:38", "remaining_time": "0:04:10"}
360
+ {"current_steps": 2660, "total_steps": 2814, "loss": 0.6251, "lr": 4.051794125682767e-08, "epoch": 2.836266666666667, "percentage": 94.53, "elapsed_time": "1:11:48", "remaining_time": "0:04:09"}
361
+ {"current_steps": 2661, "total_steps": 2814, "loss": 0.568, "lr": 3.999820970471635e-08, "epoch": 2.8373333333333335, "percentage": 94.56, "elapsed_time": "1:12:00", "remaining_time": "0:04:08"}
362
+ {"current_steps": 2662, "total_steps": 2814, "loss": 0.5269, "lr": 3.948180617278613e-08, "epoch": 2.8384, "percentage": 94.6, "elapsed_time": "1:12:12", "remaining_time": "0:04:07"}
363
+ {"current_steps": 2663, "total_steps": 2814, "loss": 0.574, "lr": 3.896873135965484e-08, "epoch": 2.8394666666666666, "percentage": 94.63, "elapsed_time": "1:12:23", "remaining_time": "0:04:06"}
364
+ {"current_steps": 2664, "total_steps": 2814, "loss": 0.564, "lr": 3.845898595943809e-08, "epoch": 2.840533333333333, "percentage": 94.67, "elapsed_time": "1:12:32", "remaining_time": "0:04:05"}
365
+ {"current_steps": 2665, "total_steps": 2814, "loss": 0.4854, "lr": 3.795257066174535e-08, "epoch": 2.8416, "percentage": 94.71, "elapsed_time": "1:12:41", "remaining_time": "0:04:03"}
366
+ {"current_steps": 2666, "total_steps": 2814, "loss": 0.5019, "lr": 3.744948615168248e-08, "epoch": 2.8426666666666667, "percentage": 94.74, "elapsed_time": "1:12:51", "remaining_time": "0:04:02"}
367
+ {"current_steps": 2667, "total_steps": 2814, "loss": 0.5404, "lr": 3.6949733109848395e-08, "epoch": 2.8437333333333332, "percentage": 94.78, "elapsed_time": "1:13:02", "remaining_time": "0:04:01"}
368
+ {"current_steps": 2668, "total_steps": 2814, "loss": 0.512, "lr": 3.645331221233589e-08, "epoch": 2.8448, "percentage": 94.81, "elapsed_time": "1:13:13", "remaining_time": "0:04:00"}
369
+ {"current_steps": 2669, "total_steps": 2814, "loss": 0.4987, "lr": 3.596022413072886e-08, "epoch": 2.8458666666666668, "percentage": 94.85, "elapsed_time": "1:13:24", "remaining_time": "0:03:59"}
370
+ {"current_steps": 2670, "total_steps": 2814, "loss": 0.5153, "lr": 3.5470469532103146e-08, "epoch": 2.8469333333333333, "percentage": 94.88, "elapsed_time": "1:13:36", "remaining_time": "0:03:58"}
371
+ {"current_steps": 2671, "total_steps": 2814, "loss": 0.56, "lr": 3.4984049079024584e-08, "epoch": 2.848, "percentage": 94.92, "elapsed_time": "1:13:49", "remaining_time": "0:03:57"}
372
+ {"current_steps": 2672, "total_steps": 2814, "loss": 0.521, "lr": 3.450096342954817e-08, "epoch": 2.8490666666666664, "percentage": 94.95, "elapsed_time": "1:14:01", "remaining_time": "0:03:56"}
373
+ {"current_steps": 2673, "total_steps": 2814, "loss": 0.531, "lr": 3.40212132372178e-08, "epoch": 2.8501333333333334, "percentage": 94.99, "elapsed_time": "1:14:11", "remaining_time": "0:03:54"}
374
+ {"current_steps": 2674, "total_steps": 2814, "loss": 0.4978, "lr": 3.354479915106512e-08, "epoch": 2.8512, "percentage": 95.02, "elapsed_time": "1:14:22", "remaining_time": "0:03:53"}
375
+ {"current_steps": 2675, "total_steps": 2814, "loss": 0.5316, "lr": 3.307172181560791e-08, "epoch": 2.8522666666666665, "percentage": 95.06, "elapsed_time": "1:14:32", "remaining_time": "0:03:52"}
376
+ {"current_steps": 2676, "total_steps": 2814, "loss": 0.5223, "lr": 3.2601981870850065e-08, "epoch": 2.8533333333333335, "percentage": 95.1, "elapsed_time": "1:14:43", "remaining_time": "0:03:51"}
377
+ {"current_steps": 2677, "total_steps": 2814, "loss": 0.5383, "lr": 3.2135579952281005e-08, "epoch": 2.8544, "percentage": 95.13, "elapsed_time": "1:14:54", "remaining_time": "0:03:50"}
378
+ {"current_steps": 2678, "total_steps": 2814, "loss": 0.563, "lr": 3.167251669087323e-08, "epoch": 2.8554666666666666, "percentage": 95.17, "elapsed_time": "1:15:05", "remaining_time": "0:03:48"}
379
+ {"current_steps": 2679, "total_steps": 2814, "loss": 0.5344, "lr": 3.1212792713083696e-08, "epoch": 2.856533333333333, "percentage": 95.2, "elapsed_time": "1:15:18", "remaining_time": "0:03:47"}
380
+ {"current_steps": 2680, "total_steps": 2814, "loss": 0.5301, "lr": 3.075640864085072e-08, "epoch": 2.8576, "percentage": 95.24, "elapsed_time": "1:15:29", "remaining_time": "0:03:46"}
381
+ {"current_steps": 2681, "total_steps": 2814, "loss": 0.6039, "lr": 3.030336509159543e-08, "epoch": 2.8586666666666667, "percentage": 95.27, "elapsed_time": "1:15:40", "remaining_time": "0:03:45"}
382
+ {"current_steps": 2682, "total_steps": 2814, "loss": 0.5478, "lr": 2.9853662678218376e-08, "epoch": 2.8597333333333332, "percentage": 95.31, "elapsed_time": "1:15:51", "remaining_time": "0:03:44"}
383
+ {"current_steps": 2683, "total_steps": 2814, "loss": 0.5019, "lr": 2.9407302009101247e-08, "epoch": 2.8608000000000002, "percentage": 95.34, "elapsed_time": "1:16:02", "remaining_time": "0:03:42"}
384
+ {"current_steps": 2684, "total_steps": 2814, "loss": 0.5394, "lr": 2.896428368810378e-08, "epoch": 2.861866666666667, "percentage": 95.38, "elapsed_time": "1:16:15", "remaining_time": "0:03:41"}
385
+ {"current_steps": 2685, "total_steps": 2814, "loss": 0.549, "lr": 2.8524608314564882e-08, "epoch": 2.8629333333333333, "percentage": 95.42, "elapsed_time": "1:16:28", "remaining_time": "0:03:40"}
386
+ {"current_steps": 2686, "total_steps": 2814, "loss": 0.5627, "lr": 2.8088276483300425e-08, "epoch": 2.864, "percentage": 95.45, "elapsed_time": "1:16:39", "remaining_time": "0:03:39"}
387
+ {"current_steps": 2687, "total_steps": 2814, "loss": 0.5129, "lr": 2.765528878460322e-08, "epoch": 2.8650666666666664, "percentage": 95.49, "elapsed_time": "1:16:51", "remaining_time": "0:03:37"}
388
+ {"current_steps": 2688, "total_steps": 2814, "loss": 0.5313, "lr": 2.722564580424192e-08, "epoch": 2.8661333333333334, "percentage": 95.52, "elapsed_time": "1:17:02", "remaining_time": "0:03:36"}
389
+ {"current_steps": 2689, "total_steps": 2814, "loss": 0.5017, "lr": 2.679934812346019e-08, "epoch": 2.8672, "percentage": 95.56, "elapsed_time": "1:17:11", "remaining_time": "0:03:35"}
390
+ {"current_steps": 2690, "total_steps": 2814, "loss": 0.5435, "lr": 2.6376396318975874e-08, "epoch": 2.8682666666666665, "percentage": 95.59, "elapsed_time": "1:17:21", "remaining_time": "0:03:33"}
391
+ {"current_steps": 2691, "total_steps": 2814, "loss": 0.606, "lr": 2.5956790962980427e-08, "epoch": 2.8693333333333335, "percentage": 95.63, "elapsed_time": "1:17:30", "remaining_time": "0:03:32"}
392
+ {"current_steps": 2692, "total_steps": 2814, "loss": 0.5902, "lr": 2.5540532623138657e-08, "epoch": 2.8704, "percentage": 95.66, "elapsed_time": "1:17:41", "remaining_time": "0:03:31"}
393
+ {"current_steps": 2693, "total_steps": 2814, "loss": 0.5051, "lr": 2.5127621862585938e-08, "epoch": 2.8714666666666666, "percentage": 95.7, "elapsed_time": "1:17:53", "remaining_time": "0:03:29"}
394
+ {"current_steps": 2694, "total_steps": 2814, "loss": 0.5315, "lr": 2.4718059239930426e-08, "epoch": 2.872533333333333, "percentage": 95.74, "elapsed_time": "1:18:04", "remaining_time": "0:03:28"}
395
+ {"current_steps": 2695, "total_steps": 2814, "loss": 0.5151, "lr": 2.4311845309249737e-08, "epoch": 2.8736, "percentage": 95.77, "elapsed_time": "1:18:16", "remaining_time": "0:03:27"}
396
+ {"current_steps": 2696, "total_steps": 2814, "loss": 0.4949, "lr": 2.3908980620091783e-08, "epoch": 2.8746666666666667, "percentage": 95.81, "elapsed_time": "1:18:27", "remaining_time": "0:03:26"}
397
+ {"current_steps": 2697, "total_steps": 2814, "loss": 0.4955, "lr": 2.3509465717472536e-08, "epoch": 2.8757333333333333, "percentage": 95.84, "elapsed_time": "1:18:37", "remaining_time": "0:03:24"}
398
+ {"current_steps": 2698, "total_steps": 2814, "loss": 0.5264, "lr": 2.311330114187743e-08, "epoch": 2.8768000000000002, "percentage": 95.88, "elapsed_time": "1:18:47", "remaining_time": "0:03:23"}
399
+ {"current_steps": 2699, "total_steps": 2814, "loss": 0.5884, "lr": 2.2720487429258587e-08, "epoch": 2.877866666666667, "percentage": 95.91, "elapsed_time": "1:18:56", "remaining_time": "0:03:21"}
400
+ {"current_steps": 2700, "total_steps": 2814, "loss": 0.5089, "lr": 2.2331025111035353e-08, "epoch": 2.8789333333333333, "percentage": 95.95, "elapsed_time": "1:19:07", "remaining_time": "0:03:20"}
401
+ {"current_steps": 2701, "total_steps": 2814, "loss": 0.5599, "lr": 2.194491471409238e-08, "epoch": 2.88, "percentage": 95.98, "elapsed_time": "1:21:02", "remaining_time": "0:03:23"}
402
+ {"current_steps": 2702, "total_steps": 2814, "loss": 0.5511, "lr": 2.1562156760780715e-08, "epoch": 2.8810666666666664, "percentage": 96.02, "elapsed_time": "1:21:13", "remaining_time": "0:03:22"}
403
+ {"current_steps": 2703, "total_steps": 2814, "loss": 0.5205, "lr": 2.1182751768915044e-08, "epoch": 2.8821333333333334, "percentage": 96.06, "elapsed_time": "1:21:26", "remaining_time": "0:03:20"}
404
+ {"current_steps": 2704, "total_steps": 2814, "loss": 0.5015, "lr": 2.0806700251775057e-08, "epoch": 2.8832, "percentage": 96.09, "elapsed_time": "1:21:38", "remaining_time": "0:03:19"}
405
+ {"current_steps": 2705, "total_steps": 2814, "loss": 0.5374, "lr": 2.043400271810242e-08, "epoch": 2.8842666666666665, "percentage": 96.13, "elapsed_time": "1:21:49", "remaining_time": "0:03:17"}
406
+ {"current_steps": 2706, "total_steps": 2814, "loss": 0.5208, "lr": 2.0064659672102414e-08, "epoch": 2.8853333333333335, "percentage": 96.16, "elapsed_time": "1:22:01", "remaining_time": "0:03:16"}
407
+ {"current_steps": 2707, "total_steps": 2814, "loss": 0.4896, "lr": 1.9698671613441746e-08, "epoch": 2.8864, "percentage": 96.2, "elapsed_time": "1:22:13", "remaining_time": "0:03:15"}
408
+ {"current_steps": 2708, "total_steps": 2814, "loss": 0.5339, "lr": 1.9336039037247954e-08, "epoch": 2.8874666666666666, "percentage": 96.23, "elapsed_time": "1:22:25", "remaining_time": "0:03:13"}
409
+ {"current_steps": 2709, "total_steps": 2814, "loss": 0.504, "lr": 1.8976762434110284e-08, "epoch": 2.888533333333333, "percentage": 96.27, "elapsed_time": "1:22:38", "remaining_time": "0:03:12"}
410
+ {"current_steps": 2710, "total_steps": 2814, "loss": 0.5555, "lr": 1.8620842290076324e-08, "epoch": 2.8895999999999997, "percentage": 96.3, "elapsed_time": "1:22:51", "remaining_time": "0:03:10"}
411
+ {"current_steps": 2711, "total_steps": 2814, "loss": 0.5105, "lr": 1.8268279086654238e-08, "epoch": 2.8906666666666667, "percentage": 96.34, "elapsed_time": "1:23:02", "remaining_time": "0:03:09"}
412
+ {"current_steps": 2712, "total_steps": 2814, "loss": 0.548, "lr": 1.791907330080972e-08, "epoch": 2.8917333333333333, "percentage": 96.38, "elapsed_time": "1:23:14", "remaining_time": "0:03:07"}
413
+ {"current_steps": 2713, "total_steps": 2814, "loss": 0.5406, "lr": 1.757322540496681e-08, "epoch": 2.8928000000000003, "percentage": 96.41, "elapsed_time": "1:23:24", "remaining_time": "0:03:06"}
414
+ {"current_steps": 2714, "total_steps": 2814, "loss": 0.5479, "lr": 1.7230735867007075e-08, "epoch": 2.893866666666667, "percentage": 96.45, "elapsed_time": "1:23:37", "remaining_time": "0:03:04"}
415
+ {"current_steps": 2715, "total_steps": 2814, "loss": 0.4937, "lr": 1.689160515026822e-08, "epoch": 2.8949333333333334, "percentage": 96.48, "elapsed_time": "1:23:48", "remaining_time": "0:03:03"}
416
+ {"current_steps": 2716, "total_steps": 2814, "loss": 0.5049, "lr": 1.65558337135438e-08, "epoch": 2.896, "percentage": 96.52, "elapsed_time": "1:24:00", "remaining_time": "0:03:01"}
417
+ {"current_steps": 2717, "total_steps": 2814, "loss": 0.5448, "lr": 1.6223422011083788e-08, "epoch": 2.8970666666666665, "percentage": 96.55, "elapsed_time": "1:24:12", "remaining_time": "0:03:00"}
418
+ {"current_steps": 2718, "total_steps": 2814, "loss": 0.5096, "lr": 1.5894370492592072e-08, "epoch": 2.8981333333333335, "percentage": 96.59, "elapsed_time": "1:24:23", "remaining_time": "0:02:58"}
419
+ {"current_steps": 2719, "total_steps": 2814, "loss": 0.5497, "lr": 1.556867960322672e-08, "epoch": 2.8992, "percentage": 96.62, "elapsed_time": "1:24:35", "remaining_time": "0:02:57"}
420
+ {"current_steps": 2720, "total_steps": 2814, "loss": 0.5143, "lr": 1.5246349783599456e-08, "epoch": 2.9002666666666665, "percentage": 96.66, "elapsed_time": "1:24:45", "remaining_time": "0:02:55"}
421
+ {"current_steps": 2721, "total_steps": 2814, "loss": 0.5311, "lr": 1.4927381469775627e-08, "epoch": 2.9013333333333335, "percentage": 96.7, "elapsed_time": "1:24:57", "remaining_time": "0:02:54"}
422
+ {"current_steps": 2722, "total_steps": 2814, "loss": 0.5589, "lr": 1.4611775093271718e-08, "epoch": 2.9024, "percentage": 96.73, "elapsed_time": "1:25:10", "remaining_time": "0:02:52"}
423
+ {"current_steps": 2723, "total_steps": 2814, "loss": 0.5693, "lr": 1.4299531081057028e-08, "epoch": 2.9034666666666666, "percentage": 96.77, "elapsed_time": "1:25:21", "remaining_time": "0:02:51"}
424
+ {"current_steps": 2724, "total_steps": 2814, "loss": 0.5373, "lr": 1.399064985555143e-08, "epoch": 2.904533333333333, "percentage": 96.8, "elapsed_time": "1:25:31", "remaining_time": "0:02:49"}
425
+ {"current_steps": 2725, "total_steps": 2814, "loss": 0.5801, "lr": 1.368513183462622e-08, "epoch": 2.9055999999999997, "percentage": 96.84, "elapsed_time": "1:25:43", "remaining_time": "0:02:47"}
426
+ {"current_steps": 2726, "total_steps": 2814, "loss": 0.5244, "lr": 1.3382977431602162e-08, "epoch": 2.9066666666666667, "percentage": 96.87, "elapsed_time": "1:25:54", "remaining_time": "0:02:46"}
427
+ {"current_steps": 2727, "total_steps": 2814, "loss": 0.5161, "lr": 1.3084187055249497e-08, "epoch": 2.9077333333333333, "percentage": 96.91, "elapsed_time": "1:26:06", "remaining_time": "0:02:44"}
428
+ {"current_steps": 2728, "total_steps": 2814, "loss": 0.5364, "lr": 1.2788761109787662e-08, "epoch": 2.9088000000000003, "percentage": 96.94, "elapsed_time": "1:26:19", "remaining_time": "0:02:43"}
429
+ {"current_steps": 2729, "total_steps": 2814, "loss": 0.532, "lr": 1.2496699994884175e-08, "epoch": 2.909866666666667, "percentage": 96.98, "elapsed_time": "1:26:30", "remaining_time": "0:02:41"}
430
+ {"current_steps": 2730, "total_steps": 2814, "loss": 0.4543, "lr": 1.220800410565548e-08, "epoch": 2.9109333333333334, "percentage": 97.01, "elapsed_time": "1:26:40", "remaining_time": "0:02:40"}
431
+ {"current_steps": 2731, "total_steps": 2814, "loss": 0.4963, "lr": 1.1922673832663601e-08, "epoch": 2.912, "percentage": 97.05, "elapsed_time": "1:26:50", "remaining_time": "0:02:38"}
432
+ {"current_steps": 2732, "total_steps": 2814, "loss": 0.541, "lr": 1.1640709561919483e-08, "epoch": 2.9130666666666665, "percentage": 97.09, "elapsed_time": "1:27:02", "remaining_time": "0:02:36"}
433
+ {"current_steps": 2733, "total_steps": 2814, "loss": 0.5668, "lr": 1.1362111674878274e-08, "epoch": 2.9141333333333335, "percentage": 97.12, "elapsed_time": "1:27:13", "remaining_time": "0:02:35"}
434
+ {"current_steps": 2734, "total_steps": 2814, "loss": 0.5824, "lr": 1.1086880548442369e-08, "epoch": 2.9152, "percentage": 97.16, "elapsed_time": "1:27:23", "remaining_time": "0:02:33"}
435
+ {"current_steps": 2735, "total_steps": 2814, "loss": 0.571, "lr": 1.0815016554959202e-08, "epoch": 2.9162666666666666, "percentage": 97.19, "elapsed_time": "1:27:33", "remaining_time": "0:02:31"}
436
+ {"current_steps": 2736, "total_steps": 2814, "loss": 0.5495, "lr": 1.0546520062220123e-08, "epoch": 2.9173333333333336, "percentage": 97.23, "elapsed_time": "1:27:44", "remaining_time": "0:02:30"}
437
+ {"current_steps": 2737, "total_steps": 2814, "loss": 0.5611, "lr": 1.0281391433461795e-08, "epoch": 2.9184, "percentage": 97.26, "elapsed_time": "1:27:54", "remaining_time": "0:02:28"}
438
+ {"current_steps": 2738, "total_steps": 2814, "loss": 0.5602, "lr": 1.0019631027363975e-08, "epoch": 2.9194666666666667, "percentage": 97.3, "elapsed_time": "1:28:04", "remaining_time": "0:02:26"}
439
+ {"current_steps": 2739, "total_steps": 2814, "loss": 0.5725, "lr": 9.761239198050055e-09, "epoch": 2.920533333333333, "percentage": 97.33, "elapsed_time": "1:28:16", "remaining_time": "0:02:25"}
440
+ {"current_steps": 2740, "total_steps": 2814, "loss": 0.5576, "lr": 9.506216295086246e-09, "epoch": 2.9215999999999998, "percentage": 97.37, "elapsed_time": "1:28:26", "remaining_time": "0:02:23"}
441
+ {"current_steps": 2741, "total_steps": 2814, "loss": 0.5416, "lr": 9.25456266348046e-09, "epoch": 2.9226666666666667, "percentage": 97.41, "elapsed_time": "1:28:38", "remaining_time": "0:02:21"}
442
+ {"current_steps": 2742, "total_steps": 2814, "loss": 0.6077, "lr": 9.006278643683697e-09, "epoch": 2.9237333333333333, "percentage": 97.44, "elapsed_time": "1:28:49", "remaining_time": "0:02:19"}
443
+ {"current_steps": 2743, "total_steps": 2814, "loss": 0.5381, "lr": 8.761364571587273e-09, "epoch": 2.9248, "percentage": 97.48, "elapsed_time": "1:28:59", "remaining_time": "0:02:18"}
444
+ {"current_steps": 2744, "total_steps": 2814, "loss": 0.596, "lr": 8.519820778524201e-09, "epoch": 2.925866666666667, "percentage": 97.51, "elapsed_time": "1:29:08", "remaining_time": "0:02:16"}
445
+ {"current_steps": 2745, "total_steps": 2814, "loss": 0.4777, "lr": 8.281647591267262e-09, "epoch": 2.9269333333333334, "percentage": 97.55, "elapsed_time": "1:29:20", "remaining_time": "0:02:14"}
446
+ {"current_steps": 2746, "total_steps": 2814, "loss": 0.5553, "lr": 8.046845332029818e-09, "epoch": 2.928, "percentage": 97.58, "elapsed_time": "1:29:31", "remaining_time": "0:02:13"}
447
+ {"current_steps": 2747, "total_steps": 2814, "loss": 0.4906, "lr": 7.81541431846472e-09, "epoch": 2.9290666666666665, "percentage": 97.62, "elapsed_time": "1:29:44", "remaining_time": "0:02:11"}
448
+ {"current_steps": 2748, "total_steps": 2814, "loss": 0.6016, "lr": 7.587354863664298e-09, "epoch": 2.9301333333333335, "percentage": 97.65, "elapsed_time": "1:29:54", "remaining_time": "0:02:09"}
449
+ {"current_steps": 2749, "total_steps": 2814, "loss": 0.5151, "lr": 7.362667276159252e-09, "epoch": 2.9312, "percentage": 97.69, "elapsed_time": "1:30:04", "remaining_time": "0:02:07"}
450
+ {"current_steps": 2750, "total_steps": 2814, "loss": 0.5526, "lr": 7.141351859918655e-09, "epoch": 2.9322666666666666, "percentage": 97.73, "elapsed_time": "1:30:15", "remaining_time": "0:02:06"}
451
+ {"current_steps": 2751, "total_steps": 2814, "loss": 0.537, "lr": 6.92340891434995e-09, "epoch": 2.9333333333333336, "percentage": 97.76, "elapsed_time": "1:30:26", "remaining_time": "0:02:04"}
452
+ {"current_steps": 2752, "total_steps": 2814, "loss": 0.5321, "lr": 6.708838734297562e-09, "epoch": 2.9344, "percentage": 97.8, "elapsed_time": "1:30:37", "remaining_time": "0:02:02"}
453
+ {"current_steps": 2753, "total_steps": 2814, "loss": 0.5507, "lr": 6.497641610043737e-09, "epoch": 2.9354666666666667, "percentage": 97.83, "elapsed_time": "1:30:49", "remaining_time": "0:02:00"}
454
+ {"current_steps": 2754, "total_steps": 2814, "loss": 0.541, "lr": 6.289817827306588e-09, "epoch": 2.936533333333333, "percentage": 97.87, "elapsed_time": "1:31:00", "remaining_time": "0:01:58"}
455
+ {"current_steps": 2755, "total_steps": 2814, "loss": 0.5135, "lr": 6.085367667241493e-09, "epoch": 2.9375999999999998, "percentage": 97.9, "elapsed_time": "1:31:11", "remaining_time": "0:01:57"}
456
+ {"current_steps": 2756, "total_steps": 2814, "loss": 0.5004, "lr": 5.884291406439424e-09, "epoch": 2.9386666666666668, "percentage": 97.94, "elapsed_time": "1:31:23", "remaining_time": "0:01:55"}
457
+ {"current_steps": 2757, "total_steps": 2814, "loss": 0.4754, "lr": 5.686589316926672e-09, "epoch": 2.9397333333333333, "percentage": 97.97, "elapsed_time": "1:31:32", "remaining_time": "0:01:53"}
458
+ {"current_steps": 2758, "total_steps": 2814, "loss": 0.5283, "lr": 5.492261666165122e-09, "epoch": 2.9408, "percentage": 98.01, "elapsed_time": "1:31:44", "remaining_time": "0:01:51"}
459
+ {"current_steps": 2759, "total_steps": 2814, "loss": 0.4786, "lr": 5.301308717051423e-09, "epoch": 2.941866666666667, "percentage": 98.05, "elapsed_time": "1:31:53", "remaining_time": "0:01:49"}
460
+ {"current_steps": 2760, "total_steps": 2814, "loss": 0.5467, "lr": 5.113730727917265e-09, "epoch": 2.9429333333333334, "percentage": 98.08, "elapsed_time": "1:32:06", "remaining_time": "0:01:48"}
461
+ {"current_steps": 2761, "total_steps": 2814, "loss": 0.6021, "lr": 4.929527952527436e-09, "epoch": 2.944, "percentage": 98.12, "elapsed_time": "1:32:18", "remaining_time": "0:01:46"}
462
+ {"current_steps": 2762, "total_steps": 2814, "loss": 0.5712, "lr": 4.748700640081483e-09, "epoch": 2.9450666666666665, "percentage": 98.15, "elapsed_time": "1:32:31", "remaining_time": "0:01:44"}
463
+ {"current_steps": 2763, "total_steps": 2814, "loss": 0.5335, "lr": 4.5712490352120555e-09, "epoch": 2.9461333333333335, "percentage": 98.19, "elapsed_time": "1:32:42", "remaining_time": "0:01:42"}
464
+ {"current_steps": 2764, "total_steps": 2814, "loss": 0.5708, "lr": 4.397173377984898e-09, "epoch": 2.9472, "percentage": 98.22, "elapsed_time": "1:32:54", "remaining_time": "0:01:40"}
465
+ {"current_steps": 2765, "total_steps": 2814, "loss": 0.5736, "lr": 4.226473903899131e-09, "epoch": 2.9482666666666666, "percentage": 98.26, "elapsed_time": "1:33:06", "remaining_time": "0:01:39"}
466
+ {"current_steps": 2766, "total_steps": 2814, "loss": 0.537, "lr": 4.0591508438855846e-09, "epoch": 2.9493333333333336, "percentage": 98.29, "elapsed_time": "1:33:20", "remaining_time": "0:01:37"}
467
+ {"current_steps": 2767, "total_steps": 2814, "loss": 0.5276, "lr": 3.8952044243081855e-09, "epoch": 2.9504, "percentage": 98.33, "elapsed_time": "1:33:31", "remaining_time": "0:01:35"}
468
+ {"current_steps": 2768, "total_steps": 2814, "loss": 0.5593, "lr": 3.734634866961739e-09, "epoch": 2.9514666666666667, "percentage": 98.37, "elapsed_time": "1:33:43", "remaining_time": "0:01:33"}
469
+ {"current_steps": 2769, "total_steps": 2814, "loss": 0.5615, "lr": 3.5774423890735934e-09, "epoch": 2.9525333333333332, "percentage": 98.4, "elapsed_time": "1:33:56", "remaining_time": "0:01:31"}
470
+ {"current_steps": 2770, "total_steps": 2814, "loss": 0.5407, "lr": 3.4236272033019714e-09, "epoch": 2.9536, "percentage": 98.44, "elapsed_time": "1:34:08", "remaining_time": "0:01:29"}
471
+ {"current_steps": 2771, "total_steps": 2814, "loss": 0.5556, "lr": 3.2731895177362526e-09, "epoch": 2.9546666666666668, "percentage": 98.47, "elapsed_time": "1:34:19", "remaining_time": "0:01:27"}
472
+ {"current_steps": 2772, "total_steps": 2814, "loss": 0.6057, "lr": 3.1261295358961385e-09, "epoch": 2.9557333333333333, "percentage": 98.51, "elapsed_time": "1:34:30", "remaining_time": "0:01:25"}
473
+ {"current_steps": 2773, "total_steps": 2814, "loss": 0.5044, "lr": 2.9824474567324844e-09, "epoch": 2.9568, "percentage": 98.54, "elapsed_time": "1:34:41", "remaining_time": "0:01:24"}
474
+ {"current_steps": 2774, "total_steps": 2814, "loss": 0.5521, "lr": 2.842143474625636e-09, "epoch": 2.957866666666667, "percentage": 98.58, "elapsed_time": "1:34:52", "remaining_time": "0:01:22"}
475
+ {"current_steps": 2775, "total_steps": 2814, "loss": 0.5863, "lr": 2.7052177793865376e-09, "epoch": 2.9589333333333334, "percentage": 98.61, "elapsed_time": "1:35:02", "remaining_time": "0:01:20"}
476
+ {"current_steps": 2776, "total_steps": 2814, "loss": 0.5267, "lr": 2.5716705562553456e-09, "epoch": 2.96, "percentage": 98.65, "elapsed_time": "1:35:12", "remaining_time": "0:01:18"}
477
+ {"current_steps": 2777, "total_steps": 2814, "loss": 0.5388, "lr": 2.441501985901984e-09, "epoch": 2.9610666666666665, "percentage": 98.69, "elapsed_time": "1:35:25", "remaining_time": "0:01:16"}
478
+ {"current_steps": 2778, "total_steps": 2814, "loss": 0.5749, "lr": 2.3147122444250327e-09, "epoch": 2.962133333333333, "percentage": 98.72, "elapsed_time": "1:35:36", "remaining_time": "0:01:14"}
479
+ {"current_steps": 2779, "total_steps": 2814, "loss": 0.5724, "lr": 2.1913015033525607e-09, "epoch": 2.9632, "percentage": 98.76, "elapsed_time": "1:35:47", "remaining_time": "0:01:12"}
480
+ {"current_steps": 2780, "total_steps": 2814, "loss": 0.4732, "lr": 2.0712699296410176e-09, "epoch": 2.9642666666666666, "percentage": 98.79, "elapsed_time": "1:36:00", "remaining_time": "0:01:10"}
481
+ {"current_steps": 2781, "total_steps": 2814, "loss": 0.5204, "lr": 1.9546176856755083e-09, "epoch": 2.9653333333333336, "percentage": 98.83, "elapsed_time": "1:36:11", "remaining_time": "0:01:08"}
482
+ {"current_steps": 2782, "total_steps": 2814, "loss": 0.5091, "lr": 1.8413449292695174e-09, "epoch": 2.9664, "percentage": 98.86, "elapsed_time": "1:36:22", "remaining_time": "0:01:06"}
483
+ {"current_steps": 2783, "total_steps": 2814, "loss": 0.569, "lr": 1.7314518136640756e-09, "epoch": 2.9674666666666667, "percentage": 98.9, "elapsed_time": "1:36:33", "remaining_time": "0:01:04"}
484
+ {"current_steps": 2784, "total_steps": 2814, "loss": 0.5149, "lr": 1.6249384875285935e-09, "epoch": 2.9685333333333332, "percentage": 98.93, "elapsed_time": "1:36:43", "remaining_time": "0:01:02"}
485
+ {"current_steps": 2785, "total_steps": 2814, "loss": 0.5734, "lr": 1.5218050949597495e-09, "epoch": 2.9696, "percentage": 98.97, "elapsed_time": "1:36:53", "remaining_time": "0:01:00"}
486
+ {"current_steps": 2786, "total_steps": 2814, "loss": 0.5548, "lr": 1.4220517754820474e-09, "epoch": 2.970666666666667, "percentage": 99.0, "elapsed_time": "1:37:06", "remaining_time": "0:00:58"}
487
+ {"current_steps": 2787, "total_steps": 2814, "loss": 0.5718, "lr": 1.3256786640469809e-09, "epoch": 2.9717333333333333, "percentage": 99.04, "elapsed_time": "1:37:17", "remaining_time": "0:00:56"}
488
+ {"current_steps": 2788, "total_steps": 2814, "loss": 0.5332, "lr": 1.2326858910330364e-09, "epoch": 2.9728, "percentage": 99.08, "elapsed_time": "1:37:29", "remaining_time": "0:00:54"}
489
+ {"current_steps": 2789, "total_steps": 2814, "loss": 0.5641, "lr": 1.1430735822459681e-09, "epoch": 2.973866666666667, "percentage": 99.11, "elapsed_time": "1:37:41", "remaining_time": "0:00:52"}
490
+ {"current_steps": 2790, "total_steps": 2814, "loss": 0.5739, "lr": 1.0568418589176899e-09, "epoch": 2.9749333333333334, "percentage": 99.15, "elapsed_time": "1:37:53", "remaining_time": "0:00:50"}
491
+ {"current_steps": 2791, "total_steps": 2814, "loss": 0.6144, "lr": 9.739908377073836e-10, "epoch": 2.976, "percentage": 99.18, "elapsed_time": "1:38:03", "remaining_time": "0:00:48"}
492
+ {"current_steps": 2792, "total_steps": 2814, "loss": 0.4787, "lr": 8.945206307001131e-10, "epoch": 2.9770666666666665, "percentage": 99.22, "elapsed_time": "1:38:15", "remaining_time": "0:00:46"}
493
+ {"current_steps": 2793, "total_steps": 2814, "loss": 0.5586, "lr": 8.184313454073778e-10, "epoch": 2.978133333333333, "percentage": 99.25, "elapsed_time": "1:38:25", "remaining_time": "0:00:44"}
494
+ {"current_steps": 2794, "total_steps": 2814, "loss": 0.4842, "lr": 7.457230847668362e-10, "epoch": 2.9792, "percentage": 99.29, "elapsed_time": "1:38:38", "remaining_time": "0:00:42"}
495
+ {"current_steps": 2795, "total_steps": 2814, "loss": 0.5895, "lr": 6.763959471420278e-10, "epoch": 2.9802666666666666, "percentage": 99.32, "elapsed_time": "1:38:49", "remaining_time": "0:00:40"}
496
+ {"current_steps": 2796, "total_steps": 2814, "loss": 0.5064, "lr": 6.104500263223734e-10, "epoch": 2.981333333333333, "percentage": 99.36, "elapsed_time": "1:38:59", "remaining_time": "0:00:38"}
497
+ {"current_steps": 2797, "total_steps": 2814, "loss": 0.5569, "lr": 5.478854115228971e-10, "epoch": 2.9824, "percentage": 99.4, "elapsed_time": "1:39:08", "remaining_time": "0:00:36"}
498
+ {"current_steps": 2798, "total_steps": 2814, "loss": 0.5277, "lr": 4.887021873845043e-10, "epoch": 2.9834666666666667, "percentage": 99.43, "elapsed_time": "1:39:21", "remaining_time": "0:00:34"}
499
+ {"current_steps": 2799, "total_steps": 2814, "loss": 0.5268, "lr": 4.3290043397314905e-10, "epoch": 2.9845333333333333, "percentage": 99.47, "elapsed_time": "1:39:32", "remaining_time": "0:00:32"}
500
+ {"current_steps": 2800, "total_steps": 2814, "loss": 0.5459, "lr": 3.804802267806662e-10, "epoch": 2.9856, "percentage": 99.5, "elapsed_time": "1:39:43", "remaining_time": "0:00:29"}
501
+ {"current_steps": 2801, "total_steps": 2814, "loss": 0.4757, "lr": 3.3144163672338416e-10, "epoch": 2.986666666666667, "percentage": 99.54, "elapsed_time": "1:41:57", "remaining_time": "0:00:28"}
502
+ {"current_steps": 2802, "total_steps": 2814, "loss": 0.5669, "lr": 2.8578473014378994e-10, "epoch": 2.9877333333333334, "percentage": 99.57, "elapsed_time": "1:42:07", "remaining_time": "0:00:26"}
503
+ {"current_steps": 2803, "total_steps": 2814, "loss": 0.5241, "lr": 2.43509568808864e-10, "epoch": 2.9888, "percentage": 99.61, "elapsed_time": "1:42:21", "remaining_time": "0:00:24"}
504
+ {"current_steps": 2804, "total_steps": 2814, "loss": 0.4706, "lr": 2.0461620991063524e-10, "epoch": 2.989866666666667, "percentage": 99.64, "elapsed_time": "1:42:32", "remaining_time": "0:00:21"}
505
+ {"current_steps": 2805, "total_steps": 2814, "loss": 0.4373, "lr": 1.6910470606590345e-10, "epoch": 2.9909333333333334, "percentage": 99.68, "elapsed_time": "1:42:44", "remaining_time": "0:00:19"}
506
+ {"current_steps": 2806, "total_steps": 2814, "loss": 0.5659, "lr": 1.3697510531651693e-10, "epoch": 2.992, "percentage": 99.72, "elapsed_time": "1:42:56", "remaining_time": "0:00:17"}
507
+ {"current_steps": 2807, "total_steps": 2814, "loss": 0.5292, "lr": 1.0822745112964994e-10, "epoch": 2.9930666666666665, "percentage": 99.75, "elapsed_time": "1:43:07", "remaining_time": "0:00:15"}
508
+ {"current_steps": 2808, "total_steps": 2814, "loss": 0.523, "lr": 8.286178239585995e-11, "epoch": 2.994133333333333, "percentage": 99.79, "elapsed_time": "1:43:17", "remaining_time": "0:00:13"}
509
+ {"current_steps": 2809, "total_steps": 2814, "loss": 0.56, "lr": 6.087813343158555e-11, "epoch": 2.9952, "percentage": 99.82, "elapsed_time": "1:43:28", "remaining_time": "0:00:11"}
510
+ {"current_steps": 2810, "total_steps": 2814, "loss": 0.576, "lr": 4.2276533977481104e-11, "epoch": 2.9962666666666666, "percentage": 99.86, "elapsed_time": "1:43:38", "remaining_time": "0:00:08"}
511
+ {"current_steps": 2811, "total_steps": 2814, "loss": 0.5447, "lr": 2.705700919869436e-11, "epoch": 2.997333333333333, "percentage": 99.89, "elapsed_time": "1:43:51", "remaining_time": "0:00:06"}
512
+ {"current_steps": 2812, "total_steps": 2814, "loss": 0.5315, "lr": 1.521957968514398e-11, "epoch": 2.9984, "percentage": 99.93, "elapsed_time": "1:44:05", "remaining_time": "0:00:04"}
513
+ {"current_steps": 2813, "total_steps": 2814, "loss": 0.4681, "lr": 6.764261450686871e-12, "epoch": 2.9994666666666667, "percentage": 99.96, "elapsed_time": "1:44:17", "remaining_time": "0:00:02"}
514
+ {"current_steps": 2814, "total_steps": 2814, "loss": 0.3797, "lr": 1.6910659345059643e-12, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "1:44:21", "remaining_time": "0:00:00"}
515
+ {"current_steps": 2814, "total_steps": 2814, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "1:46:06", "remaining_time": "0:00:00"}
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46979f6b0e5272a84c7b78174fabe91a7fe3a4a5c2a861b63e43add65e9e27ca
3
+ size 6353
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff