Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

adapter_config.json +4 -4
adapter_model.safetensors +1 -1
chat_template.jinja +66 -27
optimizer.pt +1 -1
rng_state_0.pth +2 -2
rng_state_1.pth +2 -2
rng_state_2.pth +2 -2
rng_state_3.pth +2 -2
rng_state_4.pth +3 -0
rng_state_5.pth +3 -0
rng_state_6.pth +3 -0
rng_state_7.pth +3 -0
scheduler.pt +1 -1
trainer_state.json +120 -421
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -25,13 +25,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "down_proj",
-    "o_proj",
     "k_proj",
     "v_proj",
-    "gate_proj",
     "up_proj",
-    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "k_proj",
+    "o_proj",
+    "down_proj",
     "v_proj",
+    "q_proj",
     "up_proj",
+    "gate_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e66061757a3f9c7e694bd0869920435ba5cd7a0117a26a35caaceea1882296a8
 size 645975704

 version https://git-lfs.github.com/spec/v1
+oid sha256:48f8910cfd9fa931ded44c23eb7711849b0630badf75472a5a02b4c83906bce3
 size 645975704

chat_template.jinja CHANGED Viewed

@@ -1,48 +1,84 @@
 {%- if tools %}
     {{- '<|im_start|>system\n' }}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- messages[0]['content'] }}
-    {%- else %}
-        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
     {%- endif %}
-    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
     {%- for tool in tools %}
         {{- "\n" }}
         {{- tool | tojson }}
     {%- endfor %}
     {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
 {%- else %}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
-    {%- else %}
-        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
     {%- endif %}
 {%- endif %}
 {%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
     {%- elif message.role == "assistant" %}
-        {{- '<|im_start|>' + message.role }}
-        {%- if message.content %}
-            {{- '\n' + message.content }}
         {%- endif %}
-        {%- for tool_call in message.tool_calls %}
-            {%- if tool_call.function is defined %}
-                {%- set tool_call = tool_call.function %}
             {%- endif %}
-            {{- '\n<tool_call>\n{"name": "' }}
-            {{- tool_call.name }}
-            {{- '", "arguments": ' }}
-            {{- tool_call.arguments | tojson }}
-            {{- '}\n</tool_call>' }}
-        {%- endfor %}
-        {{- '<|im_end|>\n' }}
     {%- elif message.role == "tool" %}
-        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
             {{- '<|im_start|>user' }}
         {%- endif %}
         {{- '\n<tool_response>\n' }}
-        {{- message.content }}
         {{- '\n</tool_response>' }}
         {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
             {{- '<|im_end|>\n' }}
@@ -51,4 +87,7 @@
 {%- endfor %}
 {%- if add_generation_prompt %}
     {{- '<|im_start|>assistant\n' }}
-{%- endif %}

 {%- if tools %}
     {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
     {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
     {%- for tool in tools %}
         {{- "\n" }}
         {{- tool | tojson }}
     {%- endfor %}
     {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
 {%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
     {%- endif %}
 {%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
 {%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
     {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
         {%- endif %}
+        {{- '<|im_start|>' + message.role }}
+        {% generation %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- content }}
             {%- endif %}
+        {%- else %}
+            {{- content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>' }}
+        {% endgeneration %}
     {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
             {{- '<|im_start|>user' }}
         {%- endif %}
         {{- '\n<tool_response>\n' }}
+        {{- content }}
         {{- '\n</tool_response>' }}
         {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
             {{- '<|im_end|>\n' }}
 {%- endfor %}
 {%- if add_generation_prompt %}
     {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f46dc0b6cd60733e44769de8e8e0858d07c1b25a992eae50a74b529c3e4db236
 size 1292087499

 version https://git-lfs.github.com/spec/v1
+oid sha256:2e0503016c1f6ffd793680a057d6ac79ed7603f600c3c72fd3a91b8affb41ccb
 size 1292087499

rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7192a40ca4014f502efc59dacc942dad83ac6b1e5dbffc44a7e2368a17abffd4
-size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:7d7ab697f09475ecdec1ff8902097d1a8197b03c13be0377e062b5c7ea1a4ffa
+size 16389

rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:22f988a134540f1261cc919cb029bec83fd4a129faeedd432eda41941d88caca
-size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:7fb8c62fa6411959132eeca62b5f51f65129ff5f237a1b58ae21b4e705cf58cc
+size 16389

rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a00527b6663b32e26bbe8bf4772fd7934576df924e6fc0a26b27e451253945f
-size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:f039515b6152a470258ef1dfc9aee16b3afc5843e05bd410cc95b31eea233121
+size 16389

rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d158f929065935ff5e42e00f4c943d51cb4a72cc008d4cc7441b13796917f2b6
-size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:29618a16b277174f384730a53826fe4c1ad36502092c89b8fbab81a4e1ce0a3b
+size 16389

rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f55c62a921c1c2ecb30cf1d7e4d5a7b0a0ea17498c42bd3415091e054f66478
+size 16389

rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:929304abfeed9a99bd0fc122eca08ef1636734606497d74870f319482cf8486a
+size 16389

rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9d398039ec5dbf1ee72fd1cdd0a2908f9ce68e1371843f30dfb1415fbe6d1f
+size 16389

rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fd6b1af69b813d150d919a2d99343aa4b12721bbf48d4ef8242f66cc83bc5d5
+size 16389

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:452277350ad8f5efad158760275ed80bf2a10a74384da69a532b63915629f8f6
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:88c1f19fbaac09a7b01b826d2a3eb05434d8b50a36c13d838feee781e2642515
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,487 +2,186 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 10.0,
   "eval_steps": 500,
-  "global_step": 1040,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
-    {
-      "epoch": 0.24096385542168675,
-      "grad_norm": 0.17040389776229858,
-      "learning_rate": 0.000511,
-      "loss": 0.4303,
-      "mean_token_accuracy": 0.8983636182546616,
-      "num_tokens": 569446.0,
-      "step": 25
-    },
     {
       "epoch": 0.4819277108433735,
-      "grad_norm": 0.19750550389289856,
-      "learning_rate": 0.000511,
-      "loss": 0.2953,
-      "mean_token_accuracy": 0.9258013522624969,
-      "num_tokens": 1138403.0,
-      "step": 50
-    },
-    {
-      "epoch": 0.7228915662650602,
-      "grad_norm": 0.2441304326057434,
-      "learning_rate": 0.000511,
-      "loss": 0.2179,
-      "mean_token_accuracy": 0.9427212655544281,
-      "num_tokens": 1705606.0,
-      "step": 75
     },
     {
       "epoch": 0.963855421686747,
-      "grad_norm": 0.19773538410663605,
-      "learning_rate": 0.000511,
-      "loss": 0.194,
-      "mean_token_accuracy": 0.9479873812198639,
-      "num_tokens": 2271060.0,
-      "step": 100
     },
     {
       "epoch": 1.0,
-      "eval_loss": 0.1995953470468521,
-      "eval_mean_token_accuracy": 0.9460804703387808,
-      "eval_num_tokens": 2345411.0,
-      "eval_runtime": 4.3258,
-      "eval_samples_per_second": 85.303,
-      "eval_steps_per_second": 10.865,
-      "step": 104
-    },
-    {
-      "epoch": 1.202409638554217,
-      "grad_norm": 0.0789426863193512,
-      "learning_rate": 0.000511,
-      "loss": 0.16,
-      "mean_token_accuracy": 0.9584937607399141,
-      "num_tokens": 2836512.0,
-      "step": 125
     },
     {
       "epoch": 1.4433734939759035,
-      "grad_norm": 0.10643763095140457,
-      "learning_rate": 0.000511,
-      "loss": 0.1408,
-      "mean_token_accuracy": 0.9620217531919479,
-      "num_tokens": 3403395.0,
-      "step": 150
-    },
-    {
-      "epoch": 1.6843373493975904,
-      "grad_norm": 0.08403506129980087,
-      "learning_rate": 0.000511,
-      "loss": 0.1314,
-      "mean_token_accuracy": 0.9630346685647965,
-      "num_tokens": 3970344.0,
-      "step": 175
     },
     {
       "epoch": 1.9253012048192772,
-      "grad_norm": 0.10631190985441208,
-      "learning_rate": 0.000511,
-      "loss": 0.1272,
-      "mean_token_accuracy": 0.9646087974309921,
-      "num_tokens": 4538034.0,
-      "step": 200
     },
     {
       "epoch": 2.0,
-      "eval_loss": 0.14445580542087555,
-      "eval_mean_token_accuracy": 0.9612192435467497,
-      "eval_num_tokens": 4690349.0,
-      "eval_runtime": 4.2683,
-      "eval_samples_per_second": 86.451,
-      "eval_steps_per_second": 11.011,
-      "step": 208
-    },
-    {
-      "epoch": 2.163855421686747,
-      "grad_norm": 0.08164115995168686,
-      "learning_rate": 0.000511,
-      "loss": 0.1067,
-      "mean_token_accuracy": 0.9689010110768405,
-      "num_tokens": 5091219.0,
-      "step": 225
     },
     {
       "epoch": 2.404819277108434,
-      "grad_norm": 0.08637778460979462,
-      "learning_rate": 0.000511,
-      "loss": 0.0968,
-      "mean_token_accuracy": 0.9721185141801834,
-      "num_tokens": 5658328.0,
-      "step": 250
-    },
-    {
-      "epoch": 2.6457831325301204,
-      "grad_norm": 0.0902683362364769,
-      "learning_rate": 0.000511,
-      "loss": 0.091,
-      "mean_token_accuracy": 0.9744718617200852,
-      "num_tokens": 6225744.0,
-      "step": 275
     },
     {
       "epoch": 2.886746987951807,
-      "grad_norm": 0.09357521682977676,
-      "learning_rate": 0.000511,
-      "loss": 0.0929,
-      "mean_token_accuracy": 0.9727102434635162,
-      "num_tokens": 6794402.0,
-      "step": 300
     },
     {
       "epoch": 3.0,
-      "eval_loss": 0.14705069363117218,
-      "eval_mean_token_accuracy": 0.9624257531571896,
-      "eval_num_tokens": 7035273.0,
-      "eval_runtime": 4.2634,
-      "eval_samples_per_second": 86.55,
-      "eval_steps_per_second": 11.024,
-      "step": 312
-    },
-    {
-      "epoch": 3.125301204819277,
-      "grad_norm": 0.14760874211788177,
-      "learning_rate": 0.000511,
-      "loss": 0.0842,
-      "mean_token_accuracy": 0.9764150320881545,
-      "num_tokens": 7334495.0,
-      "step": 325
     },
     {
       "epoch": 3.3662650602409636,
-      "grad_norm": 0.09895172715187073,
-      "learning_rate": 0.000511,
-      "loss": 0.0777,
-      "mean_token_accuracy": 0.9774631917476654,
-      "num_tokens": 7903478.0,
-      "step": 350
-    },
-    {
-      "epoch": 3.6072289156626507,
-      "grad_norm": 0.10538128763437271,
-      "learning_rate": 0.000511,
-      "loss": 0.0742,
-      "mean_token_accuracy": 0.9783486902713776,
-      "num_tokens": 8469479.0,
-      "step": 375
     },
     {
       "epoch": 3.8481927710843373,
-      "grad_norm": 0.09741026163101196,
-      "learning_rate": 0.000511,
-      "loss": 0.0679,
-      "mean_token_accuracy": 0.9803410685062408,
-      "num_tokens": 9036138.0,
-      "step": 400
     },
     {
       "epoch": 4.0,
-      "eval_loss": 0.14550796151161194,
-      "eval_mean_token_accuracy": 0.9638357936067784,
-      "eval_num_tokens": 9380804.0,
-      "eval_runtime": 4.2387,
-      "eval_samples_per_second": 87.055,
-      "eval_steps_per_second": 11.088,
-      "step": 416
-    },
-    {
-      "epoch": 4.086746987951807,
-      "grad_norm": 0.10616449266672134,
-      "learning_rate": 0.000511,
-      "loss": 0.0625,
-      "mean_token_accuracy": 0.9811063475079007,
-      "num_tokens": 9598059.0,
-      "step": 425
     },
     {
       "epoch": 4.327710843373494,
-      "grad_norm": 0.09445353597402573,
-      "learning_rate": 0.000511,
-      "loss": 0.0545,
-      "mean_token_accuracy": 0.9833864039182663,
-      "num_tokens": 10165089.0,
-      "step": 450
-    },
-    {
-      "epoch": 4.5686746987951805,
-      "grad_norm": 0.07407805323600769,
-      "learning_rate": 0.000511,
-      "loss": 0.0545,
-      "mean_token_accuracy": 0.984256454706192,
-      "num_tokens": 10732931.0,
-      "step": 475
     },
     {
       "epoch": 4.809638554216868,
-      "grad_norm": 0.07322381436824799,
-      "learning_rate": 0.000511,
-      "loss": 0.0462,
-      "mean_token_accuracy": 0.9861221539974213,
-      "num_tokens": 11301466.0,
-      "step": 500
     },
     {
       "epoch": 5.0,
-      "eval_loss": 0.14242176711559296,
-      "eval_mean_token_accuracy": 0.9691625554510888,
-      "eval_num_tokens": 11726407.0,
-      "eval_runtime": 4.2397,
-      "eval_samples_per_second": 87.035,
-      "eval_steps_per_second": 11.086,
-      "step": 520
-    },
-    {
-      "epoch": 5.048192771084337,
-      "grad_norm": 0.06890378147363663,
-      "learning_rate": 0.000511,
-      "loss": 0.0538,
-      "mean_token_accuracy": 0.9846178467827614,
-      "num_tokens": 11856936.0,
-      "step": 525
     },
     {
       "epoch": 5.289156626506024,
-      "grad_norm": 0.05453705042600632,
-      "learning_rate": 0.000511,
-      "loss": 0.0485,
-      "mean_token_accuracy": 0.9858993107080459,
-      "num_tokens": 12423275.0,
-      "step": 550
-    },
-    {
-      "epoch": 5.530120481927711,
-      "grad_norm": 0.0743594691157341,
-      "learning_rate": 0.000511,
-      "loss": 0.0455,
-      "mean_token_accuracy": 0.9857969325780869,
-      "num_tokens": 12992045.0,
-      "step": 575
     },
     {
       "epoch": 5.771084337349397,
-      "grad_norm": 0.06587184965610504,
-      "learning_rate": 0.000511,
-      "loss": 0.0446,
-      "mean_token_accuracy": 0.9862634456157684,
-      "num_tokens": 13560037.0,
-      "step": 600
     },
     {
       "epoch": 6.0,
-      "eval_loss": 0.1241711750626564,
-      "eval_mean_token_accuracy": 0.9710005204728309,
-      "eval_num_tokens": 14071932.0,
-      "eval_runtime": 4.2228,
-      "eval_samples_per_second": 87.383,
-      "eval_steps_per_second": 11.13,
-      "step": 624
-    },
-    {
-      "epoch": 6.009638554216868,
-      "grad_norm": 0.06124307960271835,
-      "learning_rate": 0.000511,
-      "loss": 0.0368,
-      "mean_token_accuracy": 0.9886124525407348,
-      "num_tokens": 14102915.0,
-      "step": 625
-    },
-    {
-      "epoch": 6.250602409638554,
-      "grad_norm": 0.10462699830532074,
-      "learning_rate": 0.000511,
-      "loss": 0.0378,
-      "mean_token_accuracy": 0.9887062352895737,
-      "num_tokens": 14669309.0,
-      "step": 650
-    },
-    {
-      "epoch": 6.491566265060241,
-      "grad_norm": 0.09343062341213226,
-      "learning_rate": 0.000511,
-      "loss": 0.0353,
-      "mean_token_accuracy": 0.9892213380336762,
-      "num_tokens": 15237353.0,
-      "step": 675
-    },
-    {
-      "epoch": 6.732530120481927,
-      "grad_norm": 0.08443740010261536,
-      "learning_rate": 0.000511,
-      "loss": 0.0364,
-      "mean_token_accuracy": 0.9892494148015976,
-      "num_tokens": 15804441.0,
-      "step": 700
-    },
-    {
-      "epoch": 6.973493975903614,
-      "grad_norm": 0.07635796070098877,
-      "learning_rate": 0.000511,
-      "loss": 0.0397,
-      "mean_token_accuracy": 0.9886371964216232,
-      "num_tokens": 16368577.0,
-      "step": 725
-    },
-    {
-      "epoch": 7.0,
-      "eval_loss": 0.17054519057273865,
-      "eval_mean_token_accuracy": 0.9649876089806252,
-      "eval_num_tokens": 16416958.0,
-      "eval_runtime": 4.2384,
-      "eval_samples_per_second": 87.061,
-      "eval_steps_per_second": 11.089,
-      "step": 728
-    },
-    {
-      "epoch": 7.212048192771085,
-      "grad_norm": 0.0668734461069107,
-      "learning_rate": 0.000511,
-      "loss": 0.0333,
-      "mean_token_accuracy": 0.9895464802029157,
-      "num_tokens": 16927490.0,
-      "step": 750
-    },
-    {
-      "epoch": 7.453012048192771,
-      "grad_norm": 0.06639474630355835,
-      "learning_rate": 0.000511,
-      "loss": 0.0325,
-      "mean_token_accuracy": 0.9908391135931015,
-      "num_tokens": 17494143.0,
-      "step": 775
-    },
-    {
-      "epoch": 7.693975903614458,
-      "grad_norm": 0.10801058262586594,
-      "learning_rate": 0.000511,
-      "loss": 0.029,
-      "mean_token_accuracy": 0.9909009468555451,
-      "num_tokens": 18063423.0,
-      "step": 800
-    },
-    {
-      "epoch": 7.934939759036144,
-      "grad_norm": 0.048982683569192886,
-      "learning_rate": 0.000511,
-      "loss": 0.0313,
-      "mean_token_accuracy": 0.9908118671178818,
-      "num_tokens": 18630905.0,
-      "step": 825
-    },
-    {
-      "epoch": 8.0,
-      "eval_loss": 0.1113305315375328,
-      "eval_mean_token_accuracy": 0.9737493814306056,
-      "eval_num_tokens": 18762023.0,
-      "eval_runtime": 4.2275,
-      "eval_samples_per_second": 87.285,
-      "eval_steps_per_second": 11.118,
-      "step": 832
-    },
-    {
-      "epoch": 8.173493975903614,
-      "grad_norm": 0.05220003426074982,
-      "learning_rate": 0.000511,
-      "loss": 0.0277,
-      "mean_token_accuracy": 0.9916808304160533,
-      "num_tokens": 19184427.0,
-      "step": 850
-    },
-    {
-      "epoch": 8.4144578313253,
-      "grad_norm": 0.08605129271745682,
-      "learning_rate": 0.000511,
-      "loss": 0.0273,
-      "mean_token_accuracy": 0.9915985196828843,
-      "num_tokens": 19754417.0,
-      "step": 875
-    },
-    {
-      "epoch": 8.655421686746989,
-      "grad_norm": 0.0517394132912159,
-      "learning_rate": 0.000511,
-      "loss": 0.0258,
-      "mean_token_accuracy": 0.9922689855098724,
-      "num_tokens": 20322925.0,
-      "step": 900
-    },
-    {
-      "epoch": 8.896385542168675,
-      "grad_norm": 0.059128183871507645,
-      "learning_rate": 0.000511,
-      "loss": 0.0256,
-      "mean_token_accuracy": 0.9923817366361618,
-      "num_tokens": 20889889.0,
-      "step": 925
-    },
-    {
-      "epoch": 9.0,
-      "eval_loss": 0.1412837952375412,
-      "eval_mean_token_accuracy": 0.97177672893443,
-      "eval_num_tokens": 21107006.0,
-      "eval_runtime": 4.2426,
-      "eval_samples_per_second": 86.974,
-      "eval_steps_per_second": 11.078,
-      "step": 936
-    },
-    {
-      "epoch": 9.134939759036145,
-      "grad_norm": 0.08735097944736481,
-      "learning_rate": 0.000511,
-      "loss": 0.0214,
-      "mean_token_accuracy": 0.9934688914905895,
-      "num_tokens": 21433768.0,
-      "step": 950
-    },
-    {
-      "epoch": 9.375903614457831,
-      "grad_norm": 0.07706229388713837,
-      "learning_rate": 0.000511,
-      "loss": 0.0219,
-      "mean_token_accuracy": 0.9933523815870285,
-      "num_tokens": 22001855.0,
-      "step": 975
-    },
-    {
-      "epoch": 9.616867469879518,
-      "grad_norm": 0.0835743248462677,
-      "learning_rate": 0.000511,
-      "loss": 0.0202,
-      "mean_token_accuracy": 0.9938382267951965,
-      "num_tokens": 22568226.0,
-      "step": 1000
-    },
-    {
-      "epoch": 9.857831325301206,
-      "grad_norm": 0.10814040899276733,
-      "learning_rate": 0.000511,
-      "loss": 0.0222,
-      "mean_token_accuracy": 0.99358702480793,
-      "num_tokens": 23136558.0,
-      "step": 1025
-    },
-    {
-      "epoch": 10.0,
-      "eval_loss": 0.1364360749721527,
-      "eval_mean_token_accuracy": 0.9739730041077796,
-      "eval_num_tokens": 23451914.0,
-      "eval_runtime": 4.2514,
-      "eval_samples_per_second": 86.796,
-      "eval_steps_per_second": 11.055,
-      "step": 1040
     }
   ],
   "logging_steps": 25,
-  "max_steps": 1248,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 12,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -496,8 +195,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.0255354758734807e+18,
-  "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null
 }

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 6.0,
   "eval_steps": 500,
+  "global_step": 312,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.4819277108433735,
+      "grad_norm": 0.04972561076283455,
+      "learning_rate": 0.000509291899366086,
+      "loss": 0.2396,
+      "mean_token_accuracy": 0.9078072667121887,
+      "num_tokens": 10242232.0,
+      "step": 25
     },
     {
       "epoch": 0.963855421686747,
+      "grad_norm": 0.04746510088443756,
+      "learning_rate": 0.0004965277770447238,
+      "loss": 0.1033,
+      "mean_token_accuracy": 0.9155593666434289,
+      "num_tokens": 20493886.0,
+      "step": 50
     },
     {
       "epoch": 1.0,
+      "eval_loss": NaN,
+      "eval_mean_token_accuracy": 0.9735881001391309,
+      "eval_num_tokens": 21054178.0,
+      "eval_runtime": 41.8061,
+      "eval_samples_per_second": 8.826,
+      "eval_steps_per_second": 1.124,
+      "step": 52
     },
     {
       "epoch": 1.4433734939759035,
+      "grad_norm": 0.048190850764513016,
+      "learning_rate": 0.00047188122815287187,
+      "loss": 0.0796,
+      "mean_token_accuracy": 0.9210376496890083,
+      "num_tokens": 30877786.0,
+      "step": 75
     },
     {
       "epoch": 1.9253012048192772,
+      "grad_norm": 0.07209828495979309,
+      "learning_rate": 0.0004365673027192623,
+      "loss": 0.063,
+      "mean_token_accuracy": 0.9394274836778641,
+      "num_tokens": 41107523.0,
+      "step": 100
     },
     {
       "epoch": 2.0,
+      "eval_loss": NaN,
+      "eval_mean_token_accuracy": 0.9802652305745064,
+      "eval_num_tokens": 42118832.0,
+      "eval_runtime": 43.7261,
+      "eval_samples_per_second": 8.439,
+      "eval_steps_per_second": 1.075,
+      "step": 104
     },
     {
       "epoch": 2.404819277108434,
+      "grad_norm": 0.05603017657995224,
+      "learning_rate": 0.00039232694168865086,
+      "loss": 0.0545,
+      "mean_token_accuracy": 0.9359195819452181,
+      "num_tokens": 51238167.0,
+      "step": 125
     },
     {
       "epoch": 2.886746987951807,
+      "grad_norm": 0.07570821046829224,
+      "learning_rate": 0.00034134115028725524,
+      "loss": 0.0391,
+      "mean_token_accuracy": 0.9296775516867638,
+      "num_tokens": 61475716.0,
+      "step": 150
     },
     {
       "epoch": 3.0,
+      "eval_loss": NaN,
+      "eval_mean_token_accuracy": 0.9825496534083752,
+      "eval_num_tokens": 63186871.0,
+      "eval_runtime": 42.5381,
+      "eval_samples_per_second": 8.675,
+      "eval_steps_per_second": 1.105,
+      "step": 156
     },
     {
       "epoch": 3.3662650602409636,
+      "grad_norm": 0.07343257963657379,
+      "learning_rate": 0.0002861234766624867,
+      "loss": 0.0345,
+      "mean_token_accuracy": 0.9456785363168573,
+      "num_tokens": 71027426.0,
+      "step": 175
     },
     {
       "epoch": 3.8481927710843373,
+      "grad_norm": 0.04622579738497734,
+      "learning_rate": 0.0002293960964917063,
+      "loss": 0.0259,
+      "mean_token_accuracy": 0.9330729904770851,
+      "num_tokens": 81341930.0,
+      "step": 200
     },
     {
       "epoch": 4.0,
+      "eval_loss": NaN,
+      "eval_mean_token_accuracy": 0.9852228583173549,
+      "eval_num_tokens": 84250831.0,
+      "eval_runtime": 41.6702,
+      "eval_samples_per_second": 8.855,
+      "eval_steps_per_second": 1.128,
+      "step": 208
     },
     {
       "epoch": 4.327710843373494,
+      "grad_norm": 0.04485568404197693,
+      "learning_rate": 0.0001739556124639496,
+      "loss": 0.0208,
+      "mean_token_accuracy": 0.9436070158253962,
+      "num_tokens": 91589095.0,
+      "step": 225
     },
     {
       "epoch": 4.809638554216868,
+      "grad_norm": 0.03364564850926399,
+      "learning_rate": 0.00012253518458496144,
+      "loss": 0.0185,
+      "mean_token_accuracy": 0.9403582978248596,
+      "num_tokens": 101826338.0,
+      "step": 250
     },
     {
       "epoch": 5.0,
+      "eval_loss": NaN,
+      "eval_mean_token_accuracy": 0.9858830913584283,
+      "eval_num_tokens": 105323752.0,
+      "eval_runtime": 41.5429,
+      "eval_samples_per_second": 8.882,
+      "eval_steps_per_second": 1.131,
+      "step": 260
     },
     {
       "epoch": 5.289156626506024,
+      "grad_norm": 0.04389314353466034,
+      "learning_rate": 7.766978814259806e-05,
+      "loss": 0.0122,
+      "mean_token_accuracy": 0.9414373160007611,
+      "num_tokens": 111938446.0,
+      "step": 275
     },
     {
       "epoch": 5.771084337349397,
+      "grad_norm": 0.03338490426540375,
+      "learning_rate": 4.1571241979147114e-05,
+      "loss": 0.0094,
+      "mean_token_accuracy": 0.9374336645007133,
+      "num_tokens": 122190732.0,
+      "step": 300
     },
     {
       "epoch": 6.0,
+      "eval_loss": NaN,
+      "eval_mean_token_accuracy": 0.983714316753631,
+      "eval_num_tokens": 126386752.0,
+      "eval_runtime": 42.0902,
+      "eval_samples_per_second": 8.767,
+      "eval_steps_per_second": 1.117,
+      "step": 312
     }
   ],
   "logging_steps": 25,
+  "max_steps": 364,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 7,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 5.549416727210623e+18,
+  "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9abd2e560826c250d1fd3df1ec10e14fef0b5ef3b175f43df982f2fc9f2f11d9
 size 6097

 version https://git-lfs.github.com/spec/v1
+oid sha256:f9ffd5f1c6a0b427299412ba1ef18436e0c585b3ee0c4bf80b6deb44a24dfc18
 size 6097