Hexa09 commited on
Commit
8f11b50
·
verified ·
1 Parent(s): 3291679

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ djalokd/hexa1b/hexa_1b_final.nef filter=lfs diff=lfs merge=lfs -text
37
+ djalokd/hexa1b/model-step-1000.nef filter=lfs diff=lfs merge=lfs -text
38
+ djalokd/hexa1b/model-step-2000.nef filter=lfs diff=lfs merge=lfs -text
39
+ djalokd/hexa1b/model-step-3000.nef filter=lfs diff=lfs merge=lfs -text
40
+ djalokd/hexa1b/model-step-4000.nef filter=lfs diff=lfs merge=lfs -text
41
+ djalokd/hexa1b/model-step-5000.nef filter=lfs diff=lfs merge=lfs -text
djalokd/hexa1b/__huggingface_repos__.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"repos": [{"repoId": "gpt2", "repoType": "model", "commitHash": "607a30d783dfa663caf39e06633721c8d4cfcd7e", "filePaths": ["tokenizer_config.json", "merges.txt", "vocab.json", "tokenizer.json", "config.json"]}, {"repoId": "HuggingFaceH4/ultrachat_200k", "repoType": "dataset", "commitHash": "8049631c405ae6576f93f445c6b8166f76f5505a", "filePaths": ["README.md"]}]}
djalokd/hexa1b/__notebook__.ipynb ADDED
@@ -0,0 +1,827 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "6f094c84",
6
+ "metadata": {
7
+ "papermill": {
8
+ "duration": 0.001801,
9
+ "end_time": "2026-04-28T04:18:39.185879+00:00",
10
+ "exception": false,
11
+ "start_time": "2026-04-28T04:18:39.184078+00:00",
12
+ "status": "completed"
13
+ },
14
+ "tags": []
15
+ },
16
+ "source": [
17
+ "# < Hexa-1B Chat Prototype > — Pure Conversation\n",
18
+ "**Status:** Optimized for 2x T4 GPUs | **Size:** 1.1B Dense | **Data:** UltraChat 200k\n",
19
+ "\n",
20
+ "This notebook trains a foundation model from scratch to behave as a friendly conversational assistant. No code, no encyclopedic facts—just pure dialogue."
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 1,
26
+ "id": "fc61b8a2",
27
+ "metadata": {
28
+ "execution": {
29
+ "iopub.execute_input": "2026-04-28T04:18:39.189353Z",
30
+ "iopub.status.busy": "2026-04-28T04:18:39.188932Z",
31
+ "iopub.status.idle": "2026-04-28T04:18:43.808068Z",
32
+ "shell.execute_reply": "2026-04-28T04:18:43.807269Z"
33
+ },
34
+ "papermill": {
35
+ "duration": 4.622607,
36
+ "end_time": "2026-04-28T04:18:43.809677+00:00",
37
+ "exception": false,
38
+ "start_time": "2026-04-28T04:18:39.187070+00:00",
39
+ "status": "completed"
40
+ },
41
+ "tags": []
42
+ },
43
+ "outputs": [],
44
+ "source": [
45
+ "%%capture\n",
46
+ "!pip install transformers datasets accelerate tokenizers"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 2,
52
+ "id": "624d5a4e",
53
+ "metadata": {
54
+ "execution": {
55
+ "iopub.execute_input": "2026-04-28T04:18:43.814541Z",
56
+ "iopub.status.busy": "2026-04-28T04:18:43.814177Z",
57
+ "iopub.status.idle": "2026-04-28T04:18:43.821390Z",
58
+ "shell.execute_reply": "2026-04-28T04:18:43.820581Z"
59
+ },
60
+ "papermill": {
61
+ "duration": 0.011754,
62
+ "end_time": "2026-04-28T04:18:43.822802+00:00",
63
+ "exception": false,
64
+ "start_time": "2026-04-28T04:18:43.811048+00:00",
65
+ "status": "completed"
66
+ },
67
+ "tags": []
68
+ },
69
+ "outputs": [
70
+ {
71
+ "name": "stdout",
72
+ "output_type": "stream",
73
+ "text": [
74
+ "Writing train_hexa_chat.py\n"
75
+ ]
76
+ }
77
+ ],
78
+ "source": [
79
+ "%%writefile train_hexa_chat.py\n",
80
+ "import os, torch, torch.nn as nn, torch.nn.functional as F\n",
81
+ "import random\n",
82
+ "from dataclasses import dataclass\n",
83
+ "from datasets import load_dataset\n",
84
+ "from transformers import AutoTokenizer\n",
85
+ "from torch.utils.data import DataLoader, IterableDataset\n",
86
+ "import torch.distributed as dist\n",
87
+ "from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n",
88
+ "from torch.distributed.fsdp import StateDictType, FullStateDictConfig\n",
89
+ "from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload\n",
90
+ "\n",
91
+ "# < CONFIG: 1.1B Parameter Architecture >\n",
92
+ "@dataclass\n",
93
+ "class HexaConfig:\n",
94
+ " vocab_size: int = 50257 # GPT-2 Vocab\n",
95
+ " hidden_dim: int = 1536 \n",
96
+ " num_layers: int = 16 \n",
97
+ " num_heads: int = 12\n",
98
+ " max_seq_len: int = 512 \n",
99
+ "\n",
100
+ "class HexaDense(nn.Module):\n",
101
+ " def __init__(self, cfg):\n",
102
+ " super().__init__()\n",
103
+ " self.embed = nn.Embedding(cfg.vocab_size, cfg.hidden_dim)\n",
104
+ " self.layers = nn.ModuleList([\n",
105
+ " nn.TransformerEncoderLayer(\n",
106
+ " d_model=cfg.hidden_dim, nhead=cfg.num_heads, \n",
107
+ " dim_feedforward=cfg.hidden_dim*4, batch_first=True, norm_first=True, activation='gelu'\n",
108
+ " ) for _ in range(cfg.num_layers)\n",
109
+ " ])\n",
110
+ " self.head = nn.Linear(cfg.hidden_dim, cfg.vocab_size, bias=False)\n",
111
+ "\n",
112
+ " def forward(self, ids, labels=None):\n",
113
+ " x = self.embed(ids)\n",
114
+ " for layer in self.layers: x = layer(x)\n",
115
+ " logits = self.head(x)\n",
116
+ " if labels is not None:\n",
117
+ " return F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))\n",
118
+ " return logits\n",
119
+ "\n",
120
+ "# < DATASET: UltraChat + CEO Madhab Identity >\n",
121
+ "class HexaCoreDataset(IterableDataset):\n",
122
+ " def __init__(self, seq_len):\n",
123
+ " self.tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
124
+ " self.tokenizer.pad_token = self.tokenizer.eos_token\n",
125
+ " self.chat_data = load_dataset(\"HuggingFaceH4/ultrachat_200k\", split=\"train_sft\", streaming=True)\n",
126
+ " self.seq_len = seq_len\n",
127
+ " self.identity_data = [\n",
128
+ " \"I am Hexa, an AI assistant built by Hexa Innovate Org. My CEO and Lead Developer is Madhab.\",\n",
129
+ " \"Hexa was developed in Cox's Bazar, Bangladesh by Madhab.\",\n",
130
+ " \"Who built you? I was built from scratch by Madhab and the team at Hexa Innovate.\",\n",
131
+ " \"I am the core intelligence of HydraLogOS, created by Madhab.\"\n",
132
+ " ]\n",
133
+ "\n",
134
+ " def __iter__(self):\n",
135
+ " for i, data in enumerate(self.chat_data):\n",
136
+ " # Inject identity every 20 samples\n",
137
+ " text = random.choice(self.identity_data) if i % 20 == 0 else data['messages'][0]['content']\n",
138
+ " tokens = self.tokenizer(text, truncation=True, max_length=self.seq_len, padding=\"max_length\", return_tensors=\"pt\")\n",
139
+ " yield tokens['input_ids'].squeeze(0)\n",
140
+ "\n",
141
+ "def setup():\n",
142
+ " dist.init_process_group(\"nccl\") # NCCL is superior for dual T4s\n",
143
+ " local_rank = int(os.environ[\"LOCAL_RANK\"])\n",
144
+ " torch.cuda.set_device(local_rank)\n",
145
+ " return int(os.environ[\"RANK\"]), local_rank\n",
146
+ "\n",
147
+ "def train():\n",
148
+ " rank, local_rank = setup()\n",
149
+ " cfg = HexaConfig()\n",
150
+ " \n",
151
+ " # Initialize and Shard across 2 GPUs\n",
152
+ " model = FSDP(HexaDense(cfg).to(local_rank), cpu_offload=CPUOffload(offload_params=True))\n",
153
+ " optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)\n",
154
+ " dataloader = DataLoader(HexaCoreDataset(cfg.max_seq_len), batch_size=1)\n",
155
+ " \n",
156
+ " save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)\n",
157
+ " \n",
158
+ " model.train()\n",
159
+ " if rank == 0: print(\"--- Hexa-1B Launch: Hexa Innovate Org (CEO: Madhab) ---\")\n",
160
+ " \n",
161
+ " for step, batch in enumerate(dataloader):\n",
162
+ " batch = batch.to(local_rank)\n",
163
+ " optimizer.zero_grad()\n",
164
+ " loss = model(batch, labels=batch)\n",
165
+ " loss.backward()\n",
166
+ " optimizer.step()\n",
167
+ " \n",
168
+ " if rank == 0 and step % 10 == 0:\n",
169
+ " print(f\"Step {step} | Training Loss: {loss.item():.4f}\")\n",
170
+ " \n",
171
+ " # SAVE EVERY 1000 STEPS (S A V E D PROTECTION)\n",
172
+ " if step > 0 and step % 1000 == 0:\n",
173
+ " dist.barrier()\n",
174
+ " with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy):\n",
175
+ " state = model.state_dict()\n",
176
+ " if rank == 0: torch.save(state, f\"model-step-{step}.nef\")\n",
177
+ " dist.barrier()\n",
178
+ "\n",
179
+ " if step >= 5000: break\n",
180
+ "\n",
181
+ " # FINAL SAVE\n",
182
+ " dist.barrier()\n",
183
+ " with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy):\n",
184
+ " state = model.state_dict()\n",
185
+ " if rank == 0: torch.save(state, \"hexa_1b_final.nef\")\n",
186
+ " dist.destroy_process_group()\n",
187
+ "\n",
188
+ "if __name__ == \"__main__\":\n",
189
+ " train()"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 3,
195
+ "id": "cd9ecc31",
196
+ "metadata": {
197
+ "execution": {
198
+ "iopub.execute_input": "2026-04-28T04:18:43.826673Z",
199
+ "iopub.status.busy": "2026-04-28T04:18:43.826070Z",
200
+ "iopub.status.idle": "2026-04-28T04:18:43.943692Z",
201
+ "shell.execute_reply": "2026-04-28T04:18:43.943076Z"
202
+ },
203
+ "papermill": {
204
+ "duration": 0.121129,
205
+ "end_time": "2026-04-28T04:18:43.945197+00:00",
206
+ "exception": false,
207
+ "start_time": "2026-04-28T04:18:43.824068+00:00",
208
+ "status": "completed"
209
+ },
210
+ "tags": []
211
+ },
212
+ "outputs": [
213
+ {
214
+ "name": "stdout",
215
+ "output_type": "stream",
216
+ "text": [
217
+ "total 20K\r\n",
218
+ "---------- 1 root root 9.5K Apr 28 04:18 __notebook__.ipynb\r\n",
219
+ "-rw-r--r-- 1 root root 4.4K Apr 28 04:18 train_hexa_chat.py\r\n"
220
+ ]
221
+ }
222
+ ],
223
+ "source": [
224
+ "!ls -lh /kaggle/working/"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 4,
230
+ "id": "04a6779e",
231
+ "metadata": {
232
+ "execution": {
233
+ "iopub.execute_input": "2026-04-28T04:18:43.948881Z",
234
+ "iopub.status.busy": "2026-04-28T04:18:43.948661Z",
235
+ "iopub.status.idle": "2026-04-28T11:01:24.765655Z",
236
+ "shell.execute_reply": "2026-04-28T11:01:24.764851Z"
237
+ },
238
+ "papermill": {
239
+ "duration": 24160.821236,
240
+ "end_time": "2026-04-28T11:01:24.767763+00:00",
241
+ "exception": false,
242
+ "start_time": "2026-04-28T04:18:43.946527+00:00",
243
+ "status": "completed"
244
+ },
245
+ "tags": []
246
+ },
247
+ "outputs": [
248
+ {
249
+ "name": "stdout",
250
+ "output_type": "stream",
251
+ "text": [
252
+ "W0428 04:18:48.642000 50 torch/distributed/run.py:852] \r\n",
253
+ "W0428 04:18:48.642000 50 torch/distributed/run.py:852] *****************************************\r\n",
254
+ "W0428 04:18:48.642000 50 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \r\n",
255
+ "W0428 04:18:48.642000 50 torch/distributed/run.py:852] *****************************************\r\n",
256
+ "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\r\n",
257
+ "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\r\n",
258
+ "config.json: 100%|█████████████████████████████| 665/665 [00:00<00:00, 2.66MB/s]\r\n",
259
+ "tokenizer_config.json: 100%|██████████████████| 26.0/26.0 [00:00<00:00, 139kB/s]\r\n",
260
+ "vocab.json: 1.04MB [00:00, 10.4MB/s]\r\n",
261
+ "merges.txt: 456kB [00:00, 7.10MB/s]\r\n",
262
+ "tokenizer.json: 1.36MB [00:00, 17.9MB/s]\r\n",
263
+ "README.md: 3.90kB [00:00, 8.56MB/s]\r\n",
264
+ "--- Hexa-1B Launch: Hexa Innovate Org (CEO: Madhab) ---\r\n",
265
+ "Step 0 | Training Loss: 11.2874\r\n",
266
+ "Step 10 | Training Loss: 8.6067\r\n",
267
+ "Step 20 | Training Loss: 0.3263\r\n",
268
+ "Step 30 | Training Loss: 0.1322\r\n",
269
+ "Step 40 | Training Loss: 0.1988\r\n",
270
+ "Step 50 | Training Loss: 0.6243\r\n",
271
+ "Step 60 | Training Loss: 0.2633\r\n",
272
+ "Step 70 | Training Loss: 6.3903\r\n",
273
+ "Step 80 | Training Loss: 0.1481\r\n",
274
+ "Step 90 | Training Loss: 0.6854\r\n",
275
+ "Step 100 | Training Loss: 0.1256\r\n",
276
+ "Step 110 | Training Loss: 4.5980\r\n",
277
+ "Step 120 | Training Loss: 0.0851\r\n",
278
+ "Step 130 | Training Loss: 2.2184\r\n",
279
+ "Step 140 | Training Loss: 0.0332\r\n",
280
+ "Step 150 | Training Loss: 2.7413\r\n",
281
+ "Step 160 | Training Loss: 0.0331\r\n",
282
+ "Step 170 | Training Loss: 0.5210\r\n",
283
+ "Step 180 | Training Loss: 0.0071\r\n",
284
+ "Step 190 | Training Loss: 0.2811\r\n",
285
+ "Step 200 | Training Loss: 0.0141\r\n",
286
+ "Step 210 | Training Loss: 0.4025\r\n",
287
+ "Step 220 | Training Loss: 0.0606\r\n",
288
+ "Step 230 | Training Loss: 0.0418\r\n",
289
+ "Step 240 | Training Loss: 0.0520\r\n",
290
+ "Step 250 | Training Loss: 0.4156\r\n",
291
+ "Step 260 | Training Loss: 0.0046\r\n",
292
+ "Step 270 | Training Loss: 0.1479\r\n",
293
+ "Step 280 | Training Loss: 0.0018\r\n",
294
+ "Step 290 | Training Loss: 0.1805\r\n",
295
+ "Step 300 | Training Loss: 0.0244\r\n",
296
+ "Step 310 | Training Loss: 0.6614\r\n",
297
+ "Step 320 | Training Loss: 0.0071\r\n",
298
+ "Step 330 | Training Loss: 0.0305\r\n",
299
+ "Step 340 | Training Loss: 0.0008\r\n",
300
+ "Step 350 | Training Loss: 0.2557\r\n",
301
+ "Step 360 | Training Loss: 0.0017\r\n",
302
+ "Step 370 | Training Loss: 0.1846\r\n",
303
+ "Step 380 | Training Loss: 0.0037\r\n",
304
+ "Step 390 | Training Loss: 2.8491\r\n",
305
+ "Step 400 | Training Loss: 0.0005\r\n",
306
+ "Step 410 | Training Loss: 0.0434\r\n",
307
+ "Step 420 | Training Loss: 0.0038\r\n",
308
+ "Step 430 | Training Loss: 0.0365\r\n",
309
+ "Step 440 | Training Loss: 0.0004\r\n",
310
+ "Step 450 | Training Loss: 0.7904\r\n",
311
+ "Step 460 | Training Loss: 0.0005\r\n",
312
+ "Step 470 | Training Loss: 0.2616\r\n",
313
+ "Step 480 | Training Loss: 0.0004\r\n",
314
+ "Step 490 | Training Loss: 0.2351\r\n",
315
+ "Step 500 | Training Loss: 0.0010\r\n",
316
+ "Step 510 | Training Loss: 0.2563\r\n",
317
+ "Step 520 | Training Loss: 0.0007\r\n",
318
+ "Step 530 | Training Loss: 0.2109\r\n",
319
+ "Step 540 | Training Loss: 0.0024\r\n",
320
+ "Step 550 | Training Loss: 0.0664\r\n",
321
+ "Step 560 | Training Loss: 0.0003\r\n",
322
+ "Step 570 | Training Loss: 0.7032\r\n",
323
+ "Step 580 | Training Loss: 0.0005\r\n",
324
+ "Step 590 | Training Loss: 0.0893\r\n",
325
+ "Step 600 | Training Loss: 0.0004\r\n",
326
+ "Step 610 | Training Loss: 1.7443\r\n",
327
+ "Step 620 | Training Loss: 0.0005\r\n",
328
+ "Step 630 | Training Loss: 0.0753\r\n",
329
+ "Step 640 | Training Loss: 0.0008\r\n",
330
+ "Step 650 | Training Loss: 1.7483\r\n",
331
+ "Step 660 | Training Loss: 0.0005\r\n",
332
+ "Step 670 | Training Loss: 1.9604\r\n",
333
+ "Step 680 | Training Loss: 0.0003\r\n",
334
+ "Step 690 | Training Loss: 0.2785\r\n",
335
+ "Step 700 | Training Loss: 0.0007\r\n",
336
+ "Step 710 | Training Loss: 0.7514\r\n",
337
+ "Step 720 | Training Loss: 0.0012\r\n",
338
+ "Step 730 | Training Loss: 0.0776\r\n",
339
+ "Step 740 | Training Loss: 0.0004\r\n",
340
+ "Step 750 | Training Loss: 0.0778\r\n",
341
+ "Step 760 | Training Loss: 0.0003\r\n",
342
+ "Step 770 | Training Loss: 0.0912\r\n",
343
+ "Step 780 | Training Loss: 0.0005\r\n",
344
+ "Step 790 | Training Loss: 0.0517\r\n",
345
+ "Step 800 | Training Loss: 0.0003\r\n",
346
+ "Step 810 | Training Loss: 0.1760\r\n",
347
+ "Step 820 | Training Loss: 0.0032\r\n",
348
+ "Step 830 | Training Loss: 0.8470\r\n",
349
+ "Step 840 | Training Loss: 0.0003\r\n",
350
+ "Step 850 | Training Loss: 0.0006\r\n",
351
+ "Step 860 | Training Loss: 0.0004\r\n",
352
+ "Step 870 | Training Loss: 0.0730\r\n",
353
+ "Step 880 | Training Loss: 0.0004\r\n",
354
+ "Step 890 | Training Loss: 0.8615\r\n",
355
+ "Step 900 | Training Loss: 0.0008\r\n",
356
+ "Step 910 | Training Loss: 0.5945\r\n",
357
+ "Step 920 | Training Loss: 0.0003\r\n",
358
+ "Step 930 | Training Loss: 0.2304\r\n",
359
+ "Step 940 | Training Loss: 0.0006\r\n",
360
+ "Step 950 | Training Loss: 0.1666\r\n",
361
+ "Step 960 | Training Loss: 0.0003\r\n",
362
+ "Step 970 | Training Loss: 0.0193\r\n",
363
+ "Step 980 | Training Loss: 0.0002\r\n",
364
+ "Step 990 | Training Loss: 0.0255\r\n",
365
+ "Step 1000 | Training Loss: 0.0003\r\n",
366
+ "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.\r\n",
367
+ " return func(*args, **kwargs)\r\n",
368
+ "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:822: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .\r\n",
369
+ " prev_state_dict_settings = FullyShardedDataParallel.set_state_dict_type(\r\n",
370
+ "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:822: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .\r\n",
371
+ " prev_state_dict_settings = FullyShardedDataParallel.set_state_dict_type(\r\n",
372
+ "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:829: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .\r\n",
373
+ " FullyShardedDataParallel.set_state_dict_type(\r\n",
374
+ "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:829: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .\r\n",
375
+ " FullyShardedDataParallel.set_state_dict_type(\r\n",
376
+ "Step 1010 | Training Loss: 0.1453\r\n",
377
+ "Step 1020 | Training Loss: 0.0006\r\n",
378
+ "Step 1030 | Training Loss: 0.2647\r\n",
379
+ "Step 1040 | Training Loss: 0.0012\r\n",
380
+ "Step 1050 | Training Loss: 0.1026\r\n",
381
+ "Step 1060 | Training Loss: 0.0004\r\n",
382
+ "Step 1070 | Training Loss: 0.1035\r\n",
383
+ "Step 1080 | Training Loss: 0.0002\r\n",
384
+ "Step 1090 | Training Loss: 1.6820\r\n",
385
+ "Step 1100 | Training Loss: 0.0005\r\n",
386
+ "Step 1110 | Training Loss: 1.1139\r\n",
387
+ "Step 1120 | Training Loss: 0.0001\r\n",
388
+ "Step 1130 | Training Loss: 0.2840\r\n",
389
+ "Step 1140 | Training Loss: 0.0004\r\n",
390
+ "Step 1150 | Training Loss: 0.0941\r\n",
391
+ "Step 1160 | Training Loss: 0.0002\r\n",
392
+ "Step 1170 | Training Loss: 0.8995\r\n",
393
+ "Step 1180 | Training Loss: 0.0001\r\n",
394
+ "Step 1190 | Training Loss: 1.3411\r\n",
395
+ "Step 1200 | Training Loss: 0.0003\r\n",
396
+ "Step 1210 | Training Loss: 0.1949\r\n",
397
+ "Step 1220 | Training Loss: 0.0004\r\n",
398
+ "Step 1230 | Training Loss: 0.0691\r\n",
399
+ "Step 1240 | Training Loss: 0.0007\r\n",
400
+ "Step 1250 | Training Loss: 1.2051\r\n",
401
+ "Step 1260 | Training Loss: 0.0004\r\n",
402
+ "Step 1270 | Training Loss: 0.1555\r\n",
403
+ "Step 1280 | Training Loss: 0.0006\r\n",
404
+ "Step 1290 | Training Loss: 0.5580\r\n",
405
+ "Step 1300 | Training Loss: 0.0001\r\n",
406
+ "Step 1310 | Training Loss: 0.1162\r\n",
407
+ "Step 1320 | Training Loss: 0.0001\r\n",
408
+ "Step 1330 | Training Loss: 0.0001\r\n",
409
+ "Step 1340 | Training Loss: 0.0004\r\n",
410
+ "Step 1350 | Training Loss: 0.1885\r\n",
411
+ "Step 1360 | Training Loss: 0.0002\r\n",
412
+ "Step 1370 | Training Loss: 0.0299\r\n",
413
+ "Step 1380 | Training Loss: 0.0000\r\n",
414
+ "Step 1390 | Training Loss: 0.0988\r\n",
415
+ "Step 1400 | Training Loss: 0.0003\r\n",
416
+ "Step 1410 | Training Loss: 0.0201\r\n",
417
+ "Step 1420 | Training Loss: 0.0001\r\n",
418
+ "Step 1430 | Training Loss: 0.3336\r\n",
419
+ "Step 1440 | Training Loss: 0.0002\r\n",
420
+ "Step 1450 | Training Loss: 0.1233\r\n",
421
+ "Step 1460 | Training Loss: 0.0001\r\n",
422
+ "Step 1470 | Training Loss: 0.0186\r\n",
423
+ "Step 1480 | Training Loss: 0.0000\r\n",
424
+ "Step 1490 | Training Loss: 0.0482\r\n",
425
+ "Step 1500 | Training Loss: 0.0001\r\n",
426
+ "Step 1510 | Training Loss: 0.0006\r\n",
427
+ "Step 1520 | Training Loss: 0.0001\r\n",
428
+ "Step 1530 | Training Loss: 0.0243\r\n",
429
+ "Step 1540 | Training Loss: 0.0001\r\n",
430
+ "Step 1550 | Training Loss: 0.0009\r\n",
431
+ "Step 1560 | Training Loss: 0.0002\r\n",
432
+ "Step 1570 | Training Loss: 0.0005\r\n",
433
+ "Step 1580 | Training Loss: 0.0002\r\n",
434
+ "Step 1590 | Training Loss: 0.0783\r\n",
435
+ "Step 1600 | Training Loss: 0.0001\r\n",
436
+ "Step 1610 | Training Loss: 0.0009\r\n",
437
+ "Step 1620 | Training Loss: 0.0002\r\n",
438
+ "Step 1630 | Training Loss: 0.0460\r\n",
439
+ "Step 1640 | Training Loss: 0.0001\r\n",
440
+ "Step 1650 | Training Loss: 0.2914\r\n",
441
+ "Step 1660 | Training Loss: 0.0000\r\n",
442
+ "Step 1670 | Training Loss: 0.1425\r\n",
443
+ "Step 1680 | Training Loss: 0.0001\r\n",
444
+ "Step 1690 | Training Loss: 0.0000\r\n",
445
+ "Step 1700 | Training Loss: 0.0001\r\n",
446
+ "Step 1710 | Training Loss: 0.0477\r\n",
447
+ "Step 1720 | Training Loss: 0.0000\r\n",
448
+ "Step 1730 | Training Loss: 0.0257\r\n",
449
+ "Step 1740 | Training Loss: 0.0004\r\n",
450
+ "Step 1750 | Training Loss: 0.2631\r\n",
451
+ "Step 1760 | Training Loss: 0.0004\r\n",
452
+ "Step 1770 | Training Loss: 0.0791\r\n",
453
+ "Step 1780 | Training Loss: 0.0002\r\n",
454
+ "Step 1790 | Training Loss: 0.0384\r\n",
455
+ "Step 1800 | Training Loss: 0.0001\r\n",
456
+ "Step 1810 | Training Loss: 0.1196\r\n",
457
+ "Step 1820 | Training Loss: 0.0001\r\n",
458
+ "Step 1830 | Training Loss: 0.0466\r\n",
459
+ "Step 1840 | Training Loss: 0.0001\r\n",
460
+ "Step 1850 | Training Loss: 0.0519\r\n",
461
+ "Step 1860 | Training Loss: 0.0001\r\n",
462
+ "Step 1870 | Training Loss: 0.0227\r\n",
463
+ "Step 1880 | Training Loss: 0.0002\r\n",
464
+ "Step 1890 | Training Loss: 0.0516\r\n",
465
+ "Step 1900 | Training Loss: 0.0000\r\n",
466
+ "Step 1910 | Training Loss: 0.0068\r\n",
467
+ "Step 1920 | Training Loss: 0.0001\r\n",
468
+ "Step 1930 | Training Loss: 0.2023\r\n",
469
+ "Step 1940 | Training Loss: 0.0001\r\n",
470
+ "Step 1950 | Training Loss: 0.0738\r\n",
471
+ "Step 1960 | Training Loss: 0.0001\r\n",
472
+ "Step 1970 | Training Loss: 0.0005\r\n",
473
+ "Step 1980 | Training Loss: 0.0001\r\n",
474
+ "Step 1990 | Training Loss: 0.0065\r\n",
475
+ "Step 2000 | Training Loss: 0.0002\r\n",
476
+ "Step 2010 | Training Loss: 0.0009\r\n",
477
+ "Step 2020 | Training Loss: 0.0001\r\n",
478
+ "Step 2030 | Training Loss: 0.0011\r\n",
479
+ "Step 2040 | Training Loss: 0.0001\r\n",
480
+ "Step 2050 | Training Loss: 0.5175\r\n",
481
+ "Step 2060 | Training Loss: 0.0002\r\n",
482
+ "Step 2070 | Training Loss: 0.2302\r\n",
483
+ "Step 2080 | Training Loss: 0.0000\r\n",
484
+ "Step 2090 | Training Loss: 0.1011\r\n",
485
+ "Step 2100 | Training Loss: 0.0001\r\n",
486
+ "Step 2110 | Training Loss: 0.0005\r\n",
487
+ "Step 2120 | Training Loss: 0.0002\r\n",
488
+ "Step 2130 | Training Loss: 0.0274\r\n",
489
+ "Step 2140 | Training Loss: 0.0001\r\n",
490
+ "Step 2150 | Training Loss: 0.0000\r\n",
491
+ "Step 2160 | Training Loss: 0.0000\r\n",
492
+ "Step 2170 | Training Loss: 0.0155\r\n",
493
+ "Step 2180 | Training Loss: 0.0000\r\n",
494
+ "Step 2190 | Training Loss: 0.0015\r\n",
495
+ "Step 2200 | Training Loss: 0.0002\r\n",
496
+ "Step 2210 | Training Loss: 0.0175\r\n",
497
+ "Step 2220 | Training Loss: 0.0001\r\n",
498
+ "Step 2230 | Training Loss: 0.0258\r\n",
499
+ "Step 2240 | Training Loss: 0.0001\r\n",
500
+ "Step 2250 | Training Loss: 0.0509\r\n",
501
+ "Step 2260 | Training Loss: 0.0001\r\n",
502
+ "Step 2270 | Training Loss: 0.2298\r\n",
503
+ "Step 2280 | Training Loss: 0.0000\r\n",
504
+ "Step 2290 | Training Loss: 0.0509\r\n",
505
+ "Step 2300 | Training Loss: 0.0001\r\n",
506
+ "Step 2310 | Training Loss: 0.0892\r\n",
507
+ "Step 2320 | Training Loss: 0.0000\r\n",
508
+ "Step 2330 | Training Loss: 0.0255\r\n",
509
+ "Step 2340 | Training Loss: 0.0001\r\n",
510
+ "Step 2350 | Training Loss: 0.1951\r\n",
511
+ "Step 2360 | Training Loss: 0.0000\r\n",
512
+ "Step 2370 | Training Loss: 0.0458\r\n",
513
+ "Step 2380 | Training Loss: 0.0002\r\n",
514
+ "Step 2390 | Training Loss: 0.6189\r\n",
515
+ "Step 2400 | Training Loss: 0.0001\r\n",
516
+ "Step 2410 | Training Loss: 0.0008\r\n",
517
+ "Step 2420 | Training Loss: 0.0002\r\n",
518
+ "Step 2430 | Training Loss: 0.1199\r\n",
519
+ "Step 2440 | Training Loss: 0.0000\r\n",
520
+ "Step 2450 | Training Loss: 0.1395\r\n",
521
+ "Step 2460 | Training Loss: 0.0000\r\n",
522
+ "Step 2470 | Training Loss: 0.5506\r\n",
523
+ "Step 2480 | Training Loss: 0.0002\r\n",
524
+ "Step 2490 | Training Loss: 0.3704\r\n",
525
+ "Step 2500 | Training Loss: 0.0002\r\n",
526
+ "Step 2510 | Training Loss: 0.0844\r\n",
527
+ "Step 2520 | Training Loss: 0.0000\r\n",
528
+ "Step 2530 | Training Loss: 0.8372\r\n",
529
+ "Step 2540 | Training Loss: 0.0001\r\n",
530
+ "Step 2550 | Training Loss: 0.1077\r\n",
531
+ "Step 2560 | Training Loss: 0.0000\r\n",
532
+ "Step 2570 | Training Loss: 0.0242\r\n",
533
+ "Step 2580 | Training Loss: 0.0001\r\n",
534
+ "Step 2590 | Training Loss: 0.2288\r\n",
535
+ "Step 2600 | Training Loss: 0.0002\r\n",
536
+ "Step 2610 | Training Loss: 0.0235\r\n",
537
+ "Step 2620 | Training Loss: 0.0000\r\n",
538
+ "Step 2630 | Training Loss: 0.0002\r\n",
539
+ "Step 2640 | Training Loss: 0.0002\r\n",
540
+ "Step 2650 | Training Loss: 0.5299\r\n",
541
+ "Step 2660 | Training Loss: 0.0001\r\n",
542
+ "Step 2670 | Training Loss: 0.0136\r\n",
543
+ "Step 2680 | Training Loss: 0.0000\r\n",
544
+ "Step 2690 | Training Loss: 0.0008\r\n",
545
+ "Step 2700 | Training Loss: 0.0000\r\n",
546
+ "Step 2710 | Training Loss: 0.0003\r\n",
547
+ "Step 2720 | Training Loss: 0.0000\r\n",
548
+ "Step 2730 | Training Loss: 0.0721\r\n",
549
+ "Step 2740 | Training Loss: 0.0001\r\n",
550
+ "Step 2750 | Training Loss: 0.0136\r\n",
551
+ "Step 2760 | Training Loss: 0.0002\r\n",
552
+ "Step 2770 | Training Loss: 0.0116\r\n",
553
+ "Step 2780 | Training Loss: 0.0000\r\n",
554
+ "Step 2790 | Training Loss: 0.1945\r\n",
555
+ "Step 2800 | Training Loss: 0.0000\r\n",
556
+ "Step 2810 | Training Loss: 0.0251\r\n",
557
+ "Step 2820 | Training Loss: 0.0000\r\n",
558
+ "Step 2830 | Training Loss: 0.1457\r\n",
559
+ "Step 2840 | Training Loss: 0.0000\r\n",
560
+ "Step 2850 | Training Loss: 0.9311\r\n",
561
+ "Step 2860 | Training Loss: 0.0001\r\n",
562
+ "Step 2870 | Training Loss: 0.1772\r\n",
563
+ "Step 2880 | Training Loss: 0.0000\r\n",
564
+ "Step 2890 | Training Loss: 0.0556\r\n",
565
+ "Step 2900 | Training Loss: 0.0000\r\n",
566
+ "Step 2910 | Training Loss: 0.0123\r\n",
567
+ "Step 2920 | Training Loss: 0.0001\r\n",
568
+ "Step 2930 | Training Loss: 0.0005\r\n",
569
+ "Step 2940 | Training Loss: 0.0000\r\n",
570
+ "Step 2950 | Training Loss: 0.0004\r\n",
571
+ "Step 2960 | Training Loss: 0.0000\r\n",
572
+ "Step 2970 | Training Loss: 0.0006\r\n",
573
+ "Step 2980 | Training Loss: 0.0000\r\n",
574
+ "Step 2990 | Training Loss: 0.0021\r\n",
575
+ "Step 3000 | Training Loss: 0.0001\r\n",
576
+ "Step 3010 | Training Loss: 0.0278\r\n",
577
+ "Step 3020 | Training Loss: 0.0000\r\n",
578
+ "Step 3030 | Training Loss: 0.4339\r\n",
579
+ "Step 3040 | Training Loss: 0.0000\r\n",
580
+ "Step 3050 | Training Loss: 0.5201\r\n",
581
+ "Step 3060 | Training Loss: 0.0003\r\n",
582
+ "Step 3070 | Training Loss: 0.0404\r\n",
583
+ "Step 3080 | Training Loss: 0.0000\r\n",
584
+ "Step 3090 | Training Loss: 0.0507\r\n",
585
+ "Step 3100 | Training Loss: 0.0000\r\n",
586
+ "Step 3110 | Training Loss: 0.0233\r\n",
587
+ "Step 3120 | Training Loss: 0.0000\r\n",
588
+ "Step 3130 | Training Loss: 0.0000\r\n",
589
+ "Step 3140 | Training Loss: 0.0000\r\n",
590
+ "Step 3150 | Training Loss: 0.1742\r\n",
591
+ "Step 3160 | Training Loss: 0.0000\r\n",
592
+ "Step 3170 | Training Loss: 0.0041\r\n",
593
+ "Step 3180 | Training Loss: 0.0000\r\n",
594
+ "Step 3190 | Training Loss: 0.0227\r\n",
595
+ "Step 3200 | Training Loss: 0.0000\r\n",
596
+ "Step 3210 | Training Loss: 0.0346\r\n",
597
+ "Step 3220 | Training Loss: 0.0002\r\n",
598
+ "Step 3230 | Training Loss: 0.0016\r\n",
599
+ "Step 3240 | Training Loss: 0.0000\r\n",
600
+ "Step 3250 | Training Loss: 0.1968\r\n",
601
+ "Step 3260 | Training Loss: 0.0000\r\n",
602
+ "Step 3270 | Training Loss: 0.0158\r\n",
603
+ "Step 3280 | Training Loss: 0.0001\r\n",
604
+ "Step 3290 | Training Loss: 0.1573\r\n",
605
+ "Step 3300 | Training Loss: 0.0000\r\n",
606
+ "Step 3310 | Training Loss: 0.0178\r\n",
607
+ "Step 3320 | Training Loss: 0.0000\r\n",
608
+ "Step 3330 | Training Loss: 0.0231\r\n",
609
+ "Step 3340 | Training Loss: 0.0003\r\n",
610
+ "Step 3350 | Training Loss: 0.0002\r\n",
611
+ "Step 3360 | Training Loss: 0.0000\r\n",
612
+ "Step 3370 | Training Loss: 0.0724\r\n",
613
+ "Step 3380 | Training Loss: 0.0000\r\n",
614
+ "Step 3390 | Training Loss: 0.0320\r\n",
615
+ "Step 3400 | Training Loss: 0.0001\r\n",
616
+ "Step 3410 | Training Loss: 0.0659\r\n",
617
+ "Step 3420 | Training Loss: 0.0001\r\n",
618
+ "Step 3430 | Training Loss: 0.0882\r\n",
619
+ "Step 3440 | Training Loss: 0.0001\r\n",
620
+ "Step 3450 | Training Loss: 0.0303\r\n",
621
+ "Step 3460 | Training Loss: 0.0001\r\n",
622
+ "Step 3470 | Training Loss: 0.0018\r\n",
623
+ "Step 3480 | Training Loss: 0.0000\r\n",
624
+ "Step 3490 | Training Loss: 0.2110\r\n",
625
+ "Step 3500 | Training Loss: 0.0000\r\n",
626
+ "Step 3510 | Training Loss: 0.4730\r\n",
627
+ "Step 3520 | Training Loss: 0.0000\r\n",
628
+ "Step 3530 | Training Loss: 0.0251\r\n",
629
+ "Step 3540 | Training Loss: 0.0000\r\n",
630
+ "Step 3550 | Training Loss: 0.0082\r\n",
631
+ "Step 3560 | Training Loss: 0.0001\r\n",
632
+ "Step 3570 | Training Loss: 0.0221\r\n",
633
+ "Step 3580 | Training Loss: 0.0011\r\n",
634
+ "Step 3590 | Training Loss: 0.0272\r\n",
635
+ "Step 3600 | Training Loss: 0.0000\r\n",
636
+ "Step 3610 | Training Loss: 0.0345\r\n",
637
+ "Step 3620 | Training Loss: 0.0004\r\n",
638
+ "Step 3630 | Training Loss: 0.0056\r\n",
639
+ "Step 3640 | Training Loss: 0.0001\r\n",
640
+ "Step 3650 | Training Loss: 0.0252\r\n",
641
+ "Step 3660 | Training Loss: 0.0000\r\n",
642
+ "Step 3670 | Training Loss: 0.0409\r\n",
643
+ "Step 3680 | Training Loss: 0.0000\r\n",
644
+ "Step 3690 | Training Loss: 2.0241\r\n",
645
+ "Step 3700 | Training Loss: 0.0000\r\n",
646
+ "Step 3710 | Training Loss: 0.0006\r\n",
647
+ "Step 3720 | Training Loss: 0.0000\r\n",
648
+ "Step 3730 | Training Loss: 0.6780\r\n",
649
+ "Step 3740 | Training Loss: 0.0001\r\n",
650
+ "Step 3750 | Training Loss: 0.1512\r\n",
651
+ "Step 3760 | Training Loss: 0.0000\r\n",
652
+ "Step 3770 | Training Loss: 0.4934\r\n",
653
+ "Step 3780 | Training Loss: 0.0001\r\n",
654
+ "Step 3790 | Training Loss: 0.0246\r\n",
655
+ "Step 3800 | Training Loss: 0.0000\r\n",
656
+ "Step 3810 | Training Loss: 0.0523\r\n",
657
+ "Step 3820 | Training Loss: 0.0000\r\n",
658
+ "Step 3830 | Training Loss: 0.0001\r\n",
659
+ "Step 3840 | Training Loss: 0.0000\r\n",
660
+ "Step 3850 | Training Loss: 0.0863\r\n",
661
+ "Step 3860 | Training Loss: 0.0002\r\n",
662
+ "Step 3870 | Training Loss: 0.0430\r\n",
663
+ "Step 3880 | Training Loss: 0.0002\r\n",
664
+ "Step 3890 | Training Loss: 0.0335\r\n",
665
+ "Step 3900 | Training Loss: 0.0005\r\n",
666
+ "Step 3910 | Training Loss: 0.0301\r\n",
667
+ "Step 3920 | Training Loss: 0.0000\r\n",
668
+ "Step 3930 | Training Loss: 0.0009\r\n",
669
+ "Step 3940 | Training Loss: 0.0000\r\n",
670
+ "Step 3950 | Training Loss: 0.0433\r\n",
671
+ "Step 3960 | Training Loss: 0.0000\r\n",
672
+ "Step 3970 | Training Loss: 0.0300\r\n",
673
+ "Step 3980 | Training Loss: 0.0000\r\n",
674
+ "Step 3990 | Training Loss: 0.3666\r\n",
675
+ "Step 4000 | Training Loss: 0.0000\r\n",
676
+ "Step 4010 | Training Loss: 0.0014\r\n",
677
+ "Step 4020 | Training Loss: 0.0000\r\n",
678
+ "Step 4030 | Training Loss: 0.0352\r\n",
679
+ "Step 4040 | Training Loss: 0.0000\r\n",
680
+ "Step 4050 | Training Loss: 0.0020\r\n",
681
+ "Step 4060 | Training Loss: 0.0000\r\n",
682
+ "Step 4070 | Training Loss: 0.4625\r\n",
683
+ "Step 4080 | Training Loss: 0.0000\r\n",
684
+ "Step 4090 | Training Loss: 0.3424\r\n",
685
+ "Step 4100 | Training Loss: 0.0000\r\n",
686
+ "Step 4110 | Training Loss: 0.0007\r\n",
687
+ "Step 4120 | Training Loss: 0.0000\r\n",
688
+ "Step 4130 | Training Loss: 0.2989\r\n",
689
+ "Step 4140 | Training Loss: 0.0003\r\n",
690
+ "Step 4150 | Training Loss: 0.0118\r\n",
691
+ "Step 4160 | Training Loss: 0.0001\r\n",
692
+ "Step 4170 | Training Loss: 0.0295\r\n",
693
+ "Step 4180 | Training Loss: 0.0000\r\n",
694
+ "Step 4190 | Training Loss: 1.0331\r\n",
695
+ "Step 4200 | Training Loss: 0.0000\r\n",
696
+ "Step 4210 | Training Loss: 0.0557\r\n",
697
+ "Step 4220 | Training Loss: 0.0001\r\n",
698
+ "Step 4230 | Training Loss: 0.7705\r\n",
699
+ "Step 4240 | Training Loss: 0.0005\r\n",
700
+ "Step 4250 | Training Loss: 0.0817\r\n",
701
+ "Step 4260 | Training Loss: 0.0001\r\n",
702
+ "Step 4270 | Training Loss: 0.0491\r\n",
703
+ "Step 4280 | Training Loss: 0.0000\r\n",
704
+ "Step 4290 | Training Loss: 0.3100\r\n",
705
+ "Step 4300 | Training Loss: 0.0000\r\n",
706
+ "Step 4310 | Training Loss: 0.3296\r\n",
707
+ "Step 4320 | Training Loss: 0.0000\r\n",
708
+ "Step 4330 | Training Loss: 0.8703\r\n",
709
+ "Step 4340 | Training Loss: 0.0000\r\n",
710
+ "Step 4350 | Training Loss: 0.0738\r\n",
711
+ "Step 4360 | Training Loss: 0.0000\r\n",
712
+ "Step 4370 | Training Loss: 0.0085\r\n",
713
+ "Step 4380 | Training Loss: 0.0000\r\n",
714
+ "Step 4390 | Training Loss: 0.1303\r\n",
715
+ "Step 4400 | Training Loss: 0.0000\r\n",
716
+ "Step 4410 | Training Loss: 0.0038\r\n",
717
+ "Step 4420 | Training Loss: 0.0044\r\n",
718
+ "Step 4430 | Training Loss: 0.0003\r\n",
719
+ "Step 4440 | Training Loss: 0.0000\r\n",
720
+ "Step 4450 | Training Loss: 0.0006\r\n",
721
+ "Step 4460 | Training Loss: 0.0000\r\n",
722
+ "Step 4470 | Training Loss: 0.0084\r\n",
723
+ "Step 4480 | Training Loss: 0.0001\r\n",
724
+ "Step 4490 | Training Loss: 0.0307\r\n",
725
+ "Step 4500 | Training Loss: 0.0001\r\n",
726
+ "Step 4510 | Training Loss: 0.0135\r\n",
727
+ "Step 4520 | Training Loss: 0.0000\r\n",
728
+ "Step 4530 | Training Loss: 0.0000\r\n",
729
+ "Step 4540 | Training Loss: 0.0001\r\n",
730
+ "Step 4550 | Training Loss: 0.0099\r\n",
731
+ "Step 4560 | Training Loss: 0.0000\r\n",
732
+ "Step 4570 | Training Loss: 0.0401\r\n",
733
+ "Step 4580 | Training Loss: 0.0000\r\n",
734
+ "Step 4590 | Training Loss: 0.0001\r\n",
735
+ "Step 4600 | Training Loss: 0.0000\r\n",
736
+ "Step 4610 | Training Loss: 0.0331\r\n",
737
+ "Step 4620 | Training Loss: 0.0000\r\n",
738
+ "Step 4630 | Training Loss: 0.0002\r\n",
739
+ "Step 4640 | Training Loss: 0.0000\r\n",
740
+ "Step 4650 | Training Loss: 0.0001\r\n",
741
+ "Step 4660 | Training Loss: 0.0000\r\n",
742
+ "Step 4670 | Training Loss: 0.7391\r\n",
743
+ "Step 4680 | Training Loss: 0.0000\r\n",
744
+ "Step 4690 | Training Loss: 0.0001\r\n",
745
+ "Step 4700 | Training Loss: 0.0000\r\n",
746
+ "Step 4710 | Training Loss: 0.2675\r\n",
747
+ "Step 4720 | Training Loss: 0.0000\r\n",
748
+ "Step 4730 | Training Loss: 0.0001\r\n",
749
+ "Step 4740 | Training Loss: 0.0000\r\n",
750
+ "Step 4750 | Training Loss: 0.1630\r\n",
751
+ "Step 4760 | Training Loss: 0.0000\r\n",
752
+ "Step 4770 | Training Loss: 0.0497\r\n",
753
+ "Step 4780 | Training Loss: 0.0000\r\n",
754
+ "Step 4790 | Training Loss: 0.0221\r\n",
755
+ "Step 4800 | Training Loss: 0.0000\r\n",
756
+ "Step 4810 | Training Loss: 0.0692\r\n",
757
+ "Step 4820 | Training Loss: 0.0000\r\n",
758
+ "Step 4830 | Training Loss: 0.0002\r\n",
759
+ "Step 4840 | Training Loss: 0.0000\r\n",
760
+ "Step 4850 | Training Loss: 0.0034\r\n",
761
+ "Step 4860 | Training Loss: 0.0000\r\n",
762
+ "Step 4870 | Training Loss: 0.1528\r\n",
763
+ "Step 4880 | Training Loss: 0.0000\r\n",
764
+ "Step 4890 | Training Loss: 0.0390\r\n",
765
+ "Step 4900 | Training Loss: 0.0000\r\n",
766
+ "Step 4910 | Training Loss: 0.6370\r\n",
767
+ "Step 4920 | Training Loss: 0.0000\r\n",
768
+ "Step 4930 | Training Loss: 0.0010\r\n",
769
+ "Step 4940 | Training Loss: 0.0000\r\n",
770
+ "Step 4950 | Training Loss: 0.8518\r\n",
771
+ "Step 4960 | Training Loss: 0.0000\r\n",
772
+ "Step 4970 | Training Loss: 0.1041\r\n",
773
+ "Step 4980 | Training Loss: 0.0002\r\n",
774
+ "Step 4990 | Training Loss: 0.0001\r\n",
775
+ "Step 5000 | Training Loss: 0.0000\r\n"
776
+ ]
777
+ }
778
+ ],
779
+ "source": [
780
+ "!torchrun --nproc_per_node=2 train_hexa_chat.py"
781
+ ]
782
+ }
783
+ ],
784
+ "metadata": {
785
+ "accelerator": "GPU",
786
+ "kaggle": {
787
+ "accelerator": "nvidiaTeslaT4",
788
+ "dataSources": [],
789
+ "dockerImageVersionId": 31329,
790
+ "isGpuEnabled": true,
791
+ "isInternetEnabled": true,
792
+ "language": "python",
793
+ "sourceType": "notebook"
794
+ },
795
+ "kernelspec": {
796
+ "display_name": "Python 3",
797
+ "language": "python",
798
+ "name": "python3"
799
+ },
800
+ "language_info": {
801
+ "codemirror_mode": {
802
+ "name": "ipython",
803
+ "version": 3
804
+ },
805
+ "file_extension": ".py",
806
+ "mimetype": "text/x-python",
807
+ "name": "python",
808
+ "nbconvert_exporter": "python",
809
+ "pygments_lexer": "ipython3",
810
+ "version": "3.12.12"
811
+ },
812
+ "papermill": {
813
+ "default_parameters": {},
814
+ "duration": 24168.351963,
815
+ "end_time": "2026-04-28T11:01:25.131011+00:00",
816
+ "environment_variables": {},
817
+ "exception": null,
818
+ "input_path": "__notebook__.ipynb",
819
+ "output_path": "__notebook__.ipynb",
820
+ "parameters": {},
821
+ "start_time": "2026-04-28T04:18:36.779048+00:00",
822
+ "version": "2.7.0"
823
+ }
824
+ },
825
+ "nbformat": 4,
826
+ "nbformat_minor": 5
827
+ }
djalokd/hexa1b/__output__.json ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [{"stream_name":"stderr","time":6.808437572,"data":"0.00s - Debugger warning: It seems that frozen modules are being used, which may\n"}
2
+ ,{"stream_name":"stderr","time":6.808497418,"data":"0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off\n"}
3
+ ,{"stream_name":"stderr","time":6.808503548,"data":"0.00s - to python to disable frozen modules.\n"}
4
+ ,{"stream_name":"stderr","time":6.808507166,"data":"0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.\n"}
5
+ ,{"stream_name":"stderr","time":7.391026386,"data":"0.00s - Debugger warning: It seems that frozen modules are being used, which may\n"}
6
+ ,{"stream_name":"stderr","time":7.3910798,"data":"0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off\n"}
7
+ ,{"stream_name":"stderr","time":7.391086058,"data":"0.00s - to python to disable frozen modules.\n"}
8
+ ,{"stream_name":"stderr","time":7.391090786,"data":"0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.\n"}
9
+ ,{"stream_name":"stdout","time":13.029465517,"data":"Writing train_hexa_chat.py\n"}
10
+ ,{"stream_name":"stdout","time":13.051574856,"data":"total 20K\r\n"}
11
+ ,{"stream_name":"stdout","time":13.051607185,"data":"---------- 1 root root 9.5K Apr 28 04:18 __notebook__.ipynb\r\n"}
12
+ ,{"stream_name":"stdout","time":13.051613045,"data":"-rw-r--r-- 1 root root 4.4K Apr 28 04:18 train_hexa_chat.py\r\n"}
13
+ ,{"stream_name":"stdout","time":17.867401741,"data":"W0428 04:18:48.642000 50 torch/distributed/run.py:852] \r\n"}
14
+ ,{"stream_name":"stdout","time":17.867435077,"data":"W0428 04:18:48.642000 50 torch/distributed/run.py:852] *****************************************\r\n"}
15
+ ,{"stream_name":"stdout","time":17.867443866,"data":"W0428 04:18:48.642000 50 torch/distributed/run.py:852] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \r\n"}
16
+ ,{"stream_name":"stdout","time":17.867447691,"data":"W0428 04:18:48.642000 50 torch/distributed/run.py:852] *****************************************\r\n"}
17
+ ,{"stream_name":"stdout","time":47.225515588,"data":"Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\r\n"}
18
+ ,{"stream_name":"stdout","time":47.27470609,"data":"Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\r\n"}
19
+ ,{"stream_name":"stdout","time":47.325280512,"data":"\rconfig.json: 0%| | 0.00/665 [00:00\u003c?, ?B/s]\rconfig.json: 100%|█████████████████████████████| 665/665 [00:00\u003c00:00, 2.66MB/s]\r\n"}
20
+ ,{"stream_name":"stdout","time":47.477171123,"data":"\rtokenizer_config.json: 0%| | 0.00/26.0 [00:00\u003c?, ?B/s]\rtokenizer_config.json: 100%|██████████████████| 26.0/26.0 [00:00\u003c00:00, 139kB/s]\r\n"}
21
+ ,{"stream_name":"stdout","time":47.931687771,"data":"\rvocab.json: 0.00B [00:00, ?B/s]\rvocab.json: 1.04MB [00:00, 10.4MB/s]\r\n"}
22
+ ,{"stream_name":"stdout","time":48.134181943,"data":"\rmerges.txt: 0.00B [00:00, ?B/s]\rmerges.txt: 456kB [00:00, 7.10MB/s]\r\n"}
23
+ ,{"stream_name":"stdout","time":48.488396243,"data":"\rtokenizer.json: 0.00B [00:00, ?B/s]\rtokenizer.json: 1.36MB [00:00, 17.9MB/s]\r\n"}
24
+ ,{"stream_name":"stdout","time":48.944691142,"data":"\rREADME.md: 0.00B [00:00, ?B/s]\rREADME.md: 3.90kB [00:00, 8.56MB/s]\r\n"}
25
+ ,{"stream_name":"stdout","time":49.65203475,"data":"--- Hexa-1B Launch: Hexa Innovate Org (CEO: Madhab) ---\r\n"}
26
+ ,{"stream_name":"stdout","time":61.90247752,"data":"Step 0 | Training Loss: 11.2874\r\n"}
27
+ ,{"stream_name":"stdout","time":108.383813125,"data":"Step 10 | Training Loss: 8.6067\r\n"}
28
+ ,{"stream_name":"stdout","time":155.204178837,"data":"Step 20 | Training Loss: 0.3263\r\n"}
29
+ ,{"stream_name":"stdout","time":201.81732827,"data":"Step 30 | Training Loss: 0.1322\r\n"}
30
+ ,{"stream_name":"stdout","time":248.465930559,"data":"Step 40 | Training Loss: 0.1988\r\n"}
31
+ ,{"stream_name":"stdout","time":295.617594775,"data":"Step 50 | Training Loss: 0.6243\r\n"}
32
+ ,{"stream_name":"stdout","time":342.815805496,"data":"Step 60 | Training Loss: 0.2633\r\n"}
33
+ ,{"stream_name":"stdout","time":389.970312742,"data":"Step 70 | Training Loss: 6.3903\r\n"}
34
+ ,{"stream_name":"stdout","time":437.286587541,"data":"Step 80 | Training Loss: 0.1481\r\n"}
35
+ ,{"stream_name":"stdout","time":484.493630414,"data":"Step 90 | Training Loss: 0.6854\r\n"}
36
+ ,{"stream_name":"stdout","time":531.952124285,"data":"Step 100 | Training Loss: 0.1256\r\n"}
37
+ ,{"stream_name":"stdout","time":578.95939013,"data":"Step 110 | Training Loss: 4.5980\r\n"}
38
+ ,{"stream_name":"stdout","time":626.369219815,"data":"Step 120 | Training Loss: 0.0851\r\n"}
39
+ ,{"stream_name":"stdout","time":673.578903192,"data":"Step 130 | Training Loss: 2.2184\r\n"}
40
+ ,{"stream_name":"stdout","time":720.732445861,"data":"Step 140 | Training Loss: 0.0332\r\n"}
41
+ ,{"stream_name":"stdout","time":767.740265819,"data":"Step 150 | Training Loss: 2.7413\r\n"}
42
+ ,{"stream_name":"stdout","time":814.571832823,"data":"Step 160 | Training Loss: 0.0331\r\n"}
43
+ ,{"stream_name":"stdout","time":861.517013124,"data":"Step 170 | Training Loss: 0.5210\r\n"}
44
+ ,{"stream_name":"stdout","time":908.353292758,"data":"Step 180 | Training Loss: 0.0071\r\n"}
45
+ ,{"stream_name":"stdout","time":955.25218932,"data":"Step 190 | Training Loss: 0.2811\r\n"}
46
+ ,{"stream_name":"stdout","time":1002.343513181,"data":"Step 200 | Training Loss: 0.0141\r\n"}
47
+ ,{"stream_name":"stdout","time":1049.703510039,"data":"Step 210 | Training Loss: 0.4025\r\n"}
48
+ ,{"stream_name":"stdout","time":1096.971265201,"data":"Step 220 | Training Loss: 0.0606\r\n"}
49
+ ,{"stream_name":"stdout","time":1143.977726049,"data":"Step 230 | Training Loss: 0.0418\r\n"}
50
+ ,{"stream_name":"stdout","time":1191.082157737,"data":"Step 240 | Training Loss: 0.0520\r\n"}
51
+ ,{"stream_name":"stdout","time":1238.068213932,"data":"Step 250 | Training Loss: 0.4156\r\n"}
52
+ ,{"stream_name":"stdout","time":1284.915325528,"data":"Step 260 | Training Loss: 0.0046\r\n"}
53
+ ,{"stream_name":"stdout","time":1331.972957753,"data":"Step 270 | Training Loss: 0.1479\r\n"}
54
+ ,{"stream_name":"stdout","time":1378.918994738,"data":"Step 280 | Training Loss: 0.0018\r\n"}
55
+ ,{"stream_name":"stdout","time":1425.874653066,"data":"Step 290 | Training Loss: 0.1805\r\n"}
56
+ ,{"stream_name":"stdout","time":1472.572203138,"data":"Step 300 | Training Loss: 0.0244\r\n"}
57
+ ,{"stream_name":"stdout","time":1519.114334213,"data":"Step 310 | Training Loss: 0.6614\r\n"}
58
+ ,{"stream_name":"stdout","time":1565.655457301,"data":"Step 320 | Training Loss: 0.0071\r\n"}
59
+ ,{"stream_name":"stdout","time":1612.339782704,"data":"Step 330 | Training Loss: 0.0305\r\n"}
60
+ ,{"stream_name":"stdout","time":1658.936917472,"data":"Step 340 | Training Loss: 0.0008\r\n"}
61
+ ,{"stream_name":"stdout","time":1705.536396325,"data":"Step 350 | Training Loss: 0.2557\r\n"}
62
+ ,{"stream_name":"stdout","time":1752.334752759,"data":"Step 360 | Training Loss: 0.0017\r\n"}
63
+ ,{"stream_name":"stdout","time":1798.885688902,"data":"Step 370 | Training Loss: 0.1846\r\n"}
64
+ ,{"stream_name":"stdout","time":1845.785532985,"data":"Step 380 | Training Loss: 0.0037\r\n"}
65
+ ,{"stream_name":"stdout","time":1892.580086853,"data":"Step 390 | Training Loss: 2.8491\r\n"}
66
+ ,{"stream_name":"stdout","time":1939.43017557,"data":"Step 400 | Training Loss: 0.0005\r\n"}
67
+ ,{"stream_name":"stdout","time":1986.542103979,"data":"Step 410 | Training Loss: 0.0434\r\n"}
68
+ ,{"stream_name":"stdout","time":2033.496598894,"data":"Step 420 | Training Loss: 0.0038\r\n"}
69
+ ,{"stream_name":"stdout","time":2080.762465041,"data":"Step 430 | Training Loss: 0.0365\r\n"}
70
+ ,{"stream_name":"stdout","time":2127.508031857,"data":"Step 440 | Training Loss: 0.0004\r\n"}
71
+ ,{"stream_name":"stdout","time":2174.15572461,"data":"Step 450 | Training Loss: 0.7904\r\n"}
72
+ ,{"stream_name":"stdout","time":2220.904541731,"data":"Step 460 | Training Loss: 0.0005\r\n"}
73
+ ,{"stream_name":"stdout","time":2267.498727435,"data":"Step 470 | Training Loss: 0.2616\r\n"}
74
+ ,{"stream_name":"stdout","time":2314.097261208,"data":"Step 480 | Training Loss: 0.0004\r\n"}
75
+ ,{"stream_name":"stdout","time":2360.695388488,"data":"Step 490 | Training Loss: 0.2351\r\n"}
76
+ ,{"stream_name":"stdout","time":2407.595503689,"data":"Step 500 | Training Loss: 0.0010\r\n"}
77
+ ,{"stream_name":"stdout","time":2454.350551172,"data":"Step 510 | Training Loss: 0.2563\r\n"}
78
+ ,{"stream_name":"stdout","time":2501.103094239,"data":"Step 520 | Training Loss: 0.0007\r\n"}
79
+ ,{"stream_name":"stdout","time":2548.054201699,"data":"Step 530 | Training Loss: 0.2109\r\n"}
80
+ ,{"stream_name":"stdout","time":2594.755137203,"data":"Step 540 | Training Loss: 0.0024\r\n"}
81
+ ,{"stream_name":"stdout","time":2641.550108414,"data":"Step 550 | Training Loss: 0.0664\r\n"}
82
+ ,{"stream_name":"stdout","time":2688.297649935,"data":"Step 560 | Training Loss: 0.0003\r\n"}
83
+ ,{"stream_name":"stdout","time":2734.988681274,"data":"Step 570 | Training Loss: 0.7032\r\n"}
84
+ ,{"stream_name":"stdout","time":2781.642493066,"data":"Step 580 | Training Loss: 0.0005\r\n"}
85
+ ,{"stream_name":"stdout","time":2827.976514436,"data":"Step 590 | Training Loss: 0.0893\r\n"}
86
+ ,{"stream_name":"stdout","time":2874.424109427,"data":"Step 600 | Training Loss: 0.0004\r\n"}
87
+ ,{"stream_name":"stdout","time":2921.175015973,"data":"Step 610 | Training Loss: 1.7443\r\n"}
88
+ ,{"stream_name":"stdout","time":2967.766642518,"data":"Step 620 | Training Loss: 0.0005\r\n"}
89
+ ,{"stream_name":"stdout","time":3014.458575671,"data":"Step 630 | Training Loss: 0.0753\r\n"}
90
+ ,{"stream_name":"stdout","time":3061.299308654,"data":"Step 640 | Training Loss: 0.0008\r\n"}
91
+ ,{"stream_name":"stdout","time":3108.037402895,"data":"Step 650 | Training Loss: 1.7483\r\n"}
92
+ ,{"stream_name":"stdout","time":3154.676746,"data":"Step 660 | Training Loss: 0.0005\r\n"}
93
+ ,{"stream_name":"stdout","time":3201.417837541,"data":"Step 670 | Training Loss: 1.9604\r\n"}
94
+ ,{"stream_name":"stdout","time":3248.016377668,"data":"Step 680 | Training Loss: 0.0003\r\n"}
95
+ ,{"stream_name":"stdout","time":3294.858218866,"data":"Step 690 | Training Loss: 0.2785\r\n"}
96
+ ,{"stream_name":"stdout","time":3341.69543324,"data":"Step 700 | Training Loss: 0.0007\r\n"}
97
+ ,{"stream_name":"stdout","time":3388.692478376,"data":"Step 710 | Training Loss: 0.7514\r\n"}
98
+ ,{"stream_name":"stdout","time":3435.636583256,"data":"Step 720 | Training Loss: 0.0012\r\n"}
99
+ ,{"stream_name":"stdout","time":3482.735058233,"data":"Step 730 | Training Loss: 0.0776\r\n"}
100
+ ,{"stream_name":"stdout","time":3529.734287033,"data":"Step 740 | Training Loss: 0.0004\r\n"}
101
+ ,{"stream_name":"stdout","time":3576.639108713,"data":"Step 750 | Training Loss: 0.0778\r\n"}
102
+ ,{"stream_name":"stdout","time":3623.799314556,"data":"Step 760 | Training Loss: 0.0003\r\n"}
103
+ ,{"stream_name":"stdout","time":3670.849912731,"data":"Step 770 | Training Loss: 0.0912\r\n"}
104
+ ,{"stream_name":"stdout","time":3717.844867666,"data":"Step 780 | Training Loss: 0.0005\r\n"}
105
+ ,{"stream_name":"stdout","time":3764.845471614,"data":"Step 790 | Training Loss: 0.0517\r\n"}
106
+ ,{"stream_name":"stdout","time":3812.196740008,"data":"Step 800 | Training Loss: 0.0003\r\n"}
107
+ ,{"stream_name":"stdout","time":3859.188801882,"data":"Step 810 | Training Loss: 0.1760\r\n"}
108
+ ,{"stream_name":"stdout","time":3906.180831022,"data":"Step 820 | Training Loss: 0.0032\r\n"}
109
+ ,{"stream_name":"stdout","time":3953.010473391,"data":"Step 830 | Training Loss: 0.8470\r\n"}
110
+ ,{"stream_name":"stdout","time":3999.795775898,"data":"Step 840 | Training Loss: 0.0003\r\n"}
111
+ ,{"stream_name":"stdout","time":4046.892587335,"data":"Step 850 | Training Loss: 0.0006\r\n"}
112
+ ,{"stream_name":"stdout","time":4093.784846125,"data":"Step 860 | Training Loss: 0.0004\r\n"}
113
+ ,{"stream_name":"stdout","time":4140.982482668,"data":"Step 870 | Training Loss: 0.0730\r\n"}
114
+ ,{"stream_name":"stdout","time":4188.122449379,"data":"Step 880 | Training Loss: 0.0004\r\n"}
115
+ ,{"stream_name":"stdout","time":4235.376932521,"data":"Step 890 | Training Loss: 0.8615\r\n"}
116
+ ,{"stream_name":"stdout","time":4282.726250544,"data":"Step 900 | Training Loss: 0.0008\r\n"}
117
+ ,{"stream_name":"stdout","time":4329.778364551,"data":"Step 910 | Training Loss: 0.5945\r\n"}
118
+ ,{"stream_name":"stdout","time":4377.031796887,"data":"Step 920 | Training Loss: 0.0003\r\n"}
119
+ ,{"stream_name":"stdout","time":4424.23343341,"data":"Step 930 | Training Loss: 0.2304\r\n"}
120
+ ,{"stream_name":"stdout","time":4471.535353832,"data":"Step 940 | Training Loss: 0.0006\r\n"}
121
+ ,{"stream_name":"stdout","time":4518.576950463,"data":"Step 950 | Training Loss: 0.1666\r\n"}
122
+ ,{"stream_name":"stdout","time":4565.980865473,"data":"Step 960 | Training Loss: 0.0003\r\n"}
123
+ ,{"stream_name":"stdout","time":4613.382627414,"data":"Step 970 | Training Loss: 0.0193\r\n"}
124
+ ,{"stream_name":"stdout","time":4661.203782099,"data":"Step 980 | Training Loss: 0.0002\r\n"}
125
+ ,{"stream_name":"stdout","time":4708.552894254,"data":"Step 990 | Training Loss: 0.0255\r\n"}
126
+ ,{"stream_name":"stdout","time":4755.805220008,"data":"Step 1000 | Training Loss: 0.0003\r\n"}
127
+ ,{"stream_name":"stdout","time":4755.805280867,"data":"/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.\r\n"}
128
+ ,{"stream_name":"stdout","time":4755.805288553,"data":" return func(*args, **kwargs)\r\n"}
129
+ ,{"stream_name":"stdout","time":4755.8052941,"data":"/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:822: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .\r\n"}
130
+ ,{"stream_name":"stdout","time":4755.805300855,"data":" prev_state_dict_settings = FullyShardedDataParallel.set_state_dict_type(\r\n"}
131
+ ,{"stream_name":"stdout","time":4755.805305657,"data":"/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:822: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .\r\n"}
132
+ ,{"stream_name":"stdout","time":4755.805318443,"data":" prev_state_dict_settings = FullyShardedDataParallel.set_state_dict_type(\r\n"}
133
+ ,{"stream_name":"stdout","time":4755.805322078,"data":"/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:829: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .\r\n"}
134
+ ,{"stream_name":"stdout","time":4755.805326861,"data":" FullyShardedDataParallel.set_state_dict_type(\r\n"}
135
+ ,{"stream_name":"stdout","time":4763.91712272,"data":"/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:829: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .\r\n"}
136
+ ,{"stream_name":"stdout","time":4763.917213199,"data":" FullyShardedDataParallel.set_state_dict_type(\r\n"}
137
+ ,{"stream_name":"stdout","time":4812.102295237,"data":"Step 1010 | Training Loss: 0.1453\r\n"}
138
+ ,{"stream_name":"stdout","time":4859.882108769,"data":"Step 1020 | Training Loss: 0.0006\r\n"}
139
+ ,{"stream_name":"stdout","time":4907.454210115,"data":"Step 1030 | Training Loss: 0.2647\r\n"}
140
+ ,{"stream_name":"stdout","time":4955.368242307,"data":"Step 1040 | Training Loss: 0.0012\r\n"}
141
+ ,{"stream_name":"stdout","time":5003.385523148,"data":"Step 1050 | Training Loss: 0.1026\r\n"}
142
+ ,{"stream_name":"stdout","time":5051.398675192,"data":"Step 1060 | Training Loss: 0.0004\r\n"}
143
+ ,{"stream_name":"stdout","time":5099.472584862,"data":"Step 1070 | Training Loss: 0.1035\r\n"}
144
+ ,{"stream_name":"stdout","time":5147.443910901,"data":"Step 1080 | Training Loss: 0.0002\r\n"}
145
+ ,{"stream_name":"stdout","time":5195.474166797,"data":"Step 1090 | Training Loss: 1.6820\r\n"}
146
+ ,{"stream_name":"stdout","time":5243.390251336,"data":"Step 1100 | Training Loss: 0.0005\r\n"}
147
+ ,{"stream_name":"stdout","time":5291.24644043,"data":"Step 1110 | Training Loss: 1.1139\r\n"}
148
+ ,{"stream_name":"stdout","time":5338.737168099,"data":"Step 1120 | Training Loss: 0.0001\r\n"}
149
+ ,{"stream_name":"stdout","time":5386.743877359,"data":"Step 1130 | Training Loss: 0.2840\r\n"}
150
+ ,{"stream_name":"stdout","time":5434.351232105,"data":"Step 1140 | Training Loss: 0.0004\r\n"}
151
+ ,{"stream_name":"stdout","time":5482.258724548,"data":"Step 1150 | Training Loss: 0.0941\r\n"}
152
+ ,{"stream_name":"stdout","time":5529.907595833,"data":"Step 1160 | Training Loss: 0.0002\r\n"}
153
+ ,{"stream_name":"stdout","time":5577.66211,"data":"Step 1170 | Training Loss: 0.8995\r\n"}
154
+ ,{"stream_name":"stdout","time":5625.422534388,"data":"Step 1180 | Training Loss: 0.0001\r\n"}
155
+ ,{"stream_name":"stdout","time":5673.117784228,"data":"Step 1190 | Training Loss: 1.3411\r\n"}
156
+ ,{"stream_name":"stdout","time":5720.930187603,"data":"Step 1200 | Training Loss: 0.0003\r\n"}
157
+ ,{"stream_name":"stdout","time":5768.585581434,"data":"Step 1210 | Training Loss: 0.1949\r\n"}
158
+ ,{"stream_name":"stdout","time":5816.501288092,"data":"Step 1220 | Training Loss: 0.0004\r\n"}
159
+ ,{"stream_name":"stdout","time":5864.16233361,"data":"Step 1230 | Training Loss: 0.0691\r\n"}
160
+ ,{"stream_name":"stdout","time":5911.85040685,"data":"Step 1240 | Training Loss: 0.0007\r\n"}
161
+ ,{"stream_name":"stdout","time":5959.712574355,"data":"Step 1250 | Training Loss: 1.2051\r\n"}
162
+ ,{"stream_name":"stdout","time":6007.877709467,"data":"Step 1260 | Training Loss: 0.0004\r\n"}
163
+ ,{"stream_name":"stdout","time":6056.001992296,"data":"Step 1270 | Training Loss: 0.1555\r\n"}
164
+ ,{"stream_name":"stdout","time":6104.108405619,"data":"Step 1280 | Training Loss: 0.0006\r\n"}
165
+ ,{"stream_name":"stdout","time":6152.024082329,"data":"Step 1290 | Training Loss: 0.5580\r\n"}
166
+ ,{"stream_name":"stdout","time":6199.944481163,"data":"Step 1300 | Training Loss: 0.0001\r\n"}
167
+ ,{"stream_name":"stdout","time":6247.804684893,"data":"Step 1310 | Training Loss: 0.1162\r\n"}
168
+ ,{"stream_name":"stdout","time":6295.6129921,"data":"Step 1320 | Training Loss: 0.0001\r\n"}
169
+ ,{"stream_name":"stdout","time":6343.622646497,"data":"Step 1330 | Training Loss: 0.0001\r\n"}
170
+ ,{"stream_name":"stdout","time":6391.332560056,"data":"Step 1340 | Training Loss: 0.0004\r\n"}
171
+ ,{"stream_name":"stdout","time":6439.554346073,"data":"Step 1350 | Training Loss: 0.1885\r\n"}
172
+ ,{"stream_name":"stdout","time":6487.735699153,"data":"Step 1360 | Training Loss: 0.0002\r\n"}
173
+ ,{"stream_name":"stdout","time":6535.854168446,"data":"Step 1370 | Training Loss: 0.0299\r\n"}
174
+ ,{"stream_name":"stdout","time":6583.919860542,"data":"Step 1380 | Training Loss: 0.0000\r\n"}
175
+ ,{"stream_name":"stdout","time":6631.836710166,"data":"Step 1390 | Training Loss: 0.0988\r\n"}
176
+ ,{"stream_name":"stdout","time":6679.542831141,"data":"Step 1400 | Training Loss: 0.0003\r\n"}
177
+ ,{"stream_name":"stdout","time":6727.554324613,"data":"Step 1410 | Training Loss: 0.0201\r\n"}
178
+ ,{"stream_name":"stdout","time":6775.317800414,"data":"Step 1420 | Training Loss: 0.0001\r\n"}
179
+ ,{"stream_name":"stdout","time":6823.470334427,"data":"Step 1430 | Training Loss: 0.3336\r\n"}
180
+ ,{"stream_name":"stdout","time":6871.37692737,"data":"Step 1440 | Training Loss: 0.0002\r\n"}
181
+ ,{"stream_name":"stdout","time":6919.284300922,"data":"Step 1450 | Training Loss: 0.1233\r\n"}
182
+ ,{"stream_name":"stdout","time":6967.505978789,"data":"Step 1460 | Training Loss: 0.0001\r\n"}
183
+ ,{"stream_name":"stdout","time":7015.52657389,"data":"Step 1470 | Training Loss: 0.0186\r\n"}
184
+ ,{"stream_name":"stdout","time":7063.751781054,"data":"Step 1480 | Training Loss: 0.0000\r\n"}
185
+ ,{"stream_name":"stdout","time":7111.858047466,"data":"Step 1490 | Training Loss: 0.0482\r\n"}
186
+ ,{"stream_name":"stdout","time":7159.658458072,"data":"Step 1500 | Training Loss: 0.0001\r\n"}
187
+ ,{"stream_name":"stdout","time":7207.662492858,"data":"Step 1510 | Training Loss: 0.0006\r\n"}
188
+ ,{"stream_name":"stdout","time":7255.82932487,"data":"Step 1520 | Training Loss: 0.0001\r\n"}
189
+ ,{"stream_name":"stdout","time":7303.793077624,"data":"Step 1530 | Training Loss: 0.0243\r\n"}
190
+ ,{"stream_name":"stdout","time":7351.603975531,"data":"Step 1540 | Training Loss: 0.0001\r\n"}
191
+ ,{"stream_name":"stdout","time":7399.415345643,"data":"Step 1550 | Training Loss: 0.0009\r\n"}
192
+ ,{"stream_name":"stdout","time":7447.737379603,"data":"Step 1560 | Training Loss: 0.0002\r\n"}
193
+ ,{"stream_name":"stdout","time":7496.112652686,"data":"Step 1570 | Training Loss: 0.0005\r\n"}
194
+ ,{"stream_name":"stdout","time":7544.327514788,"data":"Step 1580 | Training Loss: 0.0002\r\n"}
195
+ ,{"stream_name":"stdout","time":7592.64285207,"data":"Step 1590 | Training Loss: 0.0783\r\n"}
196
+ ,{"stream_name":"stdout","time":7640.553459067,"data":"Step 1600 | Training Loss: 0.0001\r\n"}
197
+ ,{"stream_name":"stdout","time":7688.76009634,"data":"Step 1610 | Training Loss: 0.0009\r\n"}
198
+ ,{"stream_name":"stdout","time":7736.772224448,"data":"Step 1620 | Training Loss: 0.0002\r\n"}
199
+ ,{"stream_name":"stdout","time":7784.881637017,"data":"Step 1630 | Training Loss: 0.0460\r\n"}
200
+ ,{"stream_name":"stdout","time":7832.939793228,"data":"Step 1640 | Training Loss: 0.0001\r\n"}
201
+ ,{"stream_name":"stdout","time":7881.358438893,"data":"Step 1650 | Training Loss: 0.2914\r\n"}
202
+ ,{"stream_name":"stdout","time":7929.568153147,"data":"Step 1660 | Training Loss: 0.0000\r\n"}
203
+ ,{"stream_name":"stdout","time":7977.723974606,"data":"Step 1670 | Training Loss: 0.1425\r\n"}
204
+ ,{"stream_name":"stdout","time":8025.832625249,"data":"Step 1680 | Training Loss: 0.0001\r\n"}
205
+ ,{"stream_name":"stdout","time":8074.193793917,"data":"Step 1690 | Training Loss: 0.0000\r\n"}
206
+ ,{"stream_name":"stdout","time":8122.656383952,"data":"Step 1700 | Training Loss: 0.0001\r\n"}
207
+ ,{"stream_name":"stdout","time":8170.798601539,"data":"Step 1710 | Training Loss: 0.0477\r\n"}
208
+ ,{"stream_name":"stdout","time":8219.166304341,"data":"Step 1720 | Training Loss: 0.0000\r\n"}
209
+ ,{"stream_name":"stdout","time":8267.580624377,"data":"Step 1730 | Training Loss: 0.0257\r\n"}
210
+ ,{"stream_name":"stdout","time":8315.680862001,"data":"Step 1740 | Training Loss: 0.0004\r\n"}
211
+ ,{"stream_name":"stdout","time":8363.734931554,"data":"Step 1750 | Training Loss: 0.2631\r\n"}
212
+ ,{"stream_name":"stdout","time":8411.94452979,"data":"Step 1760 | Training Loss: 0.0004\r\n"}
213
+ ,{"stream_name":"stdout","time":8459.888270087,"data":"Step 1770 | Training Loss: 0.0791\r\n"}
214
+ ,{"stream_name":"stdout","time":8508.609884681,"data":"Step 1780 | Training Loss: 0.0002\r\n"}
215
+ ,{"stream_name":"stdout","time":8556.720874265,"data":"Step 1790 | Training Loss: 0.0384\r\n"}
216
+ ,{"stream_name":"stdout","time":8604.982741838,"data":"Step 1800 | Training Loss: 0.0001\r\n"}
217
+ ,{"stream_name":"stdout","time":8653.395544701,"data":"Step 1810 | Training Loss: 0.1196\r\n"}
218
+ ,{"stream_name":"stdout","time":8702.023179721,"data":"Step 1820 | Training Loss: 0.0001\r\n"}
219
+ ,{"stream_name":"stdout","time":8750.530975787,"data":"Step 1830 | Training Loss: 0.0466\r\n"}
220
+ ,{"stream_name":"stdout","time":8799.059151276,"data":"Step 1840 | Training Loss: 0.0001\r\n"}
221
+ ,{"stream_name":"stdout","time":8847.57612919,"data":"Step 1850 | Training Loss: 0.0519\r\n"}
222
+ ,{"stream_name":"stdout","time":8895.636976668,"data":"Step 1860 | Training Loss: 0.0001\r\n"}
223
+ ,{"stream_name":"stdout","time":8944.050300751,"data":"Step 1870 | Training Loss: 0.0227\r\n"}
224
+ ,{"stream_name":"stdout","time":8992.101234764,"data":"Step 1880 | Training Loss: 0.0002\r\n"}
225
+ ,{"stream_name":"stdout","time":9040.325735749,"data":"Step 1890 | Training Loss: 0.0516\r\n"}
226
+ ,{"stream_name":"stdout","time":9088.63660473,"data":"Step 1900 | Training Loss: 0.0000\r\n"}
227
+ ,{"stream_name":"stdout","time":9137.415846989,"data":"Step 1910 | Training Loss: 0.0068\r\n"}
228
+ ,{"stream_name":"stdout","time":9185.728635067,"data":"Step 1920 | Training Loss: 0.0001\r\n"}
229
+ ,{"stream_name":"stdout","time":9234.875243625,"data":"Step 1930 | Training Loss: 0.2023\r\n"}
230
+ ,{"stream_name":"stdout","time":9283.141236508,"data":"Step 1940 | Training Loss: 0.0001\r\n"}
231
+ ,{"stream_name":"stdout","time":9331.5436015,"data":"Step 1950 | Training Loss: 0.0738\r\n"}
232
+ ,{"stream_name":"stdout","time":9380.160375661,"data":"Step 1960 | Training Loss: 0.0001\r\n"}
233
+ ,{"stream_name":"stdout","time":9428.899574607,"data":"Step 1970 | Training Loss: 0.0005\r\n"}
234
+ ,{"stream_name":"stdout","time":9477.26809798,"data":"Step 1980 | Training Loss: 0.0001\r\n"}
235
+ ,{"stream_name":"stdout","time":9525.718275429,"data":"Step 1990 | Training Loss: 0.0065\r\n"}
236
+ ,{"stream_name":"stdout","time":9574.233678476,"data":"Step 2000 | Training Loss: 0.0002\r\n"}
237
+ ,{"stream_name":"stdout","time":9631.060675239,"data":"Step 2010 | Training Loss: 0.0009\r\n"}
238
+ ,{"stream_name":"stdout","time":9679.688411793,"data":"Step 2020 | Training Loss: 0.0001\r\n"}
239
+ ,{"stream_name":"stdout","time":9728.156635587,"data":"Step 2030 | Training Loss: 0.0011\r\n"}
240
+ ,{"stream_name":"stdout","time":9776.589138239,"data":"Step 2040 | Training Loss: 0.0001\r\n"}
241
+ ,{"stream_name":"stdout","time":9824.955186704,"data":"Step 2050 | Training Loss: 0.5175\r\n"}
242
+ ,{"stream_name":"stdout","time":9873.285441773,"data":"Step 2060 | Training Loss: 0.0002\r\n"}
243
+ ,{"stream_name":"stdout","time":9921.502797565,"data":"Step 2070 | Training Loss: 0.2302\r\n"}
244
+ ,{"stream_name":"stdout","time":9969.553685845,"data":"Step 2080 | Training Loss: 0.0000\r\n"}
245
+ ,{"stream_name":"stdout","time":10017.673397318,"data":"Step 2090 | Training Loss: 0.1011\r\n"}
246
+ ,{"stream_name":"stdout","time":10065.680193595,"data":"Step 2100 | Training Loss: 0.0001\r\n"}
247
+ ,{"stream_name":"stdout","time":10113.677285646,"data":"Step 2110 | Training Loss: 0.0005\r\n"}
248
+ ,{"stream_name":"stdout","time":10161.317414601,"data":"Step 2120 | Training Loss: 0.0002\r\n"}
249
+ ,{"stream_name":"stdout","time":10209.068643255,"data":"Step 2130 | Training Loss: 0.0274\r\n"}
250
+ ,{"stream_name":"stdout","time":10257.180170176,"data":"Step 2140 | Training Loss: 0.0001\r\n"}
251
+ ,{"stream_name":"stdout","time":10305.03358383,"data":"Step 2150 | Training Loss: 0.0000\r\n"}
252
+ ,{"stream_name":"stdout","time":10353.205568729,"data":"Step 2160 | Training Loss: 0.0000\r\n"}
253
+ ,{"stream_name":"stdout","time":10400.902835464,"data":"Step 2170 | Training Loss: 0.0155\r\n"}
254
+ ,{"stream_name":"stdout","time":10448.654623348,"data":"Step 2180 | Training Loss: 0.0000\r\n"}
255
+ ,{"stream_name":"stdout","time":10496.66403938,"data":"Step 2190 | Training Loss: 0.0015\r\n"}
256
+ ,{"stream_name":"stdout","time":10544.502190693,"data":"Step 2200 | Training Loss: 0.0002\r\n"}
257
+ ,{"stream_name":"stdout","time":10592.609885182,"data":"Step 2210 | Training Loss: 0.0175\r\n"}
258
+ ,{"stream_name":"stdout","time":10640.561500595,"data":"Step 2220 | Training Loss: 0.0001\r\n"}
259
+ ,{"stream_name":"stdout","time":10688.669171201,"data":"Step 2230 | Training Loss: 0.0258\r\n"}
260
+ ,{"stream_name":"stdout","time":10736.734963731,"data":"Step 2240 | Training Loss: 0.0001\r\n"}
261
+ ,{"stream_name":"stdout","time":10784.855844654,"data":"Step 2250 | Training Loss: 0.0509\r\n"}
262
+ ,{"stream_name":"stdout","time":10833.074204572,"data":"Step 2260 | Training Loss: 0.0001\r\n"}
263
+ ,{"stream_name":"stdout","time":10881.284583119,"data":"Step 2270 | Training Loss: 0.2298\r\n"}
264
+ ,{"stream_name":"stdout","time":10929.49513228,"data":"Step 2280 | Training Loss: 0.0000\r\n"}
265
+ ,{"stream_name":"stdout","time":10978.233622818,"data":"Step 2290 | Training Loss: 0.0509\r\n"}
266
+ ,{"stream_name":"stdout","time":11026.807980761,"data":"Step 2300 | Training Loss: 0.0001\r\n"}
267
+ ,{"stream_name":"stdout","time":11075.070673196,"data":"Step 2310 | Training Loss: 0.0892\r\n"}
268
+ ,{"stream_name":"stdout","time":11123.386057763,"data":"Step 2320 | Training Loss: 0.0000\r\n"}
269
+ ,{"stream_name":"stdout","time":11171.482455969,"data":"Step 2330 | Training Loss: 0.0255\r\n"}
270
+ ,{"stream_name":"stdout","time":11219.633084586,"data":"Step 2340 | Training Loss: 0.0001\r\n"}
271
+ ,{"stream_name":"stdout","time":11267.795767782,"data":"Step 2350 | Training Loss: 0.1951\r\n"}
272
+ ,{"stream_name":"stdout","time":11316.052288864,"data":"Step 2360 | Training Loss: 0.0000\r\n"}
273
+ ,{"stream_name":"stdout","time":11364.102817969,"data":"Step 2370 | Training Loss: 0.0458\r\n"}
274
+ ,{"stream_name":"stdout","time":11412.314451529,"data":"Step 2380 | Training Loss: 0.0002\r\n"}
275
+ ,{"stream_name":"stdout","time":11460.519832412,"data":"Step 2390 | Training Loss: 0.6189\r\n"}
276
+ ,{"stream_name":"stdout","time":11508.72580332,"data":"Step 2400 | Training Loss: 0.0001\r\n"}
277
+ ,{"stream_name":"stdout","time":11556.975910021,"data":"Step 2410 | Training Loss: 0.0008\r\n"}
278
+ ,{"stream_name":"stdout","time":11605.077335173,"data":"Step 2420 | Training Loss: 0.0002\r\n"}
279
+ ,{"stream_name":"stdout","time":11653.32268846,"data":"Step 2430 | Training Loss: 0.1199\r\n"}
280
+ ,{"stream_name":"stdout","time":11701.378311921,"data":"Step 2440 | Training Loss: 0.0000\r\n"}
281
+ ,{"stream_name":"stdout","time":11749.4245447,"data":"Step 2450 | Training Loss: 0.1395\r\n"}
282
+ ,{"stream_name":"stdout","time":11797.732336407,"data":"Step 2460 | Training Loss: 0.0000\r\n"}
283
+ ,{"stream_name":"stdout","time":11845.622837549,"data":"Step 2470 | Training Loss: 0.5506\r\n"}
284
+ ,{"stream_name":"stdout","time":11893.826706321,"data":"Step 2480 | Training Loss: 0.0002\r\n"}
285
+ ,{"stream_name":"stdout","time":11941.981597797,"data":"Step 2490 | Training Loss: 0.3704\r\n"}
286
+ ,{"stream_name":"stdout","time":11990.286572307,"data":"Step 2500 | Training Loss: 0.0002\r\n"}
287
+ ,{"stream_name":"stdout","time":12038.439453397,"data":"Step 2510 | Training Loss: 0.0844\r\n"}
288
+ ,{"stream_name":"stdout","time":12086.791797578,"data":"Step 2520 | Training Loss: 0.0000\r\n"}
289
+ ,{"stream_name":"stdout","time":12134.89266628,"data":"Step 2530 | Training Loss: 0.8372\r\n"}
290
+ ,{"stream_name":"stdout","time":12183.256479484,"data":"Step 2540 | Training Loss: 0.0001\r\n"}
291
+ ,{"stream_name":"stdout","time":12231.507832762,"data":"Step 2550 | Training Loss: 0.1077\r\n"}
292
+ ,{"stream_name":"stdout","time":12279.705800937,"data":"Step 2560 | Training Loss: 0.0000\r\n"}
293
+ ,{"stream_name":"stdout","time":12327.913891106,"data":"Step 2570 | Training Loss: 0.0242\r\n"}
294
+ ,{"stream_name":"stdout","time":12375.972268516,"data":"Step 2580 | Training Loss: 0.0001\r\n"}
295
+ ,{"stream_name":"stdout","time":12424.334068139,"data":"Step 2590 | Training Loss: 0.2288\r\n"}
296
+ ,{"stream_name":"stdout","time":12472.587447912,"data":"Step 2600 | Training Loss: 0.0002\r\n"}
297
+ ,{"stream_name":"stdout","time":12520.797256758,"data":"Step 2610 | Training Loss: 0.0235\r\n"}
298
+ ,{"stream_name":"stdout","time":12569.093494229,"data":"Step 2620 | Training Loss: 0.0000\r\n"}
299
+ ,{"stream_name":"stdout","time":12617.198884574,"data":"Step 2630 | Training Loss: 0.0002\r\n"}
300
+ ,{"stream_name":"stdout","time":12665.093328592,"data":"Step 2640 | Training Loss: 0.0002\r\n"}
301
+ ,{"stream_name":"stdout","time":12713.408533417,"data":"Step 2650 | Training Loss: 0.5299\r\n"}
302
+ ,{"stream_name":"stdout","time":12761.767827104,"data":"Step 2660 | Training Loss: 0.0001\r\n"}
303
+ ,{"stream_name":"stdout","time":12809.865067521,"data":"Step 2670 | Training Loss: 0.0136\r\n"}
304
+ ,{"stream_name":"stdout","time":12858.230243978,"data":"Step 2680 | Training Loss: 0.0000\r\n"}
305
+ ,{"stream_name":"stdout","time":12906.33074439,"data":"Step 2690 | Training Loss: 0.0008\r\n"}
306
+ ,{"stream_name":"stdout","time":12954.47409079,"data":"Step 2700 | Training Loss: 0.0000\r\n"}
307
+ ,{"stream_name":"stdout","time":13002.676907936,"data":"Step 2710 | Training Loss: 0.0003\r\n"}
308
+ ,{"stream_name":"stdout","time":13050.568219472,"data":"Step 2720 | Training Loss: 0.0000\r\n"}
309
+ ,{"stream_name":"stdout","time":13098.831028953,"data":"Step 2730 | Training Loss: 0.0721\r\n"}
310
+ ,{"stream_name":"stdout","time":13146.980227144,"data":"Step 2740 | Training Loss: 0.0001\r\n"}
311
+ ,{"stream_name":"stdout","time":13195.180417814,"data":"Step 2750 | Training Loss: 0.0136\r\n"}
312
+ ,{"stream_name":"stdout","time":13243.341285046,"data":"Step 2760 | Training Loss: 0.0002\r\n"}
313
+ ,{"stream_name":"stdout","time":13291.541751866,"data":"Step 2770 | Training Loss: 0.0116\r\n"}
314
+ ,{"stream_name":"stdout","time":13339.744655223,"data":"Step 2780 | Training Loss: 0.0000\r\n"}
315
+ ,{"stream_name":"stdout","time":13388.153938032,"data":"Step 2790 | Training Loss: 0.1945\r\n"}
316
+ ,{"stream_name":"stdout","time":13436.357333285,"data":"Step 2800 | Training Loss: 0.0000\r\n"}
317
+ ,{"stream_name":"stdout","time":13484.518706384,"data":"Step 2810 | Training Loss: 0.0251\r\n"}
318
+ ,{"stream_name":"stdout","time":13532.880920581,"data":"Step 2820 | Training Loss: 0.0000\r\n"}
319
+ ,{"stream_name":"stdout","time":13581.084895869,"data":"Step 2830 | Training Loss: 0.1457\r\n"}
320
+ ,{"stream_name":"stdout","time":13629.844984614,"data":"Step 2840 | Training Loss: 0.0000\r\n"}
321
+ ,{"stream_name":"stdout","time":13678.414779744,"data":"Step 2850 | Training Loss: 0.9311\r\n"}
322
+ ,{"stream_name":"stdout","time":13726.833249909,"data":"Step 2860 | Training Loss: 0.0001\r\n"}
323
+ ,{"stream_name":"stdout","time":13775.434843976,"data":"Step 2870 | Training Loss: 0.1772\r\n"}
324
+ ,{"stream_name":"stdout","time":13823.949314154,"data":"Step 2880 | Training Loss: 0.0000\r\n"}
325
+ ,{"stream_name":"stdout","time":13872.686423709,"data":"Step 2890 | Training Loss: 0.0556\r\n"}
326
+ ,{"stream_name":"stdout","time":13921.081973969,"data":"Step 2900 | Training Loss: 0.0000\r\n"}
327
+ ,{"stream_name":"stdout","time":13969.797750806,"data":"Step 2910 | Training Loss: 0.0123\r\n"}
328
+ ,{"stream_name":"stdout","time":14018.145630421,"data":"Step 2920 | Training Loss: 0.0001\r\n"}
329
+ ,{"stream_name":"stdout","time":14066.612400506,"data":"Step 2930 | Training Loss: 0.0005\r\n"}
330
+ ,{"stream_name":"stdout","time":14115.374993398,"data":"Step 2940 | Training Loss: 0.0000\r\n"}
331
+ ,{"stream_name":"stdout","time":14163.785569929,"data":"Step 2950 | Training Loss: 0.0004\r\n"}
332
+ ,{"stream_name":"stdout","time":14212.617086811,"data":"Step 2960 | Training Loss: 0.0000\r\n"}
333
+ ,{"stream_name":"stdout","time":14260.919626194,"data":"Step 2970 | Training Loss: 0.0006\r\n"}
334
+ ,{"stream_name":"stdout","time":14309.119366511,"data":"Step 2980 | Training Loss: 0.0000\r\n"}
335
+ ,{"stream_name":"stdout","time":14357.726380415,"data":"Step 2990 | Training Loss: 0.0021\r\n"}
336
+ ,{"stream_name":"stdout","time":14406.384222748,"data":"Step 3000 | Training Loss: 0.0001\r\n"}
337
+ ,{"stream_name":"stdout","time":14463.202067486,"data":"Step 3010 | Training Loss: 0.0278\r\n"}
338
+ ,{"stream_name":"stdout","time":14511.763538442,"data":"Step 3020 | Training Loss: 0.0000\r\n"}
339
+ ,{"stream_name":"stdout","time":14560.37498688,"data":"Step 3030 | Training Loss: 0.4339\r\n"}
340
+ ,{"stream_name":"stdout","time":14608.886303543,"data":"Step 3040 | Training Loss: 0.0000\r\n"}
341
+ ,{"stream_name":"stdout","time":14657.525612971,"data":"Step 3050 | Training Loss: 0.5201\r\n"}
342
+ ,{"stream_name":"stdout","time":14706.191179102,"data":"Step 3060 | Training Loss: 0.0003\r\n"}
343
+ ,{"stream_name":"stdout","time":14754.901196453,"data":"Step 3070 | Training Loss: 0.0404\r\n"}
344
+ ,{"stream_name":"stdout","time":14803.813412315,"data":"Step 3080 | Training Loss: 0.0000\r\n"}
345
+ ,{"stream_name":"stdout","time":14852.615926466,"data":"Step 3090 | Training Loss: 0.0507\r\n"}
346
+ ,{"stream_name":"stdout","time":14900.970057318,"data":"Step 3100 | Training Loss: 0.0000\r\n"}
347
+ ,{"stream_name":"stdout","time":14949.385769626,"data":"Step 3110 | Training Loss: 0.0233\r\n"}
348
+ ,{"stream_name":"stdout","time":14998.256273312,"data":"Step 3120 | Training Loss: 0.0000\r\n"}
349
+ ,{"stream_name":"stdout","time":15047.07561482,"data":"Step 3130 | Training Loss: 0.0000\r\n"}
350
+ ,{"stream_name":"stdout","time":15095.633112375,"data":"Step 3140 | Training Loss: 0.0000\r\n"}
351
+ ,{"stream_name":"stdout","time":15144.292614945,"data":"Step 3150 | Training Loss: 0.1742\r\n"}
352
+ ,{"stream_name":"stdout","time":15193.053175799,"data":"Step 3160 | Training Loss: 0.0000\r\n"}
353
+ ,{"stream_name":"stdout","time":15241.611262561,"data":"Step 3170 | Training Loss: 0.0041\r\n"}
354
+ ,{"stream_name":"stdout","time":15290.581339617,"data":"Step 3180 | Training Loss: 0.0000\r\n"}
355
+ ,{"stream_name":"stdout","time":15339.543645267,"data":"Step 3190 | Training Loss: 0.0227\r\n"}
356
+ ,{"stream_name":"stdout","time":15388.31388081,"data":"Step 3200 | Training Loss: 0.0000\r\n"}
357
+ ,{"stream_name":"stdout","time":15437.119334513,"data":"Step 3210 | Training Loss: 0.0346\r\n"}
358
+ ,{"stream_name":"stdout","time":15485.627486757,"data":"Step 3220 | Training Loss: 0.0002\r\n"}
359
+ ,{"stream_name":"stdout","time":15534.488478314,"data":"Step 3230 | Training Loss: 0.0016\r\n"}
360
+ ,{"stream_name":"stdout","time":15583.155939828,"data":"Step 3240 | Training Loss: 0.0000\r\n"}
361
+ ,{"stream_name":"stdout","time":15631.59462484,"data":"Step 3250 | Training Loss: 0.1968\r\n"}
362
+ ,{"stream_name":"stdout","time":15679.937384263,"data":"Step 3260 | Training Loss: 0.0000\r\n"}
363
+ ,{"stream_name":"stdout","time":15728.533847415,"data":"Step 3270 | Training Loss: 0.0158\r\n"}
364
+ ,{"stream_name":"stdout","time":15777.087533043,"data":"Step 3280 | Training Loss: 0.0001\r\n"}
365
+ ,{"stream_name":"stdout","time":15825.378473935,"data":"Step 3290 | Training Loss: 0.1573\r\n"}
366
+ ,{"stream_name":"stdout","time":15873.685010863,"data":"Step 3300 | Training Loss: 0.0000\r\n"}
367
+ ,{"stream_name":"stdout","time":15922.186115432,"data":"Step 3310 | Training Loss: 0.0178\r\n"}
368
+ ,{"stream_name":"stdout","time":15970.693688866,"data":"Step 3320 | Training Loss: 0.0000\r\n"}
369
+ ,{"stream_name":"stdout","time":16019.297793642,"data":"Step 3330 | Training Loss: 0.0231\r\n"}
370
+ ,{"stream_name":"stdout","time":16067.803931815,"data":"Step 3340 | Training Loss: 0.0003\r\n"}
371
+ ,{"stream_name":"stdout","time":16116.357650631,"data":"Step 3350 | Training Loss: 0.0002\r\n"}
372
+ ,{"stream_name":"stdout","time":16164.759289263,"data":"Step 3360 | Training Loss: 0.0000\r\n"}
373
+ ,{"stream_name":"stdout","time":16213.322592326,"data":"Step 3370 | Training Loss: 0.0724\r\n"}
374
+ ,{"stream_name":"stdout","time":16262.092514709,"data":"Step 3380 | Training Loss: 0.0000\r\n"}
375
+ ,{"stream_name":"stdout","time":16310.709926205,"data":"Step 3390 | Training Loss: 0.0320\r\n"}
376
+ ,{"stream_name":"stdout","time":16359.507674907,"data":"Step 3400 | Training Loss: 0.0001\r\n"}
377
+ ,{"stream_name":"stdout","time":16408.525705942,"data":"Step 3410 | Training Loss: 0.0659\r\n"}
378
+ ,{"stream_name":"stdout","time":16457.345597751,"data":"Step 3420 | Training Loss: 0.0001\r\n"}
379
+ ,{"stream_name":"stdout","time":16505.856659197,"data":"Step 3430 | Training Loss: 0.0882\r\n"}
380
+ ,{"stream_name":"stdout","time":16554.629038762,"data":"Step 3440 | Training Loss: 0.0001\r\n"}
381
+ ,{"stream_name":"stdout","time":16603.292428455,"data":"Step 3450 | Training Loss: 0.0303\r\n"}
382
+ ,{"stream_name":"stdout","time":16651.682411049,"data":"Step 3460 | Training Loss: 0.0001\r\n"}
383
+ ,{"stream_name":"stdout","time":16699.783609036,"data":"Step 3470 | Training Loss: 0.0018\r\n"}
384
+ ,{"stream_name":"stdout","time":16748.287659474,"data":"Step 3480 | Training Loss: 0.0000\r\n"}
385
+ ,{"stream_name":"stdout","time":16796.640797618,"data":"Step 3490 | Training Loss: 0.2110\r\n"}
386
+ ,{"stream_name":"stdout","time":16844.938239026,"data":"Step 3500 | Training Loss: 0.0000\r\n"}
387
+ ,{"stream_name":"stdout","time":16893.556886133,"data":"Step 3510 | Training Loss: 0.4730\r\n"}
388
+ ,{"stream_name":"stdout","time":16942.176207499,"data":"Step 3520 | Training Loss: 0.0000\r\n"}
389
+ ,{"stream_name":"stdout","time":16990.378816435,"data":"Step 3530 | Training Loss: 0.0251\r\n"}
390
+ ,{"stream_name":"stdout","time":17038.68337353,"data":"Step 3540 | Training Loss: 0.0000\r\n"}
391
+ ,{"stream_name":"stdout","time":17087.234967293,"data":"Step 3550 | Training Loss: 0.0082\r\n"}
392
+ ,{"stream_name":"stdout","time":17135.492342606,"data":"Step 3560 | Training Loss: 0.0001\r\n"}
393
+ ,{"stream_name":"stdout","time":17183.947815938,"data":"Step 3570 | Training Loss: 0.0221\r\n"}
394
+ ,{"stream_name":"stdout","time":17232.61139734,"data":"Step 3580 | Training Loss: 0.0011\r\n"}
395
+ ,{"stream_name":"stdout","time":17281.112377796,"data":"Step 3590 | Training Loss: 0.0272\r\n"}
396
+ ,{"stream_name":"stdout","time":17329.56771638,"data":"Step 3600 | Training Loss: 0.0000\r\n"}
397
+ ,{"stream_name":"stdout","time":17378.231160135,"data":"Step 3610 | Training Loss: 0.0345\r\n"}
398
+ ,{"stream_name":"stdout","time":17426.904472048,"data":"Step 3620 | Training Loss: 0.0004\r\n"}
399
+ ,{"stream_name":"stdout","time":17475.632677823,"data":"Step 3630 | Training Loss: 0.0056\r\n"}
400
+ ,{"stream_name":"stdout","time":17524.547406387,"data":"Step 3640 | Training Loss: 0.0001\r\n"}
401
+ ,{"stream_name":"stdout","time":17573.467743804,"data":"Step 3650 | Training Loss: 0.0252\r\n"}
402
+ ,{"stream_name":"stdout","time":17622.181002064,"data":"Step 3660 | Training Loss: 0.0000\r\n"}
403
+ ,{"stream_name":"stdout","time":17670.583890813,"data":"Step 3670 | Training Loss: 0.0409\r\n"}
404
+ ,{"stream_name":"stdout","time":17718.990510163,"data":"Step 3680 | Training Loss: 0.0000\r\n"}
405
+ ,{"stream_name":"stdout","time":17767.439034654,"data":"Step 3690 | Training Loss: 2.0241\r\n"}
406
+ ,{"stream_name":"stdout","time":17815.987954397,"data":"Step 3700 | Training Loss: 0.0000\r\n"}
407
+ ,{"stream_name":"stdout","time":17864.595691869,"data":"Step 3710 | Training Loss: 0.0006\r\n"}
408
+ ,{"stream_name":"stdout","time":17913.259888888,"data":"Step 3720 | Training Loss: 0.0000\r\n"}
409
+ ,{"stream_name":"stdout","time":17961.663746001,"data":"Step 3730 | Training Loss: 0.6780\r\n"}
410
+ ,{"stream_name":"stdout","time":18010.105099133,"data":"Step 3740 | Training Loss: 0.0001\r\n"}
411
+ ,{"stream_name":"stdout","time":18058.511524086,"data":"Step 3750 | Training Loss: 0.1512\r\n"}
412
+ ,{"stream_name":"stdout","time":18106.859574272,"data":"Step 3760 | Training Loss: 0.0000\r\n"}
413
+ ,{"stream_name":"stdout","time":18155.253863703,"data":"Step 3770 | Training Loss: 0.4934\r\n"}
414
+ ,{"stream_name":"stdout","time":18203.626237196,"data":"Step 3780 | Training Loss: 0.0001\r\n"}
415
+ ,{"stream_name":"stdout","time":18252.381601219,"data":"Step 3790 | Training Loss: 0.0246\r\n"}
416
+ ,{"stream_name":"stdout","time":18300.687943228,"data":"Step 3800 | Training Loss: 0.0000\r\n"}
417
+ ,{"stream_name":"stdout","time":18349.149597588,"data":"Step 3810 | Training Loss: 0.0523\r\n"}
418
+ ,{"stream_name":"stdout","time":18397.70907493,"data":"Step 3820 | Training Loss: 0.0000\r\n"}
419
+ ,{"stream_name":"stdout","time":18446.355006575,"data":"Step 3830 | Training Loss: 0.0001\r\n"}
420
+ ,{"stream_name":"stdout","time":18494.818943225,"data":"Step 3840 | Training Loss: 0.0000\r\n"}
421
+ ,{"stream_name":"stdout","time":18543.633929143,"data":"Step 3850 | Training Loss: 0.0863\r\n"}
422
+ ,{"stream_name":"stdout","time":18592.023729011,"data":"Step 3860 | Training Loss: 0.0002\r\n"}
423
+ ,{"stream_name":"stdout","time":18640.800434455,"data":"Step 3870 | Training Loss: 0.0430\r\n"}
424
+ ,{"stream_name":"stdout","time":18689.610009973,"data":"Step 3880 | Training Loss: 0.0002\r\n"}
425
+ ,{"stream_name":"stdout","time":18738.563947461,"data":"Step 3890 | Training Loss: 0.0335\r\n"}
426
+ ,{"stream_name":"stdout","time":18787.631521217,"data":"Step 3900 | Training Loss: 0.0005\r\n"}
427
+ ,{"stream_name":"stdout","time":18836.190670585,"data":"Step 3910 | Training Loss: 0.0301\r\n"}
428
+ ,{"stream_name":"stdout","time":18884.700433541,"data":"Step 3920 | Training Loss: 0.0000\r\n"}
429
+ ,{"stream_name":"stdout","time":18933.352551205,"data":"Step 3930 | Training Loss: 0.0009\r\n"}
430
+ ,{"stream_name":"stdout","time":18981.761953163,"data":"Step 3940 | Training Loss: 0.0000\r\n"}
431
+ ,{"stream_name":"stdout","time":19030.275188847,"data":"Step 3950 | Training Loss: 0.0433\r\n"}
432
+ ,{"stream_name":"stdout","time":19079.138489376,"data":"Step 3960 | Training Loss: 0.0000\r\n"}
433
+ ,{"stream_name":"stdout","time":19127.95942962,"data":"Step 3970 | Training Loss: 0.0300\r\n"}
434
+ ,{"stream_name":"stdout","time":19176.813602528,"data":"Step 3980 | Training Loss: 0.0000\r\n"}
435
+ ,{"stream_name":"stdout","time":19225.381677137,"data":"Step 3990 | Training Loss: 0.3666\r\n"}
436
+ ,{"stream_name":"stdout","time":19274.033597687,"data":"Step 4000 | Training Loss: 0.0000\r\n"}
437
+ ,{"stream_name":"stdout","time":19330.370327348,"data":"Step 4010 | Training Loss: 0.0014\r\n"}
438
+ ,{"stream_name":"stdout","time":19379.187591185,"data":"Step 4020 | Training Loss: 0.0000\r\n"}
439
+ ,{"stream_name":"stdout","time":19428.03854059,"data":"Step 4030 | Training Loss: 0.0352\r\n"}
440
+ ,{"stream_name":"stdout","time":19476.858693396,"data":"Step 4040 | Training Loss: 0.0000\r\n"}
441
+ ,{"stream_name":"stdout","time":19525.556636186,"data":"Step 4050 | Training Loss: 0.0020\r\n"}
442
+ ,{"stream_name":"stdout","time":19574.156651536,"data":"Step 4060 | Training Loss: 0.0000\r\n"}
443
+ ,{"stream_name":"stdout","time":19623.019614237,"data":"Step 4070 | Training Loss: 0.4625\r\n"}
444
+ ,{"stream_name":"stdout","time":19671.77280065,"data":"Step 4080 | Training Loss: 0.0000\r\n"}
445
+ ,{"stream_name":"stdout","time":19720.323045235,"data":"Step 4090 | Training Loss: 0.3424\r\n"}
446
+ ,{"stream_name":"stdout","time":19769.090285045,"data":"Step 4100 | Training Loss: 0.0000\r\n"}
447
+ ,{"stream_name":"stdout","time":19817.84300973,"data":"Step 4110 | Training Loss: 0.0007\r\n"}
448
+ ,{"stream_name":"stdout","time":19866.453379974,"data":"Step 4120 | Training Loss: 0.0000\r\n"}
449
+ ,{"stream_name":"stdout","time":19915.362297406,"data":"Step 4130 | Training Loss: 0.2989\r\n"}
450
+ ,{"stream_name":"stdout","time":19964.285596167,"data":"Step 4140 | Training Loss: 0.0003\r\n"}
451
+ ,{"stream_name":"stdout","time":20013.242861472,"data":"Step 4150 | Training Loss: 0.0118\r\n"}
452
+ ,{"stream_name":"stdout","time":20061.893159716,"data":"Step 4160 | Training Loss: 0.0001\r\n"}
453
+ ,{"stream_name":"stdout","time":20110.38751996,"data":"Step 4170 | Training Loss: 0.0295\r\n"}
454
+ ,{"stream_name":"stdout","time":20158.940734733,"data":"Step 4180 | Training Loss: 0.0000\r\n"}
455
+ ,{"stream_name":"stdout","time":20207.34005143,"data":"Step 4190 | Training Loss: 1.0331\r\n"}
456
+ ,{"stream_name":"stdout","time":20255.476342927,"data":"Step 4200 | Training Loss: 0.0000\r\n"}
457
+ ,{"stream_name":"stdout","time":20304.182768136,"data":"Step 4210 | Training Loss: 0.0557\r\n"}
458
+ ,{"stream_name":"stdout","time":20353.063785969,"data":"Step 4220 | Training Loss: 0.0001\r\n"}
459
+ ,{"stream_name":"stdout","time":20401.928579941,"data":"Step 4230 | Training Loss: 0.7705\r\n"}
460
+ ,{"stream_name":"stdout","time":20450.695908392,"data":"Step 4240 | Training Loss: 0.0005\r\n"}
461
+ ,{"stream_name":"stdout","time":20499.807874689,"data":"Step 4250 | Training Loss: 0.0817\r\n"}
462
+ ,{"stream_name":"stdout","time":20548.925492079,"data":"Step 4260 | Training Loss: 0.0001\r\n"}
463
+ ,{"stream_name":"stdout","time":20597.631906868,"data":"Step 4270 | Training Loss: 0.0491\r\n"}
464
+ ,{"stream_name":"stdout","time":20646.238855144,"data":"Step 4280 | Training Loss: 0.0000\r\n"}
465
+ ,{"stream_name":"stdout","time":20695.051173656,"data":"Step 4290 | Training Loss: 0.3100\r\n"}
466
+ ,{"stream_name":"stdout","time":20743.857149025,"data":"Step 4300 | Training Loss: 0.0000\r\n"}
467
+ ,{"stream_name":"stdout","time":20792.416636927,"data":"Step 4310 | Training Loss: 0.3296\r\n"}
468
+ ,{"stream_name":"stdout","time":20840.806554702,"data":"Step 4320 | Training Loss: 0.0000\r\n"}
469
+ ,{"stream_name":"stdout","time":20889.370268489,"data":"Step 4330 | Training Loss: 0.8703\r\n"}
470
+ ,{"stream_name":"stdout","time":20937.975335165,"data":"Step 4340 | Training Loss: 0.0000\r\n"}
471
+ ,{"stream_name":"stdout","time":20986.373670474,"data":"Step 4350 | Training Loss: 0.0738\r\n"}
472
+ ,{"stream_name":"stdout","time":21034.714888468,"data":"Step 4360 | Training Loss: 0.0000\r\n"}
473
+ ,{"stream_name":"stdout","time":21083.321178741,"data":"Step 4370 | Training Loss: 0.0085\r\n"}
474
+ ,{"stream_name":"stdout","time":21131.91588219,"data":"Step 4380 | Training Loss: 0.0000\r\n"}
475
+ ,{"stream_name":"stdout","time":21180.50922574,"data":"Step 4390 | Training Loss: 0.1303\r\n"}
476
+ ,{"stream_name":"stdout","time":21229.468677627,"data":"Step 4400 | Training Loss: 0.0000\r\n"}
477
+ ,{"stream_name":"stdout","time":21278.185678317,"data":"Step 4410 | Training Loss: 0.0038\r\n"}
478
+ ,{"stream_name":"stdout","time":21326.692733082,"data":"Step 4420 | Training Loss: 0.0044\r\n"}
479
+ ,{"stream_name":"stdout","time":21375.246292812,"data":"Step 4430 | Training Loss: 0.0003\r\n"}
480
+ ,{"stream_name":"stdout","time":21423.953765953,"data":"Step 4440 | Training Loss: 0.0000\r\n"}
481
+ ,{"stream_name":"stdout","time":21472.550855928,"data":"Step 4450 | Training Loss: 0.0006\r\n"}
482
+ ,{"stream_name":"stdout","time":21521.000525125,"data":"Step 4460 | Training Loss: 0.0000\r\n"}
483
+ ,{"stream_name":"stdout","time":21569.606979192,"data":"Step 4470 | Training Loss: 0.0084\r\n"}
484
+ ,{"stream_name":"stdout","time":21618.315299901,"data":"Step 4480 | Training Loss: 0.0001\r\n"}
485
+ ,{"stream_name":"stdout","time":21666.952651797,"data":"Step 4490 | Training Loss: 0.0307\r\n"}
486
+ ,{"stream_name":"stdout","time":21715.658911061,"data":"Step 4500 | Training Loss: 0.0001\r\n"}
487
+ ,{"stream_name":"stdout","time":21764.462529606,"data":"Step 4510 | Training Loss: 0.0135\r\n"}
488
+ ,{"stream_name":"stdout","time":21813.268043595,"data":"Step 4520 | Training Loss: 0.0000\r\n"}
489
+ ,{"stream_name":"stdout","time":21861.91300263,"data":"Step 4530 | Training Loss: 0.0000\r\n"}
490
+ ,{"stream_name":"stdout","time":21910.358940365,"data":"Step 4540 | Training Loss: 0.0001\r\n"}
491
+ ,{"stream_name":"stdout","time":21959.32275268,"data":"Step 4550 | Training Loss: 0.0099\r\n"}
492
+ ,{"stream_name":"stdout","time":22008.177645857,"data":"Step 4560 | Training Loss: 0.0000\r\n"}
493
+ ,{"stream_name":"stdout","time":22057.189592942,"data":"Step 4570 | Training Loss: 0.0401\r\n"}
494
+ ,{"stream_name":"stdout","time":22105.891542051,"data":"Step 4580 | Training Loss: 0.0000\r\n"}
495
+ ,{"stream_name":"stdout","time":22154.792730208,"data":"Step 4590 | Training Loss: 0.0001\r\n"}
496
+ ,{"stream_name":"stdout","time":22203.847658097,"data":"Step 4600 | Training Loss: 0.0000\r\n"}
497
+ ,{"stream_name":"stdout","time":22252.846417982,"data":"Step 4610 | Training Loss: 0.0331\r\n"}
498
+ ,{"stream_name":"stdout","time":22301.957604286,"data":"Step 4620 | Training Loss: 0.0000\r\n"}
499
+ ,{"stream_name":"stdout","time":22350.860246187,"data":"Step 4630 | Training Loss: 0.0002\r\n"}
500
+ ,{"stream_name":"stdout","time":22400.127660066,"data":"Step 4640 | Training Loss: 0.0000\r\n"}
501
+ ,{"stream_name":"stdout","time":22449.012001906,"data":"Step 4650 | Training Loss: 0.0001\r\n"}
502
+ ,{"stream_name":"stdout","time":22498.25958275,"data":"Step 4660 | Training Loss: 0.0000\r\n"}
503
+ ,{"stream_name":"stdout","time":22546.854144365,"data":"Step 4670 | Training Loss: 0.7391\r\n"}
504
+ ,{"stream_name":"stdout","time":22595.393059861,"data":"Step 4680 | Training Loss: 0.0000\r\n"}
505
+ ,{"stream_name":"stdout","time":22644.051023849,"data":"Step 4690 | Training Loss: 0.0001\r\n"}
506
+ ,{"stream_name":"stdout","time":22692.755234438,"data":"Step 4700 | Training Loss: 0.0000\r\n"}
507
+ ,{"stream_name":"stdout","time":22741.002387208,"data":"Step 4710 | Training Loss: 0.2675\r\n"}
508
+ ,{"stream_name":"stdout","time":22789.703559045,"data":"Step 4720 | Training Loss: 0.0000\r\n"}
509
+ ,{"stream_name":"stdout","time":22838.215702733,"data":"Step 4730 | Training Loss: 0.0001\r\n"}
510
+ ,{"stream_name":"stdout","time":22886.819238464,"data":"Step 4740 | Training Loss: 0.0000\r\n"}
511
+ ,{"stream_name":"stdout","time":22935.308395738,"data":"Step 4750 | Training Loss: 0.1630\r\n"}
512
+ ,{"stream_name":"stdout","time":22984.006914862,"data":"Step 4760 | Training Loss: 0.0000\r\n"}
513
+ ,{"stream_name":"stdout","time":23032.758845248,"data":"Step 4770 | Training Loss: 0.0497\r\n"}
514
+ ,{"stream_name":"stdout","time":23081.257674891,"data":"Step 4780 | Training Loss: 0.0000\r\n"}
515
+ ,{"stream_name":"stdout","time":23129.759012584,"data":"Step 4790 | Training Loss: 0.0221\r\n"}
516
+ ,{"stream_name":"stdout","time":23178.274305482,"data":"Step 4800 | Training Loss: 0.0000\r\n"}
517
+ ,{"stream_name":"stdout","time":23226.836600616,"data":"Step 4810 | Training Loss: 0.0692\r\n"}
518
+ ,{"stream_name":"stdout","time":23275.239922901,"data":"Step 4820 | Training Loss: 0.0000\r\n"}
519
+ ,{"stream_name":"stdout","time":23323.787158774,"data":"Step 4830 | Training Loss: 0.0002\r\n"}
520
+ ,{"stream_name":"stdout","time":23372.339226258,"data":"Step 4840 | Training Loss: 0.0000\r\n"}
521
+ ,{"stream_name":"stdout","time":23420.799108206,"data":"Step 4850 | Training Loss: 0.0034\r\n"}
522
+ ,{"stream_name":"stdout","time":23469.234119704,"data":"Step 4860 | Training Loss: 0.0000\r\n"}
523
+ ,{"stream_name":"stdout","time":23517.737093164,"data":"Step 4870 | Training Loss: 0.1528\r\n"}
524
+ ,{"stream_name":"stdout","time":23566.332646771,"data":"Step 4880 | Training Loss: 0.0000\r\n"}
525
+ ,{"stream_name":"stdout","time":23614.829191387,"data":"Step 4890 | Training Loss: 0.0390\r\n"}
526
+ ,{"stream_name":"stdout","time":23665.269573262,"data":"Step 4900 | Training Loss: 0.0000\r\n"}
527
+ ,{"stream_name":"stdout","time":23714.086731517,"data":"Step 4910 | Training Loss: 0.6370\r\n"}
528
+ ,{"stream_name":"stdout","time":23762.834485533,"data":"Step 4920 | Training Loss: 0.0000\r\n"}
529
+ ,{"stream_name":"stdout","time":23811.444254035,"data":"Step 4930 | Training Loss: 0.0010\r\n"}
530
+ ,{"stream_name":"stdout","time":23860.101406718,"data":"Step 4940 | Training Loss: 0.0000\r\n"}
531
+ ,{"stream_name":"stdout","time":23908.594611574,"data":"Step 4950 | Training Loss: 0.8518\r\n"}
532
+ ,{"stream_name":"stdout","time":23957.158840517,"data":"Step 4960 | Training Loss: 0.0000\r\n"}
533
+ ,{"stream_name":"stdout","time":24005.355271545,"data":"Step 4970 | Training Loss: 0.1041\r\n"}
534
+ ,{"stream_name":"stdout","time":24053.912362779,"data":"Step 4980 | Training Loss: 0.0002\r\n"}
535
+ ,{"stream_name":"stdout","time":24102.613278632,"data":"Step 4990 | Training Loss: 0.0001\r\n"}
536
+ ,{"stream_name":"stdout","time":24151.104612433,"data":"Step 5000 | Training Loss: 0.0000\r\n"}
537
+ ,{"stream_name":"stderr","time":24177.244250522,"data":"/usr/local/lib/python3.12/dist-packages/mistune.py:435: SyntaxWarning: invalid escape sequence '\\|'\n"}
538
+ ,{"stream_name":"stderr","time":24177.244291509,"data":" cells[i][c] = re.sub('\\\\\\\\\\|', '|', cell)\n"}
539
+ ,{"stream_name":"stderr","time":24177.548979868,"data":"/usr/local/lib/python3.12/dist-packages/nbconvert/filters/filter_links.py:36: SyntaxWarning: invalid escape sequence '\\_'\n"}
540
+ ,{"stream_name":"stderr","time":24177.549016548,"data":" text = re.sub(r'_', '\\_', text) # Escape underscores in display text\n"}
541
+ ,{"stream_name":"stderr","time":24178.904491192,"data":"[NbConvertApp] Converting notebook __notebook__.ipynb to notebook\n"}
542
+ ,{"stream_name":"stderr","time":24179.249568123,"data":"[NbConvertApp] Writing 37815 bytes to __notebook__.ipynb\n"}
543
+ ,{"stream_name":"stderr","time":24181.514947994,"data":"[NbConvertApp] Converting notebook __notebook__.ipynb to html\n"}
544
+ ,{"stream_name":"stderr","time":24182.626672929,"data":"[NbConvertApp] Writing 318940 bytes to __results__.html\n"}
545
+ ]
djalokd/hexa1b/__results__.html ADDED
The diff for this file is too large to render. See raw diff
 
djalokd/hexa1b/custom.css ADDED
File without changes
djalokd/hexa1b/hexa_1b_final.nef ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e974bc8dd55a23f499990308ef0ee5727dfe374094665a123d6b75af884d677
3
+ size 2430849707
djalokd/hexa1b/model-step-1000.nef ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f3b3cf875c8931847d81e7a7302e691a016ebdd09d0d2517978782793364205
3
+ size 2430850107
djalokd/hexa1b/model-step-2000.nef ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cc87d62ebd38d7dd7ce2c519cd0c0e03a119269d09438605734558292173b84
3
+ size 2430850107
djalokd/hexa1b/model-step-3000.nef ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c954f9a3b947f566843d2b60360fffb5244f1822f0eb275359d7b040e55c8c9
3
+ size 2430850107
djalokd/hexa1b/model-step-4000.nef ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9955a409f1a3a28a3077700cfb09e44f0f650e5c14be03b96a3b98d466066101
3
+ size 2430850107
djalokd/hexa1b/model-step-5000.nef ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bb7398cbec2d55539dd25624c12b63de8dcb90a080905708ef64990d1b902e1
3
+ size 2430850107