Upload 11 files
Browse files- .gitattributes +2 -0
- tb/20251213-0337/wandb/debug-internal.log +154 -0
- tb/20251213-0337/wandb/debug.log +26 -0
- tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/config.yaml +248 -0
- tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/output.log +3 -0
- tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/requirements.txt +99 -0
- tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/wandb-metadata.json +95 -0
- tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/wandb-summary.json +1 -0
- tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-core.log +16 -0
- tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-internal.log +154 -0
- tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug.log +26 -0
- tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/run--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337.wandb +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/output.log filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/run--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337.wandb filter=lfs diff=lfs merge=lfs -text
|
tb/20251213-0337/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-13T03:37:36.23132477+08:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
|
| 2 |
+
{"time":"2025-12-13T03:37:37.42793721+08:00","level":"INFO","msg":"stream: created new stream","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 3 |
+
{"time":"2025-12-13T03:37:37.42811984+08:00","level":"INFO","msg":"handler: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 4 |
+
{"time":"2025-12-13T03:37:37.428536603+08:00","level":"INFO","msg":"stream: started","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 5 |
+
{"time":"2025-12-13T03:37:37.428580841+08:00","level":"INFO","msg":"sender: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 6 |
+
{"time":"2025-12-13T03:37:37.428589238+08:00","level":"INFO","msg":"writer: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 7 |
+
{"time":"2025-12-13T06:05:39.583033219+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 8 |
+
{"time":"2025-12-13T07:55:37.649211253+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:59798->35.186.228.49:443: read: connection reset by peer"}
|
| 9 |
+
{"time":"2025-12-13T08:13:29.786188657+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 10 |
+
{"time":"2025-12-13T08:16:47.670018036+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 11 |
+
{"time":"2025-12-13T08:59:24.082351789+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 12 |
+
{"time":"2025-12-13T09:12:13.694820381+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 13 |
+
{"time":"2025-12-13T09:33:57.835049052+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 14 |
+
{"time":"2025-12-13T09:43:28.441384157+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:35074->35.186.228.49:443: read: connection reset by peer"}
|
| 15 |
+
{"time":"2025-12-13T09:55:59.558604972+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55632->35.186.228.49:443: read: connection reset by peer"}
|
| 16 |
+
{"time":"2025-12-13T10:04:15.521407235+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 17 |
+
{"time":"2025-12-13T10:20:21.499848188+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 18 |
+
{"time":"2025-12-13T10:35:23.478678967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 19 |
+
{"time":"2025-12-13T10:51:06.294920404+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 20 |
+
{"time":"2025-12-13T13:14:41.311911939+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 21 |
+
{"time":"2025-12-13T14:59:00.895291828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42956->35.186.228.49:443: read: connection reset by peer"}
|
| 22 |
+
{"time":"2025-12-13T15:28:48.295525438+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 23 |
+
{"time":"2025-12-13T17:26:06.240844168+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 24 |
+
{"time":"2025-12-13T17:33:13.927600494+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 25 |
+
{"time":"2025-12-13T17:38:26.684259493+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 26 |
+
{"time":"2025-12-13T17:59:17.135876142+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 27 |
+
{"time":"2025-12-13T18:21:53.605231099+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:49898->35.186.228.49:443: read: connection reset by peer"}
|
| 28 |
+
{"time":"2025-12-13T20:53:31.081839781+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 29 |
+
{"time":"2025-12-13T21:01:51.042167406+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 30 |
+
{"time":"2025-12-13T21:16:58.046134274+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 31 |
+
{"time":"2025-12-13T21:19:47.88199423+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 32 |
+
{"time":"2025-12-13T21:50:37.427793114+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 33 |
+
{"time":"2025-12-13T22:04:36.210586263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57666->35.186.228.49:443: read: connection reset by peer"}
|
| 34 |
+
{"time":"2025-12-13T22:05:58.336461319+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46820->35.186.228.49:443: read: connection reset by peer"}
|
| 35 |
+
{"time":"2025-12-13T22:27:53.913619965+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:37800->35.186.228.49:443: read: connection reset by peer"}
|
| 36 |
+
{"time":"2025-12-13T23:09:40.869315801+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 37 |
+
{"time":"2025-12-13T23:10:21.955543682+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 38 |
+
{"time":"2025-12-13T23:16:09.301636231+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 39 |
+
{"time":"2025-12-14T00:23:39.770203057+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 40 |
+
{"time":"2025-12-14T00:24:17.145903336+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 41 |
+
{"time":"2025-12-14T02:56:23.178244085+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43334->35.186.228.49:443: read: connection reset by peer"}
|
| 42 |
+
{"time":"2025-12-14T03:18:51.662361676+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 43 |
+
{"time":"2025-12-14T03:19:23.322670147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 44 |
+
{"time":"2025-12-14T03:47:51.213153679+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 45 |
+
{"time":"2025-12-14T04:20:52.250790079+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 46 |
+
{"time":"2025-12-14T04:21:11.202821697+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 47 |
+
{"time":"2025-12-14T07:06:08.967902177+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 48 |
+
{"time":"2025-12-14T07:06:54.195739279+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 49 |
+
{"time":"2025-12-14T07:13:30.365530787+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 50 |
+
{"time":"2025-12-14T07:26:06.7209204+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:48478->35.186.228.49:443: read: connection reset by peer"}
|
| 51 |
+
{"time":"2025-12-14T09:25:09.070791287+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 52 |
+
{"time":"2025-12-14T10:46:12.82568069+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 53 |
+
{"time":"2025-12-14T10:48:43.885806313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 54 |
+
{"time":"2025-12-14T11:26:11.723626581+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 55 |
+
{"time":"2025-12-14T11:31:48.355445234+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 56 |
+
{"time":"2025-12-14T11:41:59.68850527+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 57 |
+
{"time":"2025-12-14T12:27:33.380879248+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 58 |
+
{"time":"2025-12-14T12:36:32.807927117+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57628->35.186.228.49:443: read: connection reset by peer"}
|
| 59 |
+
{"time":"2025-12-14T12:47:10.357920901+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 60 |
+
{"time":"2025-12-14T12:47:47.340023823+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 61 |
+
{"time":"2025-12-14T13:06:22.738392825+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:54890->35.186.228.49:443: read: connection reset by peer"}
|
| 62 |
+
{"time":"2025-12-14T13:06:40.383011533+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:49102->35.186.228.49:443: read: connection reset by peer"}
|
| 63 |
+
{"time":"2025-12-14T13:07:04.782062642+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 64 |
+
{"time":"2025-12-14T13:14:14.561018246+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 65 |
+
{"time":"2025-12-14T14:48:10.912184589+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 66 |
+
{"time":"2025-12-14T14:54:11.088938602+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 67 |
+
{"time":"2025-12-14T14:58:12.755764947+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 68 |
+
{"time":"2025-12-14T15:46:48.897060165+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57112->35.186.228.49:443: read: connection reset by peer"}
|
| 69 |
+
{"time":"2025-12-14T16:16:18.043599985+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 70 |
+
{"time":"2025-12-14T16:25:00.963481603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41088->35.186.228.49:443: read: connection reset by peer"}
|
| 71 |
+
{"time":"2025-12-14T17:14:49.422743755+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 72 |
+
{"time":"2025-12-14T17:21:40.725676457+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 73 |
+
{"time":"2025-12-14T18:22:33.919294777+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55446->35.186.228.49:443: read: connection reset by peer"}
|
| 74 |
+
{"time":"2025-12-14T18:24:49.798972384+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 75 |
+
{"time":"2025-12-14T18:25:31.346842356+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 76 |
+
{"time":"2025-12-14T20:49:52.211146515+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 77 |
+
{"time":"2025-12-14T21:04:32.293978006+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57160->35.186.228.49:443: read: connection reset by peer"}
|
| 78 |
+
{"time":"2025-12-14T21:10:28.561725795+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 79 |
+
{"time":"2025-12-14T21:55:52.598096464+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 80 |
+
{"time":"2025-12-14T21:56:23.294629143+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 81 |
+
{"time":"2025-12-14T22:54:03.471360435+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 82 |
+
{"time":"2025-12-14T23:54:12.865835537+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 83 |
+
{"time":"2025-12-15T00:03:53.643020298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": context deadline exceeded"}
|
| 84 |
+
{"time":"2025-12-15T00:04:50.877883568+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 85 |
+
{"time":"2025-12-15T00:08:44.968783043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": http2: client conn is closed"}
|
| 86 |
+
{"time":"2025-12-15T00:29:18.768015366+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 87 |
+
{"time":"2025-12-15T00:38:18.876310425+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 88 |
+
{"time":"2025-12-15T00:38:47.676509586+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 89 |
+
{"time":"2025-12-15T02:19:16.46063906+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 90 |
+
{"time":"2025-12-15T02:28:23.925958967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 91 |
+
{"time":"2025-12-15T02:28:41.801869127+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 92 |
+
{"time":"2025-12-15T02:40:36.54023762+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55584->35.186.228.49:443: read: connection reset by peer"}
|
| 93 |
+
{"time":"2025-12-15T02:41:12.850172357+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 94 |
+
{"time":"2025-12-15T02:48:24.98882141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 95 |
+
{"time":"2025-12-15T02:54:09.717068313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 96 |
+
{"time":"2025-12-15T10:55:04.891023931+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43212->35.186.228.49:443: read: connection reset by peer"}
|
| 97 |
+
{"time":"2025-12-15T11:29:29.21607297+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 98 |
+
{"time":"2025-12-15T12:32:24.098647438+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57632->35.186.228.49:443: read: connection reset by peer"}
|
| 99 |
+
{"time":"2025-12-15T12:41:37.076316554+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42078->35.186.228.49:443: read: connection reset by peer"}
|
| 100 |
+
{"time":"2025-12-15T12:47:58.065901858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41220->35.186.228.49:443: read: connection reset by peer"}
|
| 101 |
+
{"time":"2025-12-15T12:50:55.201423813+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 102 |
+
{"time":"2025-12-15T12:55:29.869715501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 103 |
+
{"time":"2025-12-15T13:03:43.614622176+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43786->35.186.228.49:443: read: connection reset by peer"}
|
| 104 |
+
{"time":"2025-12-15T13:08:32.946667579+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 105 |
+
{"time":"2025-12-15T14:27:50.584368281+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 106 |
+
{"time":"2025-12-15T14:37:01.10117924+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:50868->35.186.228.49:443: read: connection reset by peer"}
|
| 107 |
+
{"time":"2025-12-15T14:48:40.332940473+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46498->35.186.228.49:443: read: connection reset by peer"}
|
| 108 |
+
{"time":"2025-12-15T14:57:41.943269175+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:47484->35.186.228.49:443: read: connection reset by peer"}
|
| 109 |
+
{"time":"2025-12-15T15:00:55.7967066+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 110 |
+
{"time":"2025-12-15T15:01:22.024522099+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 111 |
+
{"time":"2025-12-15T15:08:02.464314661+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 112 |
+
{"time":"2025-12-15T15:17:47.835880263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:38190->35.186.228.49:443: read: connection reset by peer"}
|
| 113 |
+
{"time":"2025-12-15T18:11:47.925566439+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 114 |
+
{"time":"2025-12-15T20:09:04.562971386+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 115 |
+
{"time":"2025-12-15T20:34:41.644855631+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:44234->35.186.228.49:443: read: connection reset by peer"}
|
| 116 |
+
{"time":"2025-12-15T22:01:07.926213466+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 117 |
+
{"time":"2025-12-15T22:03:00.514128232+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 118 |
+
{"time":"2025-12-15T23:06:03.297559161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 119 |
+
{"time":"2025-12-16T12:24:09.162841753+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:34774->35.186.228.49:443: read: connection reset by peer"}
|
| 120 |
+
{"time":"2025-12-16T12:24:35.986195599+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 121 |
+
{"time":"2025-12-16T12:33:31.359618103+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43794->35.186.228.49:443: read: connection reset by peer"}
|
| 122 |
+
{"time":"2025-12-16T15:38:33.839518601+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41104->35.186.228.49:443: read: connection reset by peer"}
|
| 123 |
+
{"time":"2025-12-16T15:47:21.247612566+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 124 |
+
{"time":"2025-12-16T15:57:38.882102506+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 125 |
+
{"time":"2025-12-16T15:59:29.910928408+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 126 |
+
{"time":"2025-12-16T16:09:32.602645496+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:52806->35.186.228.49:443: read: connection reset by peer"}
|
| 127 |
+
{"time":"2025-12-16T16:32:07.534727269+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 128 |
+
{"time":"2025-12-16T17:31:52.152175035+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46382->35.186.228.49:443: read: connection reset by peer"}
|
| 129 |
+
{"time":"2025-12-17T07:31:40.798122999+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 130 |
+
{"time":"2025-12-17T07:58:44.050237073+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 131 |
+
{"time":"2025-12-17T09:00:16.094926589+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 132 |
+
{"time":"2025-12-17T17:26:32.547706698+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42782->35.186.228.49:443: read: connection reset by peer"}
|
| 133 |
+
{"time":"2025-12-17T19:56:26.256819603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:58656->35.186.228.49:443: read: connection reset by peer"}
|
| 134 |
+
{"time":"2025-12-17T20:10:25.794344881+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:58790->35.186.228.49:443: read: connection reset by peer"}
|
| 135 |
+
{"time":"2025-12-17T21:56:52.633835476+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 136 |
+
{"time":"2025-12-17T22:42:02.308668945+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 137 |
+
{"time":"2025-12-17T23:09:48.436361959+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:50494->35.186.228.49:443: read: connection reset by peer"}
|
| 138 |
+
{"time":"2025-12-17T23:14:23.558114444+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 139 |
+
{"time":"2025-12-17T23:27:17.340085956+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 140 |
+
{"time":"2025-12-18T10:58:47.675459253+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55386->35.186.228.49:443: read: connection reset by peer"}
|
| 141 |
+
{"time":"2025-12-18T16:25:51.98607741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 142 |
+
{"time":"2025-12-18T16:28:49.478972424+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 143 |
+
{"time":"2025-12-18T16:29:58.352287741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:33476->35.186.228.49:443: read: connection reset by peer"}
|
| 144 |
+
{"time":"2025-12-18T16:33:06.3310878+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:59986->35.186.228.49:443: read: connection reset by peer"}
|
| 145 |
+
{"time":"2025-12-18T16:33:39.806302479+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 146 |
+
{"time":"2025-12-18T16:54:27.647048623+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 147 |
+
{"time":"2025-12-18T17:54:07.658597189+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 148 |
+
{"time":"2025-12-18T19:31:48.116846134+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 149 |
+
{"time":"2025-12-18T20:27:17.321941239+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 150 |
+
{"time":"2025-12-18T20:27:20.824274187+08:00","level":"INFO","msg":"handler: operation stats","stats":{}}
|
| 151 |
+
{"time":"2025-12-18T20:27:20.83649299+08:00","level":"INFO","msg":"stream: closing","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 152 |
+
{"time":"2025-12-18T20:27:20.83651967+08:00","level":"INFO","msg":"handler: closed","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 153 |
+
{"time":"2025-12-18T20:27:20.836703664+08:00","level":"INFO","msg":"sender: closed","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 154 |
+
{"time":"2025-12-18T20:27:20.836740475+08:00","level":"INFO","msg":"stream: closed","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
tb/20251213-0337/wandb/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
|
| 2 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Configure stats pid to 50239
|
| 3 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from /root/autodl-tmp/flame/wandb/settings
|
| 5 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:setup_run_log_directory():714] Logging user logs to exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug.log
|
| 7 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-internal.log
|
| 8 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:init():841] calling init triggers
|
| 9 |
+
2025-12-13 03:37:35,981 INFO MainThread:50239 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'job': defaultdict(None, {'config_file': 'flame/models/fla.toml', 'dump_folder': 'exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine', 'description': 'default job', 'use_for_integration_test': False, 'print_args': True}), 'model': defaultdict(None, {'name': 'fla', 'config': 'configs/hamilton_340M.json', 'tokenizer_path': 'fla-hub/transformer-1.3B-100B', 'converters': [], 'print_after_conversion': False}), 'profiling': defaultdict(None, {'enable_profiling': True, 'save_traces_folder': 'profile_trace', 'profile_freq': 512, 'enable_memory_snapshot': False, 'save_memory_snapshot_folder': 'memory_snapshot'}), 'optimizer': defaultdict(None, {'name': 'AdamW', 'eps': 1e-15, 'lr': 0.001, 'beta1': 0.9, 'beta2': 0.95, 'weight_decay': 0.1, 'implementation': 'fused', 'early_step_in_backward': False}), 'lr_scheduler': defaultdict(None, {'warmup_steps': 1024, 'decay_ratio': None, 'decay_type': 'cosine', 'lr_min': 0.1}), 'training': defaultdict(None, {'batch_size': 1, 'seq_len': 11000, 'context_len': 4096, 'varlen': True, 'gradient_accumulation_steps': 6, 'steps': 81920, 'max_norm': 1.0, 'skip_nan_inf': True, 'dataset': '/autodl-fs', 'dataset_name': 'default', 'dataset_split': 'train', 'data_dir': None, 'data_files': None, 'data_probs': None, 'streaming': False, 'num_workers': 64, 'prefetch_factor': 2, 'data_parallel_replicate_degree': 1, 'data_parallel_shard_degree': -1, 'enable_cpu_offload': False, 'tensor_parallel_degree': 1, 'disable_loss_parallel': False, 'fsdp_reshard_after_forward': 'default', 'mixed_precision_param': 'bfloat16', 'mixed_precision_reduce': 'float32', 'compile': True, 'gc_freq': 50, 'seed': 42, 'deterministic': False, 'pin_memory': False, 'persistent_workers': False}), 'metrics': defaultdict(None, {'log_freq': 1, 'enable_tensorboard': False, 'disable_color_printing': False, 'save_tb_folder': 'tb', 'save_for_all_ranks': False, 'enable_wandb': True}), 'experimental': defaultdict(None, {'enable_async_tensor_parallel': False, 'pipeline_parallel_degree': 1, 'pipeline_parallel_split_points': [], 'pipeline_parallel_schedule': '1F1B', 'pipeline_parallel_schedule_csv': '', 'pipeline_parallel_microbatches': None, 'enable_compiled_autograd': False, 'context_parallel_degree': 1, 'context_parallel_rotate_method': 'allgather', 'custom_model_path': ''}), 'checkpoint': defaultdict(None, {'enable_checkpoint': True, 'folder': 'checkpoint', 'initial_load_path': None, 'initial_load_model_weights_only': True, 'interval': 2048, 'last_save_model_weights_only': False, 'export_dtype': 'float32', 'create_seed_checkpoint': False, 'async_mode': 'disabled', 'keep_latest_k': 2, 'load_step': -1, 'exclude_from_loading': [], 'interval_type': 'steps', 'model_weights_only': False}), 'activation_checkpoint': defaultdict(None, {'mode': 'none', 'selective_ac_option': '2'}), 'activation_offload': defaultdict(None, {'mode': 'none'}), 'float8': defaultdict(None, {'enable_fsdp_float8_all_gather': False, 'precompute_float8_dynamic_scale_for_fsdp': False, 'force_recompute_fp8_weight_in_bwd': False, 'recipe_name': None}), 'comm': defaultdict(None, {'init_timeout_seconds': 300, 'train_timeout_seconds': 100, 'trace_buf_size': 20000}), 'memory_estimation': defaultdict(None, {'enabled': False, 'disable_fake_mode': False}), 'fault_tolerance': defaultdict(None, {'enable': False, 'replica_id': 0, 'group_size': 0, 'min_replica_size': 1}), '_wandb': {}}
|
| 11 |
+
2025-12-13 03:37:35,981 INFO MainThread:50239 [wandb_init.py:init():889] starting backend
|
| 12 |
+
2025-12-13 03:37:36,219 INFO MainThread:50239 [wandb_init.py:init():892] sending inform_init request
|
| 13 |
+
2025-12-13 03:37:36,226 INFO MainThread:50239 [wandb_init.py:init():900] backend started and connected
|
| 14 |
+
2025-12-13 03:37:36,227 INFO MainThread:50239 [wandb_init.py:init():970] updated telemetry
|
| 15 |
+
2025-12-13 03:37:36,234 INFO MainThread:50239 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-12-13 03:37:38,639 INFO MainThread:50239 [wandb_init.py:init():1041] starting run threads in backend
|
| 17 |
+
2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_console_start():2521] atexit reg
|
| 18 |
+
2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2369] redirect: wrap_raw
|
| 19 |
+
2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2438] Wrapping output streams.
|
| 20 |
+
2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2461] Redirects installed.
|
| 21 |
+
2025-12-13 03:37:38,692 INFO MainThread:50239 [wandb_init.py:init():1081] run started, returning control to user process
|
| 22 |
+
2025-12-18 20:26:58,151 INFO MainThread:50239 [wandb_run.py:_finish():2287] finishing run wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337
|
| 23 |
+
2025-12-18 20:26:58,154 INFO MainThread:50239 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
|
| 24 |
+
2025-12-18 20:26:58,155 INFO MainThread:50239 [wandb_run.py:_restore():2468] restore
|
| 25 |
+
2025-12-18 20:26:58,155 INFO MainThread:50239 [wandb_run.py:_restore():2474] restore done
|
| 26 |
+
2025-12-18 20:27:20,834 INFO MainThread:50239 [wandb_run.py:_footer_sync_info():3862] logging synced files
|
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/config.yaml
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.23.1
|
| 4 |
+
e:
|
| 5 |
+
i0cpzc2r8qmav38fgt0h4i6tgznjgbr7:
|
| 6 |
+
args:
|
| 7 |
+
- --job.config_file
|
| 8 |
+
- flame/models/fla.toml
|
| 9 |
+
- --job.dump_folder
|
| 10 |
+
- exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine
|
| 11 |
+
- --model.config
|
| 12 |
+
- configs/hamilton_340M.json
|
| 13 |
+
- --model.tokenizer_path
|
| 14 |
+
- fla-hub/transformer-1.3B-100B
|
| 15 |
+
- --optimizer.name
|
| 16 |
+
- AdamW
|
| 17 |
+
- --optimizer.eps
|
| 18 |
+
- "1e-15"
|
| 19 |
+
- --optimizer.lr
|
| 20 |
+
- "1e-3"
|
| 21 |
+
- --lr_scheduler.warmup_steps
|
| 22 |
+
- "1024"
|
| 23 |
+
- --lr_scheduler.lr_min
|
| 24 |
+
- "0.1"
|
| 25 |
+
- --lr_scheduler.decay_type
|
| 26 |
+
- cosine
|
| 27 |
+
- --training.batch_size
|
| 28 |
+
- "1"
|
| 29 |
+
- --training.seq_len
|
| 30 |
+
- "11000"
|
| 31 |
+
- --training.context_len
|
| 32 |
+
- "4096"
|
| 33 |
+
- --training.varlen
|
| 34 |
+
- --training.gradient_accumulation_steps
|
| 35 |
+
- "6"
|
| 36 |
+
- --training.steps
|
| 37 |
+
- "81920"
|
| 38 |
+
- --training.max_norm
|
| 39 |
+
- "1.0"
|
| 40 |
+
- --training.skip_nan_inf
|
| 41 |
+
- --training.dataset
|
| 42 |
+
- /autodl-fs
|
| 43 |
+
- --training.dataset_name
|
| 44 |
+
- default
|
| 45 |
+
- --training.dataset_split
|
| 46 |
+
- train
|
| 47 |
+
- --training.num_workers
|
| 48 |
+
- "64"
|
| 49 |
+
- --training.prefetch_factor
|
| 50 |
+
- "2"
|
| 51 |
+
- --training.seed
|
| 52 |
+
- "42"
|
| 53 |
+
- --checkpoint.interval
|
| 54 |
+
- "2048"
|
| 55 |
+
- --checkpoint.load_step
|
| 56 |
+
- "-1"
|
| 57 |
+
- --checkpoint.keep_latest_k
|
| 58 |
+
- "2"
|
| 59 |
+
- --metrics.log_freq
|
| 60 |
+
- "1"
|
| 61 |
+
- --training.compile
|
| 62 |
+
cpu_count: 104
|
| 63 |
+
cpu_count_logical: 208
|
| 64 |
+
cudaVersion: "13.0"
|
| 65 |
+
disk:
|
| 66 |
+
/:
|
| 67 |
+
total: "32212254720"
|
| 68 |
+
used: "22135824384"
|
| 69 |
+
email: wangzhenbin@stu.scu.edu.cn
|
| 70 |
+
executable: /root/miniconda3/envs/hami/bin/python3.11
|
| 71 |
+
git:
|
| 72 |
+
commit: e11e7be75b9e45e84dbecbe8f0efa27d6af7d101
|
| 73 |
+
remote: https://github.com/fla-org/flame.git
|
| 74 |
+
gpu: NVIDIA RTX PRO 6000 Blackwell Server Edition
|
| 75 |
+
gpu_count: 1
|
| 76 |
+
gpu_nvidia:
|
| 77 |
+
- architecture: Blackwell
|
| 78 |
+
cudaCores: 24064
|
| 79 |
+
memoryTotal: "102641958912"
|
| 80 |
+
name: NVIDIA RTX PRO 6000 Blackwell Server Edition
|
| 81 |
+
uuid: GPU-de95cbc9-4fb5-9a7b-0649-e302580dd50b
|
| 82 |
+
host: autodl-container-3b7944880d-07a10e2a
|
| 83 |
+
memory:
|
| 84 |
+
total: "1081796993024"
|
| 85 |
+
os: Linux-5.15.0-78-generic-x86_64-with-glibc2.35
|
| 86 |
+
program: -m flame.train
|
| 87 |
+
python: CPython 3.11.7
|
| 88 |
+
root: exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337
|
| 89 |
+
startedAt: "2025-12-12T19:37:35.978261Z"
|
| 90 |
+
writerId: i0cpzc2r8qmav38fgt0h4i6tgznjgbr7
|
| 91 |
+
m: []
|
| 92 |
+
python_version: 3.11.7
|
| 93 |
+
t:
|
| 94 |
+
"1":
|
| 95 |
+
- 1
|
| 96 |
+
- 11
|
| 97 |
+
- 49
|
| 98 |
+
- 51
|
| 99 |
+
"2":
|
| 100 |
+
- 1
|
| 101 |
+
- 11
|
| 102 |
+
- 49
|
| 103 |
+
- 51
|
| 104 |
+
"3":
|
| 105 |
+
- 2
|
| 106 |
+
- 13
|
| 107 |
+
- 14
|
| 108 |
+
- 16
|
| 109 |
+
- 61
|
| 110 |
+
"4": 3.11.7
|
| 111 |
+
"5": 0.23.1
|
| 112 |
+
"6": 4.57.3
|
| 113 |
+
"12": 0.23.1
|
| 114 |
+
"13": linux-x86_64
|
| 115 |
+
activation_checkpoint:
|
| 116 |
+
value:
|
| 117 |
+
mode: none
|
| 118 |
+
selective_ac_option: "2"
|
| 119 |
+
activation_offload:
|
| 120 |
+
value:
|
| 121 |
+
mode: none
|
| 122 |
+
checkpoint:
|
| 123 |
+
value:
|
| 124 |
+
async_mode: disabled
|
| 125 |
+
create_seed_checkpoint: false
|
| 126 |
+
enable_checkpoint: true
|
| 127 |
+
exclude_from_loading: []
|
| 128 |
+
export_dtype: float32
|
| 129 |
+
folder: checkpoint
|
| 130 |
+
initial_load_model_weights_only: true
|
| 131 |
+
initial_load_path: null
|
| 132 |
+
interval: 2048
|
| 133 |
+
interval_type: steps
|
| 134 |
+
keep_latest_k: 2
|
| 135 |
+
last_save_model_weights_only: false
|
| 136 |
+
load_step: -1
|
| 137 |
+
model_weights_only: false
|
| 138 |
+
comm:
|
| 139 |
+
value:
|
| 140 |
+
init_timeout_seconds: 300
|
| 141 |
+
trace_buf_size: 20000
|
| 142 |
+
train_timeout_seconds: 100
|
| 143 |
+
experimental:
|
| 144 |
+
value:
|
| 145 |
+
context_parallel_degree: 1
|
| 146 |
+
context_parallel_rotate_method: allgather
|
| 147 |
+
custom_model_path: ""
|
| 148 |
+
enable_async_tensor_parallel: false
|
| 149 |
+
enable_compiled_autograd: false
|
| 150 |
+
pipeline_parallel_degree: 1
|
| 151 |
+
pipeline_parallel_microbatches: null
|
| 152 |
+
pipeline_parallel_schedule: 1F1B
|
| 153 |
+
pipeline_parallel_schedule_csv: ""
|
| 154 |
+
pipeline_parallel_split_points: []
|
| 155 |
+
fault_tolerance:
|
| 156 |
+
value:
|
| 157 |
+
enable: false
|
| 158 |
+
group_size: 0
|
| 159 |
+
min_replica_size: 1
|
| 160 |
+
replica_id: 0
|
| 161 |
+
float8:
|
| 162 |
+
value:
|
| 163 |
+
enable_fsdp_float8_all_gather: false
|
| 164 |
+
force_recompute_fp8_weight_in_bwd: false
|
| 165 |
+
precompute_float8_dynamic_scale_for_fsdp: false
|
| 166 |
+
recipe_name: null
|
| 167 |
+
job:
|
| 168 |
+
value:
|
| 169 |
+
config_file: flame/models/fla.toml
|
| 170 |
+
description: default job
|
| 171 |
+
dump_folder: exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine
|
| 172 |
+
print_args: true
|
| 173 |
+
use_for_integration_test: false
|
| 174 |
+
lr_scheduler:
|
| 175 |
+
value:
|
| 176 |
+
decay_ratio: null
|
| 177 |
+
decay_type: cosine
|
| 178 |
+
lr_min: 0.1
|
| 179 |
+
warmup_steps: 1024
|
| 180 |
+
memory_estimation:
|
| 181 |
+
value:
|
| 182 |
+
disable_fake_mode: false
|
| 183 |
+
enabled: false
|
| 184 |
+
metrics:
|
| 185 |
+
value:
|
| 186 |
+
disable_color_printing: false
|
| 187 |
+
enable_tensorboard: false
|
| 188 |
+
enable_wandb: true
|
| 189 |
+
log_freq: 1
|
| 190 |
+
save_for_all_ranks: false
|
| 191 |
+
save_tb_folder: tb
|
| 192 |
+
model:
|
| 193 |
+
value:
|
| 194 |
+
config: configs/hamilton_340M.json
|
| 195 |
+
converters: []
|
| 196 |
+
name: fla
|
| 197 |
+
print_after_conversion: false
|
| 198 |
+
tokenizer_path: fla-hub/transformer-1.3B-100B
|
| 199 |
+
optimizer:
|
| 200 |
+
value:
|
| 201 |
+
beta1: 0.9
|
| 202 |
+
beta2: 0.95
|
| 203 |
+
early_step_in_backward: false
|
| 204 |
+
eps: 1e-15
|
| 205 |
+
implementation: fused
|
| 206 |
+
lr: 0.001
|
| 207 |
+
name: AdamW
|
| 208 |
+
weight_decay: 0.1
|
| 209 |
+
profiling:
|
| 210 |
+
value:
|
| 211 |
+
enable_memory_snapshot: false
|
| 212 |
+
enable_profiling: true
|
| 213 |
+
profile_freq: 512
|
| 214 |
+
save_memory_snapshot_folder: memory_snapshot
|
| 215 |
+
save_traces_folder: profile_trace
|
| 216 |
+
training:
|
| 217 |
+
value:
|
| 218 |
+
batch_size: 1
|
| 219 |
+
compile: true
|
| 220 |
+
context_len: 4096
|
| 221 |
+
data_dir: null
|
| 222 |
+
data_files: null
|
| 223 |
+
data_parallel_replicate_degree: 1
|
| 224 |
+
data_parallel_shard_degree: -1
|
| 225 |
+
data_probs: null
|
| 226 |
+
dataset: /autodl-fs
|
| 227 |
+
dataset_name: default
|
| 228 |
+
dataset_split: train
|
| 229 |
+
deterministic: false
|
| 230 |
+
disable_loss_parallel: false
|
| 231 |
+
enable_cpu_offload: false
|
| 232 |
+
fsdp_reshard_after_forward: default
|
| 233 |
+
gc_freq: 50
|
| 234 |
+
gradient_accumulation_steps: 6
|
| 235 |
+
max_norm: 1
|
| 236 |
+
mixed_precision_param: bfloat16
|
| 237 |
+
mixed_precision_reduce: float32
|
| 238 |
+
num_workers: 64
|
| 239 |
+
persistent_workers: false
|
| 240 |
+
pin_memory: false
|
| 241 |
+
prefetch_factor: 2
|
| 242 |
+
seed: 42
|
| 243 |
+
seq_len: 11000
|
| 244 |
+
skip_nan_inf: true
|
| 245 |
+
steps: 81920
|
| 246 |
+
streaming: false
|
| 247 |
+
tensor_parallel_degree: 1
|
| 248 |
+
varlen: true
|
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/output.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3237921edeadd36f7707bf546a21cbf7e273134dff308f78c6063f96ef8fd8f8
|
| 3 |
+
size 38074835
|
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/requirements.txt
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flame==0.1.0
|
| 2 |
+
absl-py==2.3.1
|
| 3 |
+
aiohappyeyeballs==2.6.1
|
| 4 |
+
aiohttp==3.13.2
|
| 5 |
+
aiosignal==1.4.0
|
| 6 |
+
annotated-types==0.7.0
|
| 7 |
+
anyio==4.12.0
|
| 8 |
+
attrs==25.4.0
|
| 9 |
+
blobfile==3.1.0
|
| 10 |
+
certifi==2025.11.12
|
| 11 |
+
charset-normalizer==3.4.4
|
| 12 |
+
click==8.3.1
|
| 13 |
+
datasets==4.4.1
|
| 14 |
+
dill==0.4.0
|
| 15 |
+
docstring_parser==0.17.0
|
| 16 |
+
einops==0.8.1
|
| 17 |
+
filelock==3.20.0
|
| 18 |
+
flame==0.1.0
|
| 19 |
+
flash-linear-attention==0.4.1
|
| 20 |
+
frozenlist==1.8.0
|
| 21 |
+
fsspec==2025.10.0
|
| 22 |
+
gitdb==4.0.12
|
| 23 |
+
GitPython==3.1.45
|
| 24 |
+
grpcio==1.76.0
|
| 25 |
+
h11==0.16.0
|
| 26 |
+
hf-xet==1.2.0
|
| 27 |
+
httpcore==1.0.9
|
| 28 |
+
httpx==0.28.1
|
| 29 |
+
huggingface-hub==0.36.0
|
| 30 |
+
idna==3.11
|
| 31 |
+
Jinja2==3.1.6
|
| 32 |
+
lxml==6.0.2
|
| 33 |
+
Markdown==3.10
|
| 34 |
+
MarkupSafe==3.0.3
|
| 35 |
+
mpmath==1.3.0
|
| 36 |
+
multidict==6.7.0
|
| 37 |
+
multiprocess==0.70.18
|
| 38 |
+
networkx==3.6.1
|
| 39 |
+
ninja==1.13.0
|
| 40 |
+
numpy==2.3.5
|
| 41 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 42 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 43 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 44 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 45 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 46 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 47 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 48 |
+
nvidia-curand-cu12==10.3.9.90
|
| 49 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 50 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 51 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 52 |
+
nvidia-nccl-cu12==2.27.5
|
| 53 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 54 |
+
nvidia-nvshmem-cu12==3.3.20
|
| 55 |
+
nvidia-nvtx-cu12==12.8.90
|
| 56 |
+
packaging==25.0
|
| 57 |
+
pandas==2.3.3
|
| 58 |
+
pillow==12.0.0
|
| 59 |
+
pip==25.3
|
| 60 |
+
platformdirs==4.5.1
|
| 61 |
+
propcache==0.4.1
|
| 62 |
+
protobuf==6.33.2
|
| 63 |
+
pyarrow==22.0.0
|
| 64 |
+
pycryptodomex==3.23.0
|
| 65 |
+
pydantic==2.12.5
|
| 66 |
+
pydantic_core==2.41.5
|
| 67 |
+
python-dateutil==2.9.0.post0
|
| 68 |
+
pytz==2025.2
|
| 69 |
+
PyYAML==6.0.3
|
| 70 |
+
regex==2025.11.3
|
| 71 |
+
requests==2.32.5
|
| 72 |
+
safetensors==0.7.0
|
| 73 |
+
sentry-sdk==2.47.0
|
| 74 |
+
setuptools==80.9.0
|
| 75 |
+
six==1.17.0
|
| 76 |
+
smmap==5.0.2
|
| 77 |
+
sympy==1.14.0
|
| 78 |
+
tensorboard==2.20.0
|
| 79 |
+
tensorboard-data-server==0.7.2
|
| 80 |
+
tiktoken==0.12.0
|
| 81 |
+
tokenizers==0.22.1
|
| 82 |
+
tomli==2.3.0
|
| 83 |
+
torch==2.9.1
|
| 84 |
+
torchdata==0.11.0
|
| 85 |
+
torchtitan==0.1.0
|
| 86 |
+
tqdm==4.67.1
|
| 87 |
+
transformers==4.57.3
|
| 88 |
+
triton==3.5.1
|
| 89 |
+
typeguard==4.4.4
|
| 90 |
+
typing_extensions==4.15.0
|
| 91 |
+
typing-inspection==0.4.2
|
| 92 |
+
tyro==1.0.1
|
| 93 |
+
tzdata==2025.2
|
| 94 |
+
urllib3==2.6.2
|
| 95 |
+
wandb==0.23.1
|
| 96 |
+
Werkzeug==3.1.4
|
| 97 |
+
wheel==0.45.1
|
| 98 |
+
xxhash==3.6.0
|
| 99 |
+
yarl==1.22.0
|
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-78-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.7",
|
| 4 |
+
"startedAt": "2025-12-12T19:37:35.978261Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--job.config_file",
|
| 7 |
+
"flame/models/fla.toml",
|
| 8 |
+
"--job.dump_folder",
|
| 9 |
+
"exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine",
|
| 10 |
+
"--model.config",
|
| 11 |
+
"configs/hamilton_340M.json",
|
| 12 |
+
"--model.tokenizer_path",
|
| 13 |
+
"fla-hub/transformer-1.3B-100B",
|
| 14 |
+
"--optimizer.name",
|
| 15 |
+
"AdamW",
|
| 16 |
+
"--optimizer.eps",
|
| 17 |
+
"1e-15",
|
| 18 |
+
"--optimizer.lr",
|
| 19 |
+
"1e-3",
|
| 20 |
+
"--lr_scheduler.warmup_steps",
|
| 21 |
+
"1024",
|
| 22 |
+
"--lr_scheduler.lr_min",
|
| 23 |
+
"0.1",
|
| 24 |
+
"--lr_scheduler.decay_type",
|
| 25 |
+
"cosine",
|
| 26 |
+
"--training.batch_size",
|
| 27 |
+
"1",
|
| 28 |
+
"--training.seq_len",
|
| 29 |
+
"11000",
|
| 30 |
+
"--training.context_len",
|
| 31 |
+
"4096",
|
| 32 |
+
"--training.varlen",
|
| 33 |
+
"--training.gradient_accumulation_steps",
|
| 34 |
+
"6",
|
| 35 |
+
"--training.steps",
|
| 36 |
+
"81920",
|
| 37 |
+
"--training.max_norm",
|
| 38 |
+
"1.0",
|
| 39 |
+
"--training.skip_nan_inf",
|
| 40 |
+
"--training.dataset",
|
| 41 |
+
"/autodl-fs",
|
| 42 |
+
"--training.dataset_name",
|
| 43 |
+
"default",
|
| 44 |
+
"--training.dataset_split",
|
| 45 |
+
"train",
|
| 46 |
+
"--training.num_workers",
|
| 47 |
+
"64",
|
| 48 |
+
"--training.prefetch_factor",
|
| 49 |
+
"2",
|
| 50 |
+
"--training.seed",
|
| 51 |
+
"42",
|
| 52 |
+
"--checkpoint.interval",
|
| 53 |
+
"2048",
|
| 54 |
+
"--checkpoint.load_step",
|
| 55 |
+
"-1",
|
| 56 |
+
"--checkpoint.keep_latest_k",
|
| 57 |
+
"2",
|
| 58 |
+
"--metrics.log_freq",
|
| 59 |
+
"1",
|
| 60 |
+
"--training.compile"
|
| 61 |
+
],
|
| 62 |
+
"program": "-m flame.train",
|
| 63 |
+
"git": {
|
| 64 |
+
"remote": "https://github.com/fla-org/flame.git",
|
| 65 |
+
"commit": "e11e7be75b9e45e84dbecbe8f0efa27d6af7d101"
|
| 66 |
+
},
|
| 67 |
+
"email": "wangzhenbin@stu.scu.edu.cn",
|
| 68 |
+
"root": "exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337",
|
| 69 |
+
"host": "autodl-container-3b7944880d-07a10e2a",
|
| 70 |
+
"executable": "/root/miniconda3/envs/hami/bin/python3.11",
|
| 71 |
+
"cpu_count": 104,
|
| 72 |
+
"cpu_count_logical": 208,
|
| 73 |
+
"gpu": "NVIDIA RTX PRO 6000 Blackwell Server Edition",
|
| 74 |
+
"gpu_count": 1,
|
| 75 |
+
"disk": {
|
| 76 |
+
"/": {
|
| 77 |
+
"total": "32212254720",
|
| 78 |
+
"used": "22135824384"
|
| 79 |
+
}
|
| 80 |
+
},
|
| 81 |
+
"memory": {
|
| 82 |
+
"total": "1081796993024"
|
| 83 |
+
},
|
| 84 |
+
"gpu_nvidia": [
|
| 85 |
+
{
|
| 86 |
+
"name": "NVIDIA RTX PRO 6000 Blackwell Server Edition",
|
| 87 |
+
"memoryTotal": "102641958912",
|
| 88 |
+
"cudaCores": 24064,
|
| 89 |
+
"architecture": "Blackwell",
|
| 90 |
+
"uuid": "GPU-de95cbc9-4fb5-9a7b-0649-e302580dd50b"
|
| 91 |
+
}
|
| 92 |
+
],
|
| 93 |
+
"cudaVersion": "13.0",
|
| 94 |
+
"writerId": "i0cpzc2r8qmav38fgt0h4i6tgznjgbr7"
|
| 95 |
+
}
|
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"time_metrics/data_loading(%)":0.5803219584911921,"loss_metrics/global_max_loss":2.9381303787231445,"optimizer/skipped_step":0,"mfu(%)":11.607521852909128,"memory/num_alloc_retries":0,"tflops":36.215468181076474,"throughput(tps)":10864.281324639476,"optimizer/lr":0.0001,"time_metrics/data_loading(s)":0.005875714511300127,"_wandb":{"runtime":492559},"memory/max_reserved(%)":93.53824322735011,"memory/max_active(GiB)":88.26861763000488,"_timestamp":1.7660608023964982e+09,"_runtime":492559.517721682,"time_metrics/end_to_end(s)":6.074953144881874,"memory/max_reserved(GiB)":88.833984375,"optimizer/grad_norm":0.8941753506660461,"memory/max_active(%)":92.94293713500184,"memory/num_ooms":0,"loss_metrics/global_avg_loss":2.9381303787231445,"_step":81920}
|
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-core.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-13T03:37:36.129026081+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpqdibn3bf/port-50239.txt","pid":50239,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-12-13T03:37:36.130196984+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":50239}
|
| 3 |
+
{"time":"2025-12-13T03:37:36.130017672+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-50239-50346-3756975333/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2025-12-13T03:37:36.218871979+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-12-13T03:37:36.231072223+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337","id":"1(@)"}
|
| 6 |
+
{"time":"2025-12-13T03:37:37.428562008+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337","id":"1(@)"}
|
| 7 |
+
{"time":"2025-12-18T20:27:20.836424535+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337","id":"1(@)"}
|
| 8 |
+
{"time":"2025-12-18T20:27:20.841392619+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337","id":"1(@)"}
|
| 9 |
+
{"time":"2025-12-18T20:27:28.809165523+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 10 |
+
{"time":"2025-12-18T20:27:28.809238229+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 11 |
+
{"time":"2025-12-18T20:27:28.809255391+08:00","level":"INFO","msg":"server is shutting down"}
|
| 12 |
+
{"time":"2025-12-18T20:27:28.809339818+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 13 |
+
{"time":"2025-12-18T20:27:28.809382419+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-50239-50346-3756975333/socket","Net":"unix"}}
|
| 14 |
+
{"time":"2025-12-18T20:27:28.809467482+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 15 |
+
{"time":"2025-12-18T20:27:28.809478249+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 16 |
+
{"time":"2025-12-18T20:27:28.809495971+08:00","level":"INFO","msg":"server is closed"}
|
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-13T03:37:36.23132477+08:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
|
| 2 |
+
{"time":"2025-12-13T03:37:37.42793721+08:00","level":"INFO","msg":"stream: created new stream","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 3 |
+
{"time":"2025-12-13T03:37:37.42811984+08:00","level":"INFO","msg":"handler: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 4 |
+
{"time":"2025-12-13T03:37:37.428536603+08:00","level":"INFO","msg":"stream: started","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 5 |
+
{"time":"2025-12-13T03:37:37.428580841+08:00","level":"INFO","msg":"sender: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 6 |
+
{"time":"2025-12-13T03:37:37.428589238+08:00","level":"INFO","msg":"writer: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 7 |
+
{"time":"2025-12-13T06:05:39.583033219+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 8 |
+
{"time":"2025-12-13T07:55:37.649211253+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:59798->35.186.228.49:443: read: connection reset by peer"}
|
| 9 |
+
{"time":"2025-12-13T08:13:29.786188657+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 10 |
+
{"time":"2025-12-13T08:16:47.670018036+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 11 |
+
{"time":"2025-12-13T08:59:24.082351789+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 12 |
+
{"time":"2025-12-13T09:12:13.694820381+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 13 |
+
{"time":"2025-12-13T09:33:57.835049052+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 14 |
+
{"time":"2025-12-13T09:43:28.441384157+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:35074->35.186.228.49:443: read: connection reset by peer"}
|
| 15 |
+
{"time":"2025-12-13T09:55:59.558604972+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55632->35.186.228.49:443: read: connection reset by peer"}
|
| 16 |
+
{"time":"2025-12-13T10:04:15.521407235+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 17 |
+
{"time":"2025-12-13T10:20:21.499848188+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 18 |
+
{"time":"2025-12-13T10:35:23.478678967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 19 |
+
{"time":"2025-12-13T10:51:06.294920404+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 20 |
+
{"time":"2025-12-13T13:14:41.311911939+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 21 |
+
{"time":"2025-12-13T14:59:00.895291828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42956->35.186.228.49:443: read: connection reset by peer"}
|
| 22 |
+
{"time":"2025-12-13T15:28:48.295525438+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 23 |
+
{"time":"2025-12-13T17:26:06.240844168+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 24 |
+
{"time":"2025-12-13T17:33:13.927600494+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 25 |
+
{"time":"2025-12-13T17:38:26.684259493+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 26 |
+
{"time":"2025-12-13T17:59:17.135876142+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 27 |
+
{"time":"2025-12-13T18:21:53.605231099+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:49898->35.186.228.49:443: read: connection reset by peer"}
|
| 28 |
+
{"time":"2025-12-13T20:53:31.081839781+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 29 |
+
{"time":"2025-12-13T21:01:51.042167406+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 30 |
+
{"time":"2025-12-13T21:16:58.046134274+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 31 |
+
{"time":"2025-12-13T21:19:47.88199423+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 32 |
+
{"time":"2025-12-13T21:50:37.427793114+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 33 |
+
{"time":"2025-12-13T22:04:36.210586263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57666->35.186.228.49:443: read: connection reset by peer"}
|
| 34 |
+
{"time":"2025-12-13T22:05:58.336461319+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46820->35.186.228.49:443: read: connection reset by peer"}
|
| 35 |
+
{"time":"2025-12-13T22:27:53.913619965+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:37800->35.186.228.49:443: read: connection reset by peer"}
|
| 36 |
+
{"time":"2025-12-13T23:09:40.869315801+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 37 |
+
{"time":"2025-12-13T23:10:21.955543682+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 38 |
+
{"time":"2025-12-13T23:16:09.301636231+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 39 |
+
{"time":"2025-12-14T00:23:39.770203057+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 40 |
+
{"time":"2025-12-14T00:24:17.145903336+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 41 |
+
{"time":"2025-12-14T02:56:23.178244085+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43334->35.186.228.49:443: read: connection reset by peer"}
|
| 42 |
+
{"time":"2025-12-14T03:18:51.662361676+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 43 |
+
{"time":"2025-12-14T03:19:23.322670147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 44 |
+
{"time":"2025-12-14T03:47:51.213153679+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 45 |
+
{"time":"2025-12-14T04:20:52.250790079+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 46 |
+
{"time":"2025-12-14T04:21:11.202821697+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 47 |
+
{"time":"2025-12-14T07:06:08.967902177+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 48 |
+
{"time":"2025-12-14T07:06:54.195739279+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 49 |
+
{"time":"2025-12-14T07:13:30.365530787+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 50 |
+
{"time":"2025-12-14T07:26:06.7209204+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:48478->35.186.228.49:443: read: connection reset by peer"}
|
| 51 |
+
{"time":"2025-12-14T09:25:09.070791287+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 52 |
+
{"time":"2025-12-14T10:46:12.82568069+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 53 |
+
{"time":"2025-12-14T10:48:43.885806313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 54 |
+
{"time":"2025-12-14T11:26:11.723626581+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 55 |
+
{"time":"2025-12-14T11:31:48.355445234+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 56 |
+
{"time":"2025-12-14T11:41:59.68850527+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 57 |
+
{"time":"2025-12-14T12:27:33.380879248+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 58 |
+
{"time":"2025-12-14T12:36:32.807927117+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57628->35.186.228.49:443: read: connection reset by peer"}
|
| 59 |
+
{"time":"2025-12-14T12:47:10.357920901+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 60 |
+
{"time":"2025-12-14T12:47:47.340023823+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 61 |
+
{"time":"2025-12-14T13:06:22.738392825+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:54890->35.186.228.49:443: read: connection reset by peer"}
|
| 62 |
+
{"time":"2025-12-14T13:06:40.383011533+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:49102->35.186.228.49:443: read: connection reset by peer"}
|
| 63 |
+
{"time":"2025-12-14T13:07:04.782062642+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 64 |
+
{"time":"2025-12-14T13:14:14.561018246+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 65 |
+
{"time":"2025-12-14T14:48:10.912184589+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 66 |
+
{"time":"2025-12-14T14:54:11.088938602+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 67 |
+
{"time":"2025-12-14T14:58:12.755764947+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 68 |
+
{"time":"2025-12-14T15:46:48.897060165+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57112->35.186.228.49:443: read: connection reset by peer"}
|
| 69 |
+
{"time":"2025-12-14T16:16:18.043599985+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 70 |
+
{"time":"2025-12-14T16:25:00.963481603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41088->35.186.228.49:443: read: connection reset by peer"}
|
| 71 |
+
{"time":"2025-12-14T17:14:49.422743755+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 72 |
+
{"time":"2025-12-14T17:21:40.725676457+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 73 |
+
{"time":"2025-12-14T18:22:33.919294777+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55446->35.186.228.49:443: read: connection reset by peer"}
|
| 74 |
+
{"time":"2025-12-14T18:24:49.798972384+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 75 |
+
{"time":"2025-12-14T18:25:31.346842356+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 76 |
+
{"time":"2025-12-14T20:49:52.211146515+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 77 |
+
{"time":"2025-12-14T21:04:32.293978006+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57160->35.186.228.49:443: read: connection reset by peer"}
|
| 78 |
+
{"time":"2025-12-14T21:10:28.561725795+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 79 |
+
{"time":"2025-12-14T21:55:52.598096464+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 80 |
+
{"time":"2025-12-14T21:56:23.294629143+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 81 |
+
{"time":"2025-12-14T22:54:03.471360435+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 82 |
+
{"time":"2025-12-14T23:54:12.865835537+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 83 |
+
{"time":"2025-12-15T00:03:53.643020298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": context deadline exceeded"}
|
| 84 |
+
{"time":"2025-12-15T00:04:50.877883568+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 85 |
+
{"time":"2025-12-15T00:08:44.968783043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": http2: client conn is closed"}
|
| 86 |
+
{"time":"2025-12-15T00:29:18.768015366+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 87 |
+
{"time":"2025-12-15T00:38:18.876310425+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 88 |
+
{"time":"2025-12-15T00:38:47.676509586+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 89 |
+
{"time":"2025-12-15T02:19:16.46063906+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 90 |
+
{"time":"2025-12-15T02:28:23.925958967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 91 |
+
{"time":"2025-12-15T02:28:41.801869127+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 92 |
+
{"time":"2025-12-15T02:40:36.54023762+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55584->35.186.228.49:443: read: connection reset by peer"}
|
| 93 |
+
{"time":"2025-12-15T02:41:12.850172357+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 94 |
+
{"time":"2025-12-15T02:48:24.98882141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 95 |
+
{"time":"2025-12-15T02:54:09.717068313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 96 |
+
{"time":"2025-12-15T10:55:04.891023931+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43212->35.186.228.49:443: read: connection reset by peer"}
|
| 97 |
+
{"time":"2025-12-15T11:29:29.21607297+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 98 |
+
{"time":"2025-12-15T12:32:24.098647438+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57632->35.186.228.49:443: read: connection reset by peer"}
|
| 99 |
+
{"time":"2025-12-15T12:41:37.076316554+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42078->35.186.228.49:443: read: connection reset by peer"}
|
| 100 |
+
{"time":"2025-12-15T12:47:58.065901858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41220->35.186.228.49:443: read: connection reset by peer"}
|
| 101 |
+
{"time":"2025-12-15T12:50:55.201423813+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 102 |
+
{"time":"2025-12-15T12:55:29.869715501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 103 |
+
{"time":"2025-12-15T13:03:43.614622176+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43786->35.186.228.49:443: read: connection reset by peer"}
|
| 104 |
+
{"time":"2025-12-15T13:08:32.946667579+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 105 |
+
{"time":"2025-12-15T14:27:50.584368281+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 106 |
+
{"time":"2025-12-15T14:37:01.10117924+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:50868->35.186.228.49:443: read: connection reset by peer"}
|
| 107 |
+
{"time":"2025-12-15T14:48:40.332940473+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46498->35.186.228.49:443: read: connection reset by peer"}
|
| 108 |
+
{"time":"2025-12-15T14:57:41.943269175+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:47484->35.186.228.49:443: read: connection reset by peer"}
|
| 109 |
+
{"time":"2025-12-15T15:00:55.7967066+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 110 |
+
{"time":"2025-12-15T15:01:22.024522099+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 111 |
+
{"time":"2025-12-15T15:08:02.464314661+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 112 |
+
{"time":"2025-12-15T15:17:47.835880263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:38190->35.186.228.49:443: read: connection reset by peer"}
|
| 113 |
+
{"time":"2025-12-15T18:11:47.925566439+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 114 |
+
{"time":"2025-12-15T20:09:04.562971386+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 115 |
+
{"time":"2025-12-15T20:34:41.644855631+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:44234->35.186.228.49:443: read: connection reset by peer"}
|
| 116 |
+
{"time":"2025-12-15T22:01:07.926213466+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 117 |
+
{"time":"2025-12-15T22:03:00.514128232+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 118 |
+
{"time":"2025-12-15T23:06:03.297559161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 119 |
+
{"time":"2025-12-16T12:24:09.162841753+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:34774->35.186.228.49:443: read: connection reset by peer"}
|
| 120 |
+
{"time":"2025-12-16T12:24:35.986195599+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 121 |
+
{"time":"2025-12-16T12:33:31.359618103+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43794->35.186.228.49:443: read: connection reset by peer"}
|
| 122 |
+
{"time":"2025-12-16T15:38:33.839518601+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41104->35.186.228.49:443: read: connection reset by peer"}
|
| 123 |
+
{"time":"2025-12-16T15:47:21.247612566+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 124 |
+
{"time":"2025-12-16T15:57:38.882102506+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 125 |
+
{"time":"2025-12-16T15:59:29.910928408+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 126 |
+
{"time":"2025-12-16T16:09:32.602645496+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:52806->35.186.228.49:443: read: connection reset by peer"}
|
| 127 |
+
{"time":"2025-12-16T16:32:07.534727269+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 128 |
+
{"time":"2025-12-16T17:31:52.152175035+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46382->35.186.228.49:443: read: connection reset by peer"}
|
| 129 |
+
{"time":"2025-12-17T07:31:40.798122999+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 130 |
+
{"time":"2025-12-17T07:58:44.050237073+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 131 |
+
{"time":"2025-12-17T09:00:16.094926589+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 132 |
+
{"time":"2025-12-17T17:26:32.547706698+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42782->35.186.228.49:443: read: connection reset by peer"}
|
| 133 |
+
{"time":"2025-12-17T19:56:26.256819603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:58656->35.186.228.49:443: read: connection reset by peer"}
|
| 134 |
+
{"time":"2025-12-17T20:10:25.794344881+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:58790->35.186.228.49:443: read: connection reset by peer"}
|
| 135 |
+
{"time":"2025-12-17T21:56:52.633835476+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 136 |
+
{"time":"2025-12-17T22:42:02.308668945+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 137 |
+
{"time":"2025-12-17T23:09:48.436361959+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:50494->35.186.228.49:443: read: connection reset by peer"}
|
| 138 |
+
{"time":"2025-12-17T23:14:23.558114444+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 139 |
+
{"time":"2025-12-17T23:27:17.340085956+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 140 |
+
{"time":"2025-12-18T10:58:47.675459253+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55386->35.186.228.49:443: read: connection reset by peer"}
|
| 141 |
+
{"time":"2025-12-18T16:25:51.98607741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 142 |
+
{"time":"2025-12-18T16:28:49.478972424+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 143 |
+
{"time":"2025-12-18T16:29:58.352287741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:33476->35.186.228.49:443: read: connection reset by peer"}
|
| 144 |
+
{"time":"2025-12-18T16:33:06.3310878+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:59986->35.186.228.49:443: read: connection reset by peer"}
|
| 145 |
+
{"time":"2025-12-18T16:33:39.806302479+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 146 |
+
{"time":"2025-12-18T16:54:27.647048623+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 147 |
+
{"time":"2025-12-18T17:54:07.658597189+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 148 |
+
{"time":"2025-12-18T19:31:48.116846134+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
|
| 149 |
+
{"time":"2025-12-18T20:27:17.321941239+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 150 |
+
{"time":"2025-12-18T20:27:20.824274187+08:00","level":"INFO","msg":"handler: operation stats","stats":{}}
|
| 151 |
+
{"time":"2025-12-18T20:27:20.83649299+08:00","level":"INFO","msg":"stream: closing","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 152 |
+
{"time":"2025-12-18T20:27:20.83651967+08:00","level":"INFO","msg":"handler: closed","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 153 |
+
{"time":"2025-12-18T20:27:20.836703664+08:00","level":"INFO","msg":"sender: closed","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
| 154 |
+
{"time":"2025-12-18T20:27:20.836740475+08:00","level":"INFO","msg":"stream: closed","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
|
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
|
| 2 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Configure stats pid to 50239
|
| 3 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from /root/autodl-tmp/flame/wandb/settings
|
| 5 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:setup_run_log_directory():714] Logging user logs to exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug.log
|
| 7 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-internal.log
|
| 8 |
+
2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:init():841] calling init triggers
|
| 9 |
+
2025-12-13 03:37:35,981 INFO MainThread:50239 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'job': defaultdict(None, {'config_file': 'flame/models/fla.toml', 'dump_folder': 'exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine', 'description': 'default job', 'use_for_integration_test': False, 'print_args': True}), 'model': defaultdict(None, {'name': 'fla', 'config': 'configs/hamilton_340M.json', 'tokenizer_path': 'fla-hub/transformer-1.3B-100B', 'converters': [], 'print_after_conversion': False}), 'profiling': defaultdict(None, {'enable_profiling': True, 'save_traces_folder': 'profile_trace', 'profile_freq': 512, 'enable_memory_snapshot': False, 'save_memory_snapshot_folder': 'memory_snapshot'}), 'optimizer': defaultdict(None, {'name': 'AdamW', 'eps': 1e-15, 'lr': 0.001, 'beta1': 0.9, 'beta2': 0.95, 'weight_decay': 0.1, 'implementation': 'fused', 'early_step_in_backward': False}), 'lr_scheduler': defaultdict(None, {'warmup_steps': 1024, 'decay_ratio': None, 'decay_type': 'cosine', 'lr_min': 0.1}), 'training': defaultdict(None, {'batch_size': 1, 'seq_len': 11000, 'context_len': 4096, 'varlen': True, 'gradient_accumulation_steps': 6, 'steps': 81920, 'max_norm': 1.0, 'skip_nan_inf': True, 'dataset': '/autodl-fs', 'dataset_name': 'default', 'dataset_split': 'train', 'data_dir': None, 'data_files': None, 'data_probs': None, 'streaming': False, 'num_workers': 64, 'prefetch_factor': 2, 'data_parallel_replicate_degree': 1, 'data_parallel_shard_degree': -1, 'enable_cpu_offload': False, 'tensor_parallel_degree': 1, 'disable_loss_parallel': False, 'fsdp_reshard_after_forward': 'default', 'mixed_precision_param': 'bfloat16', 'mixed_precision_reduce': 'float32', 'compile': True, 'gc_freq': 50, 'seed': 42, 'deterministic': False, 'pin_memory': False, 'persistent_workers': False}), 'metrics': defaultdict(None, {'log_freq': 1, 'enable_tensorboard': False, 'disable_color_printing': False, 'save_tb_folder': 'tb', 'save_for_all_ranks': False, 'enable_wandb': True}), 'experimental': defaultdict(None, {'enable_async_tensor_parallel': False, 'pipeline_parallel_degree': 1, 'pipeline_parallel_split_points': [], 'pipeline_parallel_schedule': '1F1B', 'pipeline_parallel_schedule_csv': '', 'pipeline_parallel_microbatches': None, 'enable_compiled_autograd': False, 'context_parallel_degree': 1, 'context_parallel_rotate_method': 'allgather', 'custom_model_path': ''}), 'checkpoint': defaultdict(None, {'enable_checkpoint': True, 'folder': 'checkpoint', 'initial_load_path': None, 'initial_load_model_weights_only': True, 'interval': 2048, 'last_save_model_weights_only': False, 'export_dtype': 'float32', 'create_seed_checkpoint': False, 'async_mode': 'disabled', 'keep_latest_k': 2, 'load_step': -1, 'exclude_from_loading': [], 'interval_type': 'steps', 'model_weights_only': False}), 'activation_checkpoint': defaultdict(None, {'mode': 'none', 'selective_ac_option': '2'}), 'activation_offload': defaultdict(None, {'mode': 'none'}), 'float8': defaultdict(None, {'enable_fsdp_float8_all_gather': False, 'precompute_float8_dynamic_scale_for_fsdp': False, 'force_recompute_fp8_weight_in_bwd': False, 'recipe_name': None}), 'comm': defaultdict(None, {'init_timeout_seconds': 300, 'train_timeout_seconds': 100, 'trace_buf_size': 20000}), 'memory_estimation': defaultdict(None, {'enabled': False, 'disable_fake_mode': False}), 'fault_tolerance': defaultdict(None, {'enable': False, 'replica_id': 0, 'group_size': 0, 'min_replica_size': 1}), '_wandb': {}}
|
| 11 |
+
2025-12-13 03:37:35,981 INFO MainThread:50239 [wandb_init.py:init():889] starting backend
|
| 12 |
+
2025-12-13 03:37:36,219 INFO MainThread:50239 [wandb_init.py:init():892] sending inform_init request
|
| 13 |
+
2025-12-13 03:37:36,226 INFO MainThread:50239 [wandb_init.py:init():900] backend started and connected
|
| 14 |
+
2025-12-13 03:37:36,227 INFO MainThread:50239 [wandb_init.py:init():970] updated telemetry
|
| 15 |
+
2025-12-13 03:37:36,234 INFO MainThread:50239 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-12-13 03:37:38,639 INFO MainThread:50239 [wandb_init.py:init():1041] starting run threads in backend
|
| 17 |
+
2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_console_start():2521] atexit reg
|
| 18 |
+
2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2369] redirect: wrap_raw
|
| 19 |
+
2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2438] Wrapping output streams.
|
| 20 |
+
2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2461] Redirects installed.
|
| 21 |
+
2025-12-13 03:37:38,692 INFO MainThread:50239 [wandb_init.py:init():1081] run started, returning control to user process
|
| 22 |
+
2025-12-18 20:26:58,151 INFO MainThread:50239 [wandb_run.py:_finish():2287] finishing run wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337
|
| 23 |
+
2025-12-18 20:26:58,154 INFO MainThread:50239 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
|
| 24 |
+
2025-12-18 20:26:58,155 INFO MainThread:50239 [wandb_run.py:_restore():2468] restore
|
| 25 |
+
2025-12-18 20:26:58,155 INFO MainThread:50239 [wandb_run.py:_restore():2474] restore done
|
| 26 |
+
2025-12-18 20:27:20,834 INFO MainThread:50239 [wandb_run.py:_footer_sync_info():3862] logging synced files
|
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/run--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74836b8ead1d0d7492e4b75607bb5cd88dcdf0b00ee7c0f719b9ac1df52dfcf1
|
| 3 |
+
size 356825749
|