ZhenbinWang commited on
Commit
d63bef2
·
verified ·
1 Parent(s): 19bead5

Upload 11 files

Browse files
Files changed (12) hide show
  1. .gitattributes +2 -0
  2. tb/20251213-0337/wandb/debug-internal.log +154 -0
  3. tb/20251213-0337/wandb/debug.log +26 -0
  4. tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/config.yaml +248 -0
  5. tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/output.log +3 -0
  6. tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/requirements.txt +99 -0
  7. tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/wandb-metadata.json +95 -0
  8. tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/wandb-summary.json +1 -0
  9. tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-core.log +16 -0
  10. tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-internal.log +154 -0
  11. tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug.log +26 -0
  12. tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/run--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337.wandb +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/output.log filter=lfs diff=lfs merge=lfs -text
37
+ tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/run--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337.wandb filter=lfs diff=lfs merge=lfs -text
tb/20251213-0337/wandb/debug-internal.log ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-13T03:37:36.23132477+08:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-13T03:37:37.42793721+08:00","level":"INFO","msg":"stream: created new stream","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
3
+ {"time":"2025-12-13T03:37:37.42811984+08:00","level":"INFO","msg":"handler: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
4
+ {"time":"2025-12-13T03:37:37.428536603+08:00","level":"INFO","msg":"stream: started","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
5
+ {"time":"2025-12-13T03:37:37.428580841+08:00","level":"INFO","msg":"sender: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
6
+ {"time":"2025-12-13T03:37:37.428589238+08:00","level":"INFO","msg":"writer: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
7
+ {"time":"2025-12-13T06:05:39.583033219+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
8
+ {"time":"2025-12-13T07:55:37.649211253+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:59798->35.186.228.49:443: read: connection reset by peer"}
9
+ {"time":"2025-12-13T08:13:29.786188657+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
10
+ {"time":"2025-12-13T08:16:47.670018036+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
11
+ {"time":"2025-12-13T08:59:24.082351789+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
12
+ {"time":"2025-12-13T09:12:13.694820381+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
13
+ {"time":"2025-12-13T09:33:57.835049052+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
14
+ {"time":"2025-12-13T09:43:28.441384157+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:35074->35.186.228.49:443: read: connection reset by peer"}
15
+ {"time":"2025-12-13T09:55:59.558604972+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55632->35.186.228.49:443: read: connection reset by peer"}
16
+ {"time":"2025-12-13T10:04:15.521407235+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
17
+ {"time":"2025-12-13T10:20:21.499848188+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
18
+ {"time":"2025-12-13T10:35:23.478678967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
19
+ {"time":"2025-12-13T10:51:06.294920404+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
20
+ {"time":"2025-12-13T13:14:41.311911939+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
21
+ {"time":"2025-12-13T14:59:00.895291828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42956->35.186.228.49:443: read: connection reset by peer"}
22
+ {"time":"2025-12-13T15:28:48.295525438+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
23
+ {"time":"2025-12-13T17:26:06.240844168+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
24
+ {"time":"2025-12-13T17:33:13.927600494+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
25
+ {"time":"2025-12-13T17:38:26.684259493+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
26
+ {"time":"2025-12-13T17:59:17.135876142+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
27
+ {"time":"2025-12-13T18:21:53.605231099+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:49898->35.186.228.49:443: read: connection reset by peer"}
28
+ {"time":"2025-12-13T20:53:31.081839781+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
29
+ {"time":"2025-12-13T21:01:51.042167406+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
30
+ {"time":"2025-12-13T21:16:58.046134274+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
31
+ {"time":"2025-12-13T21:19:47.88199423+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
32
+ {"time":"2025-12-13T21:50:37.427793114+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
33
+ {"time":"2025-12-13T22:04:36.210586263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57666->35.186.228.49:443: read: connection reset by peer"}
34
+ {"time":"2025-12-13T22:05:58.336461319+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46820->35.186.228.49:443: read: connection reset by peer"}
35
+ {"time":"2025-12-13T22:27:53.913619965+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:37800->35.186.228.49:443: read: connection reset by peer"}
36
+ {"time":"2025-12-13T23:09:40.869315801+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
37
+ {"time":"2025-12-13T23:10:21.955543682+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
38
+ {"time":"2025-12-13T23:16:09.301636231+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
39
+ {"time":"2025-12-14T00:23:39.770203057+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
40
+ {"time":"2025-12-14T00:24:17.145903336+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
41
+ {"time":"2025-12-14T02:56:23.178244085+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43334->35.186.228.49:443: read: connection reset by peer"}
42
+ {"time":"2025-12-14T03:18:51.662361676+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
43
+ {"time":"2025-12-14T03:19:23.322670147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
44
+ {"time":"2025-12-14T03:47:51.213153679+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
45
+ {"time":"2025-12-14T04:20:52.250790079+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
46
+ {"time":"2025-12-14T04:21:11.202821697+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
47
+ {"time":"2025-12-14T07:06:08.967902177+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
48
+ {"time":"2025-12-14T07:06:54.195739279+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
49
+ {"time":"2025-12-14T07:13:30.365530787+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
50
+ {"time":"2025-12-14T07:26:06.7209204+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:48478->35.186.228.49:443: read: connection reset by peer"}
51
+ {"time":"2025-12-14T09:25:09.070791287+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
52
+ {"time":"2025-12-14T10:46:12.82568069+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
53
+ {"time":"2025-12-14T10:48:43.885806313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
54
+ {"time":"2025-12-14T11:26:11.723626581+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
55
+ {"time":"2025-12-14T11:31:48.355445234+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
56
+ {"time":"2025-12-14T11:41:59.68850527+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
57
+ {"time":"2025-12-14T12:27:33.380879248+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
58
+ {"time":"2025-12-14T12:36:32.807927117+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57628->35.186.228.49:443: read: connection reset by peer"}
59
+ {"time":"2025-12-14T12:47:10.357920901+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
60
+ {"time":"2025-12-14T12:47:47.340023823+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
61
+ {"time":"2025-12-14T13:06:22.738392825+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:54890->35.186.228.49:443: read: connection reset by peer"}
62
+ {"time":"2025-12-14T13:06:40.383011533+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:49102->35.186.228.49:443: read: connection reset by peer"}
63
+ {"time":"2025-12-14T13:07:04.782062642+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
64
+ {"time":"2025-12-14T13:14:14.561018246+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
65
+ {"time":"2025-12-14T14:48:10.912184589+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
66
+ {"time":"2025-12-14T14:54:11.088938602+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
67
+ {"time":"2025-12-14T14:58:12.755764947+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
68
+ {"time":"2025-12-14T15:46:48.897060165+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57112->35.186.228.49:443: read: connection reset by peer"}
69
+ {"time":"2025-12-14T16:16:18.043599985+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
70
+ {"time":"2025-12-14T16:25:00.963481603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41088->35.186.228.49:443: read: connection reset by peer"}
71
+ {"time":"2025-12-14T17:14:49.422743755+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
72
+ {"time":"2025-12-14T17:21:40.725676457+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
73
+ {"time":"2025-12-14T18:22:33.919294777+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55446->35.186.228.49:443: read: connection reset by peer"}
74
+ {"time":"2025-12-14T18:24:49.798972384+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
75
+ {"time":"2025-12-14T18:25:31.346842356+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
76
+ {"time":"2025-12-14T20:49:52.211146515+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
77
+ {"time":"2025-12-14T21:04:32.293978006+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57160->35.186.228.49:443: read: connection reset by peer"}
78
+ {"time":"2025-12-14T21:10:28.561725795+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
79
+ {"time":"2025-12-14T21:55:52.598096464+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
80
+ {"time":"2025-12-14T21:56:23.294629143+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
81
+ {"time":"2025-12-14T22:54:03.471360435+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
82
+ {"time":"2025-12-14T23:54:12.865835537+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
83
+ {"time":"2025-12-15T00:03:53.643020298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": context deadline exceeded"}
84
+ {"time":"2025-12-15T00:04:50.877883568+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
85
+ {"time":"2025-12-15T00:08:44.968783043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": http2: client conn is closed"}
86
+ {"time":"2025-12-15T00:29:18.768015366+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
87
+ {"time":"2025-12-15T00:38:18.876310425+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
88
+ {"time":"2025-12-15T00:38:47.676509586+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
89
+ {"time":"2025-12-15T02:19:16.46063906+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
90
+ {"time":"2025-12-15T02:28:23.925958967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
91
+ {"time":"2025-12-15T02:28:41.801869127+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
92
+ {"time":"2025-12-15T02:40:36.54023762+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55584->35.186.228.49:443: read: connection reset by peer"}
93
+ {"time":"2025-12-15T02:41:12.850172357+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
94
+ {"time":"2025-12-15T02:48:24.98882141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
95
+ {"time":"2025-12-15T02:54:09.717068313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
96
+ {"time":"2025-12-15T10:55:04.891023931+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43212->35.186.228.49:443: read: connection reset by peer"}
97
+ {"time":"2025-12-15T11:29:29.21607297+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
98
+ {"time":"2025-12-15T12:32:24.098647438+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57632->35.186.228.49:443: read: connection reset by peer"}
99
+ {"time":"2025-12-15T12:41:37.076316554+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42078->35.186.228.49:443: read: connection reset by peer"}
100
+ {"time":"2025-12-15T12:47:58.065901858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41220->35.186.228.49:443: read: connection reset by peer"}
101
+ {"time":"2025-12-15T12:50:55.201423813+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
102
+ {"time":"2025-12-15T12:55:29.869715501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
103
+ {"time":"2025-12-15T13:03:43.614622176+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43786->35.186.228.49:443: read: connection reset by peer"}
104
+ {"time":"2025-12-15T13:08:32.946667579+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
105
+ {"time":"2025-12-15T14:27:50.584368281+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
106
+ {"time":"2025-12-15T14:37:01.10117924+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:50868->35.186.228.49:443: read: connection reset by peer"}
107
+ {"time":"2025-12-15T14:48:40.332940473+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46498->35.186.228.49:443: read: connection reset by peer"}
108
+ {"time":"2025-12-15T14:57:41.943269175+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:47484->35.186.228.49:443: read: connection reset by peer"}
109
+ {"time":"2025-12-15T15:00:55.7967066+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
110
+ {"time":"2025-12-15T15:01:22.024522099+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
111
+ {"time":"2025-12-15T15:08:02.464314661+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
112
+ {"time":"2025-12-15T15:17:47.835880263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:38190->35.186.228.49:443: read: connection reset by peer"}
113
+ {"time":"2025-12-15T18:11:47.925566439+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
114
+ {"time":"2025-12-15T20:09:04.562971386+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
115
+ {"time":"2025-12-15T20:34:41.644855631+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:44234->35.186.228.49:443: read: connection reset by peer"}
116
+ {"time":"2025-12-15T22:01:07.926213466+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
117
+ {"time":"2025-12-15T22:03:00.514128232+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
118
+ {"time":"2025-12-15T23:06:03.297559161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
119
+ {"time":"2025-12-16T12:24:09.162841753+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:34774->35.186.228.49:443: read: connection reset by peer"}
120
+ {"time":"2025-12-16T12:24:35.986195599+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
121
+ {"time":"2025-12-16T12:33:31.359618103+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43794->35.186.228.49:443: read: connection reset by peer"}
122
+ {"time":"2025-12-16T15:38:33.839518601+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41104->35.186.228.49:443: read: connection reset by peer"}
123
+ {"time":"2025-12-16T15:47:21.247612566+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
124
+ {"time":"2025-12-16T15:57:38.882102506+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
125
+ {"time":"2025-12-16T15:59:29.910928408+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
126
+ {"time":"2025-12-16T16:09:32.602645496+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:52806->35.186.228.49:443: read: connection reset by peer"}
127
+ {"time":"2025-12-16T16:32:07.534727269+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
128
+ {"time":"2025-12-16T17:31:52.152175035+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46382->35.186.228.49:443: read: connection reset by peer"}
129
+ {"time":"2025-12-17T07:31:40.798122999+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
130
+ {"time":"2025-12-17T07:58:44.050237073+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
131
+ {"time":"2025-12-17T09:00:16.094926589+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
132
+ {"time":"2025-12-17T17:26:32.547706698+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42782->35.186.228.49:443: read: connection reset by peer"}
133
+ {"time":"2025-12-17T19:56:26.256819603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:58656->35.186.228.49:443: read: connection reset by peer"}
134
+ {"time":"2025-12-17T20:10:25.794344881+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:58790->35.186.228.49:443: read: connection reset by peer"}
135
+ {"time":"2025-12-17T21:56:52.633835476+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
136
+ {"time":"2025-12-17T22:42:02.308668945+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
137
+ {"time":"2025-12-17T23:09:48.436361959+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:50494->35.186.228.49:443: read: connection reset by peer"}
138
+ {"time":"2025-12-17T23:14:23.558114444+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
139
+ {"time":"2025-12-17T23:27:17.340085956+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
140
+ {"time":"2025-12-18T10:58:47.675459253+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55386->35.186.228.49:443: read: connection reset by peer"}
141
+ {"time":"2025-12-18T16:25:51.98607741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
142
+ {"time":"2025-12-18T16:28:49.478972424+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
143
+ {"time":"2025-12-18T16:29:58.352287741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:33476->35.186.228.49:443: read: connection reset by peer"}
144
+ {"time":"2025-12-18T16:33:06.3310878+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:59986->35.186.228.49:443: read: connection reset by peer"}
145
+ {"time":"2025-12-18T16:33:39.806302479+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
146
+ {"time":"2025-12-18T16:54:27.647048623+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
147
+ {"time":"2025-12-18T17:54:07.658597189+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
148
+ {"time":"2025-12-18T19:31:48.116846134+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
149
+ {"time":"2025-12-18T20:27:17.321941239+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
150
+ {"time":"2025-12-18T20:27:20.824274187+08:00","level":"INFO","msg":"handler: operation stats","stats":{}}
151
+ {"time":"2025-12-18T20:27:20.83649299+08:00","level":"INFO","msg":"stream: closing","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
152
+ {"time":"2025-12-18T20:27:20.83651967+08:00","level":"INFO","msg":"handler: closed","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
153
+ {"time":"2025-12-18T20:27:20.836703664+08:00","level":"INFO","msg":"sender: closed","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
154
+ {"time":"2025-12-18T20:27:20.836740475+08:00","level":"INFO","msg":"stream: closed","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
tb/20251213-0337/wandb/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Configure stats pid to 50239
3
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from /root/autodl-tmp/flame/wandb/settings
5
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:setup_run_log_directory():714] Logging user logs to exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug.log
7
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-internal.log
8
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-13 03:37:35,981 INFO MainThread:50239 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'job': defaultdict(None, {'config_file': 'flame/models/fla.toml', 'dump_folder': 'exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine', 'description': 'default job', 'use_for_integration_test': False, 'print_args': True}), 'model': defaultdict(None, {'name': 'fla', 'config': 'configs/hamilton_340M.json', 'tokenizer_path': 'fla-hub/transformer-1.3B-100B', 'converters': [], 'print_after_conversion': False}), 'profiling': defaultdict(None, {'enable_profiling': True, 'save_traces_folder': 'profile_trace', 'profile_freq': 512, 'enable_memory_snapshot': False, 'save_memory_snapshot_folder': 'memory_snapshot'}), 'optimizer': defaultdict(None, {'name': 'AdamW', 'eps': 1e-15, 'lr': 0.001, 'beta1': 0.9, 'beta2': 0.95, 'weight_decay': 0.1, 'implementation': 'fused', 'early_step_in_backward': False}), 'lr_scheduler': defaultdict(None, {'warmup_steps': 1024, 'decay_ratio': None, 'decay_type': 'cosine', 'lr_min': 0.1}), 'training': defaultdict(None, {'batch_size': 1, 'seq_len': 11000, 'context_len': 4096, 'varlen': True, 'gradient_accumulation_steps': 6, 'steps': 81920, 'max_norm': 1.0, 'skip_nan_inf': True, 'dataset': '/autodl-fs', 'dataset_name': 'default', 'dataset_split': 'train', 'data_dir': None, 'data_files': None, 'data_probs': None, 'streaming': False, 'num_workers': 64, 'prefetch_factor': 2, 'data_parallel_replicate_degree': 1, 'data_parallel_shard_degree': -1, 'enable_cpu_offload': False, 'tensor_parallel_degree': 1, 'disable_loss_parallel': False, 'fsdp_reshard_after_forward': 'default', 'mixed_precision_param': 'bfloat16', 'mixed_precision_reduce': 'float32', 'compile': True, 'gc_freq': 50, 'seed': 42, 'deterministic': False, 'pin_memory': False, 'persistent_workers': False}), 'metrics': defaultdict(None, {'log_freq': 1, 'enable_tensorboard': False, 'disable_color_printing': False, 'save_tb_folder': 'tb', 'save_for_all_ranks': False, 'enable_wandb': True}), 'experimental': defaultdict(None, {'enable_async_tensor_parallel': False, 'pipeline_parallel_degree': 1, 'pipeline_parallel_split_points': [], 'pipeline_parallel_schedule': '1F1B', 'pipeline_parallel_schedule_csv': '', 'pipeline_parallel_microbatches': None, 'enable_compiled_autograd': False, 'context_parallel_degree': 1, 'context_parallel_rotate_method': 'allgather', 'custom_model_path': ''}), 'checkpoint': defaultdict(None, {'enable_checkpoint': True, 'folder': 'checkpoint', 'initial_load_path': None, 'initial_load_model_weights_only': True, 'interval': 2048, 'last_save_model_weights_only': False, 'export_dtype': 'float32', 'create_seed_checkpoint': False, 'async_mode': 'disabled', 'keep_latest_k': 2, 'load_step': -1, 'exclude_from_loading': [], 'interval_type': 'steps', 'model_weights_only': False}), 'activation_checkpoint': defaultdict(None, {'mode': 'none', 'selective_ac_option': '2'}), 'activation_offload': defaultdict(None, {'mode': 'none'}), 'float8': defaultdict(None, {'enable_fsdp_float8_all_gather': False, 'precompute_float8_dynamic_scale_for_fsdp': False, 'force_recompute_fp8_weight_in_bwd': False, 'recipe_name': None}), 'comm': defaultdict(None, {'init_timeout_seconds': 300, 'train_timeout_seconds': 100, 'trace_buf_size': 20000}), 'memory_estimation': defaultdict(None, {'enabled': False, 'disable_fake_mode': False}), 'fault_tolerance': defaultdict(None, {'enable': False, 'replica_id': 0, 'group_size': 0, 'min_replica_size': 1}), '_wandb': {}}
11
+ 2025-12-13 03:37:35,981 INFO MainThread:50239 [wandb_init.py:init():889] starting backend
12
+ 2025-12-13 03:37:36,219 INFO MainThread:50239 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-13 03:37:36,226 INFO MainThread:50239 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-13 03:37:36,227 INFO MainThread:50239 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-13 03:37:36,234 INFO MainThread:50239 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-13 03:37:38,639 INFO MainThread:50239 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-13 03:37:38,692 INFO MainThread:50239 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-18 20:26:58,151 INFO MainThread:50239 [wandb_run.py:_finish():2287] finishing run wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337
23
+ 2025-12-18 20:26:58,154 INFO MainThread:50239 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
24
+ 2025-12-18 20:26:58,155 INFO MainThread:50239 [wandb_run.py:_restore():2468] restore
25
+ 2025-12-18 20:26:58,155 INFO MainThread:50239 [wandb_run.py:_restore():2474] restore done
26
+ 2025-12-18 20:27:20,834 INFO MainThread:50239 [wandb_run.py:_footer_sync_info():3862] logging synced files
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/config.yaml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.23.1
4
+ e:
5
+ i0cpzc2r8qmav38fgt0h4i6tgznjgbr7:
6
+ args:
7
+ - --job.config_file
8
+ - flame/models/fla.toml
9
+ - --job.dump_folder
10
+ - exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine
11
+ - --model.config
12
+ - configs/hamilton_340M.json
13
+ - --model.tokenizer_path
14
+ - fla-hub/transformer-1.3B-100B
15
+ - --optimizer.name
16
+ - AdamW
17
+ - --optimizer.eps
18
+ - "1e-15"
19
+ - --optimizer.lr
20
+ - "1e-3"
21
+ - --lr_scheduler.warmup_steps
22
+ - "1024"
23
+ - --lr_scheduler.lr_min
24
+ - "0.1"
25
+ - --lr_scheduler.decay_type
26
+ - cosine
27
+ - --training.batch_size
28
+ - "1"
29
+ - --training.seq_len
30
+ - "11000"
31
+ - --training.context_len
32
+ - "4096"
33
+ - --training.varlen
34
+ - --training.gradient_accumulation_steps
35
+ - "6"
36
+ - --training.steps
37
+ - "81920"
38
+ - --training.max_norm
39
+ - "1.0"
40
+ - --training.skip_nan_inf
41
+ - --training.dataset
42
+ - /autodl-fs
43
+ - --training.dataset_name
44
+ - default
45
+ - --training.dataset_split
46
+ - train
47
+ - --training.num_workers
48
+ - "64"
49
+ - --training.prefetch_factor
50
+ - "2"
51
+ - --training.seed
52
+ - "42"
53
+ - --checkpoint.interval
54
+ - "2048"
55
+ - --checkpoint.load_step
56
+ - "-1"
57
+ - --checkpoint.keep_latest_k
58
+ - "2"
59
+ - --metrics.log_freq
60
+ - "1"
61
+ - --training.compile
62
+ cpu_count: 104
63
+ cpu_count_logical: 208
64
+ cudaVersion: "13.0"
65
+ disk:
66
+ /:
67
+ total: "32212254720"
68
+ used: "22135824384"
69
+ email: wangzhenbin@stu.scu.edu.cn
70
+ executable: /root/miniconda3/envs/hami/bin/python3.11
71
+ git:
72
+ commit: e11e7be75b9e45e84dbecbe8f0efa27d6af7d101
73
+ remote: https://github.com/fla-org/flame.git
74
+ gpu: NVIDIA RTX PRO 6000 Blackwell Server Edition
75
+ gpu_count: 1
76
+ gpu_nvidia:
77
+ - architecture: Blackwell
78
+ cudaCores: 24064
79
+ memoryTotal: "102641958912"
80
+ name: NVIDIA RTX PRO 6000 Blackwell Server Edition
81
+ uuid: GPU-de95cbc9-4fb5-9a7b-0649-e302580dd50b
82
+ host: autodl-container-3b7944880d-07a10e2a
83
+ memory:
84
+ total: "1081796993024"
85
+ os: Linux-5.15.0-78-generic-x86_64-with-glibc2.35
86
+ program: -m flame.train
87
+ python: CPython 3.11.7
88
+ root: exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337
89
+ startedAt: "2025-12-12T19:37:35.978261Z"
90
+ writerId: i0cpzc2r8qmav38fgt0h4i6tgznjgbr7
91
+ m: []
92
+ python_version: 3.11.7
93
+ t:
94
+ "1":
95
+ - 1
96
+ - 11
97
+ - 49
98
+ - 51
99
+ "2":
100
+ - 1
101
+ - 11
102
+ - 49
103
+ - 51
104
+ "3":
105
+ - 2
106
+ - 13
107
+ - 14
108
+ - 16
109
+ - 61
110
+ "4": 3.11.7
111
+ "5": 0.23.1
112
+ "6": 4.57.3
113
+ "12": 0.23.1
114
+ "13": linux-x86_64
115
+ activation_checkpoint:
116
+ value:
117
+ mode: none
118
+ selective_ac_option: "2"
119
+ activation_offload:
120
+ value:
121
+ mode: none
122
+ checkpoint:
123
+ value:
124
+ async_mode: disabled
125
+ create_seed_checkpoint: false
126
+ enable_checkpoint: true
127
+ exclude_from_loading: []
128
+ export_dtype: float32
129
+ folder: checkpoint
130
+ initial_load_model_weights_only: true
131
+ initial_load_path: null
132
+ interval: 2048
133
+ interval_type: steps
134
+ keep_latest_k: 2
135
+ last_save_model_weights_only: false
136
+ load_step: -1
137
+ model_weights_only: false
138
+ comm:
139
+ value:
140
+ init_timeout_seconds: 300
141
+ trace_buf_size: 20000
142
+ train_timeout_seconds: 100
143
+ experimental:
144
+ value:
145
+ context_parallel_degree: 1
146
+ context_parallel_rotate_method: allgather
147
+ custom_model_path: ""
148
+ enable_async_tensor_parallel: false
149
+ enable_compiled_autograd: false
150
+ pipeline_parallel_degree: 1
151
+ pipeline_parallel_microbatches: null
152
+ pipeline_parallel_schedule: 1F1B
153
+ pipeline_parallel_schedule_csv: ""
154
+ pipeline_parallel_split_points: []
155
+ fault_tolerance:
156
+ value:
157
+ enable: false
158
+ group_size: 0
159
+ min_replica_size: 1
160
+ replica_id: 0
161
+ float8:
162
+ value:
163
+ enable_fsdp_float8_all_gather: false
164
+ force_recompute_fp8_weight_in_bwd: false
165
+ precompute_float8_dynamic_scale_for_fsdp: false
166
+ recipe_name: null
167
+ job:
168
+ value:
169
+ config_file: flame/models/fla.toml
170
+ description: default job
171
+ dump_folder: exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine
172
+ print_args: true
173
+ use_for_integration_test: false
174
+ lr_scheduler:
175
+ value:
176
+ decay_ratio: null
177
+ decay_type: cosine
178
+ lr_min: 0.1
179
+ warmup_steps: 1024
180
+ memory_estimation:
181
+ value:
182
+ disable_fake_mode: false
183
+ enabled: false
184
+ metrics:
185
+ value:
186
+ disable_color_printing: false
187
+ enable_tensorboard: false
188
+ enable_wandb: true
189
+ log_freq: 1
190
+ save_for_all_ranks: false
191
+ save_tb_folder: tb
192
+ model:
193
+ value:
194
+ config: configs/hamilton_340M.json
195
+ converters: []
196
+ name: fla
197
+ print_after_conversion: false
198
+ tokenizer_path: fla-hub/transformer-1.3B-100B
199
+ optimizer:
200
+ value:
201
+ beta1: 0.9
202
+ beta2: 0.95
203
+ early_step_in_backward: false
204
+ eps: 1e-15
205
+ implementation: fused
206
+ lr: 0.001
207
+ name: AdamW
208
+ weight_decay: 0.1
209
+ profiling:
210
+ value:
211
+ enable_memory_snapshot: false
212
+ enable_profiling: true
213
+ profile_freq: 512
214
+ save_memory_snapshot_folder: memory_snapshot
215
+ save_traces_folder: profile_trace
216
+ training:
217
+ value:
218
+ batch_size: 1
219
+ compile: true
220
+ context_len: 4096
221
+ data_dir: null
222
+ data_files: null
223
+ data_parallel_replicate_degree: 1
224
+ data_parallel_shard_degree: -1
225
+ data_probs: null
226
+ dataset: /autodl-fs
227
+ dataset_name: default
228
+ dataset_split: train
229
+ deterministic: false
230
+ disable_loss_parallel: false
231
+ enable_cpu_offload: false
232
+ fsdp_reshard_after_forward: default
233
+ gc_freq: 50
234
+ gradient_accumulation_steps: 6
235
+ max_norm: 1
236
+ mixed_precision_param: bfloat16
237
+ mixed_precision_reduce: float32
238
+ num_workers: 64
239
+ persistent_workers: false
240
+ pin_memory: false
241
+ prefetch_factor: 2
242
+ seed: 42
243
+ seq_len: 11000
244
+ skip_nan_inf: true
245
+ steps: 81920
246
+ streaming: false
247
+ tensor_parallel_degree: 1
248
+ varlen: true
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/output.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3237921edeadd36f7707bf546a21cbf7e273134dff308f78c6063f96ef8fd8f8
3
+ size 38074835
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/requirements.txt ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flame==0.1.0
2
+ absl-py==2.3.1
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.13.2
5
+ aiosignal==1.4.0
6
+ annotated-types==0.7.0
7
+ anyio==4.12.0
8
+ attrs==25.4.0
9
+ blobfile==3.1.0
10
+ certifi==2025.11.12
11
+ charset-normalizer==3.4.4
12
+ click==8.3.1
13
+ datasets==4.4.1
14
+ dill==0.4.0
15
+ docstring_parser==0.17.0
16
+ einops==0.8.1
17
+ filelock==3.20.0
18
+ flame==0.1.0
19
+ flash-linear-attention==0.4.1
20
+ frozenlist==1.8.0
21
+ fsspec==2025.10.0
22
+ gitdb==4.0.12
23
+ GitPython==3.1.45
24
+ grpcio==1.76.0
25
+ h11==0.16.0
26
+ hf-xet==1.2.0
27
+ httpcore==1.0.9
28
+ httpx==0.28.1
29
+ huggingface-hub==0.36.0
30
+ idna==3.11
31
+ Jinja2==3.1.6
32
+ lxml==6.0.2
33
+ Markdown==3.10
34
+ MarkupSafe==3.0.3
35
+ mpmath==1.3.0
36
+ multidict==6.7.0
37
+ multiprocess==0.70.18
38
+ networkx==3.6.1
39
+ ninja==1.13.0
40
+ numpy==2.3.5
41
+ nvidia-cublas-cu12==12.8.4.1
42
+ nvidia-cuda-cupti-cu12==12.8.90
43
+ nvidia-cuda-nvrtc-cu12==12.8.93
44
+ nvidia-cuda-runtime-cu12==12.8.90
45
+ nvidia-cudnn-cu12==9.10.2.21
46
+ nvidia-cufft-cu12==11.3.3.83
47
+ nvidia-cufile-cu12==1.13.1.3
48
+ nvidia-curand-cu12==10.3.9.90
49
+ nvidia-cusolver-cu12==11.7.3.90
50
+ nvidia-cusparse-cu12==12.5.8.93
51
+ nvidia-cusparselt-cu12==0.7.1
52
+ nvidia-nccl-cu12==2.27.5
53
+ nvidia-nvjitlink-cu12==12.8.93
54
+ nvidia-nvshmem-cu12==3.3.20
55
+ nvidia-nvtx-cu12==12.8.90
56
+ packaging==25.0
57
+ pandas==2.3.3
58
+ pillow==12.0.0
59
+ pip==25.3
60
+ platformdirs==4.5.1
61
+ propcache==0.4.1
62
+ protobuf==6.33.2
63
+ pyarrow==22.0.0
64
+ pycryptodomex==3.23.0
65
+ pydantic==2.12.5
66
+ pydantic_core==2.41.5
67
+ python-dateutil==2.9.0.post0
68
+ pytz==2025.2
69
+ PyYAML==6.0.3
70
+ regex==2025.11.3
71
+ requests==2.32.5
72
+ safetensors==0.7.0
73
+ sentry-sdk==2.47.0
74
+ setuptools==80.9.0
75
+ six==1.17.0
76
+ smmap==5.0.2
77
+ sympy==1.14.0
78
+ tensorboard==2.20.0
79
+ tensorboard-data-server==0.7.2
80
+ tiktoken==0.12.0
81
+ tokenizers==0.22.1
82
+ tomli==2.3.0
83
+ torch==2.9.1
84
+ torchdata==0.11.0
85
+ torchtitan==0.1.0
86
+ tqdm==4.67.1
87
+ transformers==4.57.3
88
+ triton==3.5.1
89
+ typeguard==4.4.4
90
+ typing_extensions==4.15.0
91
+ typing-inspection==0.4.2
92
+ tyro==1.0.1
93
+ tzdata==2025.2
94
+ urllib3==2.6.2
95
+ wandb==0.23.1
96
+ Werkzeug==3.1.4
97
+ wheel==0.45.1
98
+ xxhash==3.6.0
99
+ yarl==1.22.0
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/wandb-metadata.json ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-78-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.7",
4
+ "startedAt": "2025-12-12T19:37:35.978261Z",
5
+ "args": [
6
+ "--job.config_file",
7
+ "flame/models/fla.toml",
8
+ "--job.dump_folder",
9
+ "exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine",
10
+ "--model.config",
11
+ "configs/hamilton_340M.json",
12
+ "--model.tokenizer_path",
13
+ "fla-hub/transformer-1.3B-100B",
14
+ "--optimizer.name",
15
+ "AdamW",
16
+ "--optimizer.eps",
17
+ "1e-15",
18
+ "--optimizer.lr",
19
+ "1e-3",
20
+ "--lr_scheduler.warmup_steps",
21
+ "1024",
22
+ "--lr_scheduler.lr_min",
23
+ "0.1",
24
+ "--lr_scheduler.decay_type",
25
+ "cosine",
26
+ "--training.batch_size",
27
+ "1",
28
+ "--training.seq_len",
29
+ "11000",
30
+ "--training.context_len",
31
+ "4096",
32
+ "--training.varlen",
33
+ "--training.gradient_accumulation_steps",
34
+ "6",
35
+ "--training.steps",
36
+ "81920",
37
+ "--training.max_norm",
38
+ "1.0",
39
+ "--training.skip_nan_inf",
40
+ "--training.dataset",
41
+ "/autodl-fs",
42
+ "--training.dataset_name",
43
+ "default",
44
+ "--training.dataset_split",
45
+ "train",
46
+ "--training.num_workers",
47
+ "64",
48
+ "--training.prefetch_factor",
49
+ "2",
50
+ "--training.seed",
51
+ "42",
52
+ "--checkpoint.interval",
53
+ "2048",
54
+ "--checkpoint.load_step",
55
+ "-1",
56
+ "--checkpoint.keep_latest_k",
57
+ "2",
58
+ "--metrics.log_freq",
59
+ "1",
60
+ "--training.compile"
61
+ ],
62
+ "program": "-m flame.train",
63
+ "git": {
64
+ "remote": "https://github.com/fla-org/flame.git",
65
+ "commit": "e11e7be75b9e45e84dbecbe8f0efa27d6af7d101"
66
+ },
67
+ "email": "wangzhenbin@stu.scu.edu.cn",
68
+ "root": "exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337",
69
+ "host": "autodl-container-3b7944880d-07a10e2a",
70
+ "executable": "/root/miniconda3/envs/hami/bin/python3.11",
71
+ "cpu_count": 104,
72
+ "cpu_count_logical": 208,
73
+ "gpu": "NVIDIA RTX PRO 6000 Blackwell Server Edition",
74
+ "gpu_count": 1,
75
+ "disk": {
76
+ "/": {
77
+ "total": "32212254720",
78
+ "used": "22135824384"
79
+ }
80
+ },
81
+ "memory": {
82
+ "total": "1081796993024"
83
+ },
84
+ "gpu_nvidia": [
85
+ {
86
+ "name": "NVIDIA RTX PRO 6000 Blackwell Server Edition",
87
+ "memoryTotal": "102641958912",
88
+ "cudaCores": 24064,
89
+ "architecture": "Blackwell",
90
+ "uuid": "GPU-de95cbc9-4fb5-9a7b-0649-e302580dd50b"
91
+ }
92
+ ],
93
+ "cudaVersion": "13.0",
94
+ "writerId": "i0cpzc2r8qmav38fgt0h4i6tgznjgbr7"
95
+ }
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"time_metrics/data_loading(%)":0.5803219584911921,"loss_metrics/global_max_loss":2.9381303787231445,"optimizer/skipped_step":0,"mfu(%)":11.607521852909128,"memory/num_alloc_retries":0,"tflops":36.215468181076474,"throughput(tps)":10864.281324639476,"optimizer/lr":0.0001,"time_metrics/data_loading(s)":0.005875714511300127,"_wandb":{"runtime":492559},"memory/max_reserved(%)":93.53824322735011,"memory/max_active(GiB)":88.26861763000488,"_timestamp":1.7660608023964982e+09,"_runtime":492559.517721682,"time_metrics/end_to_end(s)":6.074953144881874,"memory/max_reserved(GiB)":88.833984375,"optimizer/grad_norm":0.8941753506660461,"memory/max_active(%)":92.94293713500184,"memory/num_ooms":0,"loss_metrics/global_avg_loss":2.9381303787231445,"_step":81920}
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-13T03:37:36.129026081+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpqdibn3bf/port-50239.txt","pid":50239,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-13T03:37:36.130196984+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":50239}
3
+ {"time":"2025-12-13T03:37:36.130017672+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-50239-50346-3756975333/socket","Net":"unix"}}
4
+ {"time":"2025-12-13T03:37:36.218871979+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-13T03:37:36.231072223+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337","id":"1(@)"}
6
+ {"time":"2025-12-13T03:37:37.428562008+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337","id":"1(@)"}
7
+ {"time":"2025-12-18T20:27:20.836424535+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337","id":"1(@)"}
8
+ {"time":"2025-12-18T20:27:20.841392619+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337","id":"1(@)"}
9
+ {"time":"2025-12-18T20:27:28.809165523+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2025-12-18T20:27:28.809238229+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2025-12-18T20:27:28.809255391+08:00","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-12-18T20:27:28.809339818+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2025-12-18T20:27:28.809382419+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-50239-50346-3756975333/socket","Net":"unix"}}
14
+ {"time":"2025-12-18T20:27:28.809467482+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
15
+ {"time":"2025-12-18T20:27:28.809478249+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
16
+ {"time":"2025-12-18T20:27:28.809495971+08:00","level":"INFO","msg":"server is closed"}
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-internal.log ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-13T03:37:36.23132477+08:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-13T03:37:37.42793721+08:00","level":"INFO","msg":"stream: created new stream","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
3
+ {"time":"2025-12-13T03:37:37.42811984+08:00","level":"INFO","msg":"handler: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
4
+ {"time":"2025-12-13T03:37:37.428536603+08:00","level":"INFO","msg":"stream: started","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
5
+ {"time":"2025-12-13T03:37:37.428580841+08:00","level":"INFO","msg":"sender: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
6
+ {"time":"2025-12-13T03:37:37.428589238+08:00","level":"INFO","msg":"writer: started","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
7
+ {"time":"2025-12-13T06:05:39.583033219+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
8
+ {"time":"2025-12-13T07:55:37.649211253+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:59798->35.186.228.49:443: read: connection reset by peer"}
9
+ {"time":"2025-12-13T08:13:29.786188657+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
10
+ {"time":"2025-12-13T08:16:47.670018036+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
11
+ {"time":"2025-12-13T08:59:24.082351789+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
12
+ {"time":"2025-12-13T09:12:13.694820381+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
13
+ {"time":"2025-12-13T09:33:57.835049052+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
14
+ {"time":"2025-12-13T09:43:28.441384157+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:35074->35.186.228.49:443: read: connection reset by peer"}
15
+ {"time":"2025-12-13T09:55:59.558604972+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55632->35.186.228.49:443: read: connection reset by peer"}
16
+ {"time":"2025-12-13T10:04:15.521407235+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
17
+ {"time":"2025-12-13T10:20:21.499848188+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
18
+ {"time":"2025-12-13T10:35:23.478678967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
19
+ {"time":"2025-12-13T10:51:06.294920404+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
20
+ {"time":"2025-12-13T13:14:41.311911939+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
21
+ {"time":"2025-12-13T14:59:00.895291828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42956->35.186.228.49:443: read: connection reset by peer"}
22
+ {"time":"2025-12-13T15:28:48.295525438+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
23
+ {"time":"2025-12-13T17:26:06.240844168+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
24
+ {"time":"2025-12-13T17:33:13.927600494+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
25
+ {"time":"2025-12-13T17:38:26.684259493+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
26
+ {"time":"2025-12-13T17:59:17.135876142+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
27
+ {"time":"2025-12-13T18:21:53.605231099+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:49898->35.186.228.49:443: read: connection reset by peer"}
28
+ {"time":"2025-12-13T20:53:31.081839781+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
29
+ {"time":"2025-12-13T21:01:51.042167406+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
30
+ {"time":"2025-12-13T21:16:58.046134274+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
31
+ {"time":"2025-12-13T21:19:47.88199423+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
32
+ {"time":"2025-12-13T21:50:37.427793114+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
33
+ {"time":"2025-12-13T22:04:36.210586263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57666->35.186.228.49:443: read: connection reset by peer"}
34
+ {"time":"2025-12-13T22:05:58.336461319+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46820->35.186.228.49:443: read: connection reset by peer"}
35
+ {"time":"2025-12-13T22:27:53.913619965+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:37800->35.186.228.49:443: read: connection reset by peer"}
36
+ {"time":"2025-12-13T23:09:40.869315801+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
37
+ {"time":"2025-12-13T23:10:21.955543682+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
38
+ {"time":"2025-12-13T23:16:09.301636231+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
39
+ {"time":"2025-12-14T00:23:39.770203057+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
40
+ {"time":"2025-12-14T00:24:17.145903336+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
41
+ {"time":"2025-12-14T02:56:23.178244085+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43334->35.186.228.49:443: read: connection reset by peer"}
42
+ {"time":"2025-12-14T03:18:51.662361676+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
43
+ {"time":"2025-12-14T03:19:23.322670147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
44
+ {"time":"2025-12-14T03:47:51.213153679+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
45
+ {"time":"2025-12-14T04:20:52.250790079+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
46
+ {"time":"2025-12-14T04:21:11.202821697+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
47
+ {"time":"2025-12-14T07:06:08.967902177+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
48
+ {"time":"2025-12-14T07:06:54.195739279+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
49
+ {"time":"2025-12-14T07:13:30.365530787+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
50
+ {"time":"2025-12-14T07:26:06.7209204+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:48478->35.186.228.49:443: read: connection reset by peer"}
51
+ {"time":"2025-12-14T09:25:09.070791287+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
52
+ {"time":"2025-12-14T10:46:12.82568069+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
53
+ {"time":"2025-12-14T10:48:43.885806313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
54
+ {"time":"2025-12-14T11:26:11.723626581+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
55
+ {"time":"2025-12-14T11:31:48.355445234+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
56
+ {"time":"2025-12-14T11:41:59.68850527+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
57
+ {"time":"2025-12-14T12:27:33.380879248+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
58
+ {"time":"2025-12-14T12:36:32.807927117+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57628->35.186.228.49:443: read: connection reset by peer"}
59
+ {"time":"2025-12-14T12:47:10.357920901+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
60
+ {"time":"2025-12-14T12:47:47.340023823+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
61
+ {"time":"2025-12-14T13:06:22.738392825+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:54890->35.186.228.49:443: read: connection reset by peer"}
62
+ {"time":"2025-12-14T13:06:40.383011533+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:49102->35.186.228.49:443: read: connection reset by peer"}
63
+ {"time":"2025-12-14T13:07:04.782062642+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
64
+ {"time":"2025-12-14T13:14:14.561018246+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
65
+ {"time":"2025-12-14T14:48:10.912184589+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
66
+ {"time":"2025-12-14T14:54:11.088938602+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
67
+ {"time":"2025-12-14T14:58:12.755764947+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
68
+ {"time":"2025-12-14T15:46:48.897060165+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57112->35.186.228.49:443: read: connection reset by peer"}
69
+ {"time":"2025-12-14T16:16:18.043599985+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
70
+ {"time":"2025-12-14T16:25:00.963481603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41088->35.186.228.49:443: read: connection reset by peer"}
71
+ {"time":"2025-12-14T17:14:49.422743755+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
72
+ {"time":"2025-12-14T17:21:40.725676457+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
73
+ {"time":"2025-12-14T18:22:33.919294777+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55446->35.186.228.49:443: read: connection reset by peer"}
74
+ {"time":"2025-12-14T18:24:49.798972384+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
75
+ {"time":"2025-12-14T18:25:31.346842356+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
76
+ {"time":"2025-12-14T20:49:52.211146515+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
77
+ {"time":"2025-12-14T21:04:32.293978006+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57160->35.186.228.49:443: read: connection reset by peer"}
78
+ {"time":"2025-12-14T21:10:28.561725795+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
79
+ {"time":"2025-12-14T21:55:52.598096464+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
80
+ {"time":"2025-12-14T21:56:23.294629143+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
81
+ {"time":"2025-12-14T22:54:03.471360435+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
82
+ {"time":"2025-12-14T23:54:12.865835537+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
83
+ {"time":"2025-12-15T00:03:53.643020298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": context deadline exceeded"}
84
+ {"time":"2025-12-15T00:04:50.877883568+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
85
+ {"time":"2025-12-15T00:08:44.968783043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": http2: client conn is closed"}
86
+ {"time":"2025-12-15T00:29:18.768015366+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
87
+ {"time":"2025-12-15T00:38:18.876310425+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
88
+ {"time":"2025-12-15T00:38:47.676509586+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
89
+ {"time":"2025-12-15T02:19:16.46063906+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
90
+ {"time":"2025-12-15T02:28:23.925958967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
91
+ {"time":"2025-12-15T02:28:41.801869127+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
92
+ {"time":"2025-12-15T02:40:36.54023762+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55584->35.186.228.49:443: read: connection reset by peer"}
93
+ {"time":"2025-12-15T02:41:12.850172357+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
94
+ {"time":"2025-12-15T02:48:24.98882141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
95
+ {"time":"2025-12-15T02:54:09.717068313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
96
+ {"time":"2025-12-15T10:55:04.891023931+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43212->35.186.228.49:443: read: connection reset by peer"}
97
+ {"time":"2025-12-15T11:29:29.21607297+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
98
+ {"time":"2025-12-15T12:32:24.098647438+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:57632->35.186.228.49:443: read: connection reset by peer"}
99
+ {"time":"2025-12-15T12:41:37.076316554+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42078->35.186.228.49:443: read: connection reset by peer"}
100
+ {"time":"2025-12-15T12:47:58.065901858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41220->35.186.228.49:443: read: connection reset by peer"}
101
+ {"time":"2025-12-15T12:50:55.201423813+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
102
+ {"time":"2025-12-15T12:55:29.869715501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
103
+ {"time":"2025-12-15T13:03:43.614622176+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43786->35.186.228.49:443: read: connection reset by peer"}
104
+ {"time":"2025-12-15T13:08:32.946667579+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
105
+ {"time":"2025-12-15T14:27:50.584368281+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
106
+ {"time":"2025-12-15T14:37:01.10117924+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:50868->35.186.228.49:443: read: connection reset by peer"}
107
+ {"time":"2025-12-15T14:48:40.332940473+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46498->35.186.228.49:443: read: connection reset by peer"}
108
+ {"time":"2025-12-15T14:57:41.943269175+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:47484->35.186.228.49:443: read: connection reset by peer"}
109
+ {"time":"2025-12-15T15:00:55.7967066+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
110
+ {"time":"2025-12-15T15:01:22.024522099+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
111
+ {"time":"2025-12-15T15:08:02.464314661+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
112
+ {"time":"2025-12-15T15:17:47.835880263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:38190->35.186.228.49:443: read: connection reset by peer"}
113
+ {"time":"2025-12-15T18:11:47.925566439+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
114
+ {"time":"2025-12-15T20:09:04.562971386+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
115
+ {"time":"2025-12-15T20:34:41.644855631+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:44234->35.186.228.49:443: read: connection reset by peer"}
116
+ {"time":"2025-12-15T22:01:07.926213466+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
117
+ {"time":"2025-12-15T22:03:00.514128232+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
118
+ {"time":"2025-12-15T23:06:03.297559161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
119
+ {"time":"2025-12-16T12:24:09.162841753+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:34774->35.186.228.49:443: read: connection reset by peer"}
120
+ {"time":"2025-12-16T12:24:35.986195599+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
121
+ {"time":"2025-12-16T12:33:31.359618103+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:43794->35.186.228.49:443: read: connection reset by peer"}
122
+ {"time":"2025-12-16T15:38:33.839518601+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:41104->35.186.228.49:443: read: connection reset by peer"}
123
+ {"time":"2025-12-16T15:47:21.247612566+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
124
+ {"time":"2025-12-16T15:57:38.882102506+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
125
+ {"time":"2025-12-16T15:59:29.910928408+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
126
+ {"time":"2025-12-16T16:09:32.602645496+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:52806->35.186.228.49:443: read: connection reset by peer"}
127
+ {"time":"2025-12-16T16:32:07.534727269+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
128
+ {"time":"2025-12-16T17:31:52.152175035+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:46382->35.186.228.49:443: read: connection reset by peer"}
129
+ {"time":"2025-12-17T07:31:40.798122999+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
130
+ {"time":"2025-12-17T07:58:44.050237073+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
131
+ {"time":"2025-12-17T09:00:16.094926589+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
132
+ {"time":"2025-12-17T17:26:32.547706698+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:42782->35.186.228.49:443: read: connection reset by peer"}
133
+ {"time":"2025-12-17T19:56:26.256819603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:58656->35.186.228.49:443: read: connection reset by peer"}
134
+ {"time":"2025-12-17T20:10:25.794344881+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:58790->35.186.228.49:443: read: connection reset by peer"}
135
+ {"time":"2025-12-17T21:56:52.633835476+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
136
+ {"time":"2025-12-17T22:42:02.308668945+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
137
+ {"time":"2025-12-17T23:09:48.436361959+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:50494->35.186.228.49:443: read: connection reset by peer"}
138
+ {"time":"2025-12-17T23:14:23.558114444+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
139
+ {"time":"2025-12-17T23:27:17.340085956+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
140
+ {"time":"2025-12-18T10:58:47.675459253+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:55386->35.186.228.49:443: read: connection reset by peer"}
141
+ {"time":"2025-12-18T16:25:51.98607741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
142
+ {"time":"2025-12-18T16:28:49.478972424+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
143
+ {"time":"2025-12-18T16:29:58.352287741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:33476->35.186.228.49:443: read: connection reset by peer"}
144
+ {"time":"2025-12-18T16:33:06.3310878+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": read tcp 172.17.0.4:59986->35.186.228.49:443: read: connection reset by peer"}
145
+ {"time":"2025-12-18T16:33:39.806302479+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
146
+ {"time":"2025-12-18T16:54:27.647048623+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
147
+ {"time":"2025-12-18T17:54:07.658597189+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
148
+ {"time":"2025-12-18T19:31:48.116846134+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/file_stream\": unexpected EOF"}
149
+ {"time":"2025-12-18T20:27:17.321941239+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
150
+ {"time":"2025-12-18T20:27:20.824274187+08:00","level":"INFO","msg":"handler: operation stats","stats":{}}
151
+ {"time":"2025-12-18T20:27:20.83649299+08:00","level":"INFO","msg":"stream: closing","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
152
+ {"time":"2025-12-18T20:27:20.83651967+08:00","level":"INFO","msg":"handler: closed","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
153
+ {"time":"2025-12-18T20:27:20.836703664+08:00","level":"INFO","msg":"sender: closed","stream_id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
154
+ {"time":"2025-12-18T20:27:20.836740475+08:00","level":"INFO","msg":"stream: closed","id":"-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337"}
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Configure stats pid to 50239
3
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from /root/autodl-tmp/flame/wandb/settings
5
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:setup_run_log_directory():714] Logging user logs to exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug.log
7
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine/tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/logs/debug-internal.log
8
+ 2025-12-13 03:37:35,980 INFO MainThread:50239 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-13 03:37:35,981 INFO MainThread:50239 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'job': defaultdict(None, {'config_file': 'flame/models/fla.toml', 'dump_folder': 'exp/hamilton-340M-4K-10B/batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine', 'description': 'default job', 'use_for_integration_test': False, 'print_args': True}), 'model': defaultdict(None, {'name': 'fla', 'config': 'configs/hamilton_340M.json', 'tokenizer_path': 'fla-hub/transformer-1.3B-100B', 'converters': [], 'print_after_conversion': False}), 'profiling': defaultdict(None, {'enable_profiling': True, 'save_traces_folder': 'profile_trace', 'profile_freq': 512, 'enable_memory_snapshot': False, 'save_memory_snapshot_folder': 'memory_snapshot'}), 'optimizer': defaultdict(None, {'name': 'AdamW', 'eps': 1e-15, 'lr': 0.001, 'beta1': 0.9, 'beta2': 0.95, 'weight_decay': 0.1, 'implementation': 'fused', 'early_step_in_backward': False}), 'lr_scheduler': defaultdict(None, {'warmup_steps': 1024, 'decay_ratio': None, 'decay_type': 'cosine', 'lr_min': 0.1}), 'training': defaultdict(None, {'batch_size': 1, 'seq_len': 11000, 'context_len': 4096, 'varlen': True, 'gradient_accumulation_steps': 6, 'steps': 81920, 'max_norm': 1.0, 'skip_nan_inf': True, 'dataset': '/autodl-fs', 'dataset_name': 'default', 'dataset_split': 'train', 'data_dir': None, 'data_files': None, 'data_probs': None, 'streaming': False, 'num_workers': 64, 'prefetch_factor': 2, 'data_parallel_replicate_degree': 1, 'data_parallel_shard_degree': -1, 'enable_cpu_offload': False, 'tensor_parallel_degree': 1, 'disable_loss_parallel': False, 'fsdp_reshard_after_forward': 'default', 'mixed_precision_param': 'bfloat16', 'mixed_precision_reduce': 'float32', 'compile': True, 'gc_freq': 50, 'seed': 42, 'deterministic': False, 'pin_memory': False, 'persistent_workers': False}), 'metrics': defaultdict(None, {'log_freq': 1, 'enable_tensorboard': False, 'disable_color_printing': False, 'save_tb_folder': 'tb', 'save_for_all_ranks': False, 'enable_wandb': True}), 'experimental': defaultdict(None, {'enable_async_tensor_parallel': False, 'pipeline_parallel_degree': 1, 'pipeline_parallel_split_points': [], 'pipeline_parallel_schedule': '1F1B', 'pipeline_parallel_schedule_csv': '', 'pipeline_parallel_microbatches': None, 'enable_compiled_autograd': False, 'context_parallel_degree': 1, 'context_parallel_rotate_method': 'allgather', 'custom_model_path': ''}), 'checkpoint': defaultdict(None, {'enable_checkpoint': True, 'folder': 'checkpoint', 'initial_load_path': None, 'initial_load_model_weights_only': True, 'interval': 2048, 'last_save_model_weights_only': False, 'export_dtype': 'float32', 'create_seed_checkpoint': False, 'async_mode': 'disabled', 'keep_latest_k': 2, 'load_step': -1, 'exclude_from_loading': [], 'interval_type': 'steps', 'model_weights_only': False}), 'activation_checkpoint': defaultdict(None, {'mode': 'none', 'selective_ac_option': '2'}), 'activation_offload': defaultdict(None, {'mode': 'none'}), 'float8': defaultdict(None, {'enable_fsdp_float8_all_gather': False, 'precompute_float8_dynamic_scale_for_fsdp': False, 'force_recompute_fp8_weight_in_bwd': False, 'recipe_name': None}), 'comm': defaultdict(None, {'init_timeout_seconds': 300, 'train_timeout_seconds': 100, 'trace_buf_size': 20000}), 'memory_estimation': defaultdict(None, {'enabled': False, 'disable_fake_mode': False}), 'fault_tolerance': defaultdict(None, {'enable': False, 'replica_id': 0, 'group_size': 0, 'min_replica_size': 1}), '_wandb': {}}
11
+ 2025-12-13 03:37:35,981 INFO MainThread:50239 [wandb_init.py:init():889] starting backend
12
+ 2025-12-13 03:37:36,219 INFO MainThread:50239 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-13 03:37:36,226 INFO MainThread:50239 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-13 03:37:36,227 INFO MainThread:50239 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-13 03:37:36,234 INFO MainThread:50239 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-13 03:37:38,639 INFO MainThread:50239 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-13 03:37:38,686 INFO MainThread:50239 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-13 03:37:38,692 INFO MainThread:50239 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-18 20:26:58,151 INFO MainThread:50239 [wandb_run.py:_finish():2287] finishing run wangzhenbin-sichuan-unversity/fla/-batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337
23
+ 2025-12-18 20:26:58,154 INFO MainThread:50239 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
24
+ 2025-12-18 20:26:58,155 INFO MainThread:50239 [wandb_run.py:_restore():2468] restore
25
+ 2025-12-18 20:26:58,155 INFO MainThread:50239 [wandb_run.py:_restore():2474] restore done
26
+ 2025-12-18 20:27:20,834 INFO MainThread:50239 [wandb_run.py:_footer_sync_info():3862] logging synced files
tb/20251213-0337/wandb/run-20251213_033735--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337/run--batch1.seqlen4096.context4096.warmup1024.update1.steps20480.lr1e-3.cosine-202512130337.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74836b8ead1d0d7492e4b75607bb5cd88dcdf0b00ee7c0f719b9ac1df52dfcf1
3
+ size 356825749