diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_27.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a7592246c9e8ff30f9a992370623ec6a6dfa2f3 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6a8a717d3d40b6db1f5639b88bc5d47205b640e9386e4f37bd098e5eda5daa +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_28.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..a84cd33ce6293d1897f61598d42a91ee996960ad --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:775910d1a7a7706c1e3515d5d8a0db4801d53cd0b723951c968b7a3a20893c4a +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_29.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..12973002c88104a77ea63fdf6f984033bde68262 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8304cac36d71add982370e260223ae8ae0eddd699bb37d9268d5d7e8a8b31c13 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_3.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c727a4e9a5c5c71d847dd14ee25076837c1dbcf --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e40b8ac7caddc3307d2d64148bf6a0014ec887c8a73d20286fb9f5a46856f357 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_30.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..f91d1af0af95b9606394a32d43658dde4f196e3f --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a36e83a28b25354cd8c80fb779bb1c6af4d7d8c6f55e5767f57d73184a7cb340 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_31.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..82887bf57fcefb775f1b02149472bc9f86700681 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8450c15fca4cbbb60c81b45fc613c0bc4d82e1af1fd87343dfd58894dd615375 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_4.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..bb02fc7fa54c6a0d45f706f703d018ada9ad1d85 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fd489f3430b21b2e0566aaf41a27676b720305912c40d7ddf5106ad8a1ddf22 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_5.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..a151f4380c66787fe4256d5fb16ee47f1f989d92 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79a0e270f9ee463dfd84a6f72aa8b87e907930304029aa0f17dfe68c675bc9c2 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_6.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ff9bc9d438c571aaafb3d7272a38e904aa0c0c3 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64b54dfd7330b7451347970300c9f0023eda8e0a5349646c04610e4a06abc158 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_7.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..475e8030dca051ceb9bf7e606b080d40e8d77b74 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:458f6f7ab956ee8dc74076e55375b1015feb392a6162b5c0deb3acdac3da67c6 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_8.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef619975ecbc174799a5d5cf55b0e81410abe10c --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3381787bdda6713ec8dd3c4ef258d9b7017178976f030912854874c08c78102 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-16326/rng_state_9.pth b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..04aaa5e1df29eb5a9a95fb7498c4889ad5a53e21 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-16326/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cf979befc4905e0300b47c6bae1feda6d389cc5e9e416f222eb9a7be9ca2aab +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-3628/model.safetensors b/output_qwen3_0p6b_train/checkpoint-3628/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f5590fa69b1b6462ef3d8e36ceb2166a02738494 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-3628/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abbbf5e4651e7be1770e2715f3ecbb20c23925b412b91878fdab85adf45d5ec7 +size 1632445832 diff --git a/output_qwen3_0p6b_train/checkpoint-4535/model.safetensors b/output_qwen3_0p6b_train/checkpoint-4535/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e3e61be714416ac997c254bcd0ecd99ec74c904d --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-4535/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c13125b661c99289e8edf8583c9ea0b55f078ce278aefd6e2d6dd36cc8bcb16 +size 1632445832 diff --git a/output_qwen3_0p6b_train/checkpoint-5442/model.safetensors b/output_qwen3_0p6b_train/checkpoint-5442/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..72dfd9380a5b98ff92339b911865da1acecf626c --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-5442/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9214381f564ec5d489d4fd39f1ef051c9a93c3a1d2e4dc8e4430ffec0da5ee6c +size 1632445832 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_16.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..b5de41949a924ca4851c8a5304e47a1f0c98b880 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1f76b87931cc1cd107bebba21782ed64014389b43720d00611bfab4a3b32d33 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_17.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..e8eea8b0cca49cdd83fdb0e666cf52435f62e2ea --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab80478282e10b3c89eadadcbed66537fa7bbbb670c19ac318278d38c7cc565e +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_18.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..a41a590ea8f83c3c29b8dfabfa5f885506e91dd2 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2344c0d24b44edc7d388331931439969b6669391126e3700937984de88cb58c0 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_19.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..a87d5c9fd98224c96fd336be10f001ac41bccd06 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bad4f1052a46e57db87b6189ed50e426bb61ebbe4dbca4c11963f9165d71146a +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_2.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..2502e45e48d1171e19475fc1b813b7e50cf7347d --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3411cbea0e41c4235862faa08be5797cb49c4f117f88fb998145f76a5272748c +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_20.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..3077940cd67491d1da4aae48c80226d5c4dbd153 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6e1844ee8087ac39dae5e2aa6c95ff49144b0ab2c61d96f908816f672d31bea +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_21.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3767fd01947d8c3c48b1e0d092d333503b17a4a --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16ffe45c90625765fbe6c716a0a8f667f14c05693eb878926db65119150db6ef +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_22.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..0c8ca6b3ccb0f2dc74151ffeeb475032fc293299 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:825d101310ecb5ead838859aae4b3ae99fe54083d7bb3a5cc96b6b1b3dd2eb4d +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_23.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..815c03933e10ce4d04cf3ac6e245495831d77d4d --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92f6ba700a0fa44db333b40314288d83e12b7297c4fed2ecd01249cfb186700f +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_24.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..44fe560176dc3b7e69debfb1fc8d14aba050667d --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d780b10c82f4621af7d26854b317305ce0b6e6bc30b754e9dc914eb6803b57b +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_25.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..e9a83a75027009e3be27d49046a3c22edb4fe389 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab35de2630017c8a48696ee3744f51059ff37cb638ba55f9e382d90700afeeb9 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_26.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..50b8bfe73b8098a34981894969e2d830c3fcc0b4 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46fe1bd25af7420543e5b12c92a8211caa1f034f8b098898f2e554c3d4e4e299 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_27.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..c39b8dcd942160ef72ec15b7cd9807d7c76c24c6 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2f2e0afaff4a817d27c5a4b687a67d8816fbad159d8cd186c591033f8d880a +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_28.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..11c63ebe568a7101f2d5fb3bd123334af883566d --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e40bc9d163bdfc999eadf55aa665878288ee92100ce221d28d1728ee1ca134c +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_29.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..bc1827ca2c2f5a717a0bdce627e13bb300a7dee0 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f11d6a76a688d32e4f4900272ee60a0061782f355b1a436467ab3c662a29dd5c +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_3.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc14ee7f50df71e1b6b0e8bb8607fff7fda81dc8 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7691d11fbe82492fec111a5830155238c25728750e82698f7d23855cb82b53a8 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_30.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..f6d055c0c764f8ed70528e437de1f5125947d859 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54e89477d783385923985f6ab50c1bae941e8ea664512eb5a755a9ab1a902b44 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_31.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..05271f77971704cb1af5d3be3e83f3de89cac320 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:295f0ac007a01b9d72962ebae83ce7cabb93d36c7315e5136deb35fc7450a502 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_4.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9042b6b9485acf93af602e07f2cb6f41bf9c233 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83a5f79b0ed76b818e7fecb80a531561634336a3effe487528849a6a82ba15c9 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_5.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a0f3b9c8822d3a490ba059ebc14c00284a484ea --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5943f18dc74c59db00edeab291cde824bda135021d855f434a2aee3a9193eb86 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_6.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a674b8e5fb0ced7dc4de9c9b508aba6eb57a7652 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdfcea25b44f81a168f5068f564e1975162f3f49d310f3aab202c753ae3b0f28 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_7.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..25bcaecf1a40262c466b3e6d32512bc8e05f8c57 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af7ba0e9ebef5f4e51eb6bc8419abb1441975e527ae256c0be3bb6d5203f7466 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_8.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..1d0ccc36b5c8905b04b1b3500e54fce6139c3144 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e9e0f8b82c6e5c1043dbc6346451c1f31d6f7036c06346e6baff50e1172f61 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/rng_state_9.pth b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..07e9626908371e1a69f6174d9fc5b48db97191bc --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d3dedafc0af97b9d7c4ac8bcdf665eb3f096b308666a3e5f56ed2d7333a135 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/scheduler.pt b/output_qwen3_0p6b_train/checkpoint-6349/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..560112b86658c3da6cb1d8f75f741a2e177e09e8 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f586bdaed00b2d086cc464a58bb8bef96f5ba7cfb4b4ad6fe606fc0a4d6d0418 +size 1465 diff --git a/output_qwen3_0p6b_train/checkpoint-6349/training_args.bin b/output_qwen3_0p6b_train/checkpoint-6349/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3a0dbcba3f5a95f6168e57e344653b698d02d04e --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-6349/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:601e648bc1a82b1b1c801ec4413a004a2eda7392959c7b3593ab4abca8069c02 +size 6929 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_0.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fb080cbe0f537c2f2bf3fa2554b24bf6c39062e --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b521ba0c150a32d9aee7651b07b9b6088f4bae7a64f9253be331b5ef76b54a55 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_1.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7cd918aff6b0e71f1b773746416871a3f853fea1 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78b7cb3785b4dc089095c8929381ed7795af6f9ae2f4f0411c0a36904bca5197 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_10.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..04c90245995abf9110526f99e47d18338c336c01 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f69707be87b63675b7a9c7d281f905ae885ef95a5d31ee3335e590acb54ee9d0 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_11.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..1647d1f261d2cfd883ea0a655311768070fbd3a1 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ab70cf440b017f662e538619e85be9e1a90f0c196c27bed4ceb1daa02772ad3 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_12.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a6802568ea6516b96ee20a1acc687b5ac3b2402 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:735900057a6a5f114a5fa94e5db685e2775539ab09327e71d1d7822606937a35 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_13.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..732cdad497db80782de74feb99556f31854c72c1 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:658cf316a9ea07b2b2233c3f24276a62fef9e8e17e2ea9f2f80871df7aff1a53 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_14.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..efcf03b0ca150e0b0538adb15e515d01f4743eda --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:def783c896e50a366f9c354e0f4c909ce323642b9c21a9815b58d8a28653e8e4 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_15.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc465b0695610efda49a01e499cf8fcad36784ce --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4d95955ab2df101a32a45c30e88135520734d9541392956b48ce77cc26d49d8 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_16.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..474dcc132160a0b6248e6b85f8d7e5f7a77d8b12 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09a5baad63f7044f8196697be0890a2ce0dc3669a42e81df98544da3533ab045 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_17.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..10d0f68660e6422a6951cf0157f15102fd31b9f0 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e74f143a5bf00bd1f6872365ff8e9ea576e4a83ae3d49a70c17ca74548f32d +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_18.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..cee9a1bf06b2f44cd1a5405d45df5681fa347bb4 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3796534140b33f9f5c502255b98f3f381f3908a932f617e06b442d943d99935b +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_19.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd4a5244c407ad4915f37776425fcaa244ac3f0a --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8365b07b05849ae7f1512a6d51d5e6c2d65607bced9d240f142d85a3250f4548 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_2.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..8967ac66b457da41aa1734ccc49ef795565a0081 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9cc9ebb653b99b5ff947dcc8be11cfcb5a1683c5a88552ecf9e63eaab649c46 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_20.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..18e551114a2e3bdff3b88a5a95696dab94e773b0 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:867561ac3e9ca9dd4ec0573534c7813ba2408fed48f156153c22a8846bf34c5e +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_21.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..7192f73ea913fa291bfa305887034f2a843be138 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0db8a31388a5741b11a3f0538d4f772ed0e4b6d14a78adfd617d6777719cf05 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_22.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ef04e3e57aa4cde5b9a3771b9c935bc2a863cc9 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f585ff1b6249a495143140481c7453ad77b978342e958f00e2cd45348a525d8a +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_23.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..de0950d88ce2d68216f8cecbb955ab698e7d7c66 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a9fa94dd3f41c2c854af24e0f2a5c6c12c12027dac5f2d78191e76c69610748 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_24.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..e562681758e219f16c80f7b03850e3d9c082172b --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e1312f73b1e588b1b39b1ab7c29914d4770bc6d012f5f33b7bd01c0b1ac64de +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_25.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcb092d5f30cacb2f5c9bd0f2269ac6fb11763d3 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc9732b5283e1e5e2ae84e58c66d3b1d22caba06548064148edf6b7bb033700f +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_26.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd246fc1c8360ef70f0a5980baf032dd1f05df8f --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c8459dcbf3fdeae2ea5360195c8e3dcec37724d67e19f70828330d3f2b63dee +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_27.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd344716dd15de0c6462f05c533d2d0f4b784495 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a43dba0c389ecca0801c27a275e1485b30c803690de4a78067e5214025e1270 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_28.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..c21b22b10e7b0c8618bfafc0865c92ce5a22d3fd --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e269db9e7aae41045797a13bbce33a86317fb52142b91018073ee0100015fe0 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_29.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..418ddb9b4bc4580734626aadfddb6b4df68d0fb9 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2f41ccecc7e70446e53648da51571d3748bc3a4423a8db6102345c303e04c1 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_3.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a58cada53f9e2e9429979bc550f30ff7cf8af76 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a86ad88d0740462efc3f696b60b39a70cc8825a90fd8e63e605f043255216394 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_30.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..e93db8c58e0745d99aa1cc2f9addefe1f593ff47 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d24740deaa3c8777864c969292d5cb102b7ec148c486af9f3e7788acc3fe860 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_31.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..aab48a121e34827d32eaa308df9461b44341a408 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9183a2a9acb07e01ccd09e34eeb62f2298b8f29e99b895d2b6bcccbc5845d4f1 +size 16340 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_4.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d435047b37ea7b51c80b197c8da2ecc94a01766 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ddd063b414d95d23c619690ab807ba826e43d4eb9ea8579022f7362489e78c5 +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_5.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a5aecafa2b24e27531b0e1b488fec91fc99e2c5 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11bc4f0af157fad1a98808d3f09fe0ef49419d97e0121dca83c796e8b4a41d4b +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_6.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..92704148fb892740f2c25a75a8e0551f49c3c0b2 --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23c64c82b711e7e80ba38745bc97f1ce2beb3a830a51b1eed6ef317ee65a53ce +size 16325 diff --git a/output_qwen3_0p6b_train/checkpoint-7256/rng_state_7.pth b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..67a49af2eb66f23bb112b447d97ce4947be76dcd --- /dev/null +++ b/output_qwen3_0p6b_train/checkpoint-7256/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba7dcf8d173a1a913cf2b9a2c8fc5df1e37049e01f7f4df1b67ff9b8c1e1aea9 +size 16325 diff --git a/vocab/__pycache__/__init__.cpython-310.pyc b/vocab/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91e89eee73ff5d37aaf442c4b88105a8a0ea2d70 Binary files /dev/null and b/vocab/__pycache__/__init__.cpython-310.pyc differ diff --git a/vocab/__pycache__/__init__.cpython-312.pyc b/vocab/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ce227d4ec9e642add7520e4998a2fbcab839806 Binary files /dev/null and b/vocab/__pycache__/__init__.cpython-312.pyc differ diff --git a/vocab/__pycache__/__init__.cpython-313.pyc b/vocab/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c032183d01619ab75a87d285d712607439d5137 Binary files /dev/null and b/vocab/__pycache__/__init__.cpython-313.pyc differ diff --git a/vocab/__pycache__/chord.cpython-310.pyc b/vocab/__pycache__/chord.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf14f54b95f9d087f06d45697db803d1e8ac87f0 Binary files /dev/null and b/vocab/__pycache__/chord.cpython-310.pyc differ diff --git a/vocab/__pycache__/chord.cpython-312.pyc b/vocab/__pycache__/chord.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84909d2b134463a59aac41fb80642496e2ef51e1 Binary files /dev/null and b/vocab/__pycache__/chord.cpython-312.pyc differ diff --git a/vocab/__pycache__/chord.cpython-313.pyc b/vocab/__pycache__/chord.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..849d0a310ba7361973d5a9d654ed902cbc9c3579 Binary files /dev/null and b/vocab/__pycache__/chord.cpython-313.pyc differ diff --git a/vocab/__pycache__/key.cpython-313.pyc b/vocab/__pycache__/key.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e91154fd17356ea9975fee4151d4d45d8d3b35b4 Binary files /dev/null and b/vocab/__pycache__/key.cpython-313.pyc differ diff --git a/vocab/__pycache__/sections.cpython-310.pyc b/vocab/__pycache__/sections.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77a60317f2a994659f1655c61ffe827e670a8263 Binary files /dev/null and b/vocab/__pycache__/sections.cpython-310.pyc differ diff --git a/vocab/__pycache__/sections.cpython-312.pyc b/vocab/__pycache__/sections.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0a0ea593892753d70c4d8df6692a6aa907e08cc Binary files /dev/null and b/vocab/__pycache__/sections.cpython-312.pyc differ diff --git a/vocab/__pycache__/sections.cpython-313.pyc b/vocab/__pycache__/sections.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f025271d59c76d1cfe39982dcc8de1430d961ca7 Binary files /dev/null and b/vocab/__pycache__/sections.cpython-313.pyc differ diff --git a/wandb/run-20260316_190118-i5gs23ey/files/config.yaml b/wandb/run-20260316_190118-i5gs23ey/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..00fccafb272378253786d9d88809fdc77b749965 --- /dev/null +++ b/wandb/run-20260316_190118-i5gs23ey/files/config.yaml @@ -0,0 +1,415 @@ +_name_or_path: + value: checkpoints/Qwen3-0.6B +_wandb: + value: + cli_version: 0.22.2 + e: + k8wb0dbxt94an354uzhwza5srfbxtumt: + codePath: train.py + codePathLocal: train.py + cpu_count: 56 + cpu_count_logical: 13 + cudaVersion: "12.8" + disk: + /: + total: "633794920448" + used: "92198043648" + email: 897344367@qq.com + executable: /root/miniconda3/bin/python + gpu: NVIDIA A100-SXM4-80GB + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100-SXM4-80GB + uuid: GPU-d673e426-e388-2e5b-0a03-2566656f028f + host: di-20251115043529-w9pkz + memory: + total: "243873611776" + os: Linux-5.4.250-2-velinux1u1-amd64-x86_64-with-glibc2.35 + program: /algo-intern/user/leonchen/cond_gen/train.py + python: CPython 3.12.11 + root: /algo-intern/user/leonchen/cond_gen + startedAt: "2026-03-16T19:01:18.241085Z" + writerId: k8wb0dbxt94an354uzhwza5srfbxtumt + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.12.11 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "3": + - 7 + - 19 + - 66 + "4": 3.12.11 + "5": 0.22.2 + "6": 5.3.0 + "9": + "1": transformers_trainer + "12": 0.22.2 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +architectures: + value: + - Qwen3ForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: true +batch_eval_metrics: + value: false +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: 151643 +chunk_size_feed_forward: + value: 0 +data_seed: + value: null +dataloader_drop_last: + value: true +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +do_eval: + value: false +do_predict: + value: false +do_train: + value: false +dtype: + value: bfloat16 +enable_jit_checkpoint: + value: false +eos_token_id: + value: 151645 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +fp16: + value: false +fp16_full_eval: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +full_determinism: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: + use_reentrant: false +greater_is_better: + value: null +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 1024 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_revision: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_for_metrics: + value: [] +include_num_input_tokens_seen: + value: "no" +initializer_range: + value: 0.02 +intermediate_size: + value: 3072 +is_encoder_decoder: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +layer_types: + value: + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention +learning_rate: + value: 0.0001 +length_column_name: + value: length +liger_kernel_config: + value: null +load_best_model_at_end: + value: false +local_rank: + value: -1 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: null +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 10 +logging_strategy: + value: steps +lr_scheduler_kwargs: + value: null +lr_scheduler_type: + value: linear +magel_chord_dropout_trigger_prob: + value: 0.6 +magel_num_audio_token: + value: 16384 +magel_structure_dropout_trigger_prob: + value: 0.6 +max_grad_norm: + value: 5 +max_position_embeddings: + value: 40960 +max_steps: + value: -1 +max_window_layers: + value: 28 +metric_for_best_model: + value: null +model/num_parameters: + value: 816200192 +model_type: + value: qwen3 +neftune_noise_alpha: + value: null +num_attention_heads: + value: 16 +num_hidden_layers: + value: 28 +num_key_value_heads: + value: 8 +num_train_epochs: + value: 10 +optim: + value: adamw_torch_fused +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: ./output_qwen3_0p6b_train +output_hidden_states: + value: false +pad_token_id: + value: null +parallelism_config: + value: null +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 1 +prediction_loss_only: + value: false +problem_type: + value: null +project: + value: huggingface +push_to_hub: + value: false +remove_unused_columns: + value: false +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +rms_norm_eps: + value: 1e-06 +rope_parameters: + value: + rope_theta: 1000000 + rope_type: default +run_name: + value: null +save_on_each_node: + value: false +save_only_model: + value: false +save_steps: + value: 500 +save_strategy: + value: epoch +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +sliding_window: + value: null +tf32: + value: null +tie_word_embeddings: + value: true +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +trackio_space_id: + value: trackio +train_sampling_strategy: + value: random +transformers_version: + value: 5.3.0 +use_cache: + value: false +use_cpu: + value: false +use_liger_kernel: + value: false +use_sliding_window: + value: false +vocab_size: + value: 168056 +warmup_ratio: + value: null +warmup_steps: + value: 500 +weight_decay: + value: 0.01 diff --git a/wandb/run-20260316_190118-i5gs23ey/files/output.log b/wandb/run-20260316_190118-i5gs23ey/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20260316_190118-i5gs23ey/files/requirements.txt b/wandb/run-20260316_190118-i5gs23ey/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1503ad6526927a18addb5fc58657a17a99c622de --- /dev/null +++ b/wandb/run-20260316_190118-i5gs23ey/files/requirements.txt @@ -0,0 +1,323 @@ +Brotli==1.0.9 +MarkupSafe==3.0.2 +PySocks==1.7.1 +absl-py==2.3.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.13 +aiosignal==1.3.2 +archspec==0.2.3 +asttokens==3.0.0 +attrs==25.3.0 +boltons==23.0.0 +boto3==1.38.39 +botocore==1.38.39 +cffi==1.16.0 +charset-normalizer==2.0.4 +comm==0.2.1 +conda==24.11.3 +conda-content-trust==0.2.0 +conda-libmamba-solver==23.12.0 +conda-package-handling==2.2.0 +conda_package_streaming==0.9.0 +cryptography==41.0.7 +debugpy==1.8.11 +decorator==5.1.1 +distro==1.8.0 +executing==0.8.3 +frozendict==2.4.2 +frozenlist==1.7.0 +fsspec==2024.6.1 +grpcio==1.73.0 +idna==3.4 +ipykernel==6.29.5 +ipython==9.1.0 +ipython_pygments_lexers==1.1.1 +jedi==0.19.2 +Jinja2==3.1.4 +jmespath==1.0.1 +jsonpatch==1.32 +jsonpointer==2.1 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +libmambapy==1.5.3 +Markdown==3.8.1 +matplotlib-inline==0.1.6 +menuinst==2.0.2 +mpmath==1.3.0 +multidict==6.5.0 +nest-asyncio==1.6.0 +networkx==3.3 +ninja==1.11.1.4 +numpy==2.3.0 +packaging==24.2 +parso==0.8.4 +pexpect==4.9.0 +pillow==11.0.0 +pip==25.1 +platformdirs==4.3.7 +pluggy==1.0.0 +prompt-toolkit==3.0.43 +propcache==0.3.2 +psutil==5.9.0 +psutil==7.0.0 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pybind11==2.13.6 +pycosat==0.6.6 +pycparser==2.21 +Pygments==2.19.1 +pyprof==1.0.0 +python-dateutil==2.9.0.post0 +pyzmq==26.2.0 +regex==2024.11.6 +s3transfer==0.13.0 +setuptools==78.1.1 +six==1.17.0 +stack-data==0.2.0 +tensorboard-data-server==0.7.2 +tornado==6.5.1 +traitlets==5.14.3 +truststore==0.8.0 +urllib3==2.1.0 +Werkzeug==3.1.3 +wheel==0.45.1 +yarl==1.20.1 +zstandard==0.19.0 +dnspython==2.8.0 +ruamel.yaml.clib==0.2.14 +soupsieve==2.8 +huggingface_hub==1.4.1 +gast==0.6.0 +opentelemetry-proto==1.38.0 +torchcodec==0.8.1 +ml_collections==1.1.0 +GitPython==3.1.45 +opentelemetry-api==1.38.0 +smmap==5.0.2 +pydantic==2.12.3 +shellingham==1.5.4 +crcmod==1.7 +hydra-core==1.3.2 +anyio==4.11.0 +multiprocess==0.70.18 +pyarrow==22.0.0 +dill==0.4.0 +onnx2torch-py313==1.6.0 +nvidia-nvjitlink-cu12==12.8.93 +sniffio==1.3.1 +msgpack==1.1.2 +google-pasta==0.2.0 +beartype==0.18.5 +tf_keras==2.20.1 +httpcore==1.0.9 +datasets==4.4.2 +py-cpuinfo==9.0.0 +matplotlib==3.10.7 +requests==2.32.5 +argbind==0.3.9 +sentencepiece==0.2.1 +aria2==0.0.1b0 +tqdm==4.67.1 +soundfile==0.13.1 +opentelemetry-exporter-otlp-proto-http==1.38.0 +nvidia-nccl-cu12==2.27.5 +opentelemetry-exporter-otlp==1.38.0 +zipp==3.23.0 +arguments==76 +gitdb==4.0.12 +pytorch-lightning==2.5.5 +mypy_extensions==1.1.0 +fire==0.7.1 +speechbrain==1.0.3 +torch-audiomentations==0.12.0 +julius==0.2.7 +pymongo==4.15.5 +onnx-weekly==1.21.0.dev20251110 +rich==14.2.0 +pyannote.pipeline==3.0.1 +pytz==2025.2 +nvidia-cusparselt-cu12==0.7.1 +opt_einsum==3.4.0 +coloredlogs==15.0.1 +nvidia-curand-cu12==10.3.9.90 +optree==0.17.0 +einops==0.8.1 +torch-stoi==0.2.3 +importlib_resources==6.5.2 +ujson==5.11.0 +librosa==0.11.0 +primePy==1.3 +asteroid-filterbanks==0.4.0 +httpx==0.28.1 +conda-pack==0.9.1 +black==25.12.0 +nvidia-cublas-cu12==12.8.4.1 +audio-separator==0.39.1 +opentelemetry-sdk==1.38.0 +nvidia-cudnn-cu12==9.10.2.21 +sentry-sdk==2.43.0 +wcwidth==0.4.0 +typer==0.20.0 +lightning==2.6.0 +scikit-learn==1.7.2 +PyYAML==6.0.3 +pretty_midi==0.2.11 +opentelemetry-exporter-otlp-proto-grpc==1.38.0 +diffq==0.2.4 +randomname==0.2.1 +threadpoolctl==3.6.0 +docopt==0.6.2 +beautifulsoup4==4.14.2 +omegaconf==2.3.0 +Deprecated==1.3.1 +scipy==1.16.2 +markdown2==2.5.4 +kiwisolver==1.4.9 +hf-xet==1.2.0 +tos==2.9.0 +googleapis-common-protos==1.71.0 +antlr4-python3-runtime==4.9.3 +pyparsing==3.2.5 +ruamel.yaml==0.18.16 +pyannote.database==5.1.3 +llvmlite==0.45.1 +torchmetrics==1.8.2 +mir_eval==0.8.2 +torchaudio==2.9.0 +triton==3.5.0 +sympy==1.14.0 +docstring_parser==0.17.0 +bitsandbytes==0.48.1 +terminaltables==3.1.10 +einx==0.3.0 +pyloudnorm==0.1.1 +alias-free-torch==0.0.6 +pydantic_core==2.41.4 +fonttools==4.60.1 +tensorboardX==2.6.4 +tensorboard==2.20.0 +typing_extensions==4.15.0 +hjson==3.1.0 +gpustat==1.1.1 +nvidia-cuda-cupti-cu12==12.8.90 +certifi==2025.11.12 +diffusers==0.37.0 +nvidia-cusparse-cu12==12.5.8.93 +git-lfs==1.6 +rotary-embedding-torch==0.6.5 +future==1.0.0 +x-transformers==2.12.2 +tabulate==0.9.0 +consoleprinter==95 +annotated-types==0.7.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +praatio==6.2.2 +pydub==0.25.1 +keras==3.12.0 +wrapt==1.16.0 +astunparse==1.6.3 +vector-quantize-pytorch==1.25.1 +numba==0.62.1 +click==8.3.0 +optuna==4.5.0 +flatten-dict==0.4.2 +pyannote.core==5.0.0 +pathspec==0.12.1 +tzdata==2025.2 +loguru==0.7.3 +entmax==1.3 +torch==2.9.0 +onnxruntime-gpu==1.23.2 +greenlet==3.2.4 +opentelemetry-semantic-conventions==0.59b0 +samplerate==0.1.0 +lightning-utilities==0.15.2 +opentelemetry-exporter-otlp-proto-common==1.38.0 +sortedcontainers==2.4.0 +safetensors==0.6.2 +semver==3.0.4 +pyannote.audio==3.4.0 +typer-slim==0.20.0 +importlib_metadata==8.7.0 +Cython==3.2.1 +jq==1.10.0 +encodec==0.1.1 +flash_attn==2.8.3 +typing-inspection==0.4.2 +jsonnet==0.21.0 +colorlog==6.10.1 +humanfriendly==10.0 +onnxruntime==1.23.2 +mido==1.3.3 +soxr==1.0.0 +accelerate==1.11.0 +xxhash==3.6.0 +nvidia-cuda-runtime-cu12==12.8.90 +HyperPyYAML==1.2.2 +audioread==3.0.1 +flatbuffers==25.9.23 +nvidia-cufft-cu12==11.3.3.83 +ffmpy==1.0.0 +protobuf==3.19.6 +nvidia-cufile-cu12==1.13.1.3 +jsonargparse==4.42.0 +resampy==0.4.3 +descript-audio-codec==1.0.0 +mdurl==0.1.2 +deepspeed==0.18.5 +libclang==18.1.1 +onnx-ir==0.1.12 +pystoi==0.4.1 +filelock==3.20.0 +pytokens==0.3.0 +namex==0.1.0 +h5py==3.15.1 +typeshed_client==2.8.2 +mpi4py==4.1.1 +torchdiffeq==0.2.5 +nvidia-ml-py==13.590.48 +pooch==1.8.2 +pyannoteai-sdk==0.3.0 +nvidia-cusolver-cu12==11.7.3.90 +SQLAlchemy==2.0.44 +markdown-it-py==4.0.0 +torch_pitch_shift==1.2.5 +contourpy==1.3.3 +blessed==1.27.0 +redis-cli==1.0.1 +redis==7.0.1 +descript-audiotools==0.7.2 +pyannote.metrics==3.2.1 +ml_dtypes==0.5.3 +nvidia-nvshmem-cu12==3.3.20 +Mako==1.3.10 +pytorch-metric-learning==2.9.0 +alembic==1.17.2 +termcolor==3.2.0 +tokenizers==0.22.2 +h11==0.16.0 +joblib==1.5.2 +onnx==1.20.0 +cycler==0.12.1 +lazy_loader==0.4 +wandb==0.22.2 +transformers==5.3.0 +pandas==2.3.3 +nvidia-nvtx-cu12==12.8.90 +onnxscript==0.5.7 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +inflect==7.3.1 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +jaraco.text==3.12.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +tomli==2.0.1 +typeguard==4.3.0 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/wandb/run-20260316_190118-i5gs23ey/files/wandb-metadata.json b/wandb/run-20260316_190118-i5gs23ey/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6228ed86025bf6d8841902a29a05864c82e5de21 --- /dev/null +++ b/wandb/run-20260316_190118-i5gs23ey/files/wandb-metadata.json @@ -0,0 +1,36 @@ +{ + "os": "Linux-5.4.250-2-velinux1u1-amd64-x86_64-with-glibc2.35", + "python": "CPython 3.12.11", + "startedAt": "2026-03-16T19:01:18.241085Z", + "program": "/algo-intern/user/leonchen/cond_gen/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "email": "897344367@qq.com", + "root": "/algo-intern/user/leonchen/cond_gen", + "host": "di-20251115043529-w9pkz", + "executable": "/root/miniconda3/bin/python", + "cpu_count": 56, + "cpu_count_logical": 13, + "gpu": "NVIDIA A100-SXM4-80GB", + "gpu_count": 1, + "disk": { + "/": { + "total": "633794920448", + "used": "92198043648" + } + }, + "memory": { + "total": "243873611776" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-d673e426-e388-2e5b-0a03-2566656f028f" + } + ], + "cudaVersion": "12.8", + "writerId": "k8wb0dbxt94an354uzhwza5srfbxtumt" +} \ No newline at end of file diff --git a/wandb/run-20260316_190118-i5gs23ey/files/wandb-summary.json b/wandb/run-20260316_190118-i5gs23ey/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6e0a7db53cfedb915cdaf3936a23f53b6010c03f --- /dev/null +++ b/wandb/run-20260316_190118-i5gs23ey/files/wandb-summary.json @@ -0,0 +1 @@ +{"train/grad_norm":29.75,"train/learning_rate":5.8e-06,"_step":2,"_runtime":36,"_timestamp":1.7736877095672467e+09,"train/epoch":0.000258435774402798,"_wandb":{"runtime":36},"train/global_step":30,"train/loss":13.80834503173828} \ No newline at end of file diff --git a/wandb/run-20260316_190118-i5gs23ey/logs/debug-internal.log b/wandb/run-20260316_190118-i5gs23ey/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9b6b2b901c831eadfe3284773e3e39f36d3b733a --- /dev/null +++ b/wandb/run-20260316_190118-i5gs23ey/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-03-16T19:01:18.682879399Z","level":"INFO","msg":"stream: starting","core version":"0.22.2"} +{"time":"2026-03-16T19:01:19.476064119Z","level":"INFO","msg":"stream: created new stream","id":"i5gs23ey"} +{"time":"2026-03-16T19:01:19.476195364Z","level":"INFO","msg":"handler: started","stream_id":"i5gs23ey"} +{"time":"2026-03-16T19:01:19.484707102Z","level":"ERROR","msg":"stream: error opening transaction log for reading: transactionlog: bad header: leveldb/record: invalid W&B identifier: 00000000 (\"\\x00\\x00\\x00\\x00\")","id":"i5gs23ey"} +{"time":"2026-03-16T19:01:19.499504163Z","level":"INFO","msg":"stream: started","id":"i5gs23ey"} +{"time":"2026-03-16T19:01:19.499535475Z","level":"INFO","msg":"sender: started","stream_id":"i5gs23ey"} +{"time":"2026-03-16T19:01:21.377383684Z","level":"ERROR","msg":"runconsolelogs: failed to write to file: failed to append: write /algo-intern/user/leonchen/cond_gen/wandb/run-20260316_190118-i5gs23ey/files/output.log: invalid argument"} +{"time":"2026-03-16T19:01:57.032682222Z","level":"INFO","msg":"stream: closing","id":"i5gs23ey"} +{"time":"2026-03-16T19:01:58.010160625Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-03-16T19:01:59.852388592Z","level":"INFO","msg":"handler: closed","stream_id":"i5gs23ey"} +{"time":"2026-03-16T19:01:59.852477512Z","level":"INFO","msg":"sender: closed","stream_id":"i5gs23ey"} +{"time":"2026-03-16T19:01:59.852537316Z","level":"INFO","msg":"stream: closed","id":"i5gs23ey"} diff --git a/wandb/run-20260316_190118-i5gs23ey/logs/debug.log b/wandb/run-20260316_190118-i5gs23ey/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..9605f5ce2306b19927dfd85789a9d9dc04a28e2c --- /dev/null +++ b/wandb/run-20260316_190118-i5gs23ey/logs/debug.log @@ -0,0 +1,26 @@ +2026-03-16 19:01:18,401 INFO MainThread:3676937 [wandb_setup.py:_flush():81] Current SDK version is 0.22.2 +2026-03-16 19:01:18,401 INFO MainThread:3676937 [wandb_setup.py:_flush():81] Configure stats pid to 3676937 +2026-03-16 19:01:18,401 INFO MainThread:3676937 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2026-03-16 19:01:18,401 INFO MainThread:3676937 [wandb_setup.py:_flush():81] Loading settings from /algo-intern/user/leonchen/cond_gen/wandb/settings +2026-03-16 19:01:18,401 INFO MainThread:3676937 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-16 19:01:18,402 INFO MainThread:3676937 [wandb_init.py:setup_run_log_directory():705] Logging user logs to /algo-intern/user/leonchen/cond_gen/wandb/run-20260316_190118-i5gs23ey/logs/debug.log +2026-03-16 19:01:18,403 INFO MainThread:3676937 [wandb_init.py:setup_run_log_directory():706] Logging internal logs to /algo-intern/user/leonchen/cond_gen/wandb/run-20260316_190118-i5gs23ey/logs/debug-internal.log +2026-03-16 19:01:18,403 INFO MainThread:3676937 [wandb_init.py:init():832] calling init triggers +2026-03-16 19:01:18,403 INFO MainThread:3676937 [wandb_init.py:init():837] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-16 19:01:18,403 INFO MainThread:3676937 [wandb_init.py:init():880] starting backend +2026-03-16 19:01:18,621 INFO MainThread:3676937 [wandb_init.py:init():883] sending inform_init request +2026-03-16 19:01:18,674 INFO MainThread:3676937 [wandb_init.py:init():891] backend started and connected +2026-03-16 19:01:18,677 INFO MainThread:3676937 [wandb_init.py:init():961] updated telemetry +2026-03-16 19:01:18,698 INFO MainThread:3676937 [wandb_init.py:init():985] communicating run to backend with 90.0 second timeout +2026-03-16 19:01:20,613 INFO MainThread:3676937 [wandb_init.py:init():1036] starting run threads in backend +2026-03-16 19:01:20,856 INFO MainThread:3676937 [wandb_run.py:_console_start():2509] atexit reg +2026-03-16 19:01:20,856 INFO MainThread:3676937 [wandb_run.py:_redirect():2357] redirect: wrap_raw +2026-03-16 19:01:20,857 INFO MainThread:3676937 [wandb_run.py:_redirect():2426] Wrapping output streams. +2026-03-16 19:01:20,857 INFO MainThread:3676937 [wandb_run.py:_redirect():2449] Redirects installed. +2026-03-16 19:01:20,863 INFO MainThread:3676937 [wandb_init.py:init():1076] run started, returning control to user process +2026-03-16 19:01:20,864 INFO MainThread:3676937 [wandb_run.py:_config_callback():1392] config_cb None None {'vocab_size': 168056, 'max_position_embeddings': 40960, 'hidden_size': 1024, 'intermediate_size': 3072, 'num_hidden_layers': 28, 'num_attention_heads': 16, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': None, 'bos_token_id': 151643, 'eos_token_id': 151645, 'tie_word_embeddings': True, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': 'checkpoints/Qwen3-0.6B', 'transformers_version': '5.3.0', 'model_type': 'qwen3', 'magel_chord_dropout_trigger_prob': 0.6, 'magel_structure_dropout_trigger_prob': 0.6, 'magel_num_audio_token': 16384, 'output_attentions': False, 'output_dir': './output_qwen3_0p6b_train', 'per_device_train_batch_size': 1, 'num_train_epochs': 10, 'max_steps': -1, 'learning_rate': 0.0001, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': None, 'warmup_steps': 500, 'optim': 'adamw_torch_fused', 'optim_args': None, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'optim_target_modules': None, 'gradient_accumulation_steps': 1, 'average_tokens_across_devices': True, 'max_grad_norm': 5.0, 'label_smoothing_factor': 0.0, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'use_liger_kernel': False, 'liger_kernel_config': None, 'neftune_noise_alpha': None, 'torch_empty_cache_steps': None, 'auto_find_batch_size': False, 'logging_strategy': 'steps', 'logging_steps': 10, 'logging_first_step': False, 'log_on_each_node': True, 'logging_nan_inf_filter': True, 'include_num_input_tokens_seen': 'no', 'log_level': 'passive', 'log_level_replica': 'warning', 'disable_tqdm': False, 'report_to': ['wandb'], 'run_name': None, 'project': 'huggingface', 'trackio_space_id': 'trackio', 'eval_strategy': 'no', 'eval_steps': None, 'eval_delay': 0, 'per_device_eval_batch_size': 8, 'prediction_loss_only': False, 'eval_on_start': False, 'eval_do_concat_batches': True, 'eval_use_gather_object': False, 'eval_accumulation_steps': None, 'include_for_metrics': [], 'batch_eval_metrics': False, 'save_only_model': False, 'save_strategy': 'epoch', 'save_steps': 500, 'save_on_each_node': False, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'push_to_hub': False, 'hub_token': '', 'hub_private_repo': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_always_push': False, 'hub_revision': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'restore_callback_states_from_checkpoint': False, 'full_determinism': False, 'seed': 42, 'data_seed': None, 'use_cpu': False, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'dataloader_drop_last': True, 'dataloader_num_workers': 0, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'dataloader_prefetch_factor': None, 'remove_unused_columns': False, 'label_names': None, 'train_sampling_strategy': 'random', 'length_column_name': 'length', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'ddp_backend': None, 'ddp_timeout': 1800, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'deepspeed': None, 'debug': [], 'skip_memory_metrics': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'resume_from_checkpoint': None, 'warmup_ratio': None, 'logging_dir': None, 'local_rank': -1} +2026-03-16 19:01:20,871 INFO MainThread:3676937 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 816200192 - > +2026-03-16 19:01:20,872 INFO MainThread:3676937 [wandb_run.py:_config_callback():1392] config_cb model/num_parameters 816200192 None +2026-03-16 19:01:57,032 INFO wandb-AsyncioManager-main:3676937 [service_client.py:_forward_responses():80] Reached EOF. +2026-03-16 19:01:57,032 INFO wandb-AsyncioManager-main:3676937 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/wandb/run-20260316_190118-i5gs23ey/run-i5gs23ey.wandb b/wandb/run-20260316_190118-i5gs23ey/run-i5gs23ey.wandb new file mode 100644 index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32 Binary files /dev/null and b/wandb/run-20260316_190118-i5gs23ey/run-i5gs23ey.wandb differ diff --git a/wandb/run-20260316_190844-eb7rexwd/files/output.log b/wandb/run-20260316_190844-eb7rexwd/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20260316_190844-eb7rexwd/files/requirements.txt b/wandb/run-20260316_190844-eb7rexwd/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..df6271ecfee13881ea45cb1a93031b183efef581 --- /dev/null +++ b/wandb/run-20260316_190844-eb7rexwd/files/requirements.txt @@ -0,0 +1,330 @@ +pycparser==2.21 +conda-content-trust==0.1.3 +PySocks==1.7.1 +jsonpointer==2.1 +cryptography==39.0.1 +transformers==5.3.0 +tokenizers==0.22.2 +mpi4py==4.1.1 +deepspeed==0.18.7 +huggingface_hub==1.6.0 +hf-xet==1.3.2 +dnspython==2.8.0 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cuda-cupti-cu12==12.8.90 +crcmod==1.7 +contourpy==1.3.2 +docopt==0.6.2 +pooch==1.8.2 +optuna==4.5.0 +Deprecated==1.2.18 +samplerate==0.1.0 +pyannote.audio==3.4.0 +nltk==3.9.1 +librosa==0.11.0 +lameenc==1.8.1 +opentelemetry-proto==1.38.0 +threadpoolctl==3.6.0 +aliyun-python-sdk-core==2.16.0 +silero-vad==6.0.0 +rotary-embedding-torch==0.6.5 +treetable==0.2.6 +tos==2.8.5 +nvidia-cusolver-cu11==11.4.0.1 +nvidia-cusolver-cu12==11.7.3.90 +umap-learn==0.5.9.post2 +jmespath==0.10.0 +accelerate==1.10.1 +modelscope==1.29.2 +fonttools==4.59.2 +dora_search==0.1.12 +ffmpeg-python==0.2.0 +jamo==0.4.1 +wrapt==1.17.3 +triton==3.4.0 +build==1.3.0 +sentencepiece==0.2.1 +pyannote.database==5.1.3 +lightning-utilities==0.15.2 +nvidia-cudnn-cu11==8.5.0.96 +pynndescent==0.5.13 +numba==0.61.2 +GPUtil==1.4.0 +tomli==2.2.1 +nvidia-cublas-cu11==11.10.3.66 +soundfile==0.13.1 +nvidia-cuda-nvrtc-cu12==12.8.93 +asteroid-filterbanks==0.4.0 +pyannote.metrics==3.2.1 +cmake==4.1.0 +audioread==3.0.1 +pyparsing==3.2.3 +evaluate==0.4.6 +onnxruntime-gpu==1.22.0 +nvidia-nvtx-cu11==11.7.91 +colorlog==6.9.0 +diffq==0.2.4 +greenlet==3.2.4 +markdown-it-py==4.0.0 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-curand-cu11==10.2.10.91 +demucs==4.0.1 +pyannoteai-sdk==0.3.0 +mido==1.3.3 +typer==0.17.4 +oss2==2.19.1 +opentelemetry-api==1.38.0 +jaconv==0.4.0 +nvidia-nccl-cu12==2.27.3 +torchmetrics==1.8.2 +tabulate==0.9.0 +av==15.1.0 +cloudpickle==3.1.1 +timm==0.4.5 +nvidia-cuda-nvrtc-cu11==11.7.99 +julius==0.2.7 +ml_dtypes==0.5.3 +lightning==2.5.5 +torch==2.8.0 +sympy==1.14.0 +pyproject_hooks==1.2.0 +ml_collections==1.1.0 +torchvision==0.23.0 +alembic==1.16.5 +certifi==2025.8.3 +rich==14.1.0 +future==1.0.0 +tiktoken==0.11.0 +whisper==1.1.10 +torchlibrosa==0.1.0 +addict==2.4.0 +numpy==2.2.6 +matplotlib==3.10.6 +omegaconf==2.3.0 +sortedcontainers==2.4.0 +openunmix==1.3.0 +onnx-weekly==1.20.0.dev20250901 +kaldiio==2.18.1 +semver==3.0.4 +faster-whisper==1.2.0 +speechbrain==1.0.3 +nvidia-cuda-cupti-cu11==11.7.101 +loguru==0.7.3 +stempeg==0.2.4 +fsspec==2024.2.0 +nvidia-cublas-cu12==12.8.4.1 +kiwisolver==1.4.9 +whisperx==3.4.2 +torch-complex==0.4.4 +jieba==0.42.1 +lazy_loader==0.4 +opentelemetry-semantic-conventions==0.59b0 +torchaudio==2.8.0 +torchcodec==0.7.0 +pytorch-lightning==2.5.5 +pydub==0.25.1 +datasets==4.0.0 +nvidia-curand-cu12==10.3.9.90 +torch_pitch_shift==1.2.5 +cycler==0.12.1 +scikit-learn==1.7.1 +redis==6.4.0 +nvidia-nvtx-cu12==12.8.90 +opentelemetry-exporter-otlp-proto-grpc==1.38.0 +allin1==1.1.0 +panns-inference==0.1.1 +resampy==0.4.3 +nvidia-cuda-runtime-cu11==11.7.99 +madmom==0.16.1 +opentelemetry-exporter-otlp-proto-http==1.38.0 +pyannote.core==5.0.0 +antlr4-python3-runtime==4.9.3 +nvidia-nvjitlink-cu12==12.8.93 +beartype==0.18.5 +pymongo==4.15.0 +nvidia-cufft-cu11==10.9.0.58 +nvidia-cuda-runtime-cu12==12.8.90 +openai-whisper==20250625 +filelock==3.20.0 +retrying==1.4.2 +pyannote.pipeline==3.0.1 +submitit==1.5.3 +primePy==1.3 +gpustat==1.1.1 +torch-audiomentations==0.12.0 +scipy==1.15.3 +nvidia-nccl-cu11==2.14.3 +googleapis-common-protos==1.70.0 +nvidia-cusparselt-cu12==0.7.1 +opentelemetry-exporter-otlp-proto-common==1.38.0 +soxr==0.5.0.post1 +audio-separator==0.36.1 +funasr==1.2.7 +pytorch-wpe==0.0.1 +lit==18.1.8 +Mako==1.3.10 +nvidia-cufft-cu12==11.3.3.83 +opentelemetry-sdk==1.38.0 +ctranslate2==4.4.0 +nvidia-cusparse-cu11==11.7.4.91 +hydra-core==1.3.2 +mdurl==0.1.2 +pytorch-metric-learning==2.9.0 +Cython==3.1.3 +SQLAlchemy==2.0.43 +blessed==1.21.0 +onnx2torch-py313==1.6.0 +joblib==1.5.2 +pyarrow-hotfix==0.7 +conda==25.7.0 +llvmlite==0.44.0 +pycryptodome==3.23.0 +tensorboardX==2.6.4 +opentelemetry-exporter-otlp==1.38.0 +aliyun-python-sdk-kms==2.16.5 +wget==3.2 +shellingham==1.5.4 +nvidia-cufile-cu12==1.13.1.3 +editdistance==0.8.1 +annotated-types==0.7.0 +py-cpuinfo==9.0.0 +HyperPyYAML==1.2.2 +httpx==0.28.1 +smmap==5.0.2 +nvidia-ml-py==13.580.65 +flash_attn==2.8.2 +pydantic_core==2.33.2 +hjson==3.1.0 +multiprocess==0.70.16 +pyarrow==21.0.0 +gitdb==4.0.12 +dill==0.3.8 +xxhash==3.5.0 +safetensors==0.6.2 +h11==0.16.0 +openai==1.102.0 +GitPython==3.1.45 +msgpack==1.1.1 +anyio==4.10.0 +pydantic==2.11.7 +tzdata==2025.2 +PyYAML==6.0.2 +sentry-sdk==2.35.1 +click==8.2.1 +pytz==2025.2 +einops==0.8.1 +pandas==2.3.2 +typing-inspection==0.4.1 +wandb==0.21.1 +easydict==1.13 +sniffio==1.3.1 +jiter==0.10.0 +httpcore==1.0.9 +networkx==3.3 +Jinja2==3.1.4 +mpmath==1.3.0 +pillow==11.0.0 +distro==1.9.0 +executing==0.8.3 +wcwidth==0.2.5 +aiohappyeyeballs==2.6.1 +comm==0.2.1 +pyzmq==26.2.0 +frozendict==2.4.2 +prompt-toolkit==3.0.43 +urllib3==2.3.0 +pycosat==0.6.6 +typing_extensions==4.12.2 +typing_extensions==4.14.0 +platformdirs==4.3.7 +boto3==1.38.39 +Werkzeug==3.1.3 +regex==2024.11.6 +nest-asyncio==1.6.0 +zstandard==0.23.0 +pybind11==2.13.6 +Brotli==1.0.9 +jedi==0.19.2 +botocore==1.38.39 +pip==25.1 +menuinst==2.2.0 +psutil==7.0.0 +psutil==5.9.0 +ruamel.yaml==0.18.10 +setuptools==78.1.1 +attrs==25.3.0 +boltons==24.1.0 +protobuf==6.31.1 +tornado==6.5.1 +Markdown==3.8.1 +pexpect==4.9.0 +libmambapy==2.0.5 +tensorboard==2.19.0 +charset-normalizer==3.3.2 +Pygments==2.19.1 +cffi==1.17.1 +conda-package-handling==2.4.0 +yarl==1.20.1 +ptyprocess==0.7.0 +asttokens==3.0.0 +idna==3.7 +archspec==0.2.3 +six==1.17.0 +aiohttp==3.12.13 +s3transfer==0.13.0 +grpcio==1.73.0 +decorator==5.1.1 +stack-data==0.2.0 +truststore==0.10.0 +tqdm==4.67.1 +jupyter_core==5.7.2 +ninja==1.11.1.4 +debugpy==1.8.11 +pluggy==1.5.0 +aiosignal==1.3.2 +tensorboard-data-server==0.7.2 +frozenlist==1.7.0 +python-dateutil==2.9.0.post0 +jsonpatch==1.33 +traitlets==5.14.3 +requests==2.32.3 +packaging==24.2 +async-timeout==5.0.1 +wheel==0.45.1 +pure-eval==0.2.2 +jupyter_client==8.6.3 +conda-libmamba-solver==25.4.0 +ipython==8.30.0 +ruamel.yaml.clib==0.2.12 +multidict==6.5.0 +conda_package_streaming==0.11.0 +pyprof==1.0.0 +matplotlib-inline==0.1.6 +absl-py==2.3.0 +MarkupSafe==3.0.2 +ipykernel==6.29.5 +propcache==0.3.2 +exceptiongroup==1.2.0 +parso==0.8.4 +fsmnvad==0.0.1 +kaldi-native-fbank==1.22.1 +setuptools==65.0.0 +flatbuffers==25.2.10 +coloredlogs==15.0.1 +humanfriendly==10.0 +importlib_metadata==8.0.0 +jaraco.text==3.12.1 +inflect==7.3.1 +typing_extensions==4.12.2 +typeguard==4.3.0 +tomli==2.0.1 +zipp==3.19.2 +jaraco.collections==5.1.0 +autocommand==2.2.2 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +platformdirs==4.2.2 +backports.tarfile==1.2.0 +packaging==24.2 +wheel==0.45.1 +jaraco.context==5.3.0 diff --git a/wandb/run-20260316_190844-eb7rexwd/files/wandb-metadata.json b/wandb/run-20260316_190844-eb7rexwd/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6488325aabd8ae9d04ecec03d55be3deb0f16e39 --- /dev/null +++ b/wandb/run-20260316_190844-eb7rexwd/files/wandb-metadata.json @@ -0,0 +1,90 @@ +{ + "os": "Linux-5.4.250-2-velinux1u1-amd64-x86_64-with-glibc2.35", + "python": "CPython 3.10.18", + "startedAt": "2026-03-16T19:08:44.876042Z", + "args": [ + "--gradient_checkpointing", + "--deepspeed", + "/algo-intern/user/leonchen/7B_model/ds_zero.json" + ], + "program": "/algo-intern/user/leonchen/cond_gen/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "email": "897344367@qq.com", + "root": "/algo-intern/user/leonchen/cond_gen", + "host": "t-20260317030440-9srmd-worker-0", + "executable": "/root/miniconda3/bin/python3.10", + "cpu_count": 56, + "cpu_count_logical": 112, + "gpu": "NVIDIA A100-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "3779302981632", + "used": "2782688886784" + } + }, + "memory": { + "total": "2021763125248" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-75884388-6972-86b6-ba8d-0f9b3b376afb" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-e9bd3860-f0d8-0308-3ac9-c935162efcf2" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1444bee1-7b17-dc25-c38f-e522d724c458" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-87209582-927f-facd-7bbd-260447ae38bc" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4c45d218-8f3c-0c2a-b1af-ddb4e3458ac8" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-7e9eedf7-f146-5c7d-65c0-f3b89c9d7e65" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-41fa2f62-c201-b9e3-d120-638d444345b3" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-0d1e641f-2055-b68c-968f-3045939102d7" + } + ], + "cudaVersion": "12.8", + "writerId": "vny7hqacf9y647rmq8b70vgiu83h7s35" +} \ No newline at end of file diff --git a/wandb/run-20260316_190844-eb7rexwd/logs/debug-internal.log b/wandb/run-20260316_190844-eb7rexwd/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..cfebb6c50efb09b69df5a8465188b37f891c8bca --- /dev/null +++ b/wandb/run-20260316_190844-eb7rexwd/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2026-03-16T19:08:45.288965049Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2026-03-16T19:08:46.095133995Z","level":"INFO","msg":"stream: created new stream","id":"eb7rexwd"} +{"time":"2026-03-16T19:08:46.095243588Z","level":"INFO","msg":"stream: started","id":"eb7rexwd"} +{"time":"2026-03-16T19:08:46.095302842Z","level":"INFO","msg":"writer: started","stream_id":"eb7rexwd"} +{"time":"2026-03-16T19:08:46.095275959Z","level":"INFO","msg":"sender: started","stream_id":"eb7rexwd"} +{"time":"2026-03-16T19:08:46.09526141Z","level":"INFO","msg":"handler: started","stream_id":"eb7rexwd"} +{"time":"2026-03-16T19:08:47.555778189Z","level":"ERROR","msg":"runconsolelogs: failed to write to file: write /algo-intern/user/leonchen/cond_gen/wandb/run-20260316_190844-eb7rexwd/files/output.log: invalid argument"} diff --git a/wandb/run-20260316_190844-eb7rexwd/logs/debug.log b/wandb/run-20260316_190844-eb7rexwd/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..08a261a673afd32f5bf48423c0270368b03dccb9 --- /dev/null +++ b/wandb/run-20260316_190844-eb7rexwd/logs/debug.log @@ -0,0 +1,24 @@ +2026-03-16 19:08:45,038 INFO MainThread:3685 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2026-03-16 19:08:45,038 INFO MainThread:3685 [wandb_setup.py:_flush():80] Configure stats pid to 3685 +2026-03-16 19:08:45,038 INFO MainThread:3685 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2026-03-16 19:08:45,038 INFO MainThread:3685 [wandb_setup.py:_flush():80] Loading settings from /algo-intern/user/leonchen/cond_gen/wandb/settings +2026-03-16 19:08:45,038 INFO MainThread:3685 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-03-16 19:08:45,040 INFO MainThread:3685 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /algo-intern/user/leonchen/cond_gen/wandb/run-20260316_190844-eb7rexwd/logs/debug.log +2026-03-16 19:08:45,041 INFO MainThread:3685 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /algo-intern/user/leonchen/cond_gen/wandb/run-20260316_190844-eb7rexwd/logs/debug-internal.log +2026-03-16 19:08:45,041 INFO MainThread:3685 [wandb_init.py:init():830] calling init triggers +2026-03-16 19:08:45,041 INFO MainThread:3685 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-16 19:08:45,041 INFO MainThread:3685 [wandb_init.py:init():871] starting backend +2026-03-16 19:08:45,265 INFO MainThread:3685 [wandb_init.py:init():874] sending inform_init request +2026-03-16 19:08:45,281 INFO MainThread:3685 [wandb_init.py:init():882] backend started and connected +2026-03-16 19:08:45,284 INFO MainThread:3685 [wandb_init.py:init():953] updated telemetry +2026-03-16 19:08:45,308 INFO MainThread:3685 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2026-03-16 19:08:46,861 INFO MainThread:3685 [wandb_init.py:init():1029] starting run threads in backend +2026-03-16 19:08:47,138 INFO MainThread:3685 [wandb_run.py:_console_start():2494] atexit reg +2026-03-16 19:08:47,138 INFO MainThread:3685 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2026-03-16 19:08:47,138 INFO MainThread:3685 [wandb_run.py:_redirect():2411] Wrapping output streams. +2026-03-16 19:08:47,138 INFO MainThread:3685 [wandb_run.py:_redirect():2434] Redirects installed. +2026-03-16 19:08:47,142 INFO MainThread:3685 [wandb_init.py:init():1075] run started, returning control to user process +2026-03-16 19:08:47,144 INFO MainThread:3685 [wandb_run.py:_config_callback():1380] config_cb None None {'vocab_size': 168056, 'max_position_embeddings': 40960, 'hidden_size': 1024, 'intermediate_size': 3072, 'num_hidden_layers': 28, 'num_attention_heads': 16, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': None, 'bos_token_id': 151643, 'eos_token_id': 151645, 'tie_word_embeddings': True, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': 'checkpoints/Qwen3-0.6B', 'transformers_version': '5.3.0', 'model_type': 'qwen3', 'magel_chord_dropout_trigger_prob': 0.6, 'magel_structure_dropout_trigger_prob': 0.6, 'magel_num_audio_token': 16384, 'output_attentions': False, 'output_dir': './output_qwen3_0p6b_train', 'per_device_train_batch_size': 1, 'num_train_epochs': 10, 'max_steps': -1, 'learning_rate': 0.0001, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': None, 'warmup_steps': 1000, 'optim': 'adamw_torch_fused', 'optim_args': None, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'optim_target_modules': None, 'gradient_accumulation_steps': 4, 'average_tokens_across_devices': True, 'max_grad_norm': 5.0, 'label_smoothing_factor': 0.0, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'use_liger_kernel': False, 'liger_kernel_config': None, 'neftune_noise_alpha': None, 'torch_empty_cache_steps': None, 'auto_find_batch_size': False, 'logging_strategy': 'steps', 'logging_steps': 10, 'logging_first_step': False, 'log_on_each_node': True, 'logging_nan_inf_filter': True, 'include_num_input_tokens_seen': 'no', 'log_level': 'passive', 'log_level_replica': 'warning', 'disable_tqdm': False, 'report_to': ['wandb'], 'run_name': None, 'project': 'huggingface', 'trackio_space_id': 'trackio', 'eval_strategy': 'no', 'eval_steps': None, 'eval_delay': 0, 'per_device_eval_batch_size': 8, 'prediction_loss_only': False, 'eval_on_start': False, 'eval_do_concat_batches': True, 'eval_use_gather_object': False, 'eval_accumulation_steps': None, 'include_for_metrics': [], 'batch_eval_metrics': False, 'save_only_model': False, 'save_strategy': 'epoch', 'save_steps': 500, 'save_on_each_node': False, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'push_to_hub': False, 'hub_token': '', 'hub_private_repo': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_always_push': False, 'hub_revision': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'restore_callback_states_from_checkpoint': False, 'full_determinism': False, 'seed': 42, 'data_seed': None, 'use_cpu': False, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'dataloader_drop_last': True, 'dataloader_num_workers': 12, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'dataloader_prefetch_factor': None, 'remove_unused_columns': False, 'label_names': None, 'train_sampling_strategy': 'random', 'length_column_name': 'length', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'ddp_backend': None, 'ddp_timeout': 1800, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'deepspeed': '/algo-intern/user/leonchen/7B_model/ds_zero.json', 'debug': [], 'skip_memory_metrics': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'resume_from_checkpoint': None, 'warmup_ratio': None, 'logging_dir': None, 'local_rank': -1} +2026-03-16 19:08:47,148 INFO MainThread:3685 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 0 - > +2026-03-16 19:08:47,148 INFO MainThread:3685 [wandb_run.py:_config_callback():1380] config_cb model/num_parameters 0 None diff --git a/wandb/run-20260316_190844-eb7rexwd/run-eb7rexwd.wandb b/wandb/run-20260316_190844-eb7rexwd/run-eb7rexwd.wandb new file mode 100644 index 0000000000000000000000000000000000000000..4c8543a984dbb28a32054603498a37985a68ff0c Binary files /dev/null and b/wandb/run-20260316_190844-eb7rexwd/run-eb7rexwd.wandb differ diff --git a/wandb/run-20260316_191457-oo5exfjc/files/output.log b/wandb/run-20260316_191457-oo5exfjc/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20260316_191457-oo5exfjc/files/requirements.txt b/wandb/run-20260316_191457-oo5exfjc/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..df6271ecfee13881ea45cb1a93031b183efef581 --- /dev/null +++ b/wandb/run-20260316_191457-oo5exfjc/files/requirements.txt @@ -0,0 +1,330 @@ +pycparser==2.21 +conda-content-trust==0.1.3 +PySocks==1.7.1 +jsonpointer==2.1 +cryptography==39.0.1 +transformers==5.3.0 +tokenizers==0.22.2 +mpi4py==4.1.1 +deepspeed==0.18.7 +huggingface_hub==1.6.0 +hf-xet==1.3.2 +dnspython==2.8.0 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cuda-cupti-cu12==12.8.90 +crcmod==1.7 +contourpy==1.3.2 +docopt==0.6.2 +pooch==1.8.2 +optuna==4.5.0 +Deprecated==1.2.18 +samplerate==0.1.0 +pyannote.audio==3.4.0 +nltk==3.9.1 +librosa==0.11.0 +lameenc==1.8.1 +opentelemetry-proto==1.38.0 +threadpoolctl==3.6.0 +aliyun-python-sdk-core==2.16.0 +silero-vad==6.0.0 +rotary-embedding-torch==0.6.5 +treetable==0.2.6 +tos==2.8.5 +nvidia-cusolver-cu11==11.4.0.1 +nvidia-cusolver-cu12==11.7.3.90 +umap-learn==0.5.9.post2 +jmespath==0.10.0 +accelerate==1.10.1 +modelscope==1.29.2 +fonttools==4.59.2 +dora_search==0.1.12 +ffmpeg-python==0.2.0 +jamo==0.4.1 +wrapt==1.17.3 +triton==3.4.0 +build==1.3.0 +sentencepiece==0.2.1 +pyannote.database==5.1.3 +lightning-utilities==0.15.2 +nvidia-cudnn-cu11==8.5.0.96 +pynndescent==0.5.13 +numba==0.61.2 +GPUtil==1.4.0 +tomli==2.2.1 +nvidia-cublas-cu11==11.10.3.66 +soundfile==0.13.1 +nvidia-cuda-nvrtc-cu12==12.8.93 +asteroid-filterbanks==0.4.0 +pyannote.metrics==3.2.1 +cmake==4.1.0 +audioread==3.0.1 +pyparsing==3.2.3 +evaluate==0.4.6 +onnxruntime-gpu==1.22.0 +nvidia-nvtx-cu11==11.7.91 +colorlog==6.9.0 +diffq==0.2.4 +greenlet==3.2.4 +markdown-it-py==4.0.0 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-curand-cu11==10.2.10.91 +demucs==4.0.1 +pyannoteai-sdk==0.3.0 +mido==1.3.3 +typer==0.17.4 +oss2==2.19.1 +opentelemetry-api==1.38.0 +jaconv==0.4.0 +nvidia-nccl-cu12==2.27.3 +torchmetrics==1.8.2 +tabulate==0.9.0 +av==15.1.0 +cloudpickle==3.1.1 +timm==0.4.5 +nvidia-cuda-nvrtc-cu11==11.7.99 +julius==0.2.7 +ml_dtypes==0.5.3 +lightning==2.5.5 +torch==2.8.0 +sympy==1.14.0 +pyproject_hooks==1.2.0 +ml_collections==1.1.0 +torchvision==0.23.0 +alembic==1.16.5 +certifi==2025.8.3 +rich==14.1.0 +future==1.0.0 +tiktoken==0.11.0 +whisper==1.1.10 +torchlibrosa==0.1.0 +addict==2.4.0 +numpy==2.2.6 +matplotlib==3.10.6 +omegaconf==2.3.0 +sortedcontainers==2.4.0 +openunmix==1.3.0 +onnx-weekly==1.20.0.dev20250901 +kaldiio==2.18.1 +semver==3.0.4 +faster-whisper==1.2.0 +speechbrain==1.0.3 +nvidia-cuda-cupti-cu11==11.7.101 +loguru==0.7.3 +stempeg==0.2.4 +fsspec==2024.2.0 +nvidia-cublas-cu12==12.8.4.1 +kiwisolver==1.4.9 +whisperx==3.4.2 +torch-complex==0.4.4 +jieba==0.42.1 +lazy_loader==0.4 +opentelemetry-semantic-conventions==0.59b0 +torchaudio==2.8.0 +torchcodec==0.7.0 +pytorch-lightning==2.5.5 +pydub==0.25.1 +datasets==4.0.0 +nvidia-curand-cu12==10.3.9.90 +torch_pitch_shift==1.2.5 +cycler==0.12.1 +scikit-learn==1.7.1 +redis==6.4.0 +nvidia-nvtx-cu12==12.8.90 +opentelemetry-exporter-otlp-proto-grpc==1.38.0 +allin1==1.1.0 +panns-inference==0.1.1 +resampy==0.4.3 +nvidia-cuda-runtime-cu11==11.7.99 +madmom==0.16.1 +opentelemetry-exporter-otlp-proto-http==1.38.0 +pyannote.core==5.0.0 +antlr4-python3-runtime==4.9.3 +nvidia-nvjitlink-cu12==12.8.93 +beartype==0.18.5 +pymongo==4.15.0 +nvidia-cufft-cu11==10.9.0.58 +nvidia-cuda-runtime-cu12==12.8.90 +openai-whisper==20250625 +filelock==3.20.0 +retrying==1.4.2 +pyannote.pipeline==3.0.1 +submitit==1.5.3 +primePy==1.3 +gpustat==1.1.1 +torch-audiomentations==0.12.0 +scipy==1.15.3 +nvidia-nccl-cu11==2.14.3 +googleapis-common-protos==1.70.0 +nvidia-cusparselt-cu12==0.7.1 +opentelemetry-exporter-otlp-proto-common==1.38.0 +soxr==0.5.0.post1 +audio-separator==0.36.1 +funasr==1.2.7 +pytorch-wpe==0.0.1 +lit==18.1.8 +Mako==1.3.10 +nvidia-cufft-cu12==11.3.3.83 +opentelemetry-sdk==1.38.0 +ctranslate2==4.4.0 +nvidia-cusparse-cu11==11.7.4.91 +hydra-core==1.3.2 +mdurl==0.1.2 +pytorch-metric-learning==2.9.0 +Cython==3.1.3 +SQLAlchemy==2.0.43 +blessed==1.21.0 +onnx2torch-py313==1.6.0 +joblib==1.5.2 +pyarrow-hotfix==0.7 +conda==25.7.0 +llvmlite==0.44.0 +pycryptodome==3.23.0 +tensorboardX==2.6.4 +opentelemetry-exporter-otlp==1.38.0 +aliyun-python-sdk-kms==2.16.5 +wget==3.2 +shellingham==1.5.4 +nvidia-cufile-cu12==1.13.1.3 +editdistance==0.8.1 +annotated-types==0.7.0 +py-cpuinfo==9.0.0 +HyperPyYAML==1.2.2 +httpx==0.28.1 +smmap==5.0.2 +nvidia-ml-py==13.580.65 +flash_attn==2.8.2 +pydantic_core==2.33.2 +hjson==3.1.0 +multiprocess==0.70.16 +pyarrow==21.0.0 +gitdb==4.0.12 +dill==0.3.8 +xxhash==3.5.0 +safetensors==0.6.2 +h11==0.16.0 +openai==1.102.0 +GitPython==3.1.45 +msgpack==1.1.1 +anyio==4.10.0 +pydantic==2.11.7 +tzdata==2025.2 +PyYAML==6.0.2 +sentry-sdk==2.35.1 +click==8.2.1 +pytz==2025.2 +einops==0.8.1 +pandas==2.3.2 +typing-inspection==0.4.1 +wandb==0.21.1 +easydict==1.13 +sniffio==1.3.1 +jiter==0.10.0 +httpcore==1.0.9 +networkx==3.3 +Jinja2==3.1.4 +mpmath==1.3.0 +pillow==11.0.0 +distro==1.9.0 +executing==0.8.3 +wcwidth==0.2.5 +aiohappyeyeballs==2.6.1 +comm==0.2.1 +pyzmq==26.2.0 +frozendict==2.4.2 +prompt-toolkit==3.0.43 +urllib3==2.3.0 +pycosat==0.6.6 +typing_extensions==4.12.2 +typing_extensions==4.14.0 +platformdirs==4.3.7 +boto3==1.38.39 +Werkzeug==3.1.3 +regex==2024.11.6 +nest-asyncio==1.6.0 +zstandard==0.23.0 +pybind11==2.13.6 +Brotli==1.0.9 +jedi==0.19.2 +botocore==1.38.39 +pip==25.1 +menuinst==2.2.0 +psutil==7.0.0 +psutil==5.9.0 +ruamel.yaml==0.18.10 +setuptools==78.1.1 +attrs==25.3.0 +boltons==24.1.0 +protobuf==6.31.1 +tornado==6.5.1 +Markdown==3.8.1 +pexpect==4.9.0 +libmambapy==2.0.5 +tensorboard==2.19.0 +charset-normalizer==3.3.2 +Pygments==2.19.1 +cffi==1.17.1 +conda-package-handling==2.4.0 +yarl==1.20.1 +ptyprocess==0.7.0 +asttokens==3.0.0 +idna==3.7 +archspec==0.2.3 +six==1.17.0 +aiohttp==3.12.13 +s3transfer==0.13.0 +grpcio==1.73.0 +decorator==5.1.1 +stack-data==0.2.0 +truststore==0.10.0 +tqdm==4.67.1 +jupyter_core==5.7.2 +ninja==1.11.1.4 +debugpy==1.8.11 +pluggy==1.5.0 +aiosignal==1.3.2 +tensorboard-data-server==0.7.2 +frozenlist==1.7.0 +python-dateutil==2.9.0.post0 +jsonpatch==1.33 +traitlets==5.14.3 +requests==2.32.3 +packaging==24.2 +async-timeout==5.0.1 +wheel==0.45.1 +pure-eval==0.2.2 +jupyter_client==8.6.3 +conda-libmamba-solver==25.4.0 +ipython==8.30.0 +ruamel.yaml.clib==0.2.12 +multidict==6.5.0 +conda_package_streaming==0.11.0 +pyprof==1.0.0 +matplotlib-inline==0.1.6 +absl-py==2.3.0 +MarkupSafe==3.0.2 +ipykernel==6.29.5 +propcache==0.3.2 +exceptiongroup==1.2.0 +parso==0.8.4 +fsmnvad==0.0.1 +kaldi-native-fbank==1.22.1 +setuptools==65.0.0 +flatbuffers==25.2.10 +coloredlogs==15.0.1 +humanfriendly==10.0 +importlib_metadata==8.0.0 +jaraco.text==3.12.1 +inflect==7.3.1 +typing_extensions==4.12.2 +typeguard==4.3.0 +tomli==2.0.1 +zipp==3.19.2 +jaraco.collections==5.1.0 +autocommand==2.2.2 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +platformdirs==4.2.2 +backports.tarfile==1.2.0 +packaging==24.2 +wheel==0.45.1 +jaraco.context==5.3.0 diff --git a/wandb/run-20260316_191457-oo5exfjc/files/wandb-metadata.json b/wandb/run-20260316_191457-oo5exfjc/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ae10326cb956ec0c3ffd36e3e485853d0f4c5f75 --- /dev/null +++ b/wandb/run-20260316_191457-oo5exfjc/files/wandb-metadata.json @@ -0,0 +1,90 @@ +{ + "os": "Linux-5.4.250-2-velinux1u1-amd64-x86_64-with-glibc2.35", + "python": "CPython 3.10.18", + "startedAt": "2026-03-16T19:14:57.192506Z", + "args": [ + "--gradient_checkpointing", + "--deepspeed", + "/algo-intern/user/leonchen/7B_model/ds_zero.json" + ], + "program": "/algo-intern/user/leonchen/cond_gen/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "email": "897344367@qq.com", + "root": "/algo-intern/user/leonchen/cond_gen", + "host": "t-20260317031104-gc465-worker-0", + "executable": "/root/miniconda3/bin/python3.10", + "cpu_count": 56, + "cpu_count_logical": 112, + "gpu": "NVIDIA A100-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "3779302981632", + "used": "2782735159296" + } + }, + "memory": { + "total": "2021763125248" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-75884388-6972-86b6-ba8d-0f9b3b376afb" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-e9bd3860-f0d8-0308-3ac9-c935162efcf2" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1444bee1-7b17-dc25-c38f-e522d724c458" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-87209582-927f-facd-7bbd-260447ae38bc" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4c45d218-8f3c-0c2a-b1af-ddb4e3458ac8" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-7e9eedf7-f146-5c7d-65c0-f3b89c9d7e65" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-41fa2f62-c201-b9e3-d120-638d444345b3" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-0d1e641f-2055-b68c-968f-3045939102d7" + } + ], + "cudaVersion": "12.8", + "writerId": "5f1od3cl6hm804czmttjhyhrfebh54g3" +} \ No newline at end of file diff --git a/wandb/run-20260316_191457-oo5exfjc/logs/debug-internal.log b/wandb/run-20260316_191457-oo5exfjc/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..268053a93ba28b641bc7e16cd665d6c70c211bd8 --- /dev/null +++ b/wandb/run-20260316_191457-oo5exfjc/logs/debug-internal.log @@ -0,0 +1,312 @@ +{"time":"2026-03-16T19:14:57.900213473Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2026-03-16T19:14:58.685557117Z","level":"INFO","msg":"stream: created new stream","id":"oo5exfjc"} +{"time":"2026-03-16T19:14:58.685642016Z","level":"INFO","msg":"stream: started","id":"oo5exfjc"} +{"time":"2026-03-16T19:14:58.685686478Z","level":"INFO","msg":"handler: started","stream_id":"oo5exfjc"} +{"time":"2026-03-16T19:14:58.685713026Z","level":"INFO","msg":"sender: started","stream_id":"oo5exfjc"} +{"time":"2026-03-16T19:14:58.6856662Z","level":"INFO","msg":"writer: started","stream_id":"oo5exfjc"} +{"time":"2026-03-16T19:15:00.060043511Z","level":"ERROR","msg":"runconsolelogs: failed to write to file: write /algo-intern/user/leonchen/cond_gen/wandb/run-20260316_191457-oo5exfjc/files/output.log: invalid argument"} +{"time":"2026-03-16T22:44:52.878105634Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-03-17T07:46:18.843932176Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T09:08:16.689454984Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T09:27:23.192350282Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:27:55.534937311Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:28:30.18959691Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:28:32.715969776Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:37858->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T09:29:08.223848872Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:29:55.918263136Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:30:25.406256135Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:56496->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T09:31:04.557963137Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:31:17.407826338Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T09:32:34.560190511Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:33:37.503945191Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2026-03-17T09:34:04.561757606Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:35:34.562686945Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:36:08.918421631Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:56070->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T09:36:53.191466494Z","level":"WARN","msg":"sender: taking a long time","seconds":600.000027928,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"59og14jrhn91\" connection_id:\"1(@)\")"} +{"time":"2026-03-17T09:37:04.56327784Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:38:34.564592755Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:38:52.895952913Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2026-03-17T09:40:04.565759074Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:40:06.50335681Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T09:40:39.535396526Z","level":"WARN","msg":"runwork: taking a long time","seconds":600.00058653,"work":"WorkRecord(*service_go_proto.Record_OutputRaw); Control(connection_id:\"1(@)\")"} +{"time":"2026-03-17T09:40:44.399910136Z","level":"WARN","msg":"runwork: taking a long time","seconds":600.000902536,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"} +{"time":"2026-03-17T09:40:44.494085077Z","level":"WARN","msg":"runwork: taking a long time","seconds":600.000203902,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"} +{"time":"2026-03-17T09:41:34.568031418Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:43:04.56859921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:43:39.978549352Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:34562->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T09:44:16.893089255Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 192.168.1.14:37274->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T09:45:46.894288274Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:46:17.16236988Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:53230->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T09:46:53.202198903Z","level":"WARN","msg":"sender: taking a long time","seconds":1200.01074862,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"59og14jrhn91\" connection_id:\"1(@)\")"} +{"time":"2026-03-17T09:46:56.880669708Z","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":1203.689294579,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"59og14jrhn91\" connection_id:\"1(@)\")"} +{"time":"2026-03-17T09:46:56.880841952Z","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":977.346075274,"work":"WorkRecord(*service_go_proto.Record_OutputRaw); Control(connection_id:\"1(@)\")"} +{"time":"2026-03-17T09:46:56.881055742Z","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":972.482077504,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"} +{"time":"2026-03-17T09:46:56.88108187Z","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":972.387207052,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"} +{"time":"2026-03-17T09:47:38.20852405Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:48:10.465672357Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:48:40.530256244Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:60566->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T09:51:51.135960603Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2026-03-17T09:52:23.213578644Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:52:55.295909368Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:53:06.925275977Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T09:53:30.272237206Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:54:09.25627332Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:54:58.04384561Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T09:55:21.815128544Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T09:56:04.947839902Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:57:04.371336699Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:42848->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T09:57:38.219376743Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:58:10.63043232Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T09:58:31.925235626Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:49366->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:00:36.8100992Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:56554->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:01:23.221737561Z","level":"ERROR","msg":"sender: sendStopStatus: failed to get run stopped status: net/http: request canceled (Client.Timeout or context cancellation while reading body)"} +{"time":"2026-03-17T10:02:08.222938855Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:04:00.434815879Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:04:15.172618298Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T10:04:23.224852833Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:04:55.600870559Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:05:29.768168089Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T10:06:09.232539001Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:06:56.011382106Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:07:59.722948282Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:08:26.413796535Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:08:48.906601185Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T10:09:29.723988167Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:10:59.725449517Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:11:56.751010263Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:53724->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:12:29.727059437Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T10:15:13.344180424Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:47252->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:15:43.995626684Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:34490->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:15:53.482056009Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T10:17:18.641015832Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T10:18:19.525731387Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:49156->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:20:48.192351304Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:34800->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:23:50.452235584Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:24:39.246827362Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T10:26:38.528026681Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:26:41.788956783Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 192.168.1.14:37416->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:26:51.112660956Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T10:28:41.847187836Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:40726->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:31:46.127582641Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:35630->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:32:21.280381776Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": http2: client conn is closed"} +{"time":"2026-03-17T10:36:59.005145236Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:53562->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:38:19.938789664Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T10:40:18.682135756Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:34748->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:43:41.592493552Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:43:57.65557097Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:42626->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:46:53.574655784Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T10:47:15.981047288Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:49:31.088882007Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:47060->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:50:53.997941399Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:40208->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:51:28.85336419Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T10:52:02.631821505Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T10:52:34.179775657Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T10:54:28.555910959Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:42998->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:54:38.580865238Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:56:08.582208813Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:57:44.470380157Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T10:59:04.524652435Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:41822->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T10:59:40.228260797Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T11:01:04.832549127Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:60852->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:03:38.999341329Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:48552->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:08:41.545224121Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:33354->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:09:41.870048516Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T11:11:28.664262548Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T11:13:53.911441195Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:46568->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:16:58.847950247Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2026-03-17T11:17:18.009630538Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T11:19:04.693765235Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:57046->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:20:59.864016658Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:34012->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:21:55.323861768Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:35386->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:24:03.136490212Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:42212->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:26:01.976544083Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:43492->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:27:30.045319473Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:33992->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:27:38.797373701Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T11:28:11.147109707Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T11:29:22.682646229Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:41540->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:30:27.857587025Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T11:32:52.545525597Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:32976->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:36:56.108094612Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": context deadline exceeded"} +{"time":"2026-03-17T11:38:27.712277027Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:43156->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:41:37.381697898Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": context deadline exceeded"} +{"time":"2026-03-17T11:43:42.79511835Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:51908->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:46:46.240043947Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:55284->35.186.228.49:443: read: connection timed out"} +{"time":"2026-03-17T11:47:13.117868505Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T11:48:12.61554823Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:48024->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:50:08.937877088Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:47026->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:51:38.812994668Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T11:52:11.107117658Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T11:52:24.671989854Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2026-03-17T11:52:45.628704046Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T11:53:24.094807963Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T11:53:59.437958364Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 192.168.1.14:54734->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:54:33.038473881Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:43966->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:54:51.969852434Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T11:55:01.615184382Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T11:56:04.682451756Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:42772->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T11:56:31.616497401Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T11:56:54.080409156Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T11:58:54.077639531Z","level":"ERROR","msg":"sender: sendStopStatus: failed to get run stopped status: context deadline exceeded (Client.Timeout or context cancellation while reading body)"} +{"time":"2026-03-17T11:59:39.07934131Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T11:59:58.87127361Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": context deadline exceeded"} +{"time":"2026-03-17T12:00:11.571645311Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:02:09.081410979Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T12:02:41.107521417Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:03:04.054463346Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:38128->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:03:15.900987239Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:03:54.296336238Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:60078->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:03:55.350901491Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:04:08.627377628Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": http2: client conn is closed"} +{"time":"2026-03-17T12:04:43.687049332Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:05:50.035342081Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:05:56.670327892Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:36254->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:06:36.077595309Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:07:54.085647389Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T12:08:26.257350425Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:08:27.069614517Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:42714->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:09:00.59831007Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:09:39.267505214Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:10:14.089001681Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:60104->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:10:41.507456889Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:11:54.089598646Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T12:12:26.583423275Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T12:15:29.112322675Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:46962->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:16:26.634978873Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:17:06.723851511Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:40980->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:19:39.096227492Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:22:24.098285485Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T12:22:46.003136763Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:48638->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:22:56.18143649Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:24:51.662649937Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:53794->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:27:16.057851821Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:56220->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:27:24.102694144Z","level":"ERROR","msg":"sender: sendStopStatus: failed to get run stopped status: net/http: request canceled (Client.Timeout or context cancellation while reading body)"} +{"time":"2026-03-17T12:28:18.702920368Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:30:33.586706456Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:32:09.106060561Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T12:32:41.163860825Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:33:15.230003147Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:33:15.36258496Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:33:53.951514182Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:34:40.910090608Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:35:49.223318022Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:37:19.224221104Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T12:37:53.01617309Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:41890->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:38:10.921047185Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:39:59.197010917Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:41:22.938301134Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:37782->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:41:56.043073169Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:42:33.024050951Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:42212->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:45:51.226523099Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:60454->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:46:40.698271598Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:47:38.433134745Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:49:35.029646986Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:51:04.384963789Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:51:40.000786564Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:51:55.617381374Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:52:21.280166355Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:52:39.851156671Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:53:08.357644307Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:53:40.168699909Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:35770->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:56:06.089800447Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:44742->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:56:41.714668001Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T12:57:00.884244814Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:48112->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T12:58:50.961818717Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:34094->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:02:30.327356015Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:48318->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:05:39.033697533Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:09:20.681341883Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:54524->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:12:08.8000177Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2026-03-17T13:15:39.642041445Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:45002->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:18:12.824115064Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:48198->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:19:44.347488154Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:22:49.709670316Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:42906->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:23:47.674625503Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:42188->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:27:59.31086044Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:30:45.884159695Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:33842->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:32:27.56290315Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:33:04.638691314Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:35:25.871378397Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:37:28.269359679Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:57768->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:39:09.464505796Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T13:39:41.901524701Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T13:40:32.39372356Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T13:43:37.932139228Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:41950->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:45:57.16240471Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:39402->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:46:14.233783803Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:46:59.569713801Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:48:23.832333279Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:49:29.007628132Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:49:47.242494825Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:52:16.554710879Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:52:39.473246006Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T13:55:23.002030209Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:50956->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T13:55:59.833894357Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:56:50.23870141Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:57:04.750781642Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T13:59:27.738904238Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:51318->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:00:50.121089508Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:46394->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:03:11.868849175Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:03:32.027063641Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:33452->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:03:54.560327555Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:05:37.387305673Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:07:03.318329982Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:07:52.969534954Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:08:36.532565202Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:11:23.514918049Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:48844->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:13:47.853821788Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:37664->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:16:39.931266877Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:56992->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:18:03.32972744Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:55344->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:20:40.524077117Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:35328->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:21:06.710293706Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:38488->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:25:03.534540412Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:38514->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:25:26.75354312Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:25:46.878571391Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:27:05.783156426Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:29:17.354067918Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:35280->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:30:14.480069084Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:31:30.008154771Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:34:19.730188505Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:41448->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:36:20.043042917Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:56462->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:38:31.628926976Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:60096->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:40:24.782327014Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:45338->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:41:51.302896899Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:45404->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:46:28.604304812Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:32902->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T14:49:21.571225762Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T14:58:19.562616516Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:53622->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T15:04:55.542348642Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T15:06:04.163933327Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T15:10:55.939447875Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": http2: client conn is closed"} +{"time":"2026-03-17T15:17:13.684618584Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T15:33:47.253638474Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T15:38:59.844557242Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T15:42:47.541686091Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T15:49:11.225436821Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T15:55:26.354902487Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T16:31:25.332060129Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2026-03-17T16:31:57.350827182Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T16:32:14.590593727Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:35536->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T16:32:43.85996472Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T16:33:01.424810603Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T16:34:26.646636618Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:58570->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T16:34:51.658357453Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:37046->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T16:38:02.239881232Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:59378->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T16:41:02.418272726Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:51460->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T16:41:25.555631176Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T16:43:51.424929906Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:33052->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T16:44:04.388245035Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T18:53:11.24180581Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:57054->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T18:53:56.859088092Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T18:55:05.629454748Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T18:57:11.055286006Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-17T18:57:18.065659231Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:51808->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T18:58:05.493390413Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": http2: client conn is closed"} +{"time":"2026-03-17T18:58:54.184377326Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T18:59:18.124606799Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T18:59:43.867514997Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T19:00:25.221542808Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-17T19:00:53.532865056Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:57662->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-17T19:03:55.517490264Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-18T00:02:53.019501069Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-18T03:09:03.028712545Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": read tcp 192.168.1.14:34340->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-18T04:36:11.580340834Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-18T04:56:05.320437086Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} +{"time":"2026-03-18T05:03:18.111547682Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/897344367-queen-mary-university-of-london/vaultum-qwen3-0p6b/oo5exfjc/file_stream\": unexpected EOF"} diff --git a/wandb/run-20260316_191457-oo5exfjc/logs/debug.log b/wandb/run-20260316_191457-oo5exfjc/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..9c775092fce5867b30cff61d5e01903699bbed88 --- /dev/null +++ b/wandb/run-20260316_191457-oo5exfjc/logs/debug.log @@ -0,0 +1,24 @@ +2026-03-16 19:14:57,656 INFO MainThread:3646 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2026-03-16 19:14:57,656 INFO MainThread:3646 [wandb_setup.py:_flush():80] Configure stats pid to 3646 +2026-03-16 19:14:57,656 INFO MainThread:3646 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2026-03-16 19:14:57,656 INFO MainThread:3646 [wandb_setup.py:_flush():80] Loading settings from /algo-intern/user/leonchen/cond_gen/wandb/settings +2026-03-16 19:14:57,656 INFO MainThread:3646 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-03-16 19:14:57,657 INFO MainThread:3646 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /algo-intern/user/leonchen/cond_gen/wandb/run-20260316_191457-oo5exfjc/logs/debug.log +2026-03-16 19:14:57,659 INFO MainThread:3646 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /algo-intern/user/leonchen/cond_gen/wandb/run-20260316_191457-oo5exfjc/logs/debug-internal.log +2026-03-16 19:14:57,659 INFO MainThread:3646 [wandb_init.py:init():830] calling init triggers +2026-03-16 19:14:57,659 INFO MainThread:3646 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-16 19:14:57,659 INFO MainThread:3646 [wandb_init.py:init():871] starting backend +2026-03-16 19:14:57,878 INFO MainThread:3646 [wandb_init.py:init():874] sending inform_init request +2026-03-16 19:14:57,894 INFO MainThread:3646 [wandb_init.py:init():882] backend started and connected +2026-03-16 19:14:57,896 INFO MainThread:3646 [wandb_init.py:init():953] updated telemetry +2026-03-16 19:14:57,919 INFO MainThread:3646 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2026-03-16 19:14:59,377 INFO MainThread:3646 [wandb_init.py:init():1029] starting run threads in backend +2026-03-16 19:14:59,651 INFO MainThread:3646 [wandb_run.py:_console_start():2494] atexit reg +2026-03-16 19:14:59,651 INFO MainThread:3646 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2026-03-16 19:14:59,651 INFO MainThread:3646 [wandb_run.py:_redirect():2411] Wrapping output streams. +2026-03-16 19:14:59,652 INFO MainThread:3646 [wandb_run.py:_redirect():2434] Redirects installed. +2026-03-16 19:14:59,656 INFO MainThread:3646 [wandb_init.py:init():1075] run started, returning control to user process +2026-03-16 19:14:59,658 INFO MainThread:3646 [wandb_run.py:_config_callback():1380] config_cb None None {'vocab_size': 168056, 'max_position_embeddings': 40960, 'hidden_size': 1024, 'intermediate_size': 3072, 'num_hidden_layers': 28, 'num_attention_heads': 16, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': None, 'bos_token_id': 151643, 'eos_token_id': 151645, 'tie_word_embeddings': True, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': 'checkpoints/Qwen3-0.6B', 'transformers_version': '5.3.0', 'model_type': 'qwen3', 'magel_chord_dropout_trigger_prob': 0.6, 'magel_structure_dropout_trigger_prob': 0.6, 'magel_num_audio_token': 16384, 'output_attentions': False, 'output_dir': './output_qwen3_0p6b_train', 'per_device_train_batch_size': 1, 'num_train_epochs': 20, 'max_steps': -1, 'learning_rate': 0.0001, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': None, 'warmup_steps': 1000, 'optim': 'adamw_torch_fused', 'optim_args': None, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'optim_target_modules': None, 'gradient_accumulation_steps': 4, 'average_tokens_across_devices': True, 'max_grad_norm': 5.0, 'label_smoothing_factor': 0.0, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'use_liger_kernel': False, 'liger_kernel_config': None, 'neftune_noise_alpha': None, 'torch_empty_cache_steps': None, 'auto_find_batch_size': False, 'logging_strategy': 'steps', 'logging_steps': 10, 'logging_first_step': False, 'log_on_each_node': True, 'logging_nan_inf_filter': True, 'include_num_input_tokens_seen': 'no', 'log_level': 'passive', 'log_level_replica': 'warning', 'disable_tqdm': False, 'report_to': ['wandb'], 'run_name': None, 'project': 'huggingface', 'trackio_space_id': 'trackio', 'eval_strategy': 'no', 'eval_steps': None, 'eval_delay': 0, 'per_device_eval_batch_size': 8, 'prediction_loss_only': False, 'eval_on_start': False, 'eval_do_concat_batches': True, 'eval_use_gather_object': False, 'eval_accumulation_steps': None, 'include_for_metrics': [], 'batch_eval_metrics': False, 'save_only_model': False, 'save_strategy': 'epoch', 'save_steps': 500, 'save_on_each_node': False, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'push_to_hub': False, 'hub_token': '', 'hub_private_repo': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_always_push': False, 'hub_revision': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'restore_callback_states_from_checkpoint': False, 'full_determinism': False, 'seed': 42, 'data_seed': None, 'use_cpu': False, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'dataloader_drop_last': True, 'dataloader_num_workers': 12, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'dataloader_prefetch_factor': None, 'remove_unused_columns': False, 'label_names': None, 'train_sampling_strategy': 'random', 'length_column_name': 'length', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'ddp_backend': None, 'ddp_timeout': 1800, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'deepspeed': '/algo-intern/user/leonchen/7B_model/ds_zero.json', 'debug': [], 'skip_memory_metrics': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'resume_from_checkpoint': None, 'warmup_ratio': None, 'logging_dir': None, 'local_rank': -1} +2026-03-16 19:14:59,662 INFO MainThread:3646 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 0 - > +2026-03-16 19:14:59,662 INFO MainThread:3646 [wandb_run.py:_config_callback():1380] config_cb model/num_parameters 0 None