kisoku-3.2b-base / items /_sharding
0arch-io's picture
Upload Kisoku 3.2B base model (100K steps, loss 2.733)
6f30575 verified
{"b3B0X3N0YXRlLjAuY291bnQ=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIuZGVjb2Rlcl9ub3JtLnNjYWxl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubG9naXRzX2RlbnNlLmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubGF5ZXJzLm1scC53aV8wLmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"], \"stage\", [\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubGF5ZXJzLm1scC53aV8xLmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"], \"stage\", [\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubGF5ZXJzLm1scC53by5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"], \"stage\", [\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnBvc3Rfc2VsZl9hdHRlbnRpb25fbGF5ZXJfbm9ybS5zY2FsZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\"], \"stage\"], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnByZV9zZWxmX2F0dGVudGlvbl9sYXllcl9ub3JtLnNjYWxl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\"], \"stage\"], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnNlbGZfYXR0ZW50aW9uLm91dC5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], \"stage\", null, [\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnNlbGZfYXR0ZW50aW9uLmtleS5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], \"stage\", [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], null], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnNlbGZfYXR0ZW50aW9uLnF1ZXJ5Lmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], \"stage\", [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], null], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnNlbGZfYXR0ZW50aW9uLnZhbHVlLmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], \"stage\", [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], null], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubXUucGFyYW1zLnRva2VuX2VtYmVkZGVyLmVtYmVkZGluZw==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], [\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIuZGVjb2Rlcl9ub3JtLnNjYWxl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubG9naXRzX2RlbnNlLmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubGF5ZXJzLm1scC53aV8wLmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"], \"stage\", [\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubGF5ZXJzLm1scC53aV8xLmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"], \"stage\", [\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubGF5ZXJzLm1scC53by5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"], \"stage\", [\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnBvc3Rfc2VsZl9hdHRlbnRpb25fbGF5ZXJfbm9ybS5zY2FsZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\"], \"stage\"], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnByZV9zZWxmX2F0dGVudGlvbl9sYXllcl9ub3JtLnNjYWxl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\"], \"stage\"], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnNlbGZfYXR0ZW50aW9uLm91dC5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], \"stage\", null, [\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnNlbGZfYXR0ZW50aW9uLmtleS5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], \"stage\", [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], null], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnNlbGZfYXR0ZW50aW9uLnF1ZXJ5Lmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], \"stage\", [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], null], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLmRlY29kZXIubGF5ZXJzLnNlbGZfYXR0ZW50aW9uLnZhbHVlLmtlcm5lbA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], \"stage\", [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], null], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjAubnUucGFyYW1zLnRva2VuX2VtYmVkZGVyLmVtYmVkZGluZw==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], [\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","b3B0X3N0YXRlLjIuY291bnQ=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","c3RlcA==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy50b2tlbl9lbWJlZGRlci5lbWJlZGRpbmc=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], [\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmRlY29kZXJfbm9ybS5zY2FsZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxheWVycy5tbHAud28ua2VybmVs":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"], \"stage\", [\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxheWVycy5tbHAud2lfMC5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"], \"stage\", [\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxheWVycy5tbHAud2lfMS5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"], \"stage\", [\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxheWVycy5wb3N0X3NlbGZfYXR0ZW50aW9uX2xheWVyX25vcm0uc2NhbGU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\"], \"stage\"], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxheWVycy5wcmVfc2VsZl9hdHRlbnRpb25fbGF5ZXJfbm9ybS5zY2FsZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\"], \"stage\"], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxheWVycy5zZWxmX2F0dGVudGlvbi52YWx1ZS5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], \"stage\", [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], null], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxheWVycy5zZWxmX2F0dGVudGlvbi5rZXkua2VybmVs":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], \"stage\", [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], null], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxheWVycy5zZWxmX2F0dGVudGlvbi5vdXQua2VybmVs":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], \"stage\", null, [\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxheWVycy5zZWxmX2F0dGVudGlvbi5xdWVyeS5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], \"stage\", [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"], null], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}","cGFyYW1zLnBhcmFtcy5kZWNvZGVyLmxvZ2l0c19kZW5zZS5rZXJuZWw=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"axis_names\": [\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"context_autoregressive\", \"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"expert\", \"autoregressive\"], \"partition_spec\": [[\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"], [\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"]], \"device_mesh\": {\"mesh\": [[[[[[[[[[[[{\"id\": 0}]]]]]]]]], [[[[[[[[[{\"id\": 4}]]]]]]]]], [[[[[[[[[{\"id\": 8}]]]]]]]]], [[[[[[[[[{\"id\": 12}]]]]]]]]], [[[[[[[[[{\"id\": 2}]]]]]]]]], [[[[[[[[[{\"id\": 6}]]]]]]]]], [[[[[[[[[{\"id\": 10}]]]]]]]]], [[[[[[[[[{\"id\": 14}]]]]]]]]], [[[[[[[[[{\"id\": 1}]]]]]]]]], [[[[[[[[[{\"id\": 5}]]]]]]]]], [[[[[[[[[{\"id\": 9}]]]]]]]]], [[[[[[[[[{\"id\": 13}]]]]]]]]], [[[[[[[[[{\"id\": 3}]]]]]]]]], [[[[[[[[[{\"id\": 7}]]]]]]]]], [[[[[[[[[{\"id\": 11}]]]]]]]]], [[[[[[[[[{\"id\": 15}]]]]]]]]]]]]}}"}