thiomajid commited on
Commit
5e82e8a
·
verified ·
1 Parent(s): d2f46f7

xLSTM is ready for image generation

Browse files
Files changed (30) hide show
  1. .gitattributes +4 -0
  2. config.json +81 -0
  3. logs/10000/_CHECKPOINT_METADATA +1 -0
  4. logs/10000/default/_METADATA +1 -0
  5. logs/10000/default/_sharding +1 -0
  6. logs/10000/default/array_metadatas/process_0 +1 -0
  7. logs/10000/default/d/5887d7cf72c64115a964c1a43d591c09 +0 -0
  8. logs/10000/default/manifest.ocdbt +0 -0
  9. logs/10000/default/ocdbt.process_0/d/399bbb6a26e3af7391ef4ee2c557d20e +3 -0
  10. logs/10000/default/ocdbt.process_0/d/3a746e10f4549f4e3741b3eb6050a69f +0 -0
  11. logs/10000/default/ocdbt.process_0/d/90175ba14c25d7ecb5c3934722be7b04 +0 -0
  12. logs/10000/default/ocdbt.process_0/d/b1408f6a776dffd6bb175e54fecaae3e +0 -0
  13. logs/10000/default/ocdbt.process_0/d/f957dd57345bd22cb0916bf0f123bcac +3 -0
  14. logs/10000/default/ocdbt.process_0/manifest.ocdbt +0 -0
  15. logs/8000/_CHECKPOINT_METADATA +1 -0
  16. logs/8000/default/_METADATA +1 -0
  17. logs/8000/default/_sharding +1 -0
  18. logs/8000/default/array_metadatas/process_0 +1 -0
  19. logs/8000/default/d/e3e0a03e844d7fec1a8d8dfdb3d50284 +0 -0
  20. logs/8000/default/manifest.ocdbt +0 -0
  21. logs/8000/default/ocdbt.process_0/d/2f9047fa57561a8b64e87e5c02a78301 +3 -0
  22. logs/8000/default/ocdbt.process_0/d/91152cc46185c182d6f78659e463756b +0 -0
  23. logs/8000/default/ocdbt.process_0/d/a922960ae9f2d2c29f6fcfcf28bd34e0 +0 -0
  24. logs/8000/default/ocdbt.process_0/d/b07075d27dbfe48c28f624fcfc889e48 +3 -0
  25. logs/8000/default/ocdbt.process_0/d/d2bc1d0475dd12ab55ff91956dbde771 +0 -0
  26. logs/8000/default/ocdbt.process_0/manifest.ocdbt +0 -0
  27. logs/xLSTM-tiny-stories/events.out.tfevents.1761144595.3abd4fca6fd6.6217.0.v2 +3 -0
  28. timing_summary.json +7 -0
  29. train_history.json +11 -0
  30. trainer_config.json +47 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ logs/10000/default/ocdbt.process_0/d/399bbb6a26e3af7391ef4ee2c557d20e filter=lfs diff=lfs merge=lfs -text
37
+ logs/10000/default/ocdbt.process_0/d/f957dd57345bd22cb0916bf0f123bcac filter=lfs diff=lfs merge=lfs -text
38
+ logs/8000/default/ocdbt.process_0/d/2f9047fa57561a8b64e87e5c02a78301 filter=lfs diff=lfs merge=lfs -text
39
+ logs/8000/default/ocdbt.process_0/d/b07075d27dbfe48c28f624fcfc889e48 filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mlstm_block": {
3
+ "mlstm": {
4
+ "proj_factor": 2.0,
5
+ "round_proj_up_dim_up": true,
6
+ "round_proj_up_to_multiple_of": 64,
7
+ "_proj_up_dim": 768,
8
+ "conv1d_kernel_size": 4,
9
+ "qkv_proj_blocksize": 32,
10
+ "num_heads": 4,
11
+ "embedding_dim": 384,
12
+ "bias": false,
13
+ "dropout": 0.0,
14
+ "context_length": 256,
15
+ "_num_blocks": 6,
16
+ "_inner_embedding_dim": 768
17
+ },
18
+ "_num_blocks": 6,
19
+ "_block_idx": null
20
+ },
21
+ "slstm_block": {
22
+ "slstm": {
23
+ "hidden_size": 384,
24
+ "num_heads": 4,
25
+ "num_states": 4,
26
+ "backend": "vanilla",
27
+ "function": "slstm",
28
+ "bias_init": "powerlaw_blockdependent",
29
+ "recurrent_weight_init": "zeros",
30
+ "_block_idx": null,
31
+ "_num_blocks": 6,
32
+ "num_gates": 4,
33
+ "gradient_recurrent_clipval": null,
34
+ "forward_clipval": null,
35
+ "batch_size": 8,
36
+ "input_shape": "BSGNH",
37
+ "internal_input_shape": "SBNGH",
38
+ "output_shape": "BNSH",
39
+ "dtype": "bfloat16",
40
+ "dtype_b": "float32",
41
+ "dtype_r": "bfloat16",
42
+ "dtype_w": "bfloat16",
43
+ "dtype_g": "bfloat16",
44
+ "dtype_s": "bfloat16",
45
+ "dtype_a": "float32",
46
+ "initial_val": 0.0,
47
+ "enable_automatic_mixed_precision": true,
48
+ "embedding_dim": 384,
49
+ "conv1d_kernel_size": 4,
50
+ "group_norm_weight": true,
51
+ "dropout": 0.0
52
+ },
53
+ "feedforward": {
54
+ "proj_factor": 1.3,
55
+ "round_proj_up_dim_up": true,
56
+ "round_proj_up_to_multiple_of": 64,
57
+ "_proj_up_dim": 0,
58
+ "act_fn": "swish",
59
+ "embedding_dim": -1,
60
+ "dropout": 0.0,
61
+ "bias": false,
62
+ "ff_type": "ffn_gated",
63
+ "_num_blocks": 1
64
+ },
65
+ "_num_blocks": 6,
66
+ "_block_idx": null
67
+ },
68
+ "context_length": 256,
69
+ "num_blocks": 6,
70
+ "embedding_dim": 384,
71
+ "add_post_blocks_norm": true,
72
+ "bias": false,
73
+ "dropout": 0.0,
74
+ "slstm_at": [],
75
+ "_block_map": "0,0,0,0,0,0",
76
+ "vocab_size": 49152,
77
+ "tie_weights": false,
78
+ "weight_decay_on_embedding": false,
79
+ "add_embedding_dropout": false,
80
+ "pad_token_id": 0
81
+ }
logs/10000/_CHECKPOINT_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"item_handlers": {"default": "orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler"}, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1761148034520662925, "commit_timestamp_nsecs": 1761148035000077197, "custom_metadata": {}}
logs/10000/default/_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tree_metadata": {"('lm_head', 'kernel', 'value')": {"key_metadata": [{"key": "lm_head", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [384, 49152]}}, "('token_embedding', 'embedding', 'value')": {"key_metadata": [{"key": "token_embedding", "key_type": 2}, {"key": "embedding", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [49152, 384]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 768]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 4, 1, 768]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'k_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "k_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 24, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'learnable_skip', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "learnable_skip", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 768]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 2304, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 2304, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'outnorm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "outnorm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 768]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_down', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_down", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 768, 384]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_up', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_up", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 384, 1536]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'q_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "q_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 24, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'v_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "v_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 24, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 384]}}, "('xlstm_block_stack', 'post_blocks_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "post_blocks_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [384]}}}, "use_zarr3": false, "store_array_data_equal_to_fill_value": true, "custom_metadata": null}
logs/10000/default/_sharding ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bG1faGVhZC5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","dG9rZW5fZW1iZWRkaW5nLmVtYmVkZGluZy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwub3V0bm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252LmJpYXMudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252Lmtlcm5lbC52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmtfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmxlYXJuYWJsZV9za2lwLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfZG93bi5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfdXAua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnFfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnZfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtX25vcm0uc2NhbGUudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2sucG9zdF9ibG9ja3Nfbm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}"}
logs/10000/default/array_metadatas/process_0 ADDED
@@ -0,0 +1 @@
 
 
1
+ {"array_metadatas": [{"array_metadata": {"param_name": "lm_head.kernel.value", "write_shape": [384, 49152], "chunk_shape": [384, 49152], "ext_metadata": null}}, {"array_metadata": {"param_name": "token_embedding.embedding.value", "write_shape": [49152, 384], "chunk_shape": [49152, 384], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.bias.value", "write_shape": [6, 768], "chunk_shape": [6, 768], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.kernel.value", "write_shape": [6, 4, 1, 768], "chunk_shape": [6, 4, 1, 768], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.k_proj.kernel.value", "write_shape": [6, 24, 32, 32], "chunk_shape": [6, 24, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.learnable_skip.value", "write_shape": [6, 768], "chunk_shape": [6, 768], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.bias.value", "write_shape": [6, 4], "chunk_shape": [6, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.kernel.value", "write_shape": [6, 2304, 4], "chunk_shape": [6, 2304, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.bias.value", "write_shape": [6, 4], "chunk_shape": [6, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.kernel.value", "write_shape": [6, 2304, 4], "chunk_shape": [6, 2304, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.outnorm.scale.value", "write_shape": [6, 768], "chunk_shape": [6, 768], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_down.kernel.value", "write_shape": [6, 768, 384], "chunk_shape": [6, 768, 384], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_up.kernel.value", "write_shape": [6, 384, 1536], "chunk_shape": [6, 384, 1536], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.q_proj.kernel.value", "write_shape": [6, 24, 32, 32], "chunk_shape": [6, 24, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.v_proj.kernel.value", "write_shape": [6, 24, 32, 32], "chunk_shape": [6, 24, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm_norm.scale.value", "write_shape": [6, 384], "chunk_shape": [6, 384], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.post_blocks_norm.scale.value", "write_shape": [384], "chunk_shape": [384], "ext_metadata": null}}]}
logs/10000/default/d/5887d7cf72c64115a964c1a43d591c09 ADDED
Binary file (917 Bytes). View file
 
logs/10000/default/manifest.ocdbt ADDED
Binary file (116 Bytes). View file
 
logs/10000/default/ocdbt.process_0/d/399bbb6a26e3af7391ef4ee2c557d20e ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ce179d31fddd4ce753a41434da073f55bf53e49dc2d60102dd481fbfb0a3d94
3
+ size 27783168
logs/10000/default/ocdbt.process_0/d/3a746e10f4549f4e3741b3eb6050a69f ADDED
Binary file (545 Bytes). View file
 
logs/10000/default/ocdbt.process_0/d/90175ba14c25d7ecb5c3934722be7b04 ADDED
Binary file (577 Bytes). View file
 
logs/10000/default/ocdbt.process_0/d/b1408f6a776dffd6bb175e54fecaae3e ADDED
Binary file (198 Bytes). View file
 
logs/10000/default/ocdbt.process_0/d/f957dd57345bd22cb0916bf0f123bcac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce6bbdd8ea6862906ac976a6e73c24338a0eb23989a9717626769e7a6116e4b1
3
+ size 34254848
logs/10000/default/ocdbt.process_0/manifest.ocdbt ADDED
Binary file (298 Bytes). View file
 
logs/8000/_CHECKPOINT_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"item_handlers": {"default": "orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler"}, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1761147357293181298, "commit_timestamp_nsecs": 1761147357751493968, "custom_metadata": {}}
logs/8000/default/_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tree_metadata": {"('lm_head', 'kernel', 'value')": {"key_metadata": [{"key": "lm_head", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [384, 49152]}}, "('token_embedding', 'embedding', 'value')": {"key_metadata": [{"key": "token_embedding", "key_type": 2}, {"key": "embedding", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [49152, 384]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 768]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 4, 1, 768]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'k_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "k_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 24, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'learnable_skip', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "learnable_skip", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 768]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 2304, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 2304, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'outnorm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "outnorm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 768]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_down', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_down", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 768, 384]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_up', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_up", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 384, 1536]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'q_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "q_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 24, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'v_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "v_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 24, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6, 384]}}, "('xlstm_block_stack', 'post_blocks_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "post_blocks_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [384]}}}, "use_zarr3": false, "store_array_data_equal_to_fill_value": true, "custom_metadata": null}
logs/8000/default/_sharding ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bG1faGVhZC5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","dG9rZW5fZW1iZWRkaW5nLmVtYmVkZGluZy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwub3V0bm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252LmJpYXMudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252Lmtlcm5lbC52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmtfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmxlYXJuYWJsZV9za2lwLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfZG93bi5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfdXAua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnFfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnZfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtX25vcm0uc2NhbGUudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}","eGxzdG1fYmxvY2tfc3RhY2sucG9zdF9ibG9ja3Nfbm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [1, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}]]}}"}
logs/8000/default/array_metadatas/process_0 ADDED
@@ -0,0 +1 @@
 
 
1
+ {"array_metadatas": [{"array_metadata": {"param_name": "lm_head.kernel.value", "write_shape": [384, 49152], "chunk_shape": [384, 49152], "ext_metadata": null}}, {"array_metadata": {"param_name": "token_embedding.embedding.value", "write_shape": [49152, 384], "chunk_shape": [49152, 384], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.bias.value", "write_shape": [6, 768], "chunk_shape": [6, 768], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.kernel.value", "write_shape": [6, 4, 1, 768], "chunk_shape": [6, 4, 1, 768], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.k_proj.kernel.value", "write_shape": [6, 24, 32, 32], "chunk_shape": [6, 24, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.learnable_skip.value", "write_shape": [6, 768], "chunk_shape": [6, 768], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.bias.value", "write_shape": [6, 4], "chunk_shape": [6, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.kernel.value", "write_shape": [6, 2304, 4], "chunk_shape": [6, 2304, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.bias.value", "write_shape": [6, 4], "chunk_shape": [6, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.kernel.value", "write_shape": [6, 2304, 4], "chunk_shape": [6, 2304, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.outnorm.scale.value", "write_shape": [6, 768], "chunk_shape": [6, 768], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_down.kernel.value", "write_shape": [6, 768, 384], "chunk_shape": [6, 768, 384], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_up.kernel.value", "write_shape": [6, 384, 1536], "chunk_shape": [6, 384, 1536], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.q_proj.kernel.value", "write_shape": [6, 24, 32, 32], "chunk_shape": [6, 24, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.v_proj.kernel.value", "write_shape": [6, 24, 32, 32], "chunk_shape": [6, 24, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm_norm.scale.value", "write_shape": [6, 384], "chunk_shape": [6, 384], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.post_blocks_norm.scale.value", "write_shape": [384], "chunk_shape": [384], "ext_metadata": null}}]}
logs/8000/default/d/e3e0a03e844d7fec1a8d8dfdb3d50284 ADDED
Binary file (915 Bytes). View file
 
logs/8000/default/manifest.ocdbt ADDED
Binary file (116 Bytes). View file
 
logs/8000/default/ocdbt.process_0/d/2f9047fa57561a8b64e87e5c02a78301 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb853d680a4c6e8609c3c98c4bb72f44212454b261ac7e86bc32bdb8bc340511
3
+ size 36864000
logs/8000/default/ocdbt.process_0/d/91152cc46185c182d6f78659e463756b ADDED
Binary file (545 Bytes). View file
 
logs/8000/default/ocdbt.process_0/d/a922960ae9f2d2c29f6fcfcf28bd34e0 ADDED
Binary file (198 Bytes). View file
 
logs/8000/default/ocdbt.process_0/d/b07075d27dbfe48c28f624fcfc889e48 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abd2cdba182e5b3e8a0728ea35145f460a3a4dd876031323eae5b2791d834a9e
3
+ size 25210880
logs/8000/default/ocdbt.process_0/d/d2bc1d0475dd12ab55ff91956dbde771 ADDED
Binary file (577 Bytes). View file
 
logs/8000/default/ocdbt.process_0/manifest.ocdbt ADDED
Binary file (302 Bytes). View file
 
logs/xLSTM-tiny-stories/events.out.tfevents.1761144595.3abd4fca6fd6.6217.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:650a82806804d8d6a2806c2fc54b16dbd72675eb8a21016d3016e1a5ffe4e224
3
+ size 292082
timing_summary.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_training_duration_seconds": 3404.960225803,
3
+ "total_training_duration_hours": 0.9458222849452778,
4
+ "average_epoch_duration_seconds": 673.6284810999998,
5
+ "num_epochs_completed": 5,
6
+ "num_evaluations_completed": 5
7
+ }
train_history.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_training_duration": 3404.960225803,
3
+ "avg_epoch_duration": 673.6284810999998,
4
+ "num_epochs_completed": 5,
5
+ "global_steps": 10000,
6
+ "global_optimizer_steps": 2000,
7
+ "params": {
8
+ "millions": 43.65,
9
+ "billions": 0.04
10
+ }
11
+ }
trainer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer": "HuggingFaceTB/SmolLM2-135M",
3
+ "dtype": "fp32",
4
+ "param_dtype": "bf16",
5
+ "num_train_epochs": 5,
6
+ "per_device_train_batch_size": 16,
7
+ "per_device_eval_batch_size": 16,
8
+ "gradient_accumulation_steps": 5,
9
+ "seed": 42,
10
+ "learning_rate": 0.0003,
11
+ "weight_decay": 0.01,
12
+ "adam_beta1": 0.9,
13
+ "adam_beta2": 0.999,
14
+ "warmup_ratio": 0.2,
15
+ "max_grad_norm": 1.0,
16
+ "logging_steps": 200,
17
+ "output_dir": "./artifacts/",
18
+ "logging_dir": "./artifacts/logs/",
19
+ "run_name": "train",
20
+ "best_metric_key": "perplexity",
21
+ "best_n_to_keep": 3,
22
+ "hub_model_id": "thiomajid/xLSTM-tiny-stories",
23
+ "hub_private_repo": false,
24
+ "upload_message": "xLSTM is ready for image generation",
25
+ "train_dataset_url": "roneneldan/TinyStories",
26
+ "train_subset": null,
27
+ "train_split": "train",
28
+ "train_samples": 32000,
29
+ "eval_dataset_url": "roneneldan/TinyStories",
30
+ "eval_subset": null,
31
+ "eval_split": "validation",
32
+ "eval_samples": 3200,
33
+ "dataloader_drop_last": true,
34
+ "dataloader_num_workers": 4,
35
+ "worker_buffer_size": 2,
36
+ "text_column": "text",
37
+ "use_dataset_cache": true,
38
+ "dataset_cache_dir": "./.hf_data_cache",
39
+ "mesh_shape": [
40
+ 1,
41
+ 1
42
+ ],
43
+ "axis_names": [
44
+ "dp",
45
+ "tp"
46
+ ]
47
+ }