Epoch 0 - Val loss 0.2619

Files changed (3) hide show

README.md ADDED Viewed

+---
+license: apache-2.0
+tags:
+- model_hub_mixin
+- pytorch_model_hub_mixin
+---
+This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
+- Code: [More Information Needed]
+- Paper: [More Information Needed]
+- Docs: [More Information Needed]

config.json ADDED Viewed

+{
+  "att_dropout": 0.0,
+  "att_experts": null,
+  "att_groups": 8,
+  "att_heads": 16,
+  "att_query_experts": null,
+  "att_query_groups": 8,
+  "att_type": "sqa",
+  "debug_interval": 10,
+  "debug_mode": false,
+  "embed_dim": 512,
+  "interlayer_att_dropout": 0.0,
+  "interlayer_att_experts": null,
+  "interlayer_att_groups": 8,
+  "interlayer_att_query_experts": null,
+  "interlayer_att_query_groups": 8,
+  "interlayer_att_type": "sqa",
+  "norm_decay": 0.9,
+  "norm_init_gate": -2.0,
+  "norm_per_dim_scale": false,
+  "norm_type": "classic-rms",
+  "num_groups": 3,
+  "num_layers": 21,
+  "residual_gate_init": 3.0,
+  "residual_gate_slot_status_type": "mean",
+  "residual_gate_type": "elementwise",
+  "residual_per_slot_gate": true,
+  "rope_base": 100000,
+  "seq_len": 8192,
+  "stm_size": 4096,
+  "use_flash_attention": true,
+  "use_gated_residual": true,
+  "use_tanh_residual_gate": false
+}

model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:5db2d49e47a5ca77bd7cfd0e69665b32119c3c15868e1395f7169538a5018a9b
+size 133048240