Epoch 0 - Val loss 0.2619

Files changed (3) hide show

README.md ADDED Viewed

+---
+license: apache-2.0
+pipeline_tag: text-generation
+tags:
+- model_hub_mixin
+- pytorch_model_hub_mixin
+---
+This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
+- Code: [More Information Needed]
+- Paper: [More Information Needed]
+- Docs: [More Information Needed]

config.json ADDED Viewed

+{
+  "att_groups": 4,
+  "att_heads": 16,
+  "att_query_groups": 8,
+  "cross_att_type": "sqa",
+  "dense_layer_dim": 1536,
+  "embed_dim": 512,
+  "ff_activation": "silu",
+  "ff_dim": 192,
+  "ff_dropout": 0.0,
+  "final_stateless_layers_config": [
+    "moe",
+    "moe"
+  ],
+  "head_norm_type": "rms_norm",
+  "moe_bias_mode": "global",
+  "moe_grouped_gemm": true,
+  "moe_shared_experts_bias_mode": "global",
+  "moe_top_k": 10,
+  "moe_use_cutlass_grouped_gemm": true,
+  "moe_use_weighted_shared_experts": false,
+  "num_experts": 384,
+  "num_layers": 21,
+  "num_shared_experts": 2,
+  "rope_base": 100000,
+  "router_amp": true,
+  "self_att_type": "sqa",
+  "seq_len": 8192,
+  "shared_expert_dim": 384,
+  "stateless_layers_config": [
+    "dense",
+    "moe"
+  ],
+  "stm_size": 4096,
+  "use_attention_output_bias": false,
+  "use_flash_attention": true,
+  "use_gated": true,
+  "use_gated_attention": true,
+  "use_gated_cross_attention": false,
+  "use_head_norm": true,
+  "use_moe": true,
+  "use_vectorized_moe": true,
+  "vocab_size": 65536
+}

model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:0af3c8516bbd486450d409cd4fe041db5036e2ff9c554eb93b9122b5fef7f8e7
+size 5772284720