Hazan-Lab
/

STU_500M

Model card Files Files and versions

yagizdevre commited on Jan 29, 2025

Commit

43bd63d

·

1 Parent(s): c062128

added new model

Files changed (3) hide show

config.json +11 -8
configuration_ministu.py +1 -1
model.safetensors +2 -2

config.json CHANGED Viewed

@@ -2,10 +2,11 @@
   "model_type": "ministu",
   "_name_or_path": "STU_500M",
   "architectures": ["MiniSTU"],
-  "n_embd": 768,
   "n_heads": 8,
   "n_layers": 12,
   "seq_len": 8192,
   "window_size": 1024,
   "vocab_size": 200064,
   "mlp_scale": 12,
@@ -15,17 +16,18 @@
   "use_hankel_L": false,
   "num_epochs": 1,
   "global_bsz": 524288,
-  "bsz": 1,
   "warmup_steps": 1907,
-  "eval_period": 25,
-  "save_period": 4500,
   "max_lr": 3.0e-3,
   "min_lr": 3.0e-5,
   "max_norm": 1.0,
-  "dilation": 1,
   "fsdp": true,
   "ddp": false,
   "mixed_precision": true,
   "use_cpu_offload": false,
   "sharding_strategy": "full_shard",
   "state_dict_type": "full",
@@ -46,10 +48,11 @@
     "MLP"
   ],
   "use_activation_checkpointing": true,
-  "use_flash_fft": false,
   "use_approx": true,
   "use_attn": true,
   "softcap": 50.0,
   "torch_compile": false
-}

   "model_type": "ministu",
   "_name_or_path": "STU_500M",
   "architectures": ["MiniSTU"],
+  "n_embd": 896,
   "n_heads": 8,
   "n_layers": 12,
   "seq_len": 8192,
+  "weight_tying": true,
   "window_size": 1024,
   "vocab_size": 200064,
   "mlp_scale": 12,
   "use_hankel_L": false,
   "num_epochs": 1,
   "global_bsz": 524288,
+  "bsz": 2,
   "warmup_steps": 1907,
+  "eval_period": 50,
+  "save_period": 500,
   "max_lr": 3.0e-3,
   "min_lr": 3.0e-5,
   "max_norm": 1.0,
+  "dilation": 2,
   "fsdp": true,
   "ddp": false,
   "mixed_precision": true,
+  "torch_dtype": "bfloat16",
   "use_cpu_offload": false,
   "sharding_strategy": "full_shard",
   "state_dict_type": "full",
     "MLP"
   ],
   "use_activation_checkpointing": true,
+  "use_flash_fft": true,
   "use_approx": true,
   "use_attn": true,
   "softcap": 50.0,
+  "theta": 10000.0,
+  "use_alibi": false,
   "torch_compile": false
+}

configuration_ministu.py CHANGED Viewed

@@ -7,7 +7,7 @@ class MiniSTUConfig(PretrainedConfig):
     def __init__(
         self,
         bsz: int = 1,
-        n_embd: int = 768,
         n_heads: int = 8,
         n_layers: int = 12,
         seq_len: int = 8192,

     def __init__(
         self,
         bsz: int = 1,
+        n_embd: int = 896,
         n_heads: int = 8,
         n_layers: int = 12,
         seq_len: int = 8192,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:101763e8119c492ada9816aa38006cbf6ba8bbc0530224510d62b2c7e20a8bfd
-size 1140654808

 version https://git-lfs.github.com/spec/v1
+oid sha256:2b89bf828452423367e484d5922f23e381fe3cbcd1e9751036ed4c23b9f2af19
+size 1460045528