harshad317 commited on
Commit
2ca81ec
·
verified ·
1 Parent(s): 2860c73

Uploading the base model

Browse files
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sequence_len": 8192,
3
+ "vocab_size": 131072,
4
+ "n_layer": 16,
5
+ "n_head": 8,
6
+ "n_kv_head": 8,
7
+ "n_embd": 1024,
8
+ "moe_num_experts": 8,
9
+ "moe_top_k": 2,
10
+ "moe_layer_interval": 3,
11
+ "moe_group_size": 4,
12
+ "moe_expert_intermediate_size": 1792,
13
+ "moe_adjugate_intermediate_size": 0,
14
+ "moe_adjugate_scale": 0.05,
15
+ "moe_router_aux_loss_coef": 0.015,
16
+ "moe_router_bias_lr": 0.001,
17
+ "moe_activation_checkpoint": true,
18
+ "moe_capacity_factor": 0.75,
19
+ "rotary_scaling_type": "yarn",
20
+ "rotary_scale_factor": 4.0,
21
+ "residual_scale": -1.0,
22
+ "attn_dropout": 0.01,
23
+ "label_smoothing": 0.0,
24
+ "z_loss_weight": 0.0,
25
+ "use_flash_attention": true,
26
+ "domain_router_dim": 32,
27
+ "num_domain_tags": 128,
28
+ "domain_router_features": {
29
+ "dataset": {
30
+ "capacity": 128,
31
+ "mode": "one_hot"
32
+ },
33
+ "quality": {
34
+ "capacity": 32,
35
+ "mode": "one_hot"
36
+ },
37
+ "specialty": {
38
+ "capacity": 64,
39
+ "mode": "one_hot"
40
+ },
41
+ "modality": {
42
+ "capacity": 32,
43
+ "mode": "one_hot"
44
+ },
45
+ "language": {
46
+ "capacity": 32,
47
+ "mode": "one_hot"
48
+ },
49
+ "origin": {
50
+ "capacity": 8,
51
+ "mode": "one_hot"
52
+ }
53
+ }
54
+ }
meta.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 45444,
3
+ "val_bpb": 0.6811688407173832,
4
+ "model_config": {
5
+ "sequence_len": 8192,
6
+ "vocab_size": 131072,
7
+ "n_layer": 16,
8
+ "n_head": 8,
9
+ "n_kv_head": 8,
10
+ "n_embd": 1024,
11
+ "moe_num_experts": 8,
12
+ "moe_top_k": 2,
13
+ "moe_layer_interval": 3,
14
+ "moe_group_size": 4,
15
+ "moe_expert_intermediate_size": 1792,
16
+ "moe_adjugate_intermediate_size": 0,
17
+ "moe_adjugate_scale": 0.05,
18
+ "moe_router_aux_loss_coef": 0.015,
19
+ "moe_router_bias_lr": 0.001,
20
+ "moe_activation_checkpoint": true,
21
+ "moe_capacity_factor": 0.75,
22
+ "rotary_scaling_type": "yarn",
23
+ "rotary_scale_factor": 4.0,
24
+ "residual_scale": -1.0,
25
+ "attn_dropout": 0.01,
26
+ "label_smoothing": 0.0,
27
+ "z_loss_weight": 0.0,
28
+ "use_flash_attention": true,
29
+ "domain_router_dim": 32,
30
+ "num_domain_tags": 128,
31
+ "domain_router_features": {
32
+ "dataset": {
33
+ "capacity": 128,
34
+ "mode": "one_hot"
35
+ },
36
+ "quality": {
37
+ "capacity": 32,
38
+ "mode": "one_hot"
39
+ },
40
+ "specialty": {
41
+ "capacity": 64,
42
+ "mode": "one_hot"
43
+ },
44
+ "modality": {
45
+ "capacity": 32,
46
+ "mode": "one_hot"
47
+ },
48
+ "language": {
49
+ "capacity": 32,
50
+ "mode": "one_hot"
51
+ },
52
+ "origin": {
53
+ "capacity": 8,
54
+ "mode": "one_hot"
55
+ }
56
+ }
57
+ },
58
+ "user_config": {
59
+ "run": "continous_pretraining",
60
+ "device_type": "",
61
+ "depth": 16,
62
+ "max_seq_len": 8192,
63
+ "moe_num_experts": 8,
64
+ "moe_top_k": 2,
65
+ "moe_layer_interval": 3,
66
+ "moe_group_size": 4,
67
+ "moe_expert_intermediate_size": 1792,
68
+ "moe_adjugate_intermediate_size": 0,
69
+ "moe_adjugate_scale": 0.05,
70
+ "moe_router_aux_loss_coef": 0.015,
71
+ "moe_router_bias_lr": 0.001,
72
+ "moe_activation_checkpoint": true,
73
+ "moe_capacity_factor": 0.75,
74
+ "moe_router_bias_reset_interval": 0,
75
+ "domain_router_dim": 32,
76
+ "domain_tag_capacity": 128,
77
+ "quality_tag_capacity": 32,
78
+ "specialty_tag_capacity": 64,
79
+ "modality_tag_capacity": 32,
80
+ "language_tag_capacity": 32,
81
+ "origin_tag_capacity": 8,
82
+ "domain_router_feature_modes": "dataset:one_hot,quality:one_hot,specialty:one_hot,modality:one_hot,language:one_hot,origin:one_hot",
83
+ "rotary_scaling_type": "yarn",
84
+ "rotary_scale_factor": 4.0,
85
+ "residual_scale": -1.0,
86
+ "attn_dropout": 0.01,
87
+ "label_smoothing": 0.0,
88
+ "z_loss_weight": 0.0,
89
+ "use_flash_attention": true,
90
+ "use_distributed_muon": false,
91
+ "num_iterations": -1,
92
+ "target_flops": -1.0,
93
+ "target_param_data_ratio": 20,
94
+ "device_batch_size": 1,
95
+ "total_batch_size": 524288,
96
+ "max_grad_accum_steps": 0,
97
+ "embedding_lr": 0.2,
98
+ "unembedding_lr": 0.004,
99
+ "weight_decay": 0.0,
100
+ "freeze_embedding_optimizers": false,
101
+ "matrix_lr": 0.02,
102
+ "grad_clip": 1.0,
103
+ "warmup_ratio": 0.0,
104
+ "warmdown_ratio": 0.2,
105
+ "final_lr_frac": 0.0,
106
+ "eval_every": 500,
107
+ "eval_tokens": "[redacted]",
108
+ "core_metric_every": 1000,
109
+ "core_metric_max_per_task": 2500,
110
+ "sample_every": 1000,
111
+ "micro_eval_every": 250,
112
+ "micro_eval_tokens": "[redacted]",
113
+ "skip_initial_eval": false,
114
+ "fast_dev_run": false,
115
+ "fast_dev_num_iterations": 200,
116
+ "fast_dev_max_grad_accum_steps": 2,
117
+ "fast_dev_eval_tokens_multiplier": "[redacted]",
118
+ "model_tag": "d16_cont",
119
+ "hf_repo_id": "harshad317/base_Medical_continuous",
120
+ "hf_repo_type": "model",
121
+ "hf_path_in_repo": "",
122
+ "hf_commit_message": "Uploading the base model",
123
+ "hf_private": false,
124
+ "hf_token": "[redacted]",
125
+ "hf_max_shard_size": "2gb",
126
+ "resume_from_checkpoint": "d16",
127
+ "resume_from_step": 22722,
128
+ "resume_load_optimizer": true,
129
+ "base_dataset_num_shards": 1250,
130
+ "train_dataset_mix": "",
131
+ "train_dataset_mix_path": "configs/train_dataset_mix.json",
132
+ "loader_dedup_window": 300000,
133
+ "loader_enable_length_bucketing": true,
134
+ "loader_bucket_bins": "512,1024,2048,4096,8192",
135
+ "loader_prefetch_batches": 2,
136
+ "loader_tokenizer_threads": "[redacted]",
137
+ "loader_tokenizer_batch_size": "[redacted]",
138
+ "train_single_pass": true,
139
+ "enable_length_curriculum": false,
140
+ "sequence_length_schedule": "0.25:512,0.50:1024,0.75:2048,1.0:4096",
141
+ "length_schedule_round_to": 32,
142
+ "enable_dataset_reweighting": true,
143
+ "dataset_reweight_interval": 2000,
144
+ "dataset_reweight_strength": 0.75,
145
+ "dataset_reweight_smoothing": 0.2,
146
+ "dataset_reweight_min_multiplier": 0.25,
147
+ "dataset_reweight_warmup_steps": 4000,
148
+ "memory_profile_interval": 500,
149
+ "dataset_mix_source": "/home/jupyter/nanochat/configs/train_dataset_mix.json"
150
+ },
151
+ "device_batch_size": 1,
152
+ "max_seq_len": 8192
153
+ }
model-00001-of-00001.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a502f68c5bf94be29e88c56f45adce9cfe6e9894bc53ece50a9f76ce8f3416
3
+ size 1845827576
model.safetensors.index.json ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 1845802880,
4
+ "num_shards": 1,
5
+ "step": 45444,
6
+ "format": "safetensors_state_dict"
7
+ },
8
+ "weight_map": {
9
+ "transformer.wte.weight": "model-00001-of-00001.safetensors",
10
+ "transformer.h.0.attn.c_q.weight": "model-00001-of-00001.safetensors",
11
+ "transformer.h.0.attn.c_k.weight": "model-00001-of-00001.safetensors",
12
+ "transformer.h.0.attn.c_v.weight": "model-00001-of-00001.safetensors",
13
+ "transformer.h.0.attn.c_proj.weight": "model-00001-of-00001.safetensors",
14
+ "transformer.h.0.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
15
+ "transformer.h.0.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
16
+ "transformer.h.1.attn.c_q.weight": "model-00001-of-00001.safetensors",
17
+ "transformer.h.1.attn.c_k.weight": "model-00001-of-00001.safetensors",
18
+ "transformer.h.1.attn.c_v.weight": "model-00001-of-00001.safetensors",
19
+ "transformer.h.1.attn.c_proj.weight": "model-00001-of-00001.safetensors",
20
+ "transformer.h.1.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
21
+ "transformer.h.1.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
22
+ "transformer.h.2.attn.c_q.weight": "model-00001-of-00001.safetensors",
23
+ "transformer.h.2.attn.c_k.weight": "model-00001-of-00001.safetensors",
24
+ "transformer.h.2.attn.c_v.weight": "model-00001-of-00001.safetensors",
25
+ "transformer.h.2.attn.c_proj.weight": "model-00001-of-00001.safetensors",
26
+ "transformer.h.2.mlp.router_bias": "model-00001-of-00001.safetensors",
27
+ "transformer.h.2.mlp.uniform_load": "model-00001-of-00001.safetensors",
28
+ "transformer.h.2.mlp.router.weight": "model-00001-of-00001.safetensors",
29
+ "transformer.h.2.mlp.experts.0.w1.weight": "model-00001-of-00001.safetensors",
30
+ "transformer.h.2.mlp.experts.0.w2.weight": "model-00001-of-00001.safetensors",
31
+ "transformer.h.2.mlp.experts.1.w1.weight": "model-00001-of-00001.safetensors",
32
+ "transformer.h.2.mlp.experts.1.w2.weight": "model-00001-of-00001.safetensors",
33
+ "transformer.h.2.mlp.experts.2.w1.weight": "model-00001-of-00001.safetensors",
34
+ "transformer.h.2.mlp.experts.2.w2.weight": "model-00001-of-00001.safetensors",
35
+ "transformer.h.2.mlp.experts.3.w1.weight": "model-00001-of-00001.safetensors",
36
+ "transformer.h.2.mlp.experts.3.w2.weight": "model-00001-of-00001.safetensors",
37
+ "transformer.h.2.mlp.experts.4.w1.weight": "model-00001-of-00001.safetensors",
38
+ "transformer.h.2.mlp.experts.4.w2.weight": "model-00001-of-00001.safetensors",
39
+ "transformer.h.2.mlp.experts.5.w1.weight": "model-00001-of-00001.safetensors",
40
+ "transformer.h.2.mlp.experts.5.w2.weight": "model-00001-of-00001.safetensors",
41
+ "transformer.h.2.mlp.experts.6.w1.weight": "model-00001-of-00001.safetensors",
42
+ "transformer.h.2.mlp.experts.6.w2.weight": "model-00001-of-00001.safetensors",
43
+ "transformer.h.2.mlp.experts.7.w1.weight": "model-00001-of-00001.safetensors",
44
+ "transformer.h.2.mlp.experts.7.w2.weight": "model-00001-of-00001.safetensors",
45
+ "transformer.h.2.mlp.adjugate_experts.0.w1.weight": "model-00001-of-00001.safetensors",
46
+ "transformer.h.2.mlp.adjugate_experts.0.w2.weight": "model-00001-of-00001.safetensors",
47
+ "transformer.h.2.mlp.adjugate_experts.1.w1.weight": "model-00001-of-00001.safetensors",
48
+ "transformer.h.2.mlp.adjugate_experts.1.w2.weight": "model-00001-of-00001.safetensors",
49
+ "transformer.h.2.mlp.router_context_proj.weight": "model-00001-of-00001.safetensors",
50
+ "transformer.h.2.mlp.router_context_scale_proj.weight": "model-00001-of-00001.safetensors",
51
+ "transformer.h.2.mlp.router_context_selection_proj.weight": "model-00001-of-00001.safetensors",
52
+ "transformer.h.3.attn.c_q.weight": "model-00001-of-00001.safetensors",
53
+ "transformer.h.3.attn.c_k.weight": "model-00001-of-00001.safetensors",
54
+ "transformer.h.3.attn.c_v.weight": "model-00001-of-00001.safetensors",
55
+ "transformer.h.3.attn.c_proj.weight": "model-00001-of-00001.safetensors",
56
+ "transformer.h.3.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
57
+ "transformer.h.3.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
58
+ "transformer.h.4.attn.c_q.weight": "model-00001-of-00001.safetensors",
59
+ "transformer.h.4.attn.c_k.weight": "model-00001-of-00001.safetensors",
60
+ "transformer.h.4.attn.c_v.weight": "model-00001-of-00001.safetensors",
61
+ "transformer.h.4.attn.c_proj.weight": "model-00001-of-00001.safetensors",
62
+ "transformer.h.4.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
63
+ "transformer.h.4.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
64
+ "transformer.h.5.attn.c_q.weight": "model-00001-of-00001.safetensors",
65
+ "transformer.h.5.attn.c_k.weight": "model-00001-of-00001.safetensors",
66
+ "transformer.h.5.attn.c_v.weight": "model-00001-of-00001.safetensors",
67
+ "transformer.h.5.attn.c_proj.weight": "model-00001-of-00001.safetensors",
68
+ "transformer.h.5.mlp.router_bias": "model-00001-of-00001.safetensors",
69
+ "transformer.h.5.mlp.uniform_load": "model-00001-of-00001.safetensors",
70
+ "transformer.h.5.mlp.router.weight": "model-00001-of-00001.safetensors",
71
+ "transformer.h.5.mlp.experts.0.w1.weight": "model-00001-of-00001.safetensors",
72
+ "transformer.h.5.mlp.experts.0.w2.weight": "model-00001-of-00001.safetensors",
73
+ "transformer.h.5.mlp.experts.1.w1.weight": "model-00001-of-00001.safetensors",
74
+ "transformer.h.5.mlp.experts.1.w2.weight": "model-00001-of-00001.safetensors",
75
+ "transformer.h.5.mlp.experts.2.w1.weight": "model-00001-of-00001.safetensors",
76
+ "transformer.h.5.mlp.experts.2.w2.weight": "model-00001-of-00001.safetensors",
77
+ "transformer.h.5.mlp.experts.3.w1.weight": "model-00001-of-00001.safetensors",
78
+ "transformer.h.5.mlp.experts.3.w2.weight": "model-00001-of-00001.safetensors",
79
+ "transformer.h.5.mlp.experts.4.w1.weight": "model-00001-of-00001.safetensors",
80
+ "transformer.h.5.mlp.experts.4.w2.weight": "model-00001-of-00001.safetensors",
81
+ "transformer.h.5.mlp.experts.5.w1.weight": "model-00001-of-00001.safetensors",
82
+ "transformer.h.5.mlp.experts.5.w2.weight": "model-00001-of-00001.safetensors",
83
+ "transformer.h.5.mlp.experts.6.w1.weight": "model-00001-of-00001.safetensors",
84
+ "transformer.h.5.mlp.experts.6.w2.weight": "model-00001-of-00001.safetensors",
85
+ "transformer.h.5.mlp.experts.7.w1.weight": "model-00001-of-00001.safetensors",
86
+ "transformer.h.5.mlp.experts.7.w2.weight": "model-00001-of-00001.safetensors",
87
+ "transformer.h.5.mlp.adjugate_experts.0.w1.weight": "model-00001-of-00001.safetensors",
88
+ "transformer.h.5.mlp.adjugate_experts.0.w2.weight": "model-00001-of-00001.safetensors",
89
+ "transformer.h.5.mlp.adjugate_experts.1.w1.weight": "model-00001-of-00001.safetensors",
90
+ "transformer.h.5.mlp.adjugate_experts.1.w2.weight": "model-00001-of-00001.safetensors",
91
+ "transformer.h.5.mlp.router_context_proj.weight": "model-00001-of-00001.safetensors",
92
+ "transformer.h.5.mlp.router_context_scale_proj.weight": "model-00001-of-00001.safetensors",
93
+ "transformer.h.5.mlp.router_context_selection_proj.weight": "model-00001-of-00001.safetensors",
94
+ "transformer.h.6.attn.c_q.weight": "model-00001-of-00001.safetensors",
95
+ "transformer.h.6.attn.c_k.weight": "model-00001-of-00001.safetensors",
96
+ "transformer.h.6.attn.c_v.weight": "model-00001-of-00001.safetensors",
97
+ "transformer.h.6.attn.c_proj.weight": "model-00001-of-00001.safetensors",
98
+ "transformer.h.6.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
99
+ "transformer.h.6.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
100
+ "transformer.h.7.attn.c_q.weight": "model-00001-of-00001.safetensors",
101
+ "transformer.h.7.attn.c_k.weight": "model-00001-of-00001.safetensors",
102
+ "transformer.h.7.attn.c_v.weight": "model-00001-of-00001.safetensors",
103
+ "transformer.h.7.attn.c_proj.weight": "model-00001-of-00001.safetensors",
104
+ "transformer.h.7.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
105
+ "transformer.h.7.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
106
+ "transformer.h.8.attn.c_q.weight": "model-00001-of-00001.safetensors",
107
+ "transformer.h.8.attn.c_k.weight": "model-00001-of-00001.safetensors",
108
+ "transformer.h.8.attn.c_v.weight": "model-00001-of-00001.safetensors",
109
+ "transformer.h.8.attn.c_proj.weight": "model-00001-of-00001.safetensors",
110
+ "transformer.h.8.mlp.router_bias": "model-00001-of-00001.safetensors",
111
+ "transformer.h.8.mlp.uniform_load": "model-00001-of-00001.safetensors",
112
+ "transformer.h.8.mlp.router.weight": "model-00001-of-00001.safetensors",
113
+ "transformer.h.8.mlp.experts.0.w1.weight": "model-00001-of-00001.safetensors",
114
+ "transformer.h.8.mlp.experts.0.w2.weight": "model-00001-of-00001.safetensors",
115
+ "transformer.h.8.mlp.experts.1.w1.weight": "model-00001-of-00001.safetensors",
116
+ "transformer.h.8.mlp.experts.1.w2.weight": "model-00001-of-00001.safetensors",
117
+ "transformer.h.8.mlp.experts.2.w1.weight": "model-00001-of-00001.safetensors",
118
+ "transformer.h.8.mlp.experts.2.w2.weight": "model-00001-of-00001.safetensors",
119
+ "transformer.h.8.mlp.experts.3.w1.weight": "model-00001-of-00001.safetensors",
120
+ "transformer.h.8.mlp.experts.3.w2.weight": "model-00001-of-00001.safetensors",
121
+ "transformer.h.8.mlp.experts.4.w1.weight": "model-00001-of-00001.safetensors",
122
+ "transformer.h.8.mlp.experts.4.w2.weight": "model-00001-of-00001.safetensors",
123
+ "transformer.h.8.mlp.experts.5.w1.weight": "model-00001-of-00001.safetensors",
124
+ "transformer.h.8.mlp.experts.5.w2.weight": "model-00001-of-00001.safetensors",
125
+ "transformer.h.8.mlp.experts.6.w1.weight": "model-00001-of-00001.safetensors",
126
+ "transformer.h.8.mlp.experts.6.w2.weight": "model-00001-of-00001.safetensors",
127
+ "transformer.h.8.mlp.experts.7.w1.weight": "model-00001-of-00001.safetensors",
128
+ "transformer.h.8.mlp.experts.7.w2.weight": "model-00001-of-00001.safetensors",
129
+ "transformer.h.8.mlp.adjugate_experts.0.w1.weight": "model-00001-of-00001.safetensors",
130
+ "transformer.h.8.mlp.adjugate_experts.0.w2.weight": "model-00001-of-00001.safetensors",
131
+ "transformer.h.8.mlp.adjugate_experts.1.w1.weight": "model-00001-of-00001.safetensors",
132
+ "transformer.h.8.mlp.adjugate_experts.1.w2.weight": "model-00001-of-00001.safetensors",
133
+ "transformer.h.8.mlp.router_context_proj.weight": "model-00001-of-00001.safetensors",
134
+ "transformer.h.8.mlp.router_context_scale_proj.weight": "model-00001-of-00001.safetensors",
135
+ "transformer.h.8.mlp.router_context_selection_proj.weight": "model-00001-of-00001.safetensors",
136
+ "transformer.h.9.attn.c_q.weight": "model-00001-of-00001.safetensors",
137
+ "transformer.h.9.attn.c_k.weight": "model-00001-of-00001.safetensors",
138
+ "transformer.h.9.attn.c_v.weight": "model-00001-of-00001.safetensors",
139
+ "transformer.h.9.attn.c_proj.weight": "model-00001-of-00001.safetensors",
140
+ "transformer.h.9.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
141
+ "transformer.h.9.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
142
+ "transformer.h.10.attn.c_q.weight": "model-00001-of-00001.safetensors",
143
+ "transformer.h.10.attn.c_k.weight": "model-00001-of-00001.safetensors",
144
+ "transformer.h.10.attn.c_v.weight": "model-00001-of-00001.safetensors",
145
+ "transformer.h.10.attn.c_proj.weight": "model-00001-of-00001.safetensors",
146
+ "transformer.h.10.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
147
+ "transformer.h.10.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
148
+ "transformer.h.11.attn.c_q.weight": "model-00001-of-00001.safetensors",
149
+ "transformer.h.11.attn.c_k.weight": "model-00001-of-00001.safetensors",
150
+ "transformer.h.11.attn.c_v.weight": "model-00001-of-00001.safetensors",
151
+ "transformer.h.11.attn.c_proj.weight": "model-00001-of-00001.safetensors",
152
+ "transformer.h.11.mlp.router_bias": "model-00001-of-00001.safetensors",
153
+ "transformer.h.11.mlp.uniform_load": "model-00001-of-00001.safetensors",
154
+ "transformer.h.11.mlp.router.weight": "model-00001-of-00001.safetensors",
155
+ "transformer.h.11.mlp.experts.0.w1.weight": "model-00001-of-00001.safetensors",
156
+ "transformer.h.11.mlp.experts.0.w2.weight": "model-00001-of-00001.safetensors",
157
+ "transformer.h.11.mlp.experts.1.w1.weight": "model-00001-of-00001.safetensors",
158
+ "transformer.h.11.mlp.experts.1.w2.weight": "model-00001-of-00001.safetensors",
159
+ "transformer.h.11.mlp.experts.2.w1.weight": "model-00001-of-00001.safetensors",
160
+ "transformer.h.11.mlp.experts.2.w2.weight": "model-00001-of-00001.safetensors",
161
+ "transformer.h.11.mlp.experts.3.w1.weight": "model-00001-of-00001.safetensors",
162
+ "transformer.h.11.mlp.experts.3.w2.weight": "model-00001-of-00001.safetensors",
163
+ "transformer.h.11.mlp.experts.4.w1.weight": "model-00001-of-00001.safetensors",
164
+ "transformer.h.11.mlp.experts.4.w2.weight": "model-00001-of-00001.safetensors",
165
+ "transformer.h.11.mlp.experts.5.w1.weight": "model-00001-of-00001.safetensors",
166
+ "transformer.h.11.mlp.experts.5.w2.weight": "model-00001-of-00001.safetensors",
167
+ "transformer.h.11.mlp.experts.6.w1.weight": "model-00001-of-00001.safetensors",
168
+ "transformer.h.11.mlp.experts.6.w2.weight": "model-00001-of-00001.safetensors",
169
+ "transformer.h.11.mlp.experts.7.w1.weight": "model-00001-of-00001.safetensors",
170
+ "transformer.h.11.mlp.experts.7.w2.weight": "model-00001-of-00001.safetensors",
171
+ "transformer.h.11.mlp.adjugate_experts.0.w1.weight": "model-00001-of-00001.safetensors",
172
+ "transformer.h.11.mlp.adjugate_experts.0.w2.weight": "model-00001-of-00001.safetensors",
173
+ "transformer.h.11.mlp.adjugate_experts.1.w1.weight": "model-00001-of-00001.safetensors",
174
+ "transformer.h.11.mlp.adjugate_experts.1.w2.weight": "model-00001-of-00001.safetensors",
175
+ "transformer.h.11.mlp.router_context_proj.weight": "model-00001-of-00001.safetensors",
176
+ "transformer.h.11.mlp.router_context_scale_proj.weight": "model-00001-of-00001.safetensors",
177
+ "transformer.h.11.mlp.router_context_selection_proj.weight": "model-00001-of-00001.safetensors",
178
+ "transformer.h.12.attn.c_q.weight": "model-00001-of-00001.safetensors",
179
+ "transformer.h.12.attn.c_k.weight": "model-00001-of-00001.safetensors",
180
+ "transformer.h.12.attn.c_v.weight": "model-00001-of-00001.safetensors",
181
+ "transformer.h.12.attn.c_proj.weight": "model-00001-of-00001.safetensors",
182
+ "transformer.h.12.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
183
+ "transformer.h.12.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
184
+ "transformer.h.13.attn.c_q.weight": "model-00001-of-00001.safetensors",
185
+ "transformer.h.13.attn.c_k.weight": "model-00001-of-00001.safetensors",
186
+ "transformer.h.13.attn.c_v.weight": "model-00001-of-00001.safetensors",
187
+ "transformer.h.13.attn.c_proj.weight": "model-00001-of-00001.safetensors",
188
+ "transformer.h.13.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
189
+ "transformer.h.13.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
190
+ "transformer.h.14.attn.c_q.weight": "model-00001-of-00001.safetensors",
191
+ "transformer.h.14.attn.c_k.weight": "model-00001-of-00001.safetensors",
192
+ "transformer.h.14.attn.c_v.weight": "model-00001-of-00001.safetensors",
193
+ "transformer.h.14.attn.c_proj.weight": "model-00001-of-00001.safetensors",
194
+ "transformer.h.14.mlp.router_bias": "model-00001-of-00001.safetensors",
195
+ "transformer.h.14.mlp.uniform_load": "model-00001-of-00001.safetensors",
196
+ "transformer.h.14.mlp.router.weight": "model-00001-of-00001.safetensors",
197
+ "transformer.h.14.mlp.experts.0.w1.weight": "model-00001-of-00001.safetensors",
198
+ "transformer.h.14.mlp.experts.0.w2.weight": "model-00001-of-00001.safetensors",
199
+ "transformer.h.14.mlp.experts.1.w1.weight": "model-00001-of-00001.safetensors",
200
+ "transformer.h.14.mlp.experts.1.w2.weight": "model-00001-of-00001.safetensors",
201
+ "transformer.h.14.mlp.experts.2.w1.weight": "model-00001-of-00001.safetensors",
202
+ "transformer.h.14.mlp.experts.2.w2.weight": "model-00001-of-00001.safetensors",
203
+ "transformer.h.14.mlp.experts.3.w1.weight": "model-00001-of-00001.safetensors",
204
+ "transformer.h.14.mlp.experts.3.w2.weight": "model-00001-of-00001.safetensors",
205
+ "transformer.h.14.mlp.experts.4.w1.weight": "model-00001-of-00001.safetensors",
206
+ "transformer.h.14.mlp.experts.4.w2.weight": "model-00001-of-00001.safetensors",
207
+ "transformer.h.14.mlp.experts.5.w1.weight": "model-00001-of-00001.safetensors",
208
+ "transformer.h.14.mlp.experts.5.w2.weight": "model-00001-of-00001.safetensors",
209
+ "transformer.h.14.mlp.experts.6.w1.weight": "model-00001-of-00001.safetensors",
210
+ "transformer.h.14.mlp.experts.6.w2.weight": "model-00001-of-00001.safetensors",
211
+ "transformer.h.14.mlp.experts.7.w1.weight": "model-00001-of-00001.safetensors",
212
+ "transformer.h.14.mlp.experts.7.w2.weight": "model-00001-of-00001.safetensors",
213
+ "transformer.h.14.mlp.adjugate_experts.0.w1.weight": "model-00001-of-00001.safetensors",
214
+ "transformer.h.14.mlp.adjugate_experts.0.w2.weight": "model-00001-of-00001.safetensors",
215
+ "transformer.h.14.mlp.adjugate_experts.1.w1.weight": "model-00001-of-00001.safetensors",
216
+ "transformer.h.14.mlp.adjugate_experts.1.w2.weight": "model-00001-of-00001.safetensors",
217
+ "transformer.h.14.mlp.router_context_proj.weight": "model-00001-of-00001.safetensors",
218
+ "transformer.h.14.mlp.router_context_scale_proj.weight": "model-00001-of-00001.safetensors",
219
+ "transformer.h.14.mlp.router_context_selection_proj.weight": "model-00001-of-00001.safetensors",
220
+ "transformer.h.15.attn.c_q.weight": "model-00001-of-00001.safetensors",
221
+ "transformer.h.15.attn.c_k.weight": "model-00001-of-00001.safetensors",
222
+ "transformer.h.15.attn.c_v.weight": "model-00001-of-00001.safetensors",
223
+ "transformer.h.15.attn.c_proj.weight": "model-00001-of-00001.safetensors",
224
+ "transformer.h.15.mlp.c_fc.weight": "model-00001-of-00001.safetensors",
225
+ "transformer.h.15.mlp.c_proj.weight": "model-00001-of-00001.safetensors",
226
+ "lm_head.weight": "model-00001-of-00001.safetensors"
227
+ }
228
+ }
token_bytes.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:657d33e4365804c866b1c9e8c065c0f870428a0ece1f3b0771533dd6a0dc6076
3
+ size 525865
tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d2e58be3e2ec81124dbf4941bbab6bd0715eca1a34e5a773953b60d6369e750
3
+ size 1846082