bart1259 commited on
Commit
e8f9707
·
verified ·
1 Parent(s): 4e321bc

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -8,11 +8,11 @@ Chain of Thought (CoT) transformer model trained to do multi-step integer arithm
8
 
9
  Model details:
10
  - **Vocabulary Size**: 40 (Character Tokenization)
11
- - **Layer Count**: 8
12
  - **Attention Head Count**: 4
13
- - **Residual Stream Size**: 256
14
  - **Context Length**: 256
15
- - **Tokens Trained on**: 419,612,160
16
 
17
  Training Score During Training
18
 
 
8
 
9
  Model details:
10
  - **Vocabulary Size**: 40 (Character Tokenization)
11
+ - **Layer Count**: 12
12
  - **Attention Head Count**: 4
13
+ - **Residual Stream Size**: 512
14
  - **Context Length**: 256
15
+ - **Tokens Trained on**: 1,214,656,000
16
 
17
  Training Score During Training
18
 
config.json CHANGED
@@ -4,17 +4,18 @@
4
  ],
5
  "attention_dropout": 0.0,
6
  "bos_token_id": null,
 
7
  "embd_pdrop": 0.0,
8
  "eos_token_id": null,
9
  "hidden_act": "gelu_new",
10
- "hidden_size": 256,
11
  "initializer_range": 0.02,
12
- "intermediate_size": 1024,
13
  "layer_norm_eps": 1e-05,
14
  "max_position_embeddings": 256,
15
  "model_type": "phi",
16
  "num_attention_heads": 4,
17
- "num_hidden_layers": 8,
18
  "num_key_value_heads": 4,
19
  "partial_rotary_factor": 0.5,
20
  "qk_layernorm": false,
@@ -22,8 +23,7 @@
22
  "rope_scaling": null,
23
  "rope_theta": 10000.0,
24
  "tie_word_embeddings": false,
25
- "torch_dtype": "float32",
26
- "transformers_version": "4.53.0",
27
  "use_cache": true,
28
- "vocab_size": 40
29
  }
 
4
  ],
5
  "attention_dropout": 0.0,
6
  "bos_token_id": null,
7
+ "dtype": "float32",
8
  "embd_pdrop": 0.0,
9
  "eos_token_id": null,
10
  "hidden_act": "gelu_new",
11
+ "hidden_size": 512,
12
  "initializer_range": 0.02,
13
+ "intermediate_size": 2048,
14
  "layer_norm_eps": 1e-05,
15
  "max_position_embeddings": 256,
16
  "model_type": "phi",
17
  "num_attention_heads": 4,
18
+ "num_hidden_layers": 12,
19
  "num_key_value_heads": 4,
20
  "partial_rotary_factor": 0.5,
21
  "qk_layernorm": false,
 
23
  "rope_scaling": null,
24
  "rope_theta": 10000.0,
25
  "tie_word_embeddings": false,
26
+ "transformers_version": "4.57.1",
 
27
  "use_cache": true,
28
+ "vocab_size": 45
29
  }
generation_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
  "_from_model_config": true,
3
- "transformers_version": "4.53.0"
4
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "transformers_version": "4.57.1"
4
  }
hyperparameters.json CHANGED
@@ -1 +1 @@
1
- {"MIN_DIFFICULTY": 2, "MAX_DIFFICULTY": 4, "TRAINING_SAMPLES": 3000000, "CONTEXT_LENGTH": 256, "RESIDUAL_EMBEDDING_SIZE": 256, "MLP_EMBEDDING_SIZE": 1024, "NUM_ATTENTION_HEADS": 4, "NUM_LAYERS": 8, "VOCAB_SIZE": 40, "TOTAL_TOKENS": 419612160}
 
1
+ {"MIN_DIFFICULTY": 2, "MAX_DIFFICULTY": 6, "TRAINING_SAMPLES": 3000000, "CONTEXT_LENGTH": 256, "RESIDUAL_EMBEDDING_SIZE": 512, "MLP_EMBEDDING_SIZE": 2048, "NUM_ATTENTION_HEADS": 4, "NUM_LAYERS": 12, "VOCAB_SIZE": 40, "TOTAL_TOKENS": 1214656000, "EPOCHS": 2}
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80470ec0e1be3e0e2ae474707fdd150d02d666915dfaa5d54528418c03e7da66
3
- size 25352072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84cbed7983068ea560ef90a7f02e1408c244eb4e18315d3640ff1cd0114f6f80
3
+ size 151471932
score.png CHANGED
tokenizer.json CHANGED
@@ -115,9 +115,9 @@
115
  "Ġ": 39,
116
  "Ġ-": 40,
117
  "(-": 41,
118
- "Ġ1": 42,
119
- "St": 43,
120
- "ep": 44
121
  },
122
  "merges": [
123
  [
@@ -130,15 +130,15 @@
130
  ],
131
  [
132
  "Ġ",
133
- "1"
134
  ],
135
  [
136
- "S",
137
- "t"
138
  ],
139
  [
140
- "e",
141
- "p"
142
  ]
143
  ]
144
  }
 
115
  "Ġ": 39,
116
  "Ġ-": 40,
117
  "(-": 41,
118
+ "Ġ+": 42,
119
+ "Ġ*": 43,
120
+ "Ġ1": 44
121
  },
122
  "merges": [
123
  [
 
130
  ],
131
  [
132
  "Ġ",
133
+ "+"
134
  ],
135
  [
136
+ "Ġ",
137
+ "*"
138
  ],
139
  [
140
+ "Ġ",
141
+ "1"
142
  ]
143
  ]
144
  }