levossadtchi commited on
Commit
671cd4e
·
verified ·
1 Parent(s): 8efc7a6

Delete stage_2

Browse files
stage_2/checkpoint-250M/config.json DELETED
@@ -1,62 +0,0 @@
1
- {
2
- "_sliding_window_pattern": 6,
3
- "architectures": [
4
- "Gemma3ForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "attn_logit_softcapping": null,
9
- "bos_token_id": 2,
10
- "dtype": "bfloat16",
11
- "eos_token_id": 1,
12
- "final_logit_softcapping": null,
13
- "head_dim": 256,
14
- "hidden_activation": "gelu_pytorch_tanh",
15
- "hidden_size": 640,
16
- "initializer_range": 0.02,
17
- "intermediate_size": 2048,
18
- "layer_types": [
19
- "sliding_attention",
20
- "sliding_attention",
21
- "sliding_attention",
22
- "sliding_attention",
23
- "sliding_attention",
24
- "full_attention",
25
- "sliding_attention",
26
- "sliding_attention",
27
- "sliding_attention",
28
- "sliding_attention",
29
- "sliding_attention",
30
- "full_attention",
31
- "sliding_attention",
32
- "sliding_attention",
33
- "sliding_attention",
34
- "sliding_attention",
35
- "sliding_attention",
36
- "full_attention"
37
- ],
38
- "max_position_embeddings": 32768,
39
- "model_type": "gemma3_text",
40
- "num_attention_heads": 4,
41
- "num_hidden_layers": 18,
42
- "num_key_value_heads": 1,
43
- "pad_token_id": 0,
44
- "query_pre_attn_scalar": 256,
45
- "rms_norm_eps": 1e-06,
46
- "rope_parameters": {
47
- "full_attention": {
48
- "rope_theta": 1000000.0,
49
- "rope_type": "default"
50
- },
51
- "sliding_attention": {
52
- "rope_theta": 10000.0,
53
- "rope_type": "default"
54
- }
55
- },
56
- "sliding_window": 512,
57
- "tie_word_embeddings": true,
58
- "transformers_version": "5.3.0",
59
- "use_bidirectional_attention": false,
60
- "use_cache": true,
61
- "vocab_size": 262144
62
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
stage_2/checkpoint-250M/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9880d5e22527cb8ad15a984ce7cafcb4e34e900bdf168caff05b1f1021c3bd41
3
- size 465508967
 
 
 
 
stage_2/checkpoint-250M/state.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "stage": 2,
3
- "global_step": 30518,
4
- "total_tokens_seen": 250003456,
5
- "shard_index": 0,
6
- "shard_name": "stage_2/shard_2_0000.bin",
7
- "position_in_shard": 249995264,
8
- "checkpoint_name": "checkpoint-250M",
9
- "checkpoint_hf_path": "stage_2/checkpoint-250M",
10
- "timestamp": "2026-03-09T22:48:20.953928"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
stage_2/checkpoint-500M/config.json DELETED
@@ -1,62 +0,0 @@
1
- {
2
- "_sliding_window_pattern": 6,
3
- "architectures": [
4
- "Gemma3ForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "attn_logit_softcapping": null,
9
- "bos_token_id": 2,
10
- "dtype": "bfloat16",
11
- "eos_token_id": 1,
12
- "final_logit_softcapping": null,
13
- "head_dim": 256,
14
- "hidden_activation": "gelu_pytorch_tanh",
15
- "hidden_size": 640,
16
- "initializer_range": 0.02,
17
- "intermediate_size": 2048,
18
- "layer_types": [
19
- "sliding_attention",
20
- "sliding_attention",
21
- "sliding_attention",
22
- "sliding_attention",
23
- "sliding_attention",
24
- "full_attention",
25
- "sliding_attention",
26
- "sliding_attention",
27
- "sliding_attention",
28
- "sliding_attention",
29
- "sliding_attention",
30
- "full_attention",
31
- "sliding_attention",
32
- "sliding_attention",
33
- "sliding_attention",
34
- "sliding_attention",
35
- "sliding_attention",
36
- "full_attention"
37
- ],
38
- "max_position_embeddings": 32768,
39
- "model_type": "gemma3_text",
40
- "num_attention_heads": 4,
41
- "num_hidden_layers": 18,
42
- "num_key_value_heads": 1,
43
- "pad_token_id": 0,
44
- "query_pre_attn_scalar": 256,
45
- "rms_norm_eps": 1e-06,
46
- "rope_parameters": {
47
- "full_attention": {
48
- "rope_theta": 1000000.0,
49
- "rope_type": "default"
50
- },
51
- "sliding_attention": {
52
- "rope_theta": 10000.0,
53
- "rope_type": "default"
54
- }
55
- },
56
- "sliding_window": 512,
57
- "tie_word_embeddings": true,
58
- "transformers_version": "5.3.0",
59
- "use_bidirectional_attention": false,
60
- "use_cache": true,
61
- "vocab_size": 262144
62
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
stage_2/checkpoint-500M/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:33959ba9cd853aaca95ea7c825efc63c0a39b64d4fd79d7de934689efe6bf286
3
- size 465508967
 
 
 
 
stage_2/checkpoint-500M/state.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "stage": 2,
3
- "global_step": 61036,
4
- "total_tokens_seen": 500004864,
5
- "shard_index": 1,
6
- "shard_name": "stage_2/shard_2_0001.bin",
7
- "position_in_shard": 231563264,
8
- "checkpoint_name": "checkpoint-500M",
9
- "checkpoint_hf_path": "stage_2/checkpoint-500M",
10
- "timestamp": "2026-03-10T00:21:20.782890"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
stage_2/checkpoint-750M/config.json DELETED
@@ -1,62 +0,0 @@
1
- {
2
- "_sliding_window_pattern": 6,
3
- "architectures": [
4
- "Gemma3ForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "attn_logit_softcapping": null,
9
- "bos_token_id": 2,
10
- "dtype": "bfloat16",
11
- "eos_token_id": 1,
12
- "final_logit_softcapping": null,
13
- "head_dim": 256,
14
- "hidden_activation": "gelu_pytorch_tanh",
15
- "hidden_size": 640,
16
- "initializer_range": 0.02,
17
- "intermediate_size": 2048,
18
- "layer_types": [
19
- "sliding_attention",
20
- "sliding_attention",
21
- "sliding_attention",
22
- "sliding_attention",
23
- "sliding_attention",
24
- "full_attention",
25
- "sliding_attention",
26
- "sliding_attention",
27
- "sliding_attention",
28
- "sliding_attention",
29
- "sliding_attention",
30
- "full_attention",
31
- "sliding_attention",
32
- "sliding_attention",
33
- "sliding_attention",
34
- "sliding_attention",
35
- "sliding_attention",
36
- "full_attention"
37
- ],
38
- "max_position_embeddings": 32768,
39
- "model_type": "gemma3_text",
40
- "num_attention_heads": 4,
41
- "num_hidden_layers": 18,
42
- "num_key_value_heads": 1,
43
- "pad_token_id": 0,
44
- "query_pre_attn_scalar": 256,
45
- "rms_norm_eps": 1e-06,
46
- "rope_parameters": {
47
- "full_attention": {
48
- "rope_theta": 1000000.0,
49
- "rope_type": "default"
50
- },
51
- "sliding_attention": {
52
- "rope_theta": 10000.0,
53
- "rope_type": "default"
54
- }
55
- },
56
- "sliding_window": 512,
57
- "tie_word_embeddings": true,
58
- "transformers_version": "5.3.0",
59
- "use_bidirectional_attention": false,
60
- "use_cache": true,
61
- "vocab_size": 262144
62
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
stage_2/checkpoint-750M/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8af8a6bac7e34b6f41bf36302eb1ef338a4c5e5007ae07d6749d53efe90773d
3
- size 465508967
 
 
 
 
stage_2/checkpoint-750M/state.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "stage": 2,
3
- "global_step": 91554,
4
- "total_tokens_seen": 750006272,
5
- "shard_index": 2,
6
- "shard_name": "stage_2/shard_2_0002.bin",
7
- "position_in_shard": 213131264,
8
- "checkpoint_name": "checkpoint-750M",
9
- "checkpoint_hf_path": "stage_2/checkpoint-750M",
10
- "timestamp": "2026-03-10T01:56:34.856190"
11
- }