ankitdhiman commited on
Commit
c907b33
·
verified ·
1 Parent(s): a3dbbd0

Checkpoint step 25, loss=10.4816

Browse files
added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GraniteMoeHybridForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "attention_multiplier": 0.015625,
8
+ "bos_token_id": 100257,
9
+ "embedding_multiplier": 12,
10
+ "eos_token_id": 100257,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "init_method": "mup",
14
+ "initializer_range": 0.1,
15
+ "intermediate_size": 2048,
16
+ "layer_types": [
17
+ "attention",
18
+ "attention",
19
+ "attention",
20
+ "attention",
21
+ "attention",
22
+ "attention",
23
+ "attention",
24
+ "attention",
25
+ "attention",
26
+ "attention",
27
+ "attention",
28
+ "attention",
29
+ "attention",
30
+ "attention",
31
+ "attention",
32
+ "attention",
33
+ "attention",
34
+ "attention",
35
+ "attention",
36
+ "attention",
37
+ "attention",
38
+ "attention",
39
+ "attention",
40
+ "attention",
41
+ "attention",
42
+ "attention",
43
+ "attention",
44
+ "attention"
45
+ ],
46
+ "logits_scaling": 4,
47
+ "mamba_chunk_size": 256,
48
+ "mamba_conv_bias": true,
49
+ "mamba_d_conv": 4,
50
+ "mamba_d_head": 16,
51
+ "mamba_d_state": 256,
52
+ "mamba_expand": 2,
53
+ "mamba_n_groups": 1,
54
+ "mamba_n_heads": 128,
55
+ "mamba_proj_bias": false,
56
+ "max_position_embeddings": 32768,
57
+ "model_type": "granitemoehybrid",
58
+ "normalization_function": "rmsnorm",
59
+ "num_attention_heads": 16,
60
+ "num_experts_per_tok": 0,
61
+ "num_hidden_layers": 28,
62
+ "num_key_value_heads": 4,
63
+ "num_local_experts": 0,
64
+ "output_router_logits": false,
65
+ "pad_token_id": 100256,
66
+ "position_embedding_type": "rope",
67
+ "residual_multiplier": 0.263,
68
+ "rms_norm_eps": 1e-05,
69
+ "rope_scaling": null,
70
+ "rope_theta": 10000000,
71
+ "router_aux_loss_coef": 0.01,
72
+ "shared_intermediate_size": 2048,
73
+ "tie_word_embeddings": true,
74
+ "torch_dtype": "float32",
75
+ "transformers_version": "4.53.3",
76
+ "use_cache": false,
77
+ "vocab_size": 116481
78
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100257,
4
+ "eos_token_id": 100257,
5
+ "pad_token_id": 100256,
6
+ "transformers_version": "4.53.3"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:741fcb4eb683060877ae6d4e155bb8d19b9113cb57422d5a1f920ec6a9e399f2
3
+ size 1475610400
special_tokens_map.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<AUDIO_SEP>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "bos_token": {
12
+ "content": "<|end_of_text|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "eos_token": {
19
+ "content": "<|end_of_text|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<|pad|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "unk_token": {
33
+ "content": "<|unk|>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ }
39
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff