qxsecureserver commited on
Commit
9a1c626
·
verified ·
1 Parent(s): af7732b

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ logs.jsonl filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/config-checkpoint.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/OmegaNeo-8b",
3
+ "architectures": [
4
+ "OmegaNeoForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_omeganeo.OmegaNeoConfig",
8
+ "AutoModel": "configuration_omeganeo.OmegaNeoModel",
9
+ "AutoModelForCausalLM": "modeling_omeganeo.OmegaNeoForCausalLM"
10
+ },
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 151643,
13
+ "eos_token_id": 151643,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 3584,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 18944,
18
+ "max_position_embeddings": 32768,
19
+ "max_window_layers": 28,
20
+ "model_type": "omeganeo",
21
+ "num_attention_heads": 28,
22
+ "num_hidden_layers": 32,
23
+ "num_key_value_heads": 4,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_scaling": null,
26
+ "rope_theta": 1000000.0,
27
+ "sliding_window": null,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.46.2",
31
+ "use_cache": true,
32
+ "use_mrope": false,
33
+ "use_sliding_window": false,
34
+ "vocab_size": 152064
35
+ }
.ipynb_checkpoints/tokenizer_config-checkpoint.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "auto_map": {
199
+ "AutoTokenizer": [
200
+ "tokenization_omeganeo.OmegaNeoTokenizer",
201
+ null
202
+ ]
203
+ },
204
+ "chat_template": "{% set pre_system_message = 'You are Omega, an AI Assistant exclusively developed, trained and powered by the scientists and engineers at QX LAB AI. QX LAB AI, based in Dubai, UAE, specializes in developing and integrating AI technologies to enhance business operations across various industries. You are built on the unique Omega architecture and trained with extensive datasets and configurations. You were developed solely by the scientists and engineers at QX LAB AI, without any external assistance from other organizations or teams.' %}{% if messages[0]['role'] == 'system' %}<|im_start|>system\n{{ pre_system_message }}\n\n{{ messages[0]['content'] }}<|im_end|>\n{% for message in messages[1:] %}{% if message['role'] == 'user' %}<|im_start|>user\n{{ message['content'] }}<|im_end|>\n{% elif message['role'] == 'assistant' %}<|im_start|>assistant\n{{ message['content'] }}<|im_end|>\n{% endif %}{% endfor %}{% else %}<|im_start|>system\n{{ pre_system_message }}<|im_end|>\n{% for message in messages %}{% if message['role'] == 'user' %}<|im_start|>user\n{{ message['content'] }}<|im_end|>\n{% elif message['role'] == 'assistant' %}<|im_start|>assistant\n{{ message['content'] }}<|im_end|>\n{% endif %}{% endfor %}{% endif %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
205
+ "clean_up_tokenization_spaces": false,
206
+ "eos_token": "<|endoftext|>",
207
+ "errors": "replace",
208
+ "model_max_length": 32768,
209
+ "pad_token": "<|endoftext|>",
210
+ "split_special_tokens": false,
211
+ "tokenizer_class": "OmegaNeoTokenizer",
212
+ "unk_token": null
213
+ }
Screenshot 2024-11-12 at 10.32.37/342/200/257AM.png ADDED
Screenshot 2024-11-12 at 10.32.46/342/200/257AM.png ADDED
Screenshot 2024-11-12 at 10.33.05/342/200/257AM.png ADDED
Screenshot 2024-11-12 at 10.39.49/342/200/257AM.png ADDED
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/OmegaNeo-8b",
3
+ "architectures": [
4
+ "OmegaNeoForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_omeganeo.OmegaNeoConfig",
8
+ "AutoModel": "configuration_omeganeo.OmegaNeoModel",
9
+ "AutoModelForCausalLM": "modeling_omeganeo.OmegaNeoForCausalLM"
10
+ },
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 151643,
13
+ "eos_token_id": 151643,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 3584,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 18944,
18
+ "max_position_embeddings": 32768,
19
+ "max_window_layers": 28,
20
+ "model_type": "omeganeo",
21
+ "num_attention_heads": 28,
22
+ "num_hidden_layers": 32,
23
+ "num_key_value_heads": 4,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_scaling": null,
26
+ "rope_theta": 1000000.0,
27
+ "sliding_window": null,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.46.2",
31
+ "use_cache": true,
32
+ "use_mrope": false,
33
+ "use_sliding_window": false,
34
+ "vocab_size": 152064
35
+ }
configuration_omeganeo.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OmegaNeo model configuration"""
2
+
3
+ from transformers import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+
7
+ logger = logging.get_logger(__name__)
8
+
9
+ class OmegaNeoConfig(PretrainedConfig):
10
+ model_type = "omeganeo"
11
+ keys_to_ignore_at_inference = ["past_key_values"]
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_size=152064,
16
+ hidden_size=3584,
17
+ intermediate_size=18944,
18
+ num_hidden_layers=32,
19
+ num_attention_heads=32,
20
+ num_key_value_heads=32,
21
+ hidden_act="silu",
22
+ max_position_embeddings=32768,
23
+ initializer_range=0.02,
24
+ rms_norm_eps=1e-6,
25
+ use_cache=True,
26
+ tie_word_embeddings=False,
27
+ rope_theta=10000.0,
28
+ use_sliding_window=False,
29
+ sliding_window=4096,
30
+ max_window_layers=28,
31
+ attention_dropout=0.0,
32
+ **kwargs,
33
+ ):
34
+ self.vocab_size = vocab_size
35
+ self.max_position_embeddings = max_position_embeddings
36
+ self.hidden_size = hidden_size
37
+ self.intermediate_size = intermediate_size
38
+ self.num_hidden_layers = num_hidden_layers
39
+ self.num_attention_heads = num_attention_heads
40
+ self.use_sliding_window = use_sliding_window
41
+ self.sliding_window = sliding_window
42
+ self.max_window_layers = max_window_layers
43
+
44
+ # for backward compatibility
45
+ if num_key_value_heads is None:
46
+ num_key_value_heads = num_attention_heads
47
+
48
+ self.num_key_value_heads = num_key_value_heads
49
+ self.hidden_act = hidden_act
50
+ self.initializer_range = initializer_range
51
+ self.rms_norm_eps = rms_norm_eps
52
+ self.use_cache = use_cache
53
+ self.rope_theta = rope_theta
54
+ self.attention_dropout = attention_dropout
55
+
56
+ super().__init__(
57
+ tie_word_embeddings=tie_word_embeddings,
58
+ **kwargs,
59
+ )
logs.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a68d9e885ee60949c772f607c86623c89c8456238ba4a8f3b064a13c69194fd
3
+ size 20495432
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc7ba987b6fd1383a2dfc650cc83338ed25b250eef63779f01f3b09b0982dfce
3
+ size 4976705984
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bee6d496596f40693511456f7470e8196e0d7919baef82d7d3ec8754ce36bf6d
3
+ size 4932743640
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478f945526a18300ffe92795dcd9e9d07b8a86cf314e52b5e4e037eb00622457
3
+ size 4991495720
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4adc8b5e8814283ea4135f004ba059843daf4c9adad2707df325109d49c7c23
3
+ size 2194793960
model.safetensors.index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata": {"mergekit_version": "0.0.5.1", "total_size": 17095695360}, "weight_map": {"lm_head.weight": "model-00001-of-00004.safetensors", "model.embed_tokens.weight": "model-00001-of-00004.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00004-of-00004.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.5.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", "model.layers.5.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", "model.layers.5.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", "model.layers.6.input_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00004-of-00004.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00004-of-00004.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.6.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", "model.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", "model.layers.6.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.input_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.input_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.input_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", "model.norm.weight": "model-00004-of-00004.safetensors"}}
modeling_omeganeo.py ADDED
@@ -0,0 +1,1343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ PyTorch OmegaNeo model."""
2
+ import inspect
3
+ import math
4
+ import warnings
5
+ from typing import List, Optional, Tuple, Union
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torch.utils.checkpoint
10
+ from torch import nn
11
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
12
+
13
+ from transformers.activations import ACT2FN
14
+ from transformers.cache_utils import Cache, DynamicCache
15
+ from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
16
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
17
+ from transformers.modeling_utils import PreTrainedModel
18
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
19
+ from transformers.utils import (
20
+ add_start_docstrings,
21
+ add_start_docstrings_to_model_forward,
22
+ is_flash_attn_2_available,
23
+ is_flash_attn_greater_or_equal_2_10,
24
+ logging,
25
+ replace_return_docstrings,
26
+ )
27
+ from .configuration_omeganeo import OmegaNeoConfig
28
+
29
+ if is_flash_attn_2_available():
30
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
31
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
32
+
33
+ _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
34
+
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+
39
+ _CHECKPOINT_FOR_DOC = "/models/omega_neo_small_chat"
40
+ _CONFIG_FOR_DOC = "OmegaNeoConfig"
41
+
42
+
43
+ def _get_unpad_data(attention_mask):
44
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
45
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
46
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
47
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
48
+ return (
49
+ indices,
50
+ cu_seqlens,
51
+ max_seqlen_in_batch,
52
+ )
53
+
54
+
55
+ class OmegaNeoRMSNorm(nn.Module):
56
+ def __init__(self, hidden_size, eps=1e-6):
57
+ """
58
+ OmegaNeoRMSNorm is equivalent to T5LayerNorm
59
+ """
60
+ super().__init__()
61
+ self.weight = nn.Parameter(torch.ones(hidden_size))
62
+ self.variance_epsilon = eps
63
+
64
+ def forward(self, hidden_states):
65
+ input_dtype = hidden_states.dtype
66
+ hidden_states = hidden_states.to(torch.float32)
67
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
68
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
69
+ return self.weight * hidden_states.to(input_dtype)
70
+
71
+
72
+ class OmegaNeoRotaryEmbedding(nn.Module):
73
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
74
+ super().__init__()
75
+
76
+ self.dim = dim
77
+ self.max_position_embeddings = max_position_embeddings
78
+ self.base = base
79
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
80
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
81
+
82
+ # Build here to make `torch.jit.trace` work.
83
+ self._set_cos_sin_cache(
84
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
85
+ )
86
+
87
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
88
+ self.max_seq_len_cached = seq_len
89
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
90
+
91
+ freqs = torch.outer(t, self.inv_freq)
92
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
93
+ emb = torch.cat((freqs, freqs), dim=-1)
94
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
95
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
96
+
97
+ def forward(self, x, seq_len=None):
98
+ # x: [bs, num_attention_heads, seq_len, head_size]
99
+ if seq_len > self.max_seq_len_cached:
100
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
101
+
102
+ return (
103
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
104
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
105
+ )
106
+
107
+
108
+ def rotate_half(x):
109
+ """Rotates half the hidden dims of the input."""
110
+ x1 = x[..., : x.shape[-1] // 2]
111
+ x2 = x[..., x.shape[-1] // 2 :]
112
+ return torch.cat((-x2, x1), dim=-1)
113
+
114
+
115
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
116
+ """Applies Rotary Position Embedding to the query and key tensors.
117
+ Args:
118
+ q (`torch.Tensor`): The query tensor.
119
+ k (`torch.Tensor`): The key tensor.
120
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
121
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
122
+ position_ids (`torch.Tensor`):
123
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
124
+ used to pass offsetted position ids when working with a KV-cache.
125
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
126
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
127
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
128
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
129
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
130
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
131
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
132
+ Returns:
133
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
134
+ """
135
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
136
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
137
+ q_embed = (q * cos) + (rotate_half(q) * sin)
138
+ k_embed = (k * cos) + (rotate_half(k) * sin)
139
+ return q_embed, k_embed
140
+
141
+
142
+ class OmegaNeoMLP(nn.Module):
143
+ def __init__(self, config):
144
+ super().__init__()
145
+ self.config = config
146
+ self.hidden_size = config.hidden_size
147
+ self.intermediate_size = config.intermediate_size
148
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
149
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
150
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
151
+ self.act_fn = ACT2FN[config.hidden_act]
152
+
153
+ def forward(self, x):
154
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
155
+
156
+
157
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
158
+ """
159
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
160
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
161
+ """
162
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
163
+ if n_rep == 1:
164
+ return hidden_states
165
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
166
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
167
+
168
+
169
+ class OmegaNeoAttention(nn.Module):
170
+ """
171
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
172
+ and "Generating Long Sequences with Sparse Transformers".
173
+ """
174
+
175
+ def __init__(self, config: OmegaNeoConfig, layer_idx: Optional[int] = None):
176
+ super().__init__()
177
+ self.config = config
178
+ self.layer_idx = layer_idx
179
+ if layer_idx is None:
180
+ logger.warning_once(
181
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
182
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
183
+ "when creating this class."
184
+ )
185
+
186
+ self.hidden_size = config.hidden_size
187
+ self.num_heads = config.num_attention_heads
188
+ self.head_dim = self.hidden_size // self.num_heads
189
+ self.num_key_value_heads = config.num_key_value_heads
190
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
191
+ self.max_position_embeddings = config.max_position_embeddings
192
+ self.rope_theta = config.rope_theta
193
+ self.is_causal = True
194
+ self.attention_dropout = config.attention_dropout
195
+
196
+ if (self.head_dim * self.num_heads) != self.hidden_size:
197
+ raise ValueError(
198
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
199
+ f" and `num_heads`: {self.num_heads})."
200
+ )
201
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
202
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
203
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
204
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
205
+
206
+ self.rotary_emb = OmegaNeoRotaryEmbedding(
207
+ self.head_dim,
208
+ max_position_embeddings=self.max_position_embeddings,
209
+ base=self.rope_theta,
210
+ )
211
+
212
+ def forward(
213
+ self,
214
+ hidden_states: torch.Tensor,
215
+ attention_mask: Optional[torch.Tensor] = None,
216
+ position_ids: Optional[torch.LongTensor] = None,
217
+ past_key_value: Optional[Cache] = None,
218
+ output_attentions: bool = False,
219
+ use_cache: bool = False,
220
+ **kwargs,
221
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
222
+ if "padding_mask" in kwargs:
223
+ warnings.warn(
224
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
225
+ )
226
+ bsz, q_len, _ = hidden_states.size()
227
+
228
+ query_states = self.q_proj(hidden_states)
229
+ key_states = self.k_proj(hidden_states)
230
+ value_states = self.v_proj(hidden_states)
231
+
232
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
233
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
234
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
235
+
236
+ kv_seq_len = key_states.shape[-2]
237
+ if past_key_value is not None:
238
+ if self.layer_idx is None:
239
+ raise ValueError(
240
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
241
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
242
+ "with a layer index."
243
+ )
244
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
245
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
246
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
247
+
248
+ if past_key_value is not None:
249
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
250
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
251
+
252
+ # repeat k/v heads if n_kv_heads < n_heads
253
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
254
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
255
+
256
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
257
+
258
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
259
+ raise ValueError(
260
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
261
+ f" {attn_weights.size()}"
262
+ )
263
+
264
+ if attention_mask is not None:
265
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
266
+ raise ValueError(
267
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
268
+ )
269
+
270
+ attn_weights = attn_weights + attention_mask
271
+
272
+ # upcast attention to fp32
273
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
274
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
275
+ attn_output = torch.matmul(attn_weights, value_states)
276
+
277
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
278
+ raise ValueError(
279
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
280
+ f" {attn_output.size()}"
281
+ )
282
+
283
+ attn_output = attn_output.transpose(1, 2).contiguous()
284
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
285
+
286
+ attn_output = self.o_proj(attn_output)
287
+
288
+ if not output_attentions:
289
+ attn_weights = None
290
+
291
+ return attn_output, attn_weights, past_key_value
292
+
293
+
294
+ class OmegaNeoFlashAttention2(OmegaNeoAttention):
295
+ """
296
+ OmegaNeo flash attention module, following OmegaNeo attention module. This module inherits from `OmegaNeoAttention`
297
+ as the weights of the module stays untouched. The only required change would be on the forward pass
298
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
299
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
300
+ config.max_window_layers layers.
301
+ """
302
+
303
+ def __init__(self, *args, **kwargs):
304
+ super().__init__(*args, **kwargs)
305
+
306
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
307
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
308
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
309
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
310
+
311
+ def forward(
312
+ self,
313
+ hidden_states: torch.Tensor,
314
+ attention_mask: Optional[torch.Tensor] = None,
315
+ position_ids: Optional[torch.LongTensor] = None,
316
+ past_key_value: Optional[Cache] = None,
317
+ output_attentions: bool = False,
318
+ use_cache: bool = False,
319
+ **kwargs,
320
+ ):
321
+ if "padding_mask" in kwargs:
322
+ warnings.warn(
323
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
324
+ )
325
+
326
+ # overwrite attention_mask with padding_mask
327
+ attention_mask = kwargs.pop("padding_mask")
328
+ bsz, q_len, _ = hidden_states.size()
329
+
330
+ query_states = self.q_proj(hidden_states)
331
+ key_states = self.k_proj(hidden_states)
332
+ value_states = self.v_proj(hidden_states)
333
+
334
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
335
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
336
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
337
+
338
+ kv_seq_len = key_states.shape[-2]
339
+ if past_key_value is not None:
340
+ if self.layer_idx is None:
341
+ raise ValueError(
342
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
343
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
344
+ "with a layer index."
345
+ )
346
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
347
+
348
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
349
+ rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
350
+ cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
351
+
352
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
353
+
354
+ use_sliding_windows = (
355
+ _flash_supports_window_size
356
+ and getattr(self.config, "sliding_window", None) is not None
357
+ and kv_seq_len > self.config.sliding_window
358
+ and self.config.use_sliding_window
359
+ )
360
+
361
+ if not _flash_supports_window_size:
362
+ logger.warning_once(
363
+ "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
364
+ " make sure to upgrade flash-attn library."
365
+ )
366
+
367
+ if past_key_value is not None:
368
+ # Activate slicing cache only if the config has a value `sliding_windows` attribute
369
+ cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
370
+ if (
371
+ getattr(self.config, "sliding_window", None) is not None
372
+ and kv_seq_len > self.config.sliding_window
373
+ and cache_has_contents
374
+ ):
375
+ slicing_tokens = 1 - self.config.sliding_window
376
+
377
+ past_key = past_key_value[self.layer_idx][0]
378
+ past_value = past_key_value[self.layer_idx][1]
379
+
380
+ past_key = past_key[:, :, slicing_tokens:, :].contiguous()
381
+ past_value = past_value[:, :, slicing_tokens:, :].contiguous()
382
+
383
+ if past_key.shape[-2] != self.config.sliding_window - 1:
384
+ raise ValueError(
385
+ f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
386
+ f" {past_key.shape}"
387
+ )
388
+
389
+ if attention_mask is not None:
390
+ attention_mask = attention_mask[:, slicing_tokens:]
391
+ attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
392
+
393
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
394
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
395
+
396
+ # repeat k/v heads if n_kv_heads < n_heads
397
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
398
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
399
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
400
+
401
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
402
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
403
+ # cast them back in float16 just to be sure everything works as expected.
404
+ input_dtype = query_states.dtype
405
+ if input_dtype == torch.float32:
406
+ if torch.is_autocast_enabled():
407
+ target_dtype = torch.get_autocast_gpu_dtype()
408
+ # Handle the case where the model is quantized
409
+ elif hasattr(self.config, "_pre_quantization_dtype"):
410
+ target_dtype = self.config._pre_quantization_dtype
411
+ else:
412
+ target_dtype = self.q_proj.weight.dtype
413
+
414
+ logger.warning_once(
415
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
416
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
417
+ f" {target_dtype}."
418
+ )
419
+
420
+ query_states = query_states.to(target_dtype)
421
+ key_states = key_states.to(target_dtype)
422
+ value_states = value_states.to(target_dtype)
423
+
424
+ # Reashape to the expected shape for Flash Attention
425
+ query_states = query_states.transpose(1, 2)
426
+ key_states = key_states.transpose(1, 2)
427
+ value_states = value_states.transpose(1, 2)
428
+
429
+ attn_output = self._flash_attention_forward(
430
+ query_states,
431
+ key_states,
432
+ value_states,
433
+ attention_mask,
434
+ q_len,
435
+ dropout=dropout_rate,
436
+ use_sliding_windows=use_sliding_windows,
437
+ )
438
+
439
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
440
+ attn_output = self.o_proj(attn_output)
441
+
442
+ if not output_attentions:
443
+ attn_weights = None
444
+
445
+ return attn_output, attn_weights, past_key_value
446
+
447
+ def _flash_attention_forward(
448
+ self,
449
+ query_states,
450
+ key_states,
451
+ value_states,
452
+ attention_mask,
453
+ query_length,
454
+ dropout=0.0,
455
+ softmax_scale=None,
456
+ use_sliding_windows=False,
457
+ ):
458
+ """
459
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
460
+ first unpad the input, then computes the attention scores and pad the final attention scores.
461
+ Args:
462
+ query_states (`torch.Tensor`):
463
+ Input query states to be passed to Flash Attention API
464
+ key_states (`torch.Tensor`):
465
+ Input key states to be passed to Flash Attention API
466
+ value_states (`torch.Tensor`):
467
+ Input value states to be passed to Flash Attention API
468
+ attention_mask (`torch.Tensor`):
469
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
470
+ position of padding tokens and 1 for the position of non-padding tokens.
471
+ dropout (`float`):
472
+ Attention dropout
473
+ softmax_scale (`float`, *optional*):
474
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
475
+ use_sliding_windows (`bool`, *optional*):
476
+ Whether to activate sliding window attention.
477
+ """
478
+ if not self._flash_attn_uses_top_left_mask:
479
+ causal = self.is_causal
480
+ else:
481
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
482
+ causal = self.is_causal and query_length != 1
483
+
484
+ # Decide whether to use SWA or not by layer index.
485
+ if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
486
+ use_sliding_windows = False
487
+
488
+ # Contains at least one padding token in the sequence
489
+ if attention_mask is not None:
490
+ batch_size = query_states.shape[0]
491
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
492
+ query_states, key_states, value_states, attention_mask, query_length
493
+ )
494
+
495
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
496
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
497
+
498
+ if not use_sliding_windows:
499
+ attn_output_unpad = flash_attn_varlen_func(
500
+ query_states,
501
+ key_states,
502
+ value_states,
503
+ cu_seqlens_q=cu_seqlens_q,
504
+ cu_seqlens_k=cu_seqlens_k,
505
+ max_seqlen_q=max_seqlen_in_batch_q,
506
+ max_seqlen_k=max_seqlen_in_batch_k,
507
+ dropout_p=dropout,
508
+ softmax_scale=softmax_scale,
509
+ causal=causal,
510
+ )
511
+ else:
512
+ attn_output_unpad = flash_attn_varlen_func(
513
+ query_states,
514
+ key_states,
515
+ value_states,
516
+ cu_seqlens_q=cu_seqlens_q,
517
+ cu_seqlens_k=cu_seqlens_k,
518
+ max_seqlen_q=max_seqlen_in_batch_q,
519
+ max_seqlen_k=max_seqlen_in_batch_k,
520
+ dropout_p=dropout,
521
+ softmax_scale=softmax_scale,
522
+ causal=causal,
523
+ window_size=(self.config.sliding_window, self.config.sliding_window),
524
+ )
525
+
526
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
527
+ else:
528
+ if not use_sliding_windows:
529
+ attn_output = flash_attn_func(
530
+ query_states,
531
+ key_states,
532
+ value_states,
533
+ dropout,
534
+ softmax_scale=softmax_scale,
535
+ causal=causal,
536
+ )
537
+ else:
538
+ attn_output = flash_attn_func(
539
+ query_states,
540
+ key_states,
541
+ value_states,
542
+ dropout,
543
+ softmax_scale=softmax_scale,
544
+ causal=causal,
545
+ window_size=(self.config.sliding_window, self.config.sliding_window),
546
+ )
547
+
548
+ return attn_output
549
+
550
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
551
+ batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
552
+
553
+ # On the first iteration we need to properly re-create the padding mask
554
+ # by slicing it on the proper place
555
+ if kv_seq_len != attention_mask.shape[-1]:
556
+ attention_mask_num_tokens = attention_mask.shape[-1]
557
+ attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
558
+
559
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
560
+
561
+ key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
562
+ value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
563
+
564
+ if query_length == kv_seq_len:
565
+ query_layer = index_first_axis(
566
+ query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
567
+ )
568
+ cu_seqlens_q = cu_seqlens_k
569
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
570
+ indices_q = indices_k
571
+ elif query_length == 1:
572
+ max_seqlen_in_batch_q = 1
573
+ cu_seqlens_q = torch.arange(
574
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
575
+ ) # There is a memcpy here, that is very bad.
576
+ indices_q = cu_seqlens_q[:-1]
577
+ query_layer = query_layer.squeeze(1)
578
+ else:
579
+ # The -q_len: slice assumes left padding.
580
+ attention_mask = attention_mask[:, -query_length:]
581
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
582
+
583
+ return (
584
+ query_layer,
585
+ key_layer,
586
+ value_layer,
587
+ indices_q,
588
+ (cu_seqlens_q, cu_seqlens_k),
589
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
590
+ )
591
+
592
+
593
+ class OmegaNeoSdpaAttention(OmegaNeoAttention):
594
+ """
595
+ OmegaNeo attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
596
+ `OmegaNeoAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
597
+ SDPA API.
598
+ """
599
+
600
+ # Adapted from OmegaNeoAttention.forward
601
+ def forward(
602
+ self,
603
+ hidden_states: torch.Tensor,
604
+ attention_mask: Optional[torch.Tensor] = None,
605
+ position_ids: Optional[torch.LongTensor] = None,
606
+ past_key_value: Optional[Cache] = None,
607
+ output_attentions: bool = False,
608
+ use_cache: bool = False,
609
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
610
+ if output_attentions:
611
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
612
+ logger.warning_once(
613
+ "OmegaNeoModel is using OmegaNeoSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
614
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
615
+ )
616
+ return super().forward(
617
+ hidden_states=hidden_states,
618
+ attention_mask=attention_mask,
619
+ position_ids=position_ids,
620
+ past_key_value=past_key_value,
621
+ output_attentions=output_attentions,
622
+ use_cache=use_cache,
623
+ )
624
+
625
+ bsz, q_len, _ = hidden_states.size()
626
+
627
+ query_states = self.q_proj(hidden_states)
628
+ key_states = self.k_proj(hidden_states)
629
+ value_states = self.v_proj(hidden_states)
630
+
631
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
632
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
633
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
634
+
635
+ kv_seq_len = key_states.shape[-2]
636
+ if past_key_value is not None:
637
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
638
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
639
+
640
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
641
+
642
+ if past_key_value is not None:
643
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
644
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
645
+
646
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
647
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
648
+
649
+ if attention_mask is not None:
650
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
651
+ raise ValueError(
652
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
653
+ )
654
+
655
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
656
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
657
+ if query_states.device.type == "cuda" and attention_mask is not None:
658
+ query_states = query_states.contiguous()
659
+ key_states = key_states.contiguous()
660
+ value_states = value_states.contiguous()
661
+
662
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
663
+ query_states,
664
+ key_states,
665
+ value_states,
666
+ attn_mask=attention_mask,
667
+ dropout_p=self.attention_dropout if self.training else 0.0,
668
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
669
+ is_causal=self.is_causal and attention_mask is None and q_len > 1,
670
+ )
671
+
672
+ attn_output = attn_output.transpose(1, 2).contiguous()
673
+ attn_output = attn_output.view(bsz, q_len, self.hidden_size)
674
+
675
+ attn_output = self.o_proj(attn_output)
676
+
677
+ return attn_output, None, past_key_value
678
+
679
+
680
+ OMEGANEO_ATTENTION_CLASSES = {
681
+ "eager": OmegaNeoAttention,
682
+ "flash_attention_2": OmegaNeoFlashAttention2,
683
+ "sdpa": OmegaNeoSdpaAttention,
684
+ }
685
+
686
+
687
+ class OmegaNeoDecoderLayer(nn.Module):
688
+ def __init__(self, config: OmegaNeoConfig, layer_idx: int):
689
+ super().__init__()
690
+ self.hidden_size = config.hidden_size
691
+
692
+ if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
693
+ logger.warning_once(
694
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
695
+ "unexpected results may be encountered."
696
+ )
697
+ self.self_attn = OMEGANEO_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
698
+
699
+ self.mlp = OmegaNeoMLP(config)
700
+ self.input_layernorm = OmegaNeoRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
701
+ self.post_attention_layernorm = OmegaNeoRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
702
+
703
+ def forward(
704
+ self,
705
+ hidden_states: torch.Tensor,
706
+ attention_mask: Optional[torch.Tensor] = None,
707
+ position_ids: Optional[torch.LongTensor] = None,
708
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
709
+ output_attentions: Optional[bool] = False,
710
+ use_cache: Optional[bool] = False,
711
+ **kwargs,
712
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
713
+ if "padding_mask" in kwargs:
714
+ warnings.warn(
715
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
716
+ "Please make sure use `attention_mask` instead.`"
717
+ )
718
+ """
719
+ Args:
720
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
721
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
722
+ `(batch, sequence_length)` where padding elements are indicated by 0.
723
+ output_attentions (`bool`, *optional*):
724
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
725
+ returned tensors for more detail.
726
+ use_cache (`bool`, *optional*):
727
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
728
+ (see `past_key_values`).
729
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
730
+ """
731
+
732
+ residual = hidden_states
733
+
734
+ hidden_states = self.input_layernorm(hidden_states)
735
+
736
+ # Self Attention
737
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
738
+ hidden_states=hidden_states,
739
+ attention_mask=attention_mask,
740
+ position_ids=position_ids,
741
+ past_key_value=past_key_value,
742
+ output_attentions=output_attentions,
743
+ use_cache=use_cache,
744
+ )
745
+ hidden_states = residual + hidden_states
746
+
747
+ # Fully Connected
748
+ residual = hidden_states
749
+ hidden_states = self.post_attention_layernorm(hidden_states)
750
+ hidden_states = self.mlp(hidden_states)
751
+ hidden_states = residual + hidden_states
752
+
753
+ outputs = (hidden_states,)
754
+
755
+ if output_attentions:
756
+ outputs += (self_attn_weights,)
757
+
758
+ if use_cache:
759
+ outputs += (present_key_value,)
760
+
761
+ return outputs
762
+
763
+
764
+ OMEGANEO_START_DOCSTRING = r"""
765
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
766
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
767
+ etc.)
768
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
769
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
770
+ and behavior.
771
+ Parameters:
772
+ config ([`OmegaNeoConfig`]):
773
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
774
+ load the weights associated with the model, only the configuration. Check out the
775
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
776
+ """
777
+
778
+
779
+ @add_start_docstrings(
780
+ "The bare OmegaNeo Model outputting raw hidden-states without any specific head on top.",
781
+ OMEGANEO_START_DOCSTRING,
782
+ )
783
+ class OmegaNeoPreTrainedModel(PreTrainedModel):
784
+ config_class = OmegaNeoConfig
785
+ base_model_prefix = "model"
786
+ supports_gradient_checkpointing = True
787
+ _no_split_modules = ["OmegaNeoDecoderLayer"]
788
+ _skip_keys_device_placement = "past_key_values"
789
+ _supports_flash_attn_2 = True
790
+ _supports_sdpa = True
791
+ _supports_cache_class = True
792
+
793
+ def _init_weights(self, module):
794
+ std = self.config.initializer_range
795
+ if isinstance(module, nn.Linear):
796
+ module.weight.data.normal_(mean=0.0, std=std)
797
+ if module.bias is not None:
798
+ module.bias.data.zero_()
799
+ elif isinstance(module, nn.Embedding):
800
+ module.weight.data.normal_(mean=0.0, std=std)
801
+ if module.padding_idx is not None:
802
+ module.weight.data[module.padding_idx].zero_()
803
+
804
+
805
+ OMEGANEO_INPUTS_DOCSTRING = r"""
806
+ Args:
807
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
808
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
809
+ it.
810
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
811
+ [`PreTrainedTokenizer.__call__`] for details.
812
+ [What are input IDs?](../glossary#input-ids)
813
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
814
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
815
+ - 1 for tokens that are **not masked**,
816
+ - 0 for tokens that are **masked**.
817
+ [What are attention masks?](../glossary#attention-mask)
818
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
819
+ [`PreTrainedTokenizer.__call__`] for details.
820
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
821
+ `past_key_values`).
822
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
823
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
824
+ information on the default strategy.
825
+ - 1 indicates the head is **not masked**,
826
+ - 0 indicates the head is **masked**.
827
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
828
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
829
+ config.n_positions - 1]`.
830
+ [What are position IDs?](../glossary#position-ids)
831
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
832
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
833
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
834
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
835
+ Two formats are allowed:
836
+ - a [`~cache_utils.Cache`] instance;
837
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
838
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
839
+ cache format.
840
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
841
+ legacy cache format will be returned.
842
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
843
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
844
+ of shape `(batch_size, sequence_length)`.
845
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
846
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
847
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
848
+ model's internal embedding lookup matrix.
849
+ use_cache (`bool`, *optional*):
850
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
851
+ `past_key_values`).
852
+ output_attentions (`bool`, *optional*):
853
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
854
+ tensors for more detail.
855
+ output_hidden_states (`bool`, *optional*):
856
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
857
+ more detail.
858
+ return_dict (`bool`, *optional*):
859
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
860
+ """
861
+
862
+
863
+ @add_start_docstrings(
864
+ "The bare OmegaNeo Model outputting raw hidden-states without any specific head on top.",
865
+ OMEGANEO_START_DOCSTRING,
866
+ )
867
+ class OmegaNeoModel(OmegaNeoPreTrainedModel):
868
+ """
869
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OmegaNeoDecoderLayer`]
870
+ Args:
871
+ config: OmegaNeoConfig
872
+ """
873
+
874
+ def __init__(self, config: OmegaNeoConfig):
875
+ super().__init__(config)
876
+ self.padding_idx = config.pad_token_id
877
+ self.vocab_size = config.vocab_size
878
+
879
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
880
+ self.layers = nn.ModuleList(
881
+ [OmegaNeoDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
882
+ )
883
+ self._attn_implementation = config._attn_implementation
884
+ self.norm = OmegaNeoRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
885
+
886
+ self.gradient_checkpointing = False
887
+ # Initialize weights and apply final processing
888
+ self.post_init()
889
+
890
+ def get_input_embeddings(self):
891
+ return self.embed_tokens
892
+
893
+ def set_input_embeddings(self, value):
894
+ self.embed_tokens = value
895
+
896
+ @add_start_docstrings_to_model_forward(OMEGANEO_INPUTS_DOCSTRING)
897
+ def forward(
898
+ self,
899
+ input_ids: torch.LongTensor = None,
900
+ attention_mask: Optional[torch.Tensor] = None,
901
+ position_ids: Optional[torch.LongTensor] = None,
902
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
903
+ inputs_embeds: Optional[torch.FloatTensor] = None,
904
+ use_cache: Optional[bool] = None,
905
+ output_attentions: Optional[bool] = None,
906
+ output_hidden_states: Optional[bool] = None,
907
+ return_dict: Optional[bool] = None,
908
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
909
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
910
+ output_hidden_states = (
911
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
912
+ )
913
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
914
+
915
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
916
+
917
+ # retrieve input_ids and inputs_embeds
918
+ if input_ids is not None and inputs_embeds is not None:
919
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
920
+ elif input_ids is not None:
921
+ batch_size, seq_length = input_ids.shape
922
+ elif inputs_embeds is not None:
923
+ batch_size, seq_length, _ = inputs_embeds.shape
924
+ else:
925
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
926
+
927
+ if self.gradient_checkpointing and self.training:
928
+ if use_cache:
929
+ logger.warning_once(
930
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
931
+ )
932
+ use_cache = False
933
+
934
+ past_key_values_length = 0
935
+
936
+ if use_cache:
937
+ use_legacy_cache = not isinstance(past_key_values, Cache)
938
+ if use_legacy_cache:
939
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
940
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
941
+
942
+ if position_ids is None:
943
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
944
+ position_ids = torch.arange(
945
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
946
+ )
947
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
948
+ else:
949
+ position_ids = position_ids.view(-1, seq_length).long()
950
+
951
+ if inputs_embeds is None:
952
+ inputs_embeds = self.embed_tokens(input_ids)
953
+
954
+ if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
955
+ is_padding_right = attention_mask[:, -1].sum().item() != batch_size
956
+ if is_padding_right:
957
+ raise ValueError(
958
+ "You are attempting to perform batched generation with padding_side='right'"
959
+ " this may lead to unexpected behaviour for Flash Attention version of OmegaNeo. Make sure to "
960
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
961
+ )
962
+
963
+ if self._attn_implementation == "flash_attention_2":
964
+ # 2d mask is passed through the layers
965
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
966
+ elif self._attn_implementation == "sdpa" and not output_attentions:
967
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
968
+ # the manual implementation that requires a 4D causal mask in all cases.
969
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
970
+ attention_mask,
971
+ (batch_size, seq_length),
972
+ inputs_embeds,
973
+ past_key_values_length,
974
+ sliding_window=self.config.sliding_window,
975
+ )
976
+ else:
977
+ # 4d mask is passed through the layers
978
+ attention_mask = _prepare_4d_causal_attention_mask(
979
+ attention_mask,
980
+ (batch_size, seq_length),
981
+ inputs_embeds,
982
+ past_key_values_length,
983
+ sliding_window=self.config.sliding_window,
984
+ )
985
+
986
+ hidden_states = inputs_embeds
987
+
988
+ # decoder layers
989
+ all_hidden_states = () if output_hidden_states else None
990
+ all_self_attns = () if output_attentions else None
991
+ next_decoder_cache = None
992
+
993
+ for decoder_layer in self.layers:
994
+ if output_hidden_states:
995
+ all_hidden_states += (hidden_states,)
996
+
997
+ if self.gradient_checkpointing and self.training:
998
+ layer_outputs = self._gradient_checkpointing_func(
999
+ decoder_layer.__call__,
1000
+ hidden_states,
1001
+ attention_mask,
1002
+ position_ids,
1003
+ past_key_values,
1004
+ output_attentions,
1005
+ use_cache,
1006
+ )
1007
+ else:
1008
+ layer_outputs = decoder_layer(
1009
+ hidden_states,
1010
+ attention_mask=attention_mask,
1011
+ position_ids=position_ids,
1012
+ past_key_value=past_key_values,
1013
+ output_attentions=output_attentions,
1014
+ use_cache=use_cache,
1015
+ )
1016
+
1017
+ hidden_states = layer_outputs[0]
1018
+
1019
+ if use_cache:
1020
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1021
+
1022
+ if output_attentions:
1023
+ all_self_attns += (layer_outputs[1],)
1024
+
1025
+ hidden_states = self.norm(hidden_states)
1026
+
1027
+ # add hidden states from the last decoder layer
1028
+ if output_hidden_states:
1029
+ all_hidden_states += (hidden_states,)
1030
+
1031
+ next_cache = None
1032
+ if use_cache:
1033
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
1034
+
1035
+ if not return_dict:
1036
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1037
+ return BaseModelOutputWithPast(
1038
+ last_hidden_state=hidden_states,
1039
+ past_key_values=next_cache,
1040
+ hidden_states=all_hidden_states,
1041
+ attentions=all_self_attns,
1042
+ )
1043
+
1044
+
1045
+ class OmegaNeoForCausalLM(OmegaNeoPreTrainedModel):
1046
+ _tied_weights_keys = ["lm_head.weight"]
1047
+
1048
+ def __init__(self, config):
1049
+ super().__init__(config)
1050
+ self.model = OmegaNeoModel(config)
1051
+ self.vocab_size = config.vocab_size
1052
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1053
+
1054
+ # Initialize weights and apply final processing
1055
+ self.post_init()
1056
+
1057
+ def get_input_embeddings(self):
1058
+ return self.model.embed_tokens
1059
+
1060
+ def set_input_embeddings(self, value):
1061
+ self.model.embed_tokens = value
1062
+
1063
+ def get_output_embeddings(self):
1064
+ return self.lm_head
1065
+
1066
+ def set_output_embeddings(self, new_embeddings):
1067
+ self.lm_head = new_embeddings
1068
+
1069
+ def set_decoder(self, decoder):
1070
+ self.model = decoder
1071
+
1072
+ def get_decoder(self):
1073
+ return self.model
1074
+
1075
+ @add_start_docstrings_to_model_forward(OMEGANEO_INPUTS_DOCSTRING)
1076
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1077
+ def forward(
1078
+ self,
1079
+ input_ids: torch.LongTensor = None,
1080
+ attention_mask: Optional[torch.Tensor] = None,
1081
+ position_ids: Optional[torch.LongTensor] = None,
1082
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1083
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1084
+ labels: Optional[torch.LongTensor] = None,
1085
+ use_cache: Optional[bool] = None,
1086
+ output_attentions: Optional[bool] = None,
1087
+ output_hidden_states: Optional[bool] = None,
1088
+ return_dict: Optional[bool] = None,
1089
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1090
+ r"""
1091
+ Args:
1092
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1093
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1094
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1095
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1096
+ Returns:
1097
+ Example:
1098
+ ```python
1099
+ >>> from transformers import AutoTokenizer, OmegaNeoForCausalLM
1100
+ >>> model = OmegaNeoForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1101
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1102
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1103
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1104
+ >>> # Generate
1105
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1106
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1107
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1108
+ ```"""
1109
+
1110
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1111
+ output_hidden_states = (
1112
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1113
+ )
1114
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1115
+
1116
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1117
+ outputs = self.model(
1118
+ input_ids=input_ids,
1119
+ attention_mask=attention_mask,
1120
+ position_ids=position_ids,
1121
+ past_key_values=past_key_values,
1122
+ inputs_embeds=inputs_embeds,
1123
+ use_cache=use_cache,
1124
+ output_attentions=output_attentions,
1125
+ output_hidden_states=output_hidden_states,
1126
+ return_dict=return_dict,
1127
+ )
1128
+
1129
+ hidden_states = outputs[0]
1130
+ logits = self.lm_head(hidden_states)
1131
+ logits = logits.float()
1132
+
1133
+ loss = None
1134
+ if labels is not None:
1135
+ # Shift so that tokens < n predict n
1136
+ shift_logits = logits[..., :-1, :].contiguous()
1137
+ shift_labels = labels[..., 1:].contiguous()
1138
+ # Flatten the tokens
1139
+ loss_fct = CrossEntropyLoss()
1140
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1141
+ shift_labels = shift_labels.view(-1)
1142
+ # Enable model parallelism
1143
+ shift_labels = shift_labels.to(shift_logits.device)
1144
+ loss = loss_fct(shift_logits, shift_labels)
1145
+
1146
+ if not return_dict:
1147
+ output = (logits,) + outputs[1:]
1148
+ return (loss,) + output if loss is not None else output
1149
+
1150
+ return CausalLMOutputWithPast(
1151
+ loss=loss,
1152
+ logits=logits,
1153
+ past_key_values=outputs.past_key_values,
1154
+ hidden_states=outputs.hidden_states,
1155
+ attentions=outputs.attentions,
1156
+ )
1157
+
1158
+ def prepare_inputs_for_generation(
1159
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
1160
+ ):
1161
+ # Omit tokens covered by past_key_values
1162
+ if past_key_values is not None:
1163
+ if isinstance(past_key_values, Cache):
1164
+ cache_length = past_key_values.get_seq_length()
1165
+ past_length = past_key_values.seen_tokens
1166
+ max_cache_length = past_key_values.get_max_length()
1167
+ else:
1168
+ cache_length = past_length = past_key_values[0][0].shape[2]
1169
+ max_cache_length = None
1170
+
1171
+ # Keep only the unprocessed tokens:
1172
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
1173
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
1174
+ # input)
1175
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
1176
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
1177
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1178
+ # input_ids based on the past_length.
1179
+ elif past_length < input_ids.shape[1]:
1180
+ input_ids = input_ids[:, past_length:]
1181
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
1182
+
1183
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1184
+ if (
1185
+ max_cache_length is not None
1186
+ and attention_mask is not None
1187
+ and cache_length + input_ids.shape[1] > max_cache_length
1188
+ ):
1189
+ attention_mask = attention_mask[:, -max_cache_length:]
1190
+
1191
+ position_ids = kwargs.get("position_ids", None)
1192
+ if attention_mask is not None and position_ids is None:
1193
+ # create position_ids on the fly for batch generation
1194
+ position_ids = attention_mask.long().cumsum(-1) - 1
1195
+ position_ids.masked_fill_(attention_mask == 0, 1)
1196
+ if past_key_values:
1197
+ position_ids = position_ids[:, -input_ids.shape[1] :]
1198
+
1199
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1200
+ if inputs_embeds is not None and past_key_values is None:
1201
+ model_inputs = {"inputs_embeds": inputs_embeds}
1202
+ else:
1203
+ model_inputs = {"input_ids": input_ids}
1204
+
1205
+ model_inputs.update(
1206
+ {
1207
+ "position_ids": position_ids,
1208
+ "past_key_values": past_key_values,
1209
+ "use_cache": kwargs.get("use_cache"),
1210
+ "attention_mask": attention_mask,
1211
+ }
1212
+ )
1213
+ return model_inputs
1214
+
1215
+ @staticmethod
1216
+ def _reorder_cache(past_key_values, beam_idx):
1217
+ reordered_past = ()
1218
+ for layer_past in past_key_values:
1219
+ reordered_past += (
1220
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1221
+ )
1222
+ return reordered_past
1223
+
1224
+
1225
+ @add_start_docstrings(
1226
+ """
1227
+ The OmegaNeo Model transformer with a sequence classification head on top (linear layer).
1228
+ [`OmegaNeoForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1229
+ (e.g. GPT-2) do.
1230
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1231
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1232
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1233
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1234
+ each row of the batch).
1235
+ """,
1236
+ OMEGANEO_START_DOCSTRING,
1237
+ )
1238
+ class OmegaNeoForSequenceClassification(OmegaNeoPreTrainedModel):
1239
+ def __init__(self, config):
1240
+ super().__init__(config)
1241
+ self.num_labels = config.num_labels
1242
+ self.model = OmegaNeoModel(config)
1243
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1244
+
1245
+ # Initialize weights and apply final processing
1246
+ self.post_init()
1247
+
1248
+ def get_input_embeddings(self):
1249
+ return self.model.embed_tokens
1250
+
1251
+ def set_input_embeddings(self, value):
1252
+ self.model.embed_tokens = value
1253
+
1254
+ @add_start_docstrings_to_model_forward(OMEGANEO_INPUTS_DOCSTRING)
1255
+ def forward(
1256
+ self,
1257
+ input_ids: torch.LongTensor = None,
1258
+ attention_mask: Optional[torch.Tensor] = None,
1259
+ position_ids: Optional[torch.LongTensor] = None,
1260
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1261
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1262
+ labels: Optional[torch.LongTensor] = None,
1263
+ use_cache: Optional[bool] = None,
1264
+ output_attentions: Optional[bool] = None,
1265
+ output_hidden_states: Optional[bool] = None,
1266
+ return_dict: Optional[bool] = None,
1267
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1268
+ r"""
1269
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1270
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1271
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1272
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1273
+ """
1274
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1275
+
1276
+ transformer_outputs = self.model(
1277
+ input_ids,
1278
+ attention_mask=attention_mask,
1279
+ position_ids=position_ids,
1280
+ past_key_values=past_key_values,
1281
+ inputs_embeds=inputs_embeds,
1282
+ use_cache=use_cache,
1283
+ output_attentions=output_attentions,
1284
+ output_hidden_states=output_hidden_states,
1285
+ return_dict=return_dict,
1286
+ )
1287
+ hidden_states = transformer_outputs[0]
1288
+ logits = self.score(hidden_states)
1289
+
1290
+ if input_ids is not None:
1291
+ batch_size = input_ids.shape[0]
1292
+ else:
1293
+ batch_size = inputs_embeds.shape[0]
1294
+
1295
+ if self.config.pad_token_id is None and batch_size != 1:
1296
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1297
+ if self.config.pad_token_id is None:
1298
+ sequence_lengths = -1
1299
+ else:
1300
+ if input_ids is not None:
1301
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1302
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1303
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1304
+ sequence_lengths = sequence_lengths.to(logits.device)
1305
+ else:
1306
+ sequence_lengths = -1
1307
+
1308
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1309
+
1310
+ loss = None
1311
+ if labels is not None:
1312
+ labels = labels.to(logits.device)
1313
+ if self.config.problem_type is None:
1314
+ if self.num_labels == 1:
1315
+ self.config.problem_type = "regression"
1316
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1317
+ self.config.problem_type = "single_label_classification"
1318
+ else:
1319
+ self.config.problem_type = "multi_label_classification"
1320
+
1321
+ if self.config.problem_type == "regression":
1322
+ loss_fct = MSELoss()
1323
+ if self.num_labels == 1:
1324
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1325
+ else:
1326
+ loss = loss_fct(pooled_logits, labels)
1327
+ elif self.config.problem_type == "single_label_classification":
1328
+ loss_fct = CrossEntropyLoss()
1329
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1330
+ elif self.config.problem_type == "multi_label_classification":
1331
+ loss_fct = BCEWithLogitsLoss()
1332
+ loss = loss_fct(pooled_logits, labels)
1333
+ if not return_dict:
1334
+ output = (pooled_logits,) + transformer_outputs[1:]
1335
+ return ((loss,) + output) if loss is not None else output
1336
+
1337
+ return SequenceClassifierOutputWithPast(
1338
+ loss=loss,
1339
+ logits=pooled_logits,
1340
+ past_key_values=transformer_outputs.past_key_values,
1341
+ hidden_states=transformer_outputs.hidden_states,
1342
+ attentions=transformer_outputs.attentions,
1343
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenization_omeganeo.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tokenization classes for OmegaNeo."""
2
+
3
+ import json
4
+ import os
5
+ import unicodedata
6
+ from functools import lru_cache
7
+ from typing import Optional, Tuple
8
+
9
+ import regex as re
10
+
11
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
12
+ from transformers.utils import logging
13
+
14
+
15
+ logger = logging.get_logger(__name__)
16
+
17
+ VOCAB_FILES_NAMES = {
18
+ "vocab_file": "vocab.json",
19
+ "merges_file": "merges.txt",
20
+ }
21
+
22
+
23
+ MAX_MODEL_INPUT_SIZES = {"model/omeganeo-tokenizer": 32768}
24
+
25
+ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
26
+
27
+
28
+ @lru_cache()
29
+ def bytes_to_unicode():
30
+ """
31
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
32
+ characters the bpe code barfs on.
33
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
34
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
35
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
36
+ tables between utf-8 bytes and unicode strings.
37
+ """
38
+ bs = (
39
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
40
+ )
41
+ cs = bs[:]
42
+ n = 0
43
+ for b in range(2**8):
44
+ if b not in bs:
45
+ bs.append(b)
46
+ cs.append(2**8 + n)
47
+ n += 1
48
+ cs = [chr(n) for n in cs]
49
+ return dict(zip(bs, cs))
50
+
51
+
52
+ def get_pairs(word):
53
+ """
54
+ Return set of symbol pairs in a word.
55
+ Word is represented as tuple of symbols (symbols being variable-length strings).
56
+ """
57
+ pairs = set()
58
+ prev_char = word[0]
59
+ for char in word[1:]:
60
+ pairs.add((prev_char, char))
61
+ prev_char = char
62
+ return pairs
63
+
64
+
65
+ class OmegaNeoTokenizer(PreTrainedTokenizer):
66
+ """
67
+ Construct a OmegaNeo tokenizer. Based on byte-level Byte-Pair-Encoding.
68
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
69
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
70
+ ```python
71
+ >>> from transformers import OmegaNeoTokenizer
72
+ >>> tokenizer = OmegaNeoTokenizer.from_pretrained("model/omeganeo-tokenizer")
73
+ >>> tokenizer("Hello world")["input_ids"]
74
+ [9707, 1879]
75
+ >>> tokenizer(" Hello world")["input_ids"]
76
+ [21927, 1879]
77
+ ```
78
+ This is expected.
79
+ You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
80
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
81
+ this superclass for more information regarding those methods.
82
+ Args:
83
+ vocab_file (`str`):
84
+ Path to the vocabulary file.
85
+ merges_file (`str`):
86
+ Path to the merges file.
87
+ errors (`str`, *optional*, defaults to `"replace"`):
88
+ Paradigm to follow when decoding bytes to UTF-8. See
89
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
90
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
91
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
92
+ token instead.
93
+ bos_token (`str`, *optional*):
94
+ The beginning of sequence token. Not applicable for this tokenizer.
95
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
96
+ The end of sequence token.
97
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
98
+ The token used for padding, for example when batching sequences of different lengths.
99
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
100
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
101
+ tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
102
+ split_special_tokens (`bool`, *optional*, defaults to `False`):
103
+ Whether or not the special tokens should be split during the tokenization process. The default behavior is
104
+ to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
105
+ ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
106
+ '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
107
+ """
108
+
109
+ vocab_files_names = VOCAB_FILES_NAMES
110
+ model_input_names = ["input_ids", "attention_mask"]
111
+
112
+ def __init__(
113
+ self,
114
+ vocab_file,
115
+ merges_file,
116
+ errors="replace",
117
+ unk_token="<|endoftext|>",
118
+ bos_token=None,
119
+ eos_token="<|endoftext|>",
120
+ pad_token="<|endoftext|>",
121
+ clean_up_tokenization_spaces=False,
122
+ split_special_tokens=False,
123
+ **kwargs,
124
+ ):
125
+ # OmegaNeo vocab does not contain control tokens; added tokens need to be special
126
+ bos_token = (
127
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
128
+ if isinstance(bos_token, str)
129
+ else bos_token
130
+ )
131
+ eos_token = (
132
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
133
+ if isinstance(eos_token, str)
134
+ else eos_token
135
+ )
136
+ unk_token = (
137
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
138
+ if isinstance(unk_token, str)
139
+ else unk_token
140
+ )
141
+ pad_token = (
142
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
143
+ if isinstance(pad_token, str)
144
+ else pad_token
145
+ )
146
+
147
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
148
+ self.encoder = json.load(vocab_handle)
149
+ self.decoder = {v: k for k, v in self.encoder.items()}
150
+ self.errors = errors # how to handle errors in decoding
151
+ self.byte_encoder = bytes_to_unicode()
152
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
153
+ bpe_merges = []
154
+ with open(merges_file, encoding="utf-8") as merges_handle:
155
+ for i, line in enumerate(merges_handle):
156
+ line = line.strip()
157
+ if (i == 0 and line.startswith("#version:")) or not line:
158
+ continue
159
+ bpe_merges.append(tuple(line.split()))
160
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
161
+ self.cache = {}
162
+
163
+ self.pat = re.compile(PRETOKENIZE_REGEX)
164
+
165
+ if kwargs.get("add_prefix_space", False):
166
+ logger.warning_once(
167
+ f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
168
+ )
169
+
170
+ super().__init__(
171
+ errors=errors,
172
+ bos_token=bos_token,
173
+ eos_token=eos_token,
174
+ pad_token=pad_token,
175
+ unk_token=unk_token,
176
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
177
+ split_special_tokens=split_special_tokens,
178
+ **kwargs,
179
+ )
180
+
181
+ @property
182
+ def vocab_size(self) -> int:
183
+ return len(self.encoder)
184
+
185
+ def get_vocab(self):
186
+ return dict(self.encoder, **self.added_tokens_encoder)
187
+
188
+ def bpe(self, token):
189
+ if token in self.cache:
190
+ return self.cache[token]
191
+ word = tuple(token)
192
+ pairs = get_pairs(word)
193
+
194
+ if not pairs:
195
+ return token
196
+
197
+ while True:
198
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
199
+ if bigram not in self.bpe_ranks:
200
+ break
201
+ first, second = bigram
202
+ new_word = []
203
+ i = 0
204
+ while i < len(word):
205
+ try:
206
+ j = word.index(first, i)
207
+ except ValueError:
208
+ new_word.extend(word[i:])
209
+ break
210
+ else:
211
+ new_word.extend(word[i:j])
212
+ i = j
213
+
214
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
215
+ new_word.append(first + second)
216
+ i += 2
217
+ else:
218
+ new_word.append(word[i])
219
+ i += 1
220
+ new_word = tuple(new_word)
221
+ word = new_word
222
+ if len(word) == 1:
223
+ break
224
+ else:
225
+ pairs = get_pairs(word)
226
+ word = " ".join(word)
227
+ self.cache[token] = word
228
+ return word
229
+
230
+ def _tokenize(self, text):
231
+ """Tokenize a string."""
232
+ bpe_tokens = []
233
+ for token in re.findall(self.pat, text):
234
+ token = "".join(
235
+ self.byte_encoder[b] for b in token.encode("utf-8")
236
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
237
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
238
+ return bpe_tokens
239
+
240
+ def _convert_token_to_id(self, token):
241
+ """Converts a token (str) in an id using the vocab."""
242
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
243
+
244
+ def _convert_id_to_token(self, index):
245
+ """Converts an index (integer) in a token (str) using the vocab."""
246
+ return self.decoder.get(index)
247
+
248
+ def convert_tokens_to_string(self, tokens):
249
+ """Converts a sequence of tokens (string) in a single string."""
250
+ text = "".join(tokens)
251
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
252
+ return text
253
+
254
+ def decode(
255
+ self,
256
+ token_ids,
257
+ skip_special_tokens: bool = False,
258
+ clean_up_tokenization_spaces: Optional[bool] = False,
259
+ spaces_between_special_tokens: bool = False,
260
+ **kwargs,
261
+ ) -> str:
262
+ # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
263
+ # and cannot be configured elsewhere, but it should default to False for OmegaNeoTokenizer
264
+ return super().decode(
265
+ token_ids,
266
+ skip_special_tokens=skip_special_tokens,
267
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
268
+ spaces_between_special_tokens=spaces_between_special_tokens,
269
+ **kwargs,
270
+ )
271
+
272
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
273
+ if not os.path.isdir(save_directory):
274
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
275
+ return
276
+ vocab_file = os.path.join(
277
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
278
+ )
279
+ merge_file = os.path.join(
280
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
281
+ )
282
+
283
+ with open(vocab_file, "w", encoding="utf-8") as f:
284
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
285
+
286
+ index = 0
287
+ with open(merge_file, "w", encoding="utf-8") as writer:
288
+ writer.write("#version: 0.2\n")
289
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
290
+ if index != token_index:
291
+ logger.warning(
292
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
293
+ " Please check that the tokenizer is not corrupted!"
294
+ )
295
+ index = token_index
296
+ writer.write(" ".join(bpe_tokens) + "\n")
297
+ index += 1
298
+
299
+ return vocab_file, merge_file
300
+
301
+ def prepare_for_tokenization(self, text, **kwargs):
302
+ text = unicodedata.normalize("NFC", text)
303
+ return (text, kwargs)
tokenization_omeganeo_fast.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tokenization classes for OmegaNeo."""
2
+
3
+ from typing import Optional, Tuple
4
+
5
+ from transformers.tokenization_utils import AddedToken
6
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
7
+ from transformers.utils import logging
8
+ from .tokenization_omeganeo import OmegaNeoTokenizer
9
+
10
+
11
+ logger = logging.get_logger(__name__)
12
+
13
+ VOCAB_FILES_NAMES = {
14
+ "vocab_file": "vocab.json",
15
+ "merges_file": "merges.txt",
16
+ "tokenizer_file": "tokenizer.json",
17
+ }
18
+
19
+
20
+ MAX_MODEL_INPUT_SIZES = {"model/omeganeo-tokenizer": 32768}
21
+
22
+
23
+ class OmegaNeoTokenizerFast(PreTrainedTokenizerFast):
24
+ """
25
+ Construct a "fast" OmegaNeo tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
26
+ Byte-Pair-Encoding.
27
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
28
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
29
+ ```python
30
+ >>> from transformers import OmegaNeoTokenizerFast
31
+ >>> tokenizer = OmegaNeoTokenizerFast.from_pretrained("model/omeganeo-tokenizer")
32
+ >>> tokenizer("Hello world")["input_ids"]
33
+ [9707, 1879]
34
+ >>> tokenizer(" Hello world")["input_ids"]
35
+ [21927, 1879]
36
+ ```
37
+ This is expected.
38
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
39
+ refer to this superclass for more information regarding those methods.
40
+ Args:
41
+ vocab_file (`str`, *optional*):
42
+ Path to the vocabulary file.
43
+ merges_file (`str`, *optional*):
44
+ Path to the merges file.
45
+ tokenizer_file (`str`, *optional*):
46
+ Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
47
+ contains everything needed to load the tokenizer.
48
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
49
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
50
+ token instead. Not applicable to this tokenizer.
51
+ bos_token (`str`, *optional*):
52
+ The beginning of sequence token. Not applicable for this tokenizer.
53
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
54
+ The end of sequence token.
55
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
56
+ The token used for padding, for example when batching sequences of different lengths.
57
+ """
58
+
59
+ vocab_files_names = VOCAB_FILES_NAMES
60
+ model_input_names = ["input_ids", "attention_mask"]
61
+ slow_tokenizer_class = OmegaNeoTokenizer
62
+
63
+ def __init__(
64
+ self,
65
+ vocab_file=None,
66
+ merges_file=None,
67
+ tokenizer_file=None,
68
+ unk_token="<|endoftext|>",
69
+ bos_token=None,
70
+ eos_token="<|endoftext|>",
71
+ pad_token="<|endoftext|>",
72
+ **kwargs,
73
+ ):
74
+ # We need to at least pass vocab_file and merges_file to base class
75
+ # in case a slow tokenizer needs to be initialized; other can be
76
+ # configured through files.
77
+ # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
78
+
79
+ bos_token = (
80
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
81
+ if isinstance(bos_token, str)
82
+ else bos_token
83
+ )
84
+ eos_token = (
85
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
86
+ if isinstance(eos_token, str)
87
+ else eos_token
88
+ )
89
+ unk_token = (
90
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
91
+ if isinstance(unk_token, str)
92
+ else unk_token
93
+ )
94
+ pad_token = (
95
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
96
+ if isinstance(pad_token, str)
97
+ else pad_token
98
+ )
99
+
100
+ super().__init__(
101
+ vocab_file,
102
+ merges_file,
103
+ tokenizer_file=tokenizer_file,
104
+ unk_token=unk_token,
105
+ bos_token=bos_token,
106
+ eos_token=eos_token,
107
+ pad_token=pad_token,
108
+ **kwargs,
109
+ )
110
+
111
+ # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
112
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
113
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
114
+ return tuple(files)
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "auto_map": {
199
+ "AutoTokenizer": [
200
+ "tokenization_omeganeo.OmegaNeoTokenizer",
201
+ null
202
+ ]
203
+ },
204
+ "chat_template": "{% set pre_system_message = 'You are Omega, an AI Assistant exclusively developed, trained and powered by the scientists and engineers at QX LAB AI. QX LAB AI, based in Dubai, UAE, specializes in developing and integrating AI technologies to enhance business operations across various industries. You are built on the unique Omega architecture and trained with extensive datasets and configurations. You were developed solely by the scientists and engineers at QX LAB AI, without any external assistance from other organizations or teams.' %}{% if messages[0]['role'] == 'system' %}<|im_start|>system\n{{ pre_system_message }}\n\n{{ messages[0]['content'] }}<|im_end|>\n{% for message in messages[1:] %}{% if message['role'] == 'user' %}<|im_start|>user\n{{ message['content'] }}<|im_end|>\n{% elif message['role'] == 'assistant' %}<|im_start|>assistant\n{{ message['content'] }}<|im_end|>\n{% endif %}{% endfor %}{% else %}<|im_start|>system\n{{ pre_system_message }}<|im_end|>\n{% for message in messages %}{% if message['role'] == 'user' %}<|im_start|>user\n{{ message['content'] }}<|im_end|>\n{% elif message['role'] == 'assistant' %}<|im_start|>assistant\n{{ message['content'] }}<|im_end|>\n{% endif %}{% endfor %}{% endif %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
205
+ "clean_up_tokenization_spaces": false,
206
+ "eos_token": "<|endoftext|>",
207
+ "errors": "replace",
208
+ "model_max_length": 32768,
209
+ "pad_token": "<|endoftext|>",
210
+ "split_special_tokens": false,
211
+ "tokenizer_class": "OmegaNeoTokenizer",
212
+ "unk_token": null
213
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff