ebisuke commited on
Commit
641e635
·
verified ·
1 Parent(s): aaa12b0

Training in progress, step 5000

Browse files
config.json ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "T5Gemma2ForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 155,
7
+ "classifier_dropout_rate": 0.0,
8
+ "decoder": {
9
+ "_sliding_window_pattern": 6,
10
+ "attention_bias": false,
11
+ "attention_dropout": 0.0,
12
+ "attn_logit_softcapping": null,
13
+ "dropout_rate": 0.0,
14
+ "dtype": "float32",
15
+ "final_logit_softcapping": null,
16
+ "head_dim": 256,
17
+ "hidden_activation": "gelu_pytorch_tanh",
18
+ "hidden_size": 1152,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 6912,
21
+ "layer_types": [
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "sliding_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "sliding_attention",
54
+ "sliding_attention",
55
+ "sliding_attention",
56
+ "sliding_attention",
57
+ "sliding_attention",
58
+ "sliding_attention",
59
+ "sliding_attention"
60
+ ],
61
+ "max_position_embeddings": 32768,
62
+ "model_type": "t5gemma2_decoder",
63
+ "num_attention_heads": 4,
64
+ "num_hidden_layers": 38,
65
+ "num_key_value_heads": 1,
66
+ "query_pre_attn_scalar": 256,
67
+ "rms_norm_eps": 1e-06,
68
+ "rope_parameters": {
69
+ "full_attention": {
70
+ "factor": 8.0,
71
+ "rope_theta": 1000000,
72
+ "rope_type": "linear"
73
+ },
74
+ "sliding_attention": {
75
+ "rope_theta": 10000,
76
+ "rope_type": "default"
77
+ }
78
+ },
79
+ "sliding_window": 512,
80
+ "use_bidirectional_attention": false,
81
+ "use_cache": true,
82
+ "vocab_size": 157
83
+ },
84
+ "dropout_rate": 0.0,
85
+ "dtype": "float32",
86
+ "encoder": {
87
+ "attention_dropout": 0.0,
88
+ "boi_token_index": 255999,
89
+ "dropout_rate": 0.0,
90
+ "dtype": "float32",
91
+ "eoi_token_index": 256000,
92
+ "image_token_index": 256001,
93
+ "initializer_range": 0.02,
94
+ "mm_tokens_per_image": 256,
95
+ "model_type": "t5gemma2_encoder",
96
+ "text_config": {
97
+ "_name_or_path": "",
98
+ "_sliding_window_pattern": 6,
99
+ "add_cross_attention": false,
100
+ "architectures": null,
101
+ "attention_bias": false,
102
+ "attention_dropout": 0.0,
103
+ "attn_logit_softcapping": null,
104
+ "bos_token_id": 2,
105
+ "chunk_size_feed_forward": 0,
106
+ "cross_attention_hidden_size": null,
107
+ "decoder_start_token_id": null,
108
+ "dropout_rate": 0.0,
109
+ "dtype": "float32",
110
+ "eos_token_id": 1,
111
+ "final_logit_softcapping": null,
112
+ "finetuning_task": null,
113
+ "head_dim": 256,
114
+ "hidden_activation": "gelu_pytorch_tanh",
115
+ "hidden_size": 1152,
116
+ "id2label": {
117
+ "0": "LABEL_0",
118
+ "1": "LABEL_1"
119
+ },
120
+ "initializer_range": 0.02,
121
+ "intermediate_size": 6912,
122
+ "is_decoder": false,
123
+ "is_encoder_decoder": false,
124
+ "label2id": {
125
+ "LABEL_0": 0,
126
+ "LABEL_1": 1
127
+ },
128
+ "layer_types": [
129
+ "sliding_attention",
130
+ "sliding_attention",
131
+ "sliding_attention",
132
+ "sliding_attention",
133
+ "sliding_attention",
134
+ "sliding_attention",
135
+ "sliding_attention",
136
+ "sliding_attention",
137
+ "sliding_attention",
138
+ "sliding_attention",
139
+ "sliding_attention",
140
+ "full_attention",
141
+ "sliding_attention",
142
+ "sliding_attention",
143
+ "sliding_attention",
144
+ "sliding_attention",
145
+ "sliding_attention",
146
+ "full_attention",
147
+ "sliding_attention",
148
+ "sliding_attention",
149
+ "sliding_attention",
150
+ "sliding_attention",
151
+ "sliding_attention",
152
+ "full_attention",
153
+ "sliding_attention",
154
+ "sliding_attention",
155
+ "sliding_attention",
156
+ "sliding_attention",
157
+ "sliding_attention",
158
+ "full_attention",
159
+ "sliding_attention",
160
+ "sliding_attention",
161
+ "sliding_attention",
162
+ "sliding_attention",
163
+ "sliding_attention",
164
+ "sliding_attention",
165
+ "sliding_attention",
166
+ "sliding_attention"
167
+ ],
168
+ "max_position_embeddings": 32768,
169
+ "model_type": "t5gemma2_text",
170
+ "num_attention_heads": 4,
171
+ "num_hidden_layers": 38,
172
+ "num_key_value_heads": 1,
173
+ "output_attentions": false,
174
+ "output_hidden_states": false,
175
+ "pad_token_id": 0,
176
+ "prefix": null,
177
+ "problem_type": null,
178
+ "query_pre_attn_scalar": 256,
179
+ "return_dict": true,
180
+ "rms_norm_eps": 1e-06,
181
+ "rope_parameters": {
182
+ "full_attention": {
183
+ "factor": 8.0,
184
+ "rope_theta": 1000000,
185
+ "rope_type": "linear"
186
+ },
187
+ "sliding_attention": {
188
+ "rope_theta": 10000,
189
+ "rope_type": "default"
190
+ }
191
+ },
192
+ "sep_token_id": null,
193
+ "sliding_window": 512,
194
+ "task_specific_params": null,
195
+ "tie_encoder_decoder": false,
196
+ "tie_word_embeddings": true,
197
+ "tokenizer_class": null,
198
+ "use_bidirectional_attention": false,
199
+ "use_cache": true,
200
+ "vocab_size": 157
201
+ },
202
+ "vision_config": {
203
+ "_name_or_path": "",
204
+ "add_cross_attention": false,
205
+ "architectures": null,
206
+ "attention_dropout": 0.0,
207
+ "bos_token_id": null,
208
+ "chunk_size_feed_forward": 0,
209
+ "cross_attention_hidden_size": null,
210
+ "decoder_start_token_id": null,
211
+ "dropout_rate": 0.0,
212
+ "dtype": "float32",
213
+ "eos_token_id": null,
214
+ "finetuning_task": null,
215
+ "hidden_act": "gelu_pytorch_tanh",
216
+ "hidden_size": 1152,
217
+ "id2label": {
218
+ "0": "LABEL_0",
219
+ "1": "LABEL_1"
220
+ },
221
+ "image_size": 896,
222
+ "intermediate_size": 4304,
223
+ "is_decoder": false,
224
+ "is_encoder_decoder": false,
225
+ "label2id": {
226
+ "LABEL_0": 0,
227
+ "LABEL_1": 1
228
+ },
229
+ "layer_norm_eps": 1e-06,
230
+ "model_type": "siglip_vision_model",
231
+ "num_attention_heads": 16,
232
+ "num_channels": 3,
233
+ "num_hidden_layers": 27,
234
+ "output_attentions": false,
235
+ "output_hidden_states": false,
236
+ "pad_token_id": null,
237
+ "patch_size": 14,
238
+ "prefix": null,
239
+ "problem_type": null,
240
+ "return_dict": true,
241
+ "sep_token_id": null,
242
+ "task_specific_params": null,
243
+ "tie_encoder_decoder": false,
244
+ "tie_word_embeddings": true,
245
+ "tokenizer_class": null,
246
+ "vision_use_head": false,
247
+ "vocab_size": 157
248
+ },
249
+ "vocab_size": 157
250
+ },
251
+ "eoi_token_index": 256000,
252
+ "eos_token_id": 156,
253
+ "image_token_index": 256001,
254
+ "initializer_range": 0.02,
255
+ "is_encoder_decoder": true,
256
+ "model_type": "t5gemma2",
257
+ "pad_token_id": 156,
258
+ "transformers_version": "5.0.0rc1",
259
+ "use_cache": false,
260
+ "vocab_size": 157
261
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 155,
4
+ "eos_token_id": 156,
5
+ "pad_token_id": 156,
6
+ "transformers_version": "5.0.0rc1"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9f4dc5e7b21fea6f5aa41b3f6d2d32c9623bbf726467d11b41c61e7e34f00af
3
+ size 4916948360
runs/Jan03_07-26-51_computeinstance-e00z5zf785kejrw7zn/events.out.tfevents.1767425211.computeinstance-e00z5zf785kejrw7zn.4576.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4afdee16782644b9b3480048ad8bd67fb1b22ea9e4a20efe14235a5dc81989b
3
+ size 22131
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25f746a3b8483e3bf5ef8c9be52557807f3e28b63114755f34a2025696c185d3
3
+ size 5265