Ceyda Cinarel commited on
Commit
8ecd28d
·
1 Parent(s): 3d243f4

deadline push, add new native clip output

Browse files
clippy.py ADDED
@@ -0,0 +1,860 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Team All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Pre-training/Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
18
+
19
+ Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
20
+ https://huggingface.co/models?filter=causal-lm
21
+ """
22
+ # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
23
+
24
+ import logging
25
+ import math
26
+ import os
27
+ import sys
28
+ import time
29
+ from dataclasses import dataclass, field
30
+ from pathlib import Path
31
+ from typing import Callable, Optional
32
+ import json
33
+ import jsonlines
34
+ import shutil
35
+ import numpy as np
36
+
37
+ import datasets
38
+ from datasets import Dataset, load_dataset
39
+ from flax import training
40
+ from tqdm import tqdm
41
+
42
+ import torch
43
+ from torchvision.datasets import VisionDataset
44
+ from torchvision.io import ImageReadMode, read_image
45
+ from torchvision.transforms import (
46
+ # added for image augmentation
47
+ ToPILImage,
48
+ RandomCrop,
49
+ ColorJitter,
50
+ RandomHorizontalFlip,
51
+ RandomVerticalFlip,
52
+ RandomResizedCrop,
53
+ ToTensor,
54
+ # /added for image augmentation
55
+ CenterCrop,
56
+ ConvertImageDtype,
57
+ Normalize,
58
+ Resize
59
+ )
60
+ from torchvision.transforms.functional import InterpolationMode
61
+
62
+ import jax
63
+ import jax.profiler
64
+ import jax.numpy as jnp
65
+ import optax
66
+ import transformers
67
+ from flax import jax_utils, traverse_util
68
+ from flax.jax_utils import unreplicate
69
+ from flax.training import train_state
70
+ from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
71
+ from flax.training.checkpoints import save_checkpoint, restore_checkpoint
72
+ from flax.serialization import to_bytes, from_bytes
73
+ from transformers import (
74
+ CONFIG_MAPPING,
75
+ AutoConfig,
76
+ FlaxCLIPModel,
77
+ CLIPProcessor,
78
+ CLIPTokenizerFast,
79
+ HfArgumentParser,
80
+ TrainingArguments,
81
+ is_tensorboard_available,
82
+ IntervalStrategy
83
+
84
+ )
85
+ from transformers.testing_utils import CaptureLogger
86
+
87
+ from importlib.util import find_spec
88
+
89
+ logger = logging.getLogger(__name__)
90
+
91
+
92
+ @dataclass
93
+ class ModelArguments:
94
+ """
95
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
96
+ """
97
+
98
+ model_name_or_path: Optional[str] = field(
99
+ default=None,
100
+ metadata={
101
+ "help": "The model checkpoint for weights initialization."
102
+ "Don't set if you want to train a model from scratch."
103
+ },
104
+ )
105
+ config_name: Optional[str] = field(
106
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
107
+ )
108
+ tokenizer_name: Optional[str] = field(
109
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
110
+ )
111
+ cache_dir: Optional[str] = field(
112
+ default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
113
+ )
114
+ use_fast_tokenizer: bool = field(
115
+ default=True,
116
+ metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
117
+ )
118
+ dtype: Optional[str] = field(
119
+ default="float32",
120
+ metadata={
121
+ "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
122
+ },
123
+ )
124
+ save_optimizer: Optional[bool] = field(
125
+ default=True,
126
+ metadata={"help": "Whether to store full train state including optimizer."},
127
+ )
128
+ repo_path_or_name: Optional[str] = field(
129
+ default=None,
130
+ metadata={"help": "Path to the modelhub repo directory"},
131
+ )
132
+
133
+
134
+ @dataclass
135
+ class DataTrainingArguments:
136
+ """
137
+ Arguments pertaining to what data we are going to input our model for training and eval.
138
+ """
139
+
140
+ dataset_name: Optional[str] = field(
141
+ default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
142
+ )
143
+ data_dir: Optional[str] = field(
144
+ default=None, metadata={"help": "Path to local folder containing data files."}
145
+ )
146
+ train_file: Optional[str] = field(
147
+ default=None, metadata={"help": "The input training data file (a jsonlines file)."}
148
+ )
149
+ validation_file: Optional[str] = field(
150
+ default=None,
151
+ metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
152
+ )
153
+ train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
154
+ validation_file: Optional[str] = field(
155
+ default=None,
156
+ metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
157
+ )
158
+ max_train_samples: Optional[int] = field(
159
+ default=None,
160
+ metadata={
161
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
162
+ "value if set."
163
+ },
164
+ )
165
+ max_eval_samples: Optional[int] = field(
166
+ default=None,
167
+ metadata={
168
+ "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
169
+ "value if set."
170
+ },
171
+ )
172
+ overwrite_cache: bool = field(
173
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
174
+ )
175
+ validation_split_percentage: Optional[int] = field(
176
+ default=5,
177
+ metadata={
178
+ "help": "The percentage of the train set used as validation set in case there's no validation split"
179
+ },
180
+ )
181
+ block_size: Optional[int] = field(
182
+ default=None,
183
+ metadata={
184
+ "help": "Optional input sequence length after tokenization. "
185
+ "The training dataset will be truncated in block of this size for training. "
186
+ "Default to the model max input length for single sentence inputs (take into account special tokens)."
187
+ },
188
+ )
189
+ overwrite_cache: bool = field(
190
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
191
+ )
192
+ preprocessing_num_workers: Optional[int] = field(
193
+ default=None,
194
+ metadata={"help": "The number of processes to use for the preprocessing."},
195
+ )
196
+ text_column_name: Optional[str] = field(
197
+ default='text',
198
+ metadata={"help": "Column containing main text data."},
199
+ )
200
+ augment_images: Optional[bool] = field(
201
+ default=True,
202
+ metadata={ "help": "Augment input training images" }
203
+ )
204
+ augment_captions: Optional[bool] = field(
205
+ default=True,
206
+ metadata={ "help": "Augment input training images" }
207
+ )
208
+ captions_per_image: Optional[int] = field(
209
+ default=5,
210
+ metadata={"help": "Number of captions per image to use when creating train dataset."},
211
+ )
212
+
213
+ def __post_init__(self):
214
+ if self.dataset_name is None and self.train_file is None and self.validation_file is None:
215
+ raise ValueError("Need either a dataset name or a training/validation file.")
216
+ else:
217
+ if self.train_file is not None:
218
+ extension = self.train_file.split(".")[-1]
219
+ assert extension in ["csv", "json", "txt", "jsonl"], "`train_file` should be a csv, a json or a txt file."
220
+ if self.validation_file is not None:
221
+ extension = self.validation_file.split(".")[-1]
222
+ assert extension in ["csv", "json", "txt", "jsonl"], "`validation_file` should be a csv, a json or a txt file."
223
+
224
+
225
+ @dataclass
226
+ class ImageAugmentationArguments:
227
+ """
228
+ Arguments for image augmentations configuration
229
+ """
230
+ random_horizontal_flip: Optional[float] = field(
231
+ default=0.5,
232
+ metadata={ "help": "Probability of applying random horizontal flip" }
233
+ )
234
+ random_vertical_flip: Optional[float] = field(
235
+ default=0.5,
236
+ metadata={ "help": "Probability of applying random vartical flip" }
237
+ )
238
+
239
+ # We use torchvision for faster image pre-processing.
240
+ # We need to ensure faster processing speed as it can become a bottleneck on TPU
241
+ class Transform(torch.nn.Module):
242
+ def __init__(self, image_size, augment_images, augmentation_args):
243
+ super().__init__()
244
+ if augment_images:
245
+ crop_size = int(image_size * 0.8)
246
+ self.transforms = torch.nn.Sequential(
247
+ # image augmentation transforms
248
+ RandomCrop(crop_size),
249
+ ColorJitter(),
250
+ RandomHorizontalFlip(augmentation_args.random_horizontal_flip),
251
+ RandomVerticalFlip(augmentation_args.random_vertical_flip),
252
+ RandomResizedCrop(crop_size, scale=(0.8, 1.2), ratio=(1.0, 1.0)),
253
+ # /image augmentation transforms
254
+ Resize([image_size], interpolation=InterpolationMode.BICUBIC),
255
+ CenterCrop(image_size),
256
+ ConvertImageDtype(torch.float),
257
+ Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
258
+ )
259
+ else:
260
+ self.transforms = torch.nn.Sequential(
261
+ Resize([image_size], interpolation=InterpolationMode.BICUBIC),
262
+ CenterCrop(image_size),
263
+ ConvertImageDtype(torch.float),
264
+ Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
265
+ )
266
+
267
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
268
+ with torch.no_grad():
269
+ x = self.transforms(x)
270
+ return x
271
+
272
+ class ImageTextDataset(VisionDataset):
273
+ """
274
+ Dtaset for loading image-text data for tasks like CLIP training, Image Captioning.
275
+
276
+ Args:
277
+ root: (string): The root path where the dataset is stored
278
+ file_path: (string): Path to the file containing the image_paths and associated captions.
279
+ The expected format is jsonlines where each line is a json object containing to keys.
280
+ `filename`: The path to the image.
281
+ `captions`: An `array` of captions.
282
+ transform (callable, optional): A function/transform that takes in an PIL image
283
+ and returns a transformed version. E.g, ``transforms.ToTensor``
284
+ target_transform (callable, optional): A function/transform that takes in the
285
+ target and transforms it.
286
+ transforms (callable, optional): A function/transform that takes input sample and its target as entry
287
+ and returns a transformed version.
288
+ """
289
+
290
+ def __init__(
291
+ self,
292
+ file_path: str,
293
+ split: str,
294
+ captions_per_image:int = 5,
295
+ augment_captions:bool = True,
296
+ transform: Optional[Callable] = None,
297
+ target_transform: Optional[Callable] = None,
298
+ transforms: Optional[Callable] = None,
299
+ ):
300
+ super().__init__(None, transforms, transform, target_transform)
301
+
302
+ if augment_captions:
303
+ prefix = "textaug_"
304
+ else:
305
+ prefix = ""
306
+ with open(file_path, "r") as f:
307
+ examples = [json.loads(line) for line in f.readlines()]
308
+
309
+ self.captions = []
310
+ self.image_paths = []
311
+
312
+ for example in examples:
313
+ self.captions.extend(example["captions"][:captions_per_image])
314
+ self.image_paths.extend([example["image_path"]] * captions_per_image)
315
+
316
+ def _load_image(self, idx: int):
317
+ path = f"{self.image_paths[idx]}"
318
+ return read_image(path, mode=ImageReadMode.RGB)
319
+
320
+ def _load_target(self, idx):
321
+ return self.captions[idx]
322
+
323
+ def __getitem__(self, index: int):
324
+ image = self._load_image(index)
325
+ target = self._load_target(index)
326
+
327
+ if self.transforms is not None:
328
+ image, target = self.transforms(image, target)
329
+
330
+ return image, target
331
+
332
+ def __len__(self) -> int:
333
+ return len(self.captions)
334
+
335
+
336
+ class TrainState(train_state.TrainState):
337
+ dropout_rng: jnp.ndarray
338
+
339
+ def replicate(self):
340
+ return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
341
+
342
+
343
+ def write_train_metric(summary_writer, train_metrics, train_time, step):
344
+ summary_writer.scalar("train_time", train_time, step)
345
+
346
+ train_metrics = get_metrics(train_metrics)
347
+ for key, vals in train_metrics.items():
348
+ tag = f"train_{key}"
349
+ for i, val in enumerate(vals):
350
+ summary_writer.scalar(tag, val, step - len(vals) + i + 1)
351
+
352
+
353
+ def write_eval_metric(summary_writer, eval_metrics, step):
354
+ for metric_name, value in eval_metrics.items():
355
+ summary_writer.scalar(f"eval_{metric_name}", value, step)
356
+
357
+
358
+ def create_learning_rate_fn(
359
+ train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
360
+ ) -> Callable[[int], jnp.array]:
361
+ """Returns a linear warmup, linear_decay learning rate function."""
362
+ steps_per_epoch = train_ds_size // train_batch_size
363
+ num_train_steps = steps_per_epoch * num_train_epochs
364
+ warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
365
+ decay_fn = optax.linear_schedule(
366
+ init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
367
+ )
368
+ schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
369
+ return schedule_fn
370
+
371
+ # utils
372
+ def mb_item(x):
373
+ return x.item() if hasattr(x, "item") else x
374
+
375
+ def make_batch(samples):
376
+ batch = {k:jnp.array(v) for k,v in samples.items()}
377
+ return batch
378
+
379
+ #checkpoint functions
380
+ # def save_checkpoint(model, save_dir, state, with_opt:bool=True, push_to_hub:bool=False):
381
+ # state = jax_utils.unreplicate(state)
382
+ # logger.info(f"SAVING CHECKPOINT IN {save_dir}...")
383
+ # save_dir = f"{save_dir}/ckpt-{mb_item(state.step)-1}"
384
+ # model.save_pretrained(
385
+ # save_dir,
386
+ # params=state.params,
387
+ # push_to_hub=push_to_hub,
388
+ # commit_message=f"Saving weights and logs at step {mb_item(state.step)-1}",
389
+ # )
390
+ # if with_opt:
391
+ # with open(os.path.join(save_dir, "opt_state.msgpack"), "wb") as f:
392
+ # f.write(to_bytes(state.opt_state))
393
+ # with open(os.path.join(save_dir, "training_state.json"), "w") as f:
394
+ # json.dump({"step": state.step.item()}, f)
395
+ # logger.info("checkpoint saved")
396
+
397
+ # def restore_checkpoint(save_dir, state):
398
+ # logger.info(f"RESTORING CHECKPOINT FROM {save_dir}...")
399
+ # with open(os.path.join(save_dir, "flax_model.msgpack"), "rb") as f:
400
+ # params = from_bytes(state.params, f.read())
401
+
402
+ # with open(os.path.join(save_dir, "opt_state.msgpack"), "rb") as f:
403
+ # opt_state = from_bytes(state.opt_state, f.read())
404
+
405
+ # with open(os.path.join(save_dir, "training_state.json"), "r") as f:
406
+ # training_state = json.load(f)
407
+ # step = training_state["step"]
408
+
409
+ # logger.info("checkpoint restored")
410
+ # return state.replace(step=step, params=params, opt_state=opt_state), step
411
+
412
+ def rotate_checkpoints(ckpt_dir:str, save_total_limit:int):
413
+ "Removes older checkpoints so that `save_total_limit` checkpoints are kept"
414
+ # TODO: what to remove is decided using step number only, we might want to improve that
415
+ ckpts = [str(x) for x in Path(ckpt_dir).glob("ckpt-*")]
416
+ # sort checkpoints by step
417
+ ckpts_sorted = sorted(ckpts, key=lambda x: int(x.split('-')[-1]))
418
+ ckpts_to_delete = ckpts_sorted[:-save_total_limit]
419
+ for ckpt in ckpts_to_delete:
420
+ logger.info(f"Deleting older checkpoint [{ckpt}] due to save_total_limit ({save_total_limit})")
421
+ shutil.rmtree(ckpt)
422
+
423
+ def main():
424
+ # See all possible arguments in src/transformers/training_args.py
425
+ # or by passing the --help flag to this script.
426
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
427
+
428
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, ImageAugmentationArguments))
429
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
430
+ # If we pass only one argument to the script and it's the path to a json file,
431
+ # let's parse it to get our arguments.
432
+ model_args, data_args, training_args, augmentation_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
433
+ else:
434
+ model_args, data_args, training_args, augmentation_args = parser.parse_args_into_dataclasses()
435
+
436
+ if (
437
+ os.path.exists(training_args.output_dir)
438
+ and os.listdir(training_args.output_dir)
439
+ and training_args.do_train
440
+ and not training_args.overwrite_output_dir
441
+ ):
442
+ raise ValueError(
443
+ f"Output directory ({training_args.output_dir}) already exists and is not empty."
444
+ "Use --overwrite_output_dir to overcome."
445
+ )
446
+
447
+ # Make one log on every process with the configuration for debugging.
448
+ logging.basicConfig(
449
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
450
+ datefmt="%m/%d/%Y %H:%M:%S",
451
+ level=logging.INFO,
452
+ )
453
+ # Setup logging, we only want one process per machine to log things on the screen.
454
+ logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
455
+ if jax.process_index() == 0:
456
+ datasets.utils.logging.set_verbosity_warning()
457
+ transformers.utils.logging.set_verbosity_info()
458
+ else:
459
+ datasets.utils.logging.set_verbosity_error()
460
+ transformers.utils.logging.set_verbosity_error()
461
+
462
+ # Set the verbosity to info of the Transformers logger (on main process only):
463
+ logger.info(f"Training/evaluation parameters {training_args}")
464
+
465
+ # Load pretrained model and tokenizer
466
+
467
+ # Distributed training:
468
+ # The .from_pretrained methods guarantee that only one local process can concurrently
469
+ # download model & vocab.
470
+ if model_args.config_name:
471
+ config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
472
+ elif model_args.model_name_or_path:
473
+ config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
474
+ else:
475
+ config = CONFIG_MAPPING[model_args.model_type]()
476
+ logger.warning("You are instantiating a new config instance from scratch.")
477
+
478
+ # if model_args.tokenizer_name:
479
+ # tokenizer = AutoTokenizer.from_pretrained(
480
+ # model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
481
+ # )
482
+ # elif model_args.model_name_or_path:
483
+ # tokenizer = AutoTokenizer.from_pretrained(
484
+ # model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
485
+ # )
486
+ # else:
487
+ # raise ValueError(
488
+ # "You are instantiating a new tokenizer from scratch. This is not supported by this script."
489
+ # "You can do it from another script, save it, and load it from here, using --tokenizer_name."
490
+ # )
491
+
492
+ processor = CLIPProcessor.from_pretrained(model_args.model_name_or_path)
493
+ # tokenizer = CLIPTokenizerFast.from_pretrained(model_args.model_name_or_path)
494
+ tokenizer = processor.tokenizer
495
+ if model_args.model_name_or_path:
496
+ model = FlaxCLIPModel.from_pretrained(
497
+ model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
498
+ )
499
+ else:
500
+ model = FlaxCLIPModel.from_config(
501
+ config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
502
+ )
503
+
504
+ config = model.config
505
+ # Initialize torchvision transforms and jit them for faster processing
506
+ # preprocess = Transform(config.vision_config.image_size)
507
+ preprocess = Transform(config.vision_config.image_size, data_args.augment_images, augmentation_args)
508
+ preprocess = torch.jit.script(preprocess)
509
+
510
+ eval_preprocess = Transform(config.vision_config.image_size, False, augmentation_args)
511
+ eval_preprocess = torch.jit.script(eval_preprocess)
512
+
513
+ # Initialize the image-text dataset
514
+ train_dataset = ImageTextDataset(
515
+ data_args.train_file,
516
+ "train",
517
+ captions_per_image=data_args.captions_per_image,
518
+ augment_captions=data_args.augment_captions,
519
+ transform=preprocess,
520
+ )
521
+
522
+ eval_dataset = ImageTextDataset(
523
+ data_args.validation_file,
524
+ "valid",
525
+ captions_per_image=1,
526
+ augment_captions=False,
527
+ transform=eval_preprocess,
528
+ )
529
+
530
+ # Enable tensorboard only on the master node
531
+ has_tensorboard = is_tensorboard_available()
532
+ if has_tensorboard and jax.process_index() == 0:
533
+ try:
534
+ from flax.metrics.tensorboard import SummaryWriter
535
+
536
+ summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
537
+ except ImportError as ie:
538
+ has_tensorboard = False
539
+ logger.warning(
540
+ f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
541
+ )
542
+ else:
543
+ logger.warning(
544
+ "Unable to display metrics through TensorBoard because the package is not installed: "
545
+ "Please run pip install tensorboard to enable."
546
+ )
547
+
548
+ # Use collate function to tokenizer the text and convert the processed images to numpy
549
+ def collate_fn(examples):
550
+ pixel_values = torch.stack([example[0] for example in examples]).numpy()
551
+ captions = [example[1] for example in examples]
552
+ inputs = tokenizer(captions, max_length=128, padding="max_length", return_tensors="np", truncation=True)
553
+
554
+ batch = {
555
+ "pixel_values": pixel_values,
556
+ "input_ids": inputs["input_ids"],
557
+ "attention_mask": inputs["attention_mask"],
558
+ }
559
+
560
+ return batch
561
+
562
+ # Store some constant
563
+ num_epochs = int(training_args.num_train_epochs)
564
+ train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() * training_args.gradient_accumulation_steps
565
+ eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
566
+ steps_per_epoch = len(train_dataset) // train_batch_size
567
+ total_train_steps = steps_per_epoch * num_epochs
568
+
569
+ # Create data loaders
570
+ train_loader = torch.utils.data.DataLoader(
571
+ train_dataset,
572
+ batch_size=train_batch_size,
573
+ shuffle=True,
574
+ num_workers=data_args.preprocessing_num_workers,
575
+ persistent_workers=True,
576
+ drop_last=True,
577
+ collate_fn=collate_fn,
578
+ )
579
+
580
+ eval_loader = torch.utils.data.DataLoader(
581
+ eval_dataset,
582
+ batch_size=eval_batch_size,
583
+ shuffle=False,
584
+ num_workers=data_args.preprocessing_num_workers,
585
+ persistent_workers=True,
586
+ drop_last=True,
587
+ collate_fn=collate_fn,
588
+ )
589
+
590
+ # enable wandb tracking
591
+ has_wandb = find_spec("wandb") is not None
592
+ if jax.process_index() == 0 and has_wandb and ("wandb" in training_args.report_to):
593
+ try:
594
+ import wandb
595
+ if training_args.run_name is None:
596
+ run_name = training_args.output_dir.split("/")[-1]
597
+ else:
598
+ run_name = training_args.run_name
599
+ wandb.init(
600
+ name=run_name,
601
+ project="clip-flax",
602
+ sync_tensorboard=True
603
+ )
604
+ wandb.config.update(training_args)
605
+ wandb.config.update(model_args)
606
+ wandb.config.update(data_args)
607
+ except ImportError as e:
608
+ print(e)
609
+ has_wandb = False
610
+
611
+ # Initialize our training
612
+ rng = jax.random.PRNGKey(training_args.seed)
613
+ rng, dropout_rng = jax.random.split(rng)
614
+
615
+ # Create learning rate schedule
616
+ linear_decay_lr_schedule_fn = create_learning_rate_fn(
617
+ len(train_dataset),
618
+ train_batch_size,
619
+ training_args.num_train_epochs,
620
+ training_args.warmup_steps,
621
+ training_args.learning_rate,
622
+ )
623
+
624
+ # We use Optax's "masking" functionality to not apply weight decay
625
+ # to bias and LayerNorm scale parameters. decay_mask_fn returns a
626
+ # mask boolean with the same structure as the parameters.
627
+ # The mask is True for parameters that should be decayed.
628
+ # Note that this mask is specifically adapted for FlaxGPT2.
629
+ # For other models, one should correct the layer norm parameter naming
630
+ # accordingly.
631
+ def decay_mask_fn(params):
632
+ flat_params = traverse_util.flatten_dict(params)
633
+ flat_mask = {
634
+ path: (path[-1] != "bias" and path[-2:] not in [("ln_1", "scale"), ("ln_2", "scale"), ("ln_f", "scale")])
635
+ for path in flat_params
636
+ }
637
+ return traverse_util.unflatten_dict(flat_mask)
638
+
639
+ # create optimizer
640
+ if training_args.adafactor:
641
+ # We use the default parameters here to initialize adafactor,
642
+ # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
643
+ optimizer = optax.adafactor(
644
+ learning_rate=linear_decay_lr_schedule_fn,
645
+ )
646
+ else:
647
+ optimizer = optax.adamw(
648
+ learning_rate=linear_decay_lr_schedule_fn,
649
+ b1=training_args.adam_beta1,
650
+ b2=training_args.adam_beta2,
651
+ eps=training_args.adam_epsilon,
652
+ weight_decay=training_args.weight_decay,
653
+ mask=decay_mask_fn,
654
+ )
655
+ optimizer = optax.chain(
656
+ optax.clip_by_global_norm(1.),
657
+ optimizer
658
+ )
659
+ if training_args.gradient_accumulation_steps > 1:
660
+ optimizer = optax.MultiSteps(optimizer, training_args.gradient_accumulation_steps)
661
+ grad_accum_steps = training_args.gradient_accumulation_steps
662
+
663
+ # Setup train state
664
+ state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer, dropout_rng=dropout_rng)
665
+
666
+ if training_args.resume_from_checkpoint:
667
+ state = restore_checkpoint(training_args.resume_from_checkpoint, state)
668
+ resume_step = mb_item(state.step)
669
+ else:
670
+ resume_step = 0
671
+
672
+ def cross_entropy(logits, axis):
673
+ logprobs = jax.nn.log_softmax(logits, axis=axis)
674
+ nll = jnp.diag(logprobs)
675
+ ce = -jnp.mean(nll)
676
+ return ce
677
+
678
+ def clip_loss(similarity):
679
+ loss = (cross_entropy(similarity, axis=0) + cross_entropy(similarity, axis=1)) / 2
680
+ return loss
681
+
682
+ # Define gradient update step fn
683
+ def train_step(state, batch):
684
+ dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
685
+
686
+ def compute_loss(params):
687
+ logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
688
+ loss = clip_loss(logits)
689
+ return loss
690
+
691
+ grad_fn = jax.value_and_grad(compute_loss)
692
+ loss, grad = grad_fn(state.params)
693
+ grad = jax.lax.pmean(grad, "batch")
694
+
695
+ new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
696
+
697
+ metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step // grad_accum_steps)}
698
+ metrics = jax.lax.pmean(metrics, axis_name="batch")
699
+
700
+ return new_state, metrics
701
+
702
+ # Define eval fn
703
+ def eval_step(params, batch):
704
+ logits = model(**batch, params=params, train=False)[0]
705
+ loss = clip_loss(logits)
706
+
707
+ # summarize metrics
708
+ metrics = {"loss": loss}
709
+ metrics = jax.lax.pmean(metrics, axis_name="batch")
710
+ return metrics
711
+
712
+ # Create parallel version of the train and eval step
713
+ p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
714
+ p_eval_step = jax.pmap(eval_step, "batch")
715
+
716
+ # Replicate the train state on each device
717
+ state = state.replicate()
718
+
719
+ logger.info("***** Running training *****")
720
+ logger.info(f" Num examples = {len(train_dataset)}")
721
+ logger.info(f" Num Epochs = {num_epochs}")
722
+ logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
723
+ logger.info(f" Total train batch size (w. parallel, distributed and grad_accum) = {train_batch_size}")
724
+ logger.info(f" Total optimization steps = {total_train_steps}")
725
+
726
+ if not training_args.skip_memory_metrics:
727
+ server = jax.profiler.start_server(9999)
728
+
729
+ train_time = 0
730
+ train_metrics = []
731
+ epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
732
+ for epoch in epochs:
733
+ # ======================== Training ================================
734
+ train_start = time.time()
735
+
736
+ # Create sampling rng
737
+ rng, input_rng = jax.random.split(rng)
738
+
739
+ # Generate an epoch by shuffling sampling indices from the train dataset
740
+ steps_per_epoch = len(train_dataset) // train_batch_size
741
+ # train
742
+ steps_trained_progress_bar = tqdm(range(steps_per_epoch), desc="Training...", position=1,
743
+ leave=False, initial=(resume_step // grad_accum_steps))
744
+ for step, batch in enumerate(train_loader):
745
+ cur_step = epoch * (len(train_dataset) // train_batch_size) + step
746
+ # skip to the step from which we are resuming
747
+ if cur_step < resume_step:
748
+ continue
749
+
750
+ batch = shard(make_batch(batch))
751
+
752
+ state, train_metric = p_train_step(state, batch)
753
+ train_metrics.append(train_metric)
754
+ if step % grad_accum_steps == 0:
755
+ steps_trained_progress_bar.update(1)
756
+
757
+ if cur_step % (training_args.logging_steps * grad_accum_steps)== 0 and cur_step > 0:
758
+ # Save metrics
759
+ train_metric = unreplicate(train_metric)
760
+ train_time += time.time() - train_start
761
+ if has_tensorboard and jax.process_index() == 0:
762
+ write_train_metric(summary_writer, train_metrics, train_time, cur_step)
763
+ if has_wandb and jax.process_index() == 0 and ("wandb" in training_args.report_to):
764
+ # TODO: add accumulation of metrics
765
+ _metrics = {k if k=="learning_rate" else f"train_{k}":mb_item(v.mean()) for k, v in train_metric.items()}
766
+ wandb.log({"training_step":cur_step, **_metrics}, commit=True)
767
+
768
+ epochs.write(
769
+ f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
770
+ )
771
+
772
+ train_metrics = []
773
+
774
+ # if (cur_step % (training_args.eval_steps * grad_accum_steps) == 0 and
775
+ # cur_step > 0 and
776
+ # model_args.eval_strategy == "steps"):
777
+ # # ======================== Evaluating ==============================
778
+ # eval_metrics = []
779
+ # eval_steps = len(eval_dataset) // eval_batch_size
780
+ # eval_iter = iter(eval_loader)
781
+ # for batch in tqdm(eval_loader, desc="Evaluating...", position=2, leave=False):
782
+ # # Model forward
783
+ # batch = shard(make_batch(batch))
784
+ # metrics = p_eval_step(state.params, batch)
785
+ # eval_metrics.append(metrics)
786
+
787
+ # # normalize eval metrics
788
+ # eval_metrics = get_metrics(eval_metrics)
789
+ # eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
790
+
791
+ # # Print metrics and update progress bar
792
+ # desc = f"Step... ({cur_step} | Eval Loss: {eval_metrics['loss']})"
793
+ # epochs.write(desc)
794
+ # epochs.desc = desc
795
+
796
+ # # Save metrics
797
+ # if has_tensorboard and jax.process_index() == 0:
798
+ # # cur_step = epoch * (len(train_dataset) // train_batch_size)
799
+ # write_eval_metric(summary_writer, eval_metrics, cur_step)
800
+ # if has_wandb and jax.process_index() == 0 and ("wandb" in training_args.report_to):
801
+ # _metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
802
+ # wandb.log({"eval_step":cur_step, **_metrics})
803
+
804
+ # we can add an argument to select eval strategy; for now its done every epoch
805
+ if True:
806
+ # ======================== Evaluating ==============================
807
+ eval_metrics = []
808
+ eval_steps = len(eval_dataset) // eval_batch_size
809
+ for batch in tqdm(eval_loader, desc="Evaluating...", position=2, leave=False):
810
+ # Model forward
811
+ batch = shard(make_batch(batch))
812
+ metrics = p_eval_step(state.params, batch)
813
+ eval_metrics.append(metrics)
814
+
815
+ # normalize eval metrics
816
+ eval_metrics = get_metrics(eval_metrics)
817
+ eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
818
+
819
+ # Print metrics and update progress bar
820
+ desc = f"Step... ({cur_step} | Eval Loss: {eval_metrics['loss']})"
821
+ epochs.write(desc)
822
+ epochs.desc = desc
823
+
824
+ # Save metrics
825
+ if has_tensorboard and jax.process_index() == 0:
826
+ # cur_step = epoch * (len(train_dataset) // train_batch_size)
827
+ write_eval_metric(summary_writer, eval_metrics, cur_step)
828
+ if has_wandb and jax.process_index() == 0 and ("wandb" in training_args.report_to):
829
+ _metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
830
+ wandb.log({"eval_step":cur_step, **_metrics})
831
+
832
+ # save checkpoint after each epoch
833
+ if jax.process_index() == 0 and training_args.save_strategy == IntervalStrategy.EPOCH:
834
+ save_dir = f"{training_args.output_dir}/ckpt-{epoch}"
835
+ model.save_pretrained(
836
+ save_dir,
837
+ params=unreplicate(state.params),
838
+ push_to_hub=False, # training_args.push_to_hub, # we don't push intermediate steps
839
+ commit_message=f"Saving weights and logs at epoch {epoch}",
840
+ repo_name_or_path=training_args.output_dir
841
+ )
842
+ if model_args.save_optimizer:
843
+ save_checkpoint(training_args.output_dir, unreplicate(state), cur_step, keep=training_args.save_total_limit, overwrite=True)
844
+ if training_args.save_total_limit is not None:
845
+ rotate_checkpoints(training_args.output_dir, training_args.save_total_limit)
846
+
847
+ # save model after training is over
848
+ model.save_pretrained(
849
+ training_args.output_dir,
850
+ params=unreplicate(state.params),
851
+ push_to_hub=training_args.push_to_hub,
852
+ commit_message=f"Saving weights and logs at step {cur_step}",
853
+ repo_name_or_path=training_args.output_dir
854
+ )
855
+
856
+
857
+
858
+
859
+ if __name__ == "__main__":
860
+ main()
configuration_hybrid_clip.py ADDED
@@ -0,0 +1 @@
 
 
1
+ /home/ceyda/code/transformers/examples/research_projects/jax-projects/hybrid_clip/configuration_hybrid_clip.py
convert_model.ipynb ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 11,
6
+ "source": [
7
+ "from modeling_hybrid_clip import FlaxHybridCLIP\n",
8
+ "model = FlaxHybridCLIP.from_pretrained(\"output\")\n",
9
+ "# model.save_pretrained(\"./\")"
10
+ ],
11
+ "outputs": [],
12
+ "metadata": {}
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 36,
17
+ "source": [
18
+ "from transformers.modeling_flax_pytorch_utils import load_flax_checkpoint_in_pytorch_model\n",
19
+ "from modeling_hybrid_clip import FlaxHybridCLIP\n",
20
+ "\n",
21
+ "model = FlaxHybridCLIP.from_pretrained(\"output\")\n"
22
+ ],
23
+ "outputs": [],
24
+ "metadata": {}
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": null,
29
+ "source": [
30
+ "load_flax_checkpoint_in_pytorch_model(model, \"./flax_model.msgpack\")\n",
31
+ "model.save_pretrained(\"./\")"
32
+ ],
33
+ "outputs": [],
34
+ "metadata": {}
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 10,
39
+ "source": [
40
+ "model.save_pretrained(\"./test\",)"
41
+ ],
42
+ "outputs": [
43
+ {
44
+ "output_type": "stream",
45
+ "name": "stderr",
46
+ "text": [
47
+ "tcmalloc: large alloc 1284136960 bytes == 0x133cb8000 @ 0x7f0ad1bd7680 0x7f0ad1bf7bdd 0x7f07e8e4420d 0x7f07e8e52340 0x7f07e8e51e87 0x7f07e8e51e87 0x7f07e8e51e87 0x7f07e8e51e87 0x7f07e8e51e87 0x7f07e8e51e87 0x7f07e8e51e87 0x7f07e8e51e87 0x7f07e8e4dbd3 0x7f07e8e4e1fe 0x504d56 0x56acb6 0x568d9a 0x5f5b33 0x56bc9b 0x5f5956 0x56aadf 0x5f5956 0x56aadf 0x568d9a 0x5f5b33 0x56acb6 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf\n"
48
+ ]
49
+ }
50
+ ],
51
+ "metadata": {}
52
+ }
53
+ ],
54
+ "metadata": {
55
+ "orig_nbformat": 4,
56
+ "language_info": {
57
+ "name": "python"
58
+ }
59
+ },
60
+ "nbformat": 4,
61
+ "nbformat_minor": 2
62
+ }
modeling_hybrid_clip.py ADDED
@@ -0,0 +1 @@
 
 
1
+ /home/ceyda/code/transformers/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
out_clip/ckpt-0/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-0/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b086f40f91383efabb1bd36e97ee5be264158e20e2571e78c9bfc70bfbe650e1
3
+ size 605123003
out_clip/ckpt-1/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-1/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6270ee59f0cdb4fe8da15c5c27d6deb68bcc3877c3fd6035ca6815261b806a81
3
+ size 605123003
out_clip/ckpt-2/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-2/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9752386dc3af0b4dba81f0d035a71705b92f898cb61cb778f90058086f82259
3
+ size 605123003
out_clip/ckpt-3/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-3/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5baa7d16fbefba029a2fa1739774cd07c51e8a1c734678151091ccce7a2a82ed
3
+ size 605123003
out_clip/ckpt-4/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-4/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:060711623aafe04d58e38ac99971cd68c6af6113bc88edfb446b3bd8cf8836b7
3
+ size 605123003
out_clip/ckpt-5/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-5/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bba70f201cba6493dfda1a0e203927577a3b82ff011aebed1594908d7a7f7246
3
+ size 605123003
out_clip/ckpt-6/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-6/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39dcaec6f0d4e974a180679acf47ec18872ac50efe8b752d5614def01d45a278
3
+ size 605123003
out_clip/ckpt-7/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-7/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81aa037f9aad91717276034eb29e7fa31d0442ce4703b865306c8ee19028337f
3
+ size 605123003
out_clip/ckpt-8/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-8/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f3da1a61c4c62ed02d05bab267a160191f720a4a58c6d756419d2c2d630d29a
3
+ size 605123003
out_clip/ckpt-9/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/ckpt-9/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18340fa3ccbdec25f09d7fbeaa55f997cf1b9a781369304e8e9165cdc895ed46
3
+ size 605123003
out_clip/config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "clip",
7
+ "projection_dim": 512,
8
+ "text_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "dropout": 0.0,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": 2,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 512,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_factor": 1.0,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 2048,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "max_position_embeddings": 77,
46
+ "min_length": 0,
47
+ "model_type": "clip_text_model",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 12,
53
+ "num_return_sequences": 1,
54
+ "output_attentions": false,
55
+ "output_hidden_states": false,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "prefix": null,
59
+ "problem_type": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torch_dtype": null,
74
+ "torchscript": false,
75
+ "transformers_version": "4.9.0.dev0",
76
+ "use_bfloat16": false,
77
+ "vocab_size": 49408
78
+ },
79
+ "text_config_dict": null,
80
+ "transformers_version": null,
81
+ "vision_config": {
82
+ "_name_or_path": "",
83
+ "add_cross_attention": false,
84
+ "architectures": null,
85
+ "attention_dropout": 0.0,
86
+ "bad_words_ids": null,
87
+ "bos_token_id": null,
88
+ "chunk_size_feed_forward": 0,
89
+ "decoder_start_token_id": null,
90
+ "diversity_penalty": 0.0,
91
+ "do_sample": false,
92
+ "dropout": 0.0,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "quick_gelu",
101
+ "hidden_size": 768,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1"
105
+ },
106
+ "image_size": 224,
107
+ "initializer_factor": 1.0,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-05,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "min_length": 0,
120
+ "model_type": "clip_vision_model",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 32,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "task_specific_params": null,
141
+ "temperature": 1.0,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "transformers_version": "4.9.0.dev0",
150
+ "use_bfloat16": false
151
+ },
152
+ "vision_config_dict": null
153
+ }
out_clip/events.out.tfevents.1626479984.t1v-n-04cac4e5-w-0.3664172.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b279516a6941942fb1174d1871239727e87de10d032a729e3fe576b72d1b6fa
3
+ size 40
out_clip/events.out.tfevents.1626480047.t1v-n-04cac4e5-w-0.3665107.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58a13f56871873489a4b01e6403c7f060ac118121415df97f185dc7ed02468f0
3
+ size 40
out_clip/events.out.tfevents.1626480149.t1v-n-04cac4e5-w-0.3667807.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12238548366917b9e94898297ba0d85fc21d02ed8faceb2acd19c7725a60ccce
3
+ size 1076476
out_clip/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18340fa3ccbdec25f09d7fbeaa55f997cf1b9a781369304e8e9165cdc895ed46
3
+ size 605123003
test/config.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "HybridCLIP"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "hybrid-clip",
7
+ "projection_dim": 512,
8
+ "seed": 42,
9
+ "text_config": {
10
+ "_name_or_path": "tweeteval_new/roberta-base-rt-emoji/",
11
+ "add_cross_attention": false,
12
+ "architectures": [
13
+ "RobertaForSequenceClassification"
14
+ ],
15
+ "attention_probs_dropout_prob": 0.1,
16
+ "bad_words_ids": null,
17
+ "bos_token_id": 0,
18
+ "chunk_size_feed_forward": 0,
19
+ "decoder_start_token_id": null,
20
+ "diversity_penalty": 0.0,
21
+ "do_sample": false,
22
+ "early_stopping": false,
23
+ "encoder_no_repeat_ngram_size": 0,
24
+ "eos_token_id": 2,
25
+ "finetuning_task": null,
26
+ "forced_bos_token_id": null,
27
+ "forced_eos_token_id": null,
28
+ "gradient_checkpointing": false,
29
+ "hidden_act": "gelu",
30
+ "hidden_dropout_prob": 0.1,
31
+ "hidden_size": 768,
32
+ "id2label": {
33
+ "0": "LABEL_0",
34
+ "1": "LABEL_1",
35
+ "2": "LABEL_2",
36
+ "3": "LABEL_3",
37
+ "4": "LABEL_4",
38
+ "5": "LABEL_5",
39
+ "6": "LABEL_6",
40
+ "7": "LABEL_7",
41
+ "8": "LABEL_8",
42
+ "9": "LABEL_9",
43
+ "10": "LABEL_10",
44
+ "11": "LABEL_11",
45
+ "12": "LABEL_12",
46
+ "13": "LABEL_13",
47
+ "14": "LABEL_14",
48
+ "15": "LABEL_15",
49
+ "16": "LABEL_16",
50
+ "17": "LABEL_17",
51
+ "18": "LABEL_18",
52
+ "19": "LABEL_19"
53
+ },
54
+ "initializer_range": 0.02,
55
+ "intermediate_size": 3072,
56
+ "is_decoder": false,
57
+ "is_encoder_decoder": false,
58
+ "label2id": {
59
+ "LABEL_0": 0,
60
+ "LABEL_1": 1,
61
+ "LABEL_10": 10,
62
+ "LABEL_11": 11,
63
+ "LABEL_12": 12,
64
+ "LABEL_13": 13,
65
+ "LABEL_14": 14,
66
+ "LABEL_15": 15,
67
+ "LABEL_16": 16,
68
+ "LABEL_17": 17,
69
+ "LABEL_18": 18,
70
+ "LABEL_19": 19,
71
+ "LABEL_2": 2,
72
+ "LABEL_3": 3,
73
+ "LABEL_4": 4,
74
+ "LABEL_5": 5,
75
+ "LABEL_6": 6,
76
+ "LABEL_7": 7,
77
+ "LABEL_8": 8,
78
+ "LABEL_9": 9
79
+ },
80
+ "layer_norm_eps": 1e-05,
81
+ "length_penalty": 1.0,
82
+ "max_length": 20,
83
+ "max_position_embeddings": 514,
84
+ "min_length": 0,
85
+ "model_type": "roberta",
86
+ "no_repeat_ngram_size": 0,
87
+ "num_attention_heads": 12,
88
+ "num_beam_groups": 1,
89
+ "num_beams": 1,
90
+ "num_hidden_layers": 12,
91
+ "num_return_sequences": 1,
92
+ "output_attentions": false,
93
+ "output_hidden_states": false,
94
+ "output_scores": false,
95
+ "pad_token_id": 1,
96
+ "position_embedding_type": "absolute",
97
+ "prefix": null,
98
+ "problem_type": null,
99
+ "pruned_heads": {},
100
+ "remove_invalid_values": false,
101
+ "repetition_penalty": 1.0,
102
+ "return_dict": true,
103
+ "return_dict_in_generate": false,
104
+ "sep_token_id": null,
105
+ "task_specific_params": null,
106
+ "temperature": 1.0,
107
+ "tie_encoder_decoder": false,
108
+ "tie_word_embeddings": true,
109
+ "tokenizer_class": null,
110
+ "top_k": 50,
111
+ "top_p": 1.0,
112
+ "torch_dtype": null,
113
+ "torchscript": false,
114
+ "transformers_version": "4.9.0.dev0",
115
+ "type_vocab_size": 1,
116
+ "use_bfloat16": false,
117
+ "use_cache": true,
118
+ "vocab_size": 50265
119
+ },
120
+ "transformers_version": null,
121
+ "vision_config": {
122
+ "_name_or_path": "",
123
+ "add_cross_attention": false,
124
+ "architectures": null,
125
+ "attention_dropout": 0.0,
126
+ "bad_words_ids": null,
127
+ "bos_token_id": null,
128
+ "chunk_size_feed_forward": 0,
129
+ "decoder_start_token_id": null,
130
+ "diversity_penalty": 0.0,
131
+ "do_sample": false,
132
+ "dropout": 0.0,
133
+ "early_stopping": false,
134
+ "encoder_no_repeat_ngram_size": 0,
135
+ "eos_token_id": null,
136
+ "finetuning_task": null,
137
+ "forced_bos_token_id": null,
138
+ "forced_eos_token_id": null,
139
+ "gradient_checkpointing": false,
140
+ "hidden_act": "quick_gelu",
141
+ "hidden_size": 768,
142
+ "id2label": {
143
+ "0": "LABEL_0",
144
+ "1": "LABEL_1"
145
+ },
146
+ "image_size": 224,
147
+ "initializer_factor": 1.0,
148
+ "initializer_range": 0.02,
149
+ "intermediate_size": 3072,
150
+ "is_decoder": false,
151
+ "is_encoder_decoder": false,
152
+ "label2id": {
153
+ "LABEL_0": 0,
154
+ "LABEL_1": 1
155
+ },
156
+ "layer_norm_eps": 1e-05,
157
+ "length_penalty": 1.0,
158
+ "max_length": 20,
159
+ "min_length": 0,
160
+ "model_type": "clip_vision_model",
161
+ "no_repeat_ngram_size": 0,
162
+ "num_attention_heads": 12,
163
+ "num_beam_groups": 1,
164
+ "num_beams": 1,
165
+ "num_hidden_layers": 12,
166
+ "num_return_sequences": 1,
167
+ "output_attentions": false,
168
+ "output_hidden_states": false,
169
+ "output_scores": false,
170
+ "pad_token_id": null,
171
+ "patch_size": 32,
172
+ "prefix": null,
173
+ "problem_type": null,
174
+ "pruned_heads": {},
175
+ "remove_invalid_values": false,
176
+ "repetition_penalty": 1.0,
177
+ "return_dict": true,
178
+ "return_dict_in_generate": false,
179
+ "sep_token_id": null,
180
+ "task_specific_params": null,
181
+ "temperature": 1.0,
182
+ "tie_encoder_decoder": false,
183
+ "tie_word_embeddings": true,
184
+ "tokenizer_class": null,
185
+ "top_k": 50,
186
+ "top_p": 1.0,
187
+ "torch_dtype": null,
188
+ "torchscript": false,
189
+ "transformers_version": "4.9.0.dev0",
190
+ "use_bfloat16": false
191
+ }
192
+ }
test/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2bc552b4bf159294bd53b247a56f90ff489b3323a8c544bd4e3ff5f918451ef
3
+ size 851566424
train_clip.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/bash
2
+ MODEL_DIR=./out_clip
3
+ BASE_V_MODEL=openai/clip-vit-base-patch32
4
+ python clippy.py \
5
+ --output_dir ${MODEL_DIR} \
6
+ --model_name_or_path ${BASE_V_MODEL} \
7
+ --train_file="/home/ceyda/data/train.json" \
8
+ --validation_file="/home/ceyda/data/val.json" \
9
+ --do_train --do_eval \
10
+ --per_device_train_batch_size="32" \
11
+ --per_device_eval_batch_size="8" \
12
+ --preprocessing_num_workers="16" \
13
+ --learning_rate="1e-5" \
14
+ --adafactor false \
15
+ --warmup_steps="50" \
16
+ --adam_beta1="0.9" \
17
+ --adam_beta2="0.98" \
18
+ --weight_decay="0.1" \
19
+ --overwrite_output_dir \
20
+ --num_train_epochs 10 \
21
+ --logging_steps="2" \
22
+ --eval_steps="20" \
23
+ --push_to_hub="False" \
24
+ --dtype="bfloat16" \
25
+ --skip_memory_metrics="True" \
26
+ --save_steps="200" \
27
+ --save_total_limit 10 \
28
+ --gradient_accumulation_steps 1 \
29
+ --report_to all \
30
+ --save_strategy epoch \
31
+ --save_optimizer="False" \
32
+ --captions_per_image 1 \
33
+ --augment_images false \
34
+ --augment_captions false
35
+ # --run_name="test_run" \
36
+ # --resume_from_checkpoint $HOME/gpt-neo-125M-code-clippy/ckpt_201 \
37
+ # --max_train_samples 10240 \
38
+ # --max_eval_samples="1000"