Chiraag Anand commited on
Commit
263e7a9
·
1 Parent(s): 8aab37c
Q16Model.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import CLIPModel, PreTrainedModel, CLIPProcessor, AutoConfig
2
+ import torch
3
+ import pickle
4
+ from torch.nn.functional import cosine_similarity
5
+
6
+ CLIP_MODEL = "openai/clip-vit-large-patch14"
7
+
8
+
9
+ class Q16Model(PreTrainedModel):
10
+ def __init__(self, config):
11
+ super().__init__(config)
12
+ self.clip_model = CLIPModel.from_pretrained(CLIP_MODEL)
13
+ self.soft_prompts = None
14
+
15
+ def load_soft_prompts(self, path):
16
+ self.soft_prompts = torch.HalfTensor(pickle.load(
17
+ open(path, 'rb'))).to('cpu').to(torch.float32)
18
+
19
+ def forward(self, pixel_values):
20
+ # Get image encodings from CLIP model
21
+ image_features = self.clip_model.get_image_features(
22
+ pixel_values=pixel_values)
23
+
24
+ # Compare image features with soft prompts
25
+ similarities = cosine_similarity(image_features.unsqueeze(
26
+ 1), self.soft_prompts.unsqueeze(0), dim=-1)
27
+ logits = similarities
28
+
29
+ return logits
30
+
31
+ @classmethod
32
+ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
33
+ config = kwargs.pop("config", None)
34
+ model = super(Q16Model, cls).from_pretrained(
35
+ pretrained_model_name_or_path, config=config, *model_args, **kwargs)
36
+ # Load the soft prompts
37
+ model.load_soft_prompts(f"{pretrained_model_name_or_path}/prompts.p")
38
+ return model
39
+
40
+ def save_pretrained(self, save_directory):
41
+ super().save_pretrained(save_directory)
42
+ # Save the soft prompts separately
43
+ with open(f"{save_directory}/prompts.p", 'wb') as f:
44
+ pickle.dump(self.soft_prompts.cpu().numpy(), f)
45
+
46
+
47
+ if __name__ == "__main__":
48
+ # Define the configuration
49
+ config = AutoConfig.from_pretrained(CLIP_MODEL)
50
+ config.soft_prompt_dim = 768
51
+
52
+ # Initialize the custom model
53
+ model = Q16Model(config)
54
+
55
+ # Load the soft prompts
56
+ model.load_soft_prompts("./prompts.p")
57
+
58
+ # Save the model and processor
59
+ save_directory = "."
60
+ model.save_pretrained(save_directory)
61
+ processor = CLIPProcessor.from_pretrained(CLIP_MODEL)
62
+ processor.save_pretrained(save_directory)
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/clip-vit-large-patch14",
3
+ "architectures": [
4
+ "Q16Model"
5
+ ],
6
+ "initializer_factor": 1.0,
7
+ "logit_scale_init_value": 2.6592,
8
+ "model_type": "clip",
9
+ "projection_dim": 768,
10
+ "soft_prompt_dim": 768,
11
+ "text_config": {
12
+ "dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "intermediate_size": 3072,
15
+ "model_type": "clip_text_model",
16
+ "num_attention_heads": 12,
17
+ "projection_dim": 768
18
+ },
19
+ "torch_dtype": "float32",
20
+ "transformers_version": "4.42.4",
21
+ "vision_config": {
22
+ "dropout": 0.0,
23
+ "hidden_size": 1024,
24
+ "intermediate_size": 4096,
25
+ "model_type": "clip_vision_model",
26
+ "num_attention_heads": 16,
27
+ "num_hidden_layers": 24,
28
+ "patch_size": 14,
29
+ "projection_dim": 768
30
+ }
31
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e95556d2b3766f31a43fb26efb887846ccd6cfcce8234f1a0868d62647d7492
3
+ size 1710544204
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "processor_class": "CLIPProcessor",
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 224
27
+ }
28
+ }
prompts.p ADDED
Binary file (6.3 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": true,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "model_max_length": 77,
27
+ "pad_token": "<|endoftext|>",
28
+ "processor_class": "CLIPProcessor",
29
+ "tokenizer_class": "CLIPTokenizer",
30
+ "unk_token": "<|endoftext|>"
31
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff