Update files for transformers integration

#3
by guarin HF Staff - opened
Files changed (4) hide show
  1. config.json +67 -16
  2. configuration_tips.py +29 -31
  3. processor_config.json +16 -0
  4. tokenizer_config.json +12 -0
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "model_type": "tipsv2",
3
  "architectures": [
4
  "TIPSv2Model"
5
  ],
@@ -7,18 +6,70 @@
7
  "AutoConfig": "configuration_tips.TIPSv2Config",
8
  "AutoModel": "modeling_tips.TIPSv2Model"
9
  },
10
- "patch_size": 14,
11
- "img_size": 448,
12
- "init_values": 1.0,
13
- "num_register_tokens": 1,
14
- "vocab_size": 32000,
15
- "max_len": 64,
16
- "vision_fn": "vit_base",
17
- "embed_dim": 768,
18
- "text_hidden_size": 768,
19
- "text_mlp_dim": 3072,
20
- "text_num_heads": 12,
21
- "text_num_layers": 12,
22
- "ffn_layer": "mlp",
23
- "temperature": 0.005065968260169029
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
 
2
  "architectures": [
3
  "TIPSv2Model"
4
  ],
 
6
  "AutoConfig": "configuration_tips.TIPSv2Config",
7
  "AutoModel": "modeling_tips.TIPSv2Model"
8
  },
9
+ "model_type": "tipsv2",
10
+ "temperature_init_value": 0.005065968260169029,
11
+ "text_config": {
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": null,
14
+ "eos_token_id": null,
15
+ "hidden_act": "relu",
16
+ "hidden_size": 768,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "layer_norm_eps": 1e-05,
20
+ "max_position_embeddings": 64,
21
+ "model_type": "tipsv2_text_model",
22
+ "num_attention_heads": 12,
23
+ "num_hidden_layers": 12,
24
+ "pad_token_id": 0,
25
+ "pooling_epsilon": 1e-08,
26
+ "scale_sqrt_depth": true,
27
+ "vocab_size": 32000
28
+ },
29
+ "transformers_version": "5.10.0.dev0",
30
+ "vision_config": {
31
+ "apply_layernorm": true,
32
+ "attention_probs_dropout_prob": 0.0,
33
+ "drop_path_rate": 0.0,
34
+ "hidden_act": "gelu",
35
+ "hidden_dropout_prob": 0.0,
36
+ "hidden_size": 768,
37
+ "image_size": 448,
38
+ "initializer_range": 0.02,
39
+ "interpolate_antialias": true,
40
+ "interpolate_offset": 0.0,
41
+ "layer_norm_eps": 1e-06,
42
+ "layerscale_value": 1.0,
43
+ "mlp_ratio": 4,
44
+ "model_type": "tipsv2_vision_model",
45
+ "num_attention_heads": 12,
46
+ "num_channels": 3,
47
+ "num_hidden_layers": 12,
48
+ "num_register_tokens": 1,
49
+ "out_features": [
50
+ "stage12"
51
+ ],
52
+ "out_indices": [
53
+ 12
54
+ ],
55
+ "patch_size": 14,
56
+ "qkv_bias": true,
57
+ "reshape_hidden_states": true,
58
+ "stage_names": [
59
+ "stem",
60
+ "stage1",
61
+ "stage2",
62
+ "stage3",
63
+ "stage4",
64
+ "stage5",
65
+ "stage6",
66
+ "stage7",
67
+ "stage8",
68
+ "stage9",
69
+ "stage10",
70
+ "stage11",
71
+ "stage12"
72
+ ],
73
+ "use_swiglu_ffn": false
74
+ }
75
+ }
configuration_tips.py CHANGED
@@ -3,6 +3,14 @@
3
  from transformers import PretrainedConfig
4
 
5
 
 
 
 
 
 
 
 
 
6
  class TIPSv2Config(PretrainedConfig):
7
  """Configuration for TIPSv2 vision-language model."""
8
 
@@ -10,37 +18,27 @@ class TIPSv2Config(PretrainedConfig):
10
 
11
  def __init__(
12
  self,
13
- # Vision encoder
14
- vision_fn="vit_base",
15
- embed_dim=768,
16
- patch_size=14,
17
- img_size=448,
18
- ffn_layer="mlp",
19
- init_values=1.0,
20
- num_register_tokens=1,
21
- # Text encoder
22
- text_hidden_size=768,
23
- text_mlp_dim=3072,
24
- text_num_heads=12,
25
- text_num_layers=12,
26
- vocab_size=32000,
27
- max_len=64,
28
- # Contrastive
29
- temperature=0.01,
30
  **kwargs,
31
  ):
32
  super().__init__(**kwargs)
33
- self.vision_fn = vision_fn
34
- self.embed_dim = embed_dim
35
- self.patch_size = patch_size
36
- self.img_size = img_size
37
- self.ffn_layer = ffn_layer
38
- self.init_values = init_values
39
- self.num_register_tokens = num_register_tokens
40
- self.text_hidden_size = text_hidden_size
41
- self.text_mlp_dim = text_mlp_dim
42
- self.text_num_heads = text_num_heads
43
- self.text_num_layers = text_num_layers
44
- self.vocab_size = vocab_size
45
- self.max_len = max_len
46
- self.temperature = temperature
 
 
 
 
 
3
  from transformers import PretrainedConfig
4
 
5
 
6
+ _VISION_FN_BY_GEOMETRY = {
7
+ (768, 12): "vit_base",
8
+ (1024, 24): "vit_large",
9
+ (1152, 27): "vit_so400m",
10
+ (1536, 40): "vit_giant2",
11
+ }
12
+
13
+
14
  class TIPSv2Config(PretrainedConfig):
15
  """Configuration for TIPSv2 vision-language model."""
16
 
 
18
 
19
  def __init__(
20
  self,
21
+ vision_config=None,
22
+ text_config=None,
23
+ temperature_init_value=0.01,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  **kwargs,
25
  ):
26
  super().__init__(**kwargs)
27
+ vision_config = vision_config or {}
28
+ text_config = text_config or {}
29
+ hidden_size = vision_config.get("hidden_size", 768)
30
+ num_hidden_layers = vision_config.get("num_hidden_layers", 12)
31
+ self.vision_fn = _VISION_FN_BY_GEOMETRY[(hidden_size, num_hidden_layers)]
32
+ self.embed_dim = hidden_size
33
+ self.patch_size = vision_config.get("patch_size", 14)
34
+ self.img_size = vision_config.get("image_size", 448)
35
+ self.ffn_layer = "swiglu" if vision_config.get("use_swiglu_ffn", False) else "mlp"
36
+ self.init_values = vision_config.get("layerscale_value", 1.0)
37
+ self.num_register_tokens = vision_config.get("num_register_tokens", 1)
38
+ self.text_hidden_size = text_config.get("hidden_size", 768)
39
+ self.text_mlp_dim = text_config.get("intermediate_size", 3072)
40
+ self.text_num_heads = text_config.get("num_attention_heads", 12)
41
+ self.text_num_layers = text_config.get("num_hidden_layers", 12)
42
+ self.vocab_size = text_config.get("vocab_size", 32000)
43
+ self.max_len = text_config.get("max_position_embeddings", 64)
44
+ self.temperature = temperature_init_value
processor_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "do_convert_rgb": true,
4
+ "do_normalize": false,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_processor_type": "Tipsv2ImageProcessor",
8
+ "resample": 2,
9
+ "rescale_factor": 0.00392156862745098,
10
+ "size": {
11
+ "height": 448,
12
+ "width": 448
13
+ }
14
+ },
15
+ "processor_class": "Tipsv2Processor"
16
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": null,
4
+ "do_lower_case": true,
5
+ "eos_token": null,
6
+ "model_max_length": 64,
7
+ "pad_token": "<pad>",
8
+ "processor_class": "Tipsv2Processor",
9
+ "token_type_ids_pattern": "all_zeros",
10
+ "tokenizer_class": "Tipsv2Tokenizer",
11
+ "unk_token": "<unk>"
12
+ }