evanec commited on
Commit
94a0812
·
verified ·
1 Parent(s): 1d05670

Upload 5 files

Browse files
models/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Makes this directory a python package
models/base_encoder.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/base_encoder.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from abc import ABC, abstractmethod
6
+
7
+
8
+ class BaseVisionEncoder(nn.Module, ABC):
9
+ def __init__(self, embed_dim: int):
10
+ super().__init__()
11
+ self.embed_dim = embed_dim
12
+
13
+ @abstractmethod
14
+ def forward(self, pixel_values: torch.Tensor):
15
+ pass
16
+
17
+ @abstractmethod
18
+ def get_output_dim(self):
19
+ """Return the dimensionality of the encoder output embedding."""
20
+ pass
models/encoder_projection_t5.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/image_projection.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+
7
+ class ImageProjection(nn.Module):
8
+ """
9
+ Projects encoder image embeddings into the T5 hidden size.
10
+
11
+ Example:
12
+ - CLIP ViT-L/14 gives 1024-d embeddings
13
+ - T5-small expects 512-d hidden states
14
+ → This linear layer maps 1024 → 512
15
+
16
+ Forward:
17
+ image_embeds: (B, D_enc) or (B, S, D_enc)
18
+ returns:
19
+ projected_embeds: (B, D_t5) or (B, S, D_t5)
20
+ """
21
+
22
+ def __init__(self, encoder_dim: int, t5_hidden_size: int):
23
+ super().__init__()
24
+ self.proj = nn.Linear(encoder_dim, t5_hidden_size)
25
+
26
+ def forward(self, image_embeds: torch.Tensor):
27
+ return self.proj(image_embeds)
models/encoders.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/resnet_encoder.py
2
+ # Just for setting up the pipeline, this will be replaced
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ from torchvision import models
7
+ from .base_encoder import BaseVisionEncoder
8
+ from transformers import CLIPVisionModel
9
+ from transformers import AutoModel
10
+
11
+
12
+ class ResnetCNNEncoder(nn.Module):
13
+ def __init__(self, model_name="resnet50", fine_tune_all_encoder_layers=False, freeze_encoder_entirely=False, freeze_until=3, vision_mode="patch"):
14
+ super().__init__()
15
+ self.embed_dim = 2048 # Fixed output dimension for ResNet-50/101
16
+ self.vision_mode = vision_mode
17
+
18
+ if model_name == "resnet50":
19
+ resnet = models.resnet50(
20
+ weights=models.ResNet50_Weights.IMAGENET1K_V2
21
+ )
22
+ elif model_name == "resnet101":
23
+ resnet = models.resnet101(
24
+ weights=models.ResNet101_Weights.IMAGENET1K_V2
25
+ )
26
+ else:
27
+ raise ValueError("model_name must be 'resnet50' or 'resnet101'")
28
+
29
+ # Keep encoder layers only (no classifier head)
30
+ # ResNet children indices: 0:conv1, 1:bn1, 2:relu, 3:maxpool, 4:layer1, 5:layer2, 6:layer3, 7:layer4
31
+ self.features = nn.Sequential(*list(resnet.children())[:-1])
32
+ self.model = self.features
33
+
34
+
35
+ # Full Fine-Tuning Mode
36
+ if fine_tune_all_encoder_layers:
37
+ print("[INFO] ResNet-50: Fine-tuning ALL layers (1-4).")
38
+ # PyTorch defaults to requires_grad=True, so no action is needed here.
39
+ return
40
+
41
+ # Full Freezing Mode
42
+ elif freeze_encoder_entirely:
43
+ print("[INFO] ResNet-50: Freezing ALL layers (1-4).")
44
+ for param in self.features.parameters():
45
+ param.requires_grad = False
46
+ return
47
+
48
+ # Dynamic Partial Freezing Mode
49
+ else:
50
+ # freeze_until=3 is the default behavior (freeze L1-L3, train L4)
51
+ # freeze_until=2 means freeze L1-L2, train L3-L4
52
+ print(f"[INFO] ResNet-50: Dynamic partial fine-tuning (Freezing Layers 1-{freeze_until}).")
53
+
54
+ # To freeze up to layer N (L1, L2, L3, or L4), we freeze all indices from 0 up to N+3.
55
+ max_freeze_idx = freeze_until + 3
56
+
57
+ # Create a set of indices to freeze (from 0 up to max_freeze_idx, inclusive)
58
+ freeze_indices = set(range(max_freeze_idx + 1))
59
+
60
+ for idx, layer in enumerate(self.features):
61
+ # Ensure we only process layers up to the target index
62
+ if idx in freeze_indices:
63
+ for param in layer.parameters():
64
+ param.requires_grad = False
65
+
66
+
67
+ def forward(self, pixel_values):
68
+ x = self.features(pixel_values) # (B, 2048, H, W)
69
+
70
+ if self.vision_mode == "cls":
71
+ x_flat = x.flatten(1) # (B, 2048*H*W)
72
+ return {"image_embeds": x_flat}
73
+
74
+ tokens = x.flatten(2).transpose(1, 2) # (B, S, 2048)
75
+ return {"image_embeds": tokens}
76
+
77
+
78
+ def get_output_dim(self):
79
+ return self.embed_dim
80
+
81
+
82
+
83
+ # ViT Encoders
84
+
85
+ class ViTEncoder(BaseVisionEncoder):
86
+
87
+ def __init__(self, model_name="google/vit-base-patch16-224", train_last_n_layers=4, vision_mode="patch"):
88
+ super().__init__(embed_dim=None)
89
+
90
+ self.model = AutoModel.from_pretrained(model_name)
91
+ self.vision_mode = vision_mode
92
+
93
+ self.embed_dim = self.model.config.hidden_size
94
+ if self.embed_dim is None:
95
+ raise ValueError("Could not determine embed_dim from model config.")
96
+
97
+ # Partial Fine-Tuning Strategy
98
+ # Strategy: Freeze first 8 layers (0-7), train last 4 layers (8-11),
99
+ # plus embeddings and final LayerNorm. (Total layers = 12 for ViT-Base)
100
+
101
+ # Freeze all parameters initially
102
+ for param in self.model.parameters():
103
+ param.requires_grad = False
104
+
105
+ # Unfreeze the final N transformer blocks
106
+ NUM_LAYERS_TO_TRAIN = train_last_n_layers
107
+
108
+ try:
109
+ # The layers are typically stored in .encoder.layer
110
+ encoder_layers = self.model.encoder.layer
111
+ num_layers = len(encoder_layers)
112
+
113
+ # Unfreeze the last NUM_LAYERS_TO_TRAIN blocks
114
+ for i in range(num_layers - NUM_LAYERS_TO_TRAIN, num_layers):
115
+ layer = encoder_layers[i]
116
+ for param in layer.parameters():
117
+ param.requires_grad = True
118
+
119
+ print(f"ViT Encoder: Unfrozen the final {NUM_LAYERS_TO_TRAIN} blocks ({num_layers - NUM_LAYERS_TO_TRAIN} to {num_layers - 1}).")
120
+
121
+ except AttributeError:
122
+ print("Warning: Could not find standard ViT layer structure for partial fine-tuning.")
123
+
124
+ # Unfreeze Positional Embeddings (often gives a small boost)
125
+ if hasattr(self.model.embeddings, 'position_embeddings'):
126
+ self.model.embeddings.position_embeddings.requires_grad = True
127
+ print("ViT Encoder: Unfrozen positional embeddings.")
128
+
129
+ # Unfreeze the final LayerNorm (for stabilization)
130
+ if hasattr(self.model.encoder, 'layernorm'):
131
+ for param in self.model.encoder.layernorm.parameters():
132
+ param.requires_grad = True
133
+ print("ViT Encoder: Unfrozen final LayerNorm.")
134
+
135
+
136
+ def forward(self, pixel_values):
137
+ out = self.model(pixel_values=pixel_values)
138
+
139
+ # CLS MODE
140
+ if self.vision_mode == "cls":
141
+ if hasattr(out, 'pooler_output') and out.pooler_output is not None:
142
+ pooled = out.pooler_output # (B, D)
143
+ elif hasattr(out, 'last_hidden_state'):
144
+ pooled = out.last_hidden_state[:, 0, :] # CLS token (B, D)
145
+ else:
146
+ raise RuntimeError("Model output format not recognized.")
147
+
148
+ return {"image_embeds": pooled}
149
+
150
+ # PATCH
151
+ seq = out.last_hidden_state # (B, S, D)
152
+ return {"image_embeds": seq}
153
+
154
+
155
+ def get_output_dim(self):
156
+ return self.embed_dim
157
+
158
+
159
+ # Clip Encoders
160
+
161
+ class CLIPEncoder(BaseVisionEncoder):
162
+
163
+ def __init__(self, model_name="openai/clip-vit-base-patch32", train_last_n_layers=4, vision_mode="patch"):
164
+ # The output dimension (hidden size) will be set after loading the model config
165
+ super().__init__(embed_dim=None)
166
+
167
+ self.model = CLIPVisionModel.from_pretrained(model_name)
168
+ self.vision_mode = vision_mode
169
+
170
+ self.embed_dim = self.model.config.hidden_size
171
+ if self.embed_dim is None:
172
+ raise ValueError("Could not determine embed_dim from model config.")
173
+
174
+ # Partial Fine-Tuning Strategy
175
+ # Strategy: Freeze first 8 layers (0-7), train last 4 layers (8-11),
176
+ # plus embeddings and final LayerNorm. (Total layers = 12 for ViT-Base)
177
+
178
+ # Freeze all parameters initially
179
+ for param in self.model.parameters():
180
+ param.requires_grad = False
181
+
182
+ # Unfreeze the final N transformer blocks
183
+ NUM_LAYERS_TO_TRAIN = train_last_n_layers
184
+
185
+ try:
186
+ encoder_layers = self.model.vision_model.encoder.layers
187
+ num_layers = len(encoder_layers)
188
+
189
+ for i in range(num_layers - NUM_LAYERS_TO_TRAIN, num_layers):
190
+ layer = encoder_layers[i]
191
+ for param in layer.parameters():
192
+ param.requires_grad = True
193
+
194
+ print(f"CLIP Encoder: Unfrozen the final {NUM_LAYERS_TO_TRAIN} blocks ({num_layers - NUM_LAYERS_TO_TRAIN} to {num_layers - 1}).")
195
+
196
+ except AttributeError:
197
+ print("Warning: Could not find standard CLIP layer structure for partial fine-tuning. Ensure model structure is correct.")
198
+
199
+
200
+ if hasattr(self.model.vision_model.embeddings, 'position_embedding'):
201
+ self.model.vision_model.embeddings.position_embedding.requires_grad = True
202
+ print("CLIP Encoder: Unfrozen positional embeddings.")
203
+
204
+ if hasattr(self.model.vision_model, 'post_layernorm'):
205
+ for param in self.model.vision_model.post_layernorm.parameters():
206
+ param.requires_grad = True
207
+ print("CLIP Encoder: Unfrozen final LayerNorm.")
208
+
209
+
210
+
211
+ def forward(self, pixel_values):
212
+ out = self.model(pixel_values=pixel_values)
213
+ seq = out.last_hidden_state # (B, S, D)
214
+
215
+ if self.vision_mode == "cls":
216
+ return {"image_embeds": seq[:, 0, :]} # (B, D)
217
+
218
+ return {"image_embeds": seq} # (B, S, D)
219
+
220
+
221
+
222
+ def get_output_dim(self):
223
+ return self.embed_dim
models/vision_t5.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import T5ForConditionalGeneration
4
+ from transformers.modeling_outputs import BaseModelOutput
5
+ from peft import LoraConfig, get_peft_model
6
+ import os
7
+ import requests
8
+ from transformers import AutoConfig
9
+
10
+
11
+ def safe_load_t5(model_name, local_path):
12
+ has_local = os.path.exists(local_path)
13
+
14
+ try:
15
+ print(f"[INFO] Trying to load {model_name} from HuggingFace…")
16
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
17
+ print("[INFO] Loaded from HF successfully.")
18
+ return model
19
+
20
+ except Exception as e:
21
+ print(f"[WARN] HF load failed: {e}")
22
+
23
+ if not has_local:
24
+ raise RuntimeError(
25
+ f"No local copy available at {local_path} and HF download failed."
26
+ )
27
+
28
+ print("[INFO] Falling back to local Drive copy...")
29
+ return T5ForConditionalGeneration.from_pretrained(local_path)
30
+
31
+
32
+ class VisionT5(nn.Module):
33
+
34
+
35
+ def __init__(self, vision_encoder, projector, t5_name="t5-small", decoder_params=None):
36
+ super().__init__()
37
+
38
+ decoder_params = decoder_params or {}
39
+
40
+ self.vision_encoder = vision_encoder
41
+ self.projector = projector
42
+
43
+ # Load full T5, but we only use decoder
44
+ local_large = "/content/drive/MyDrive/Models/t5-large"
45
+
46
+ if t5_name == "t5-large":
47
+ self.t5 = safe_load_t5("t5-large", local_large)
48
+ else:
49
+ self.t5 = T5ForConditionalGeneration.from_pretrained(t5_name)
50
+
51
+ self.apply_decoder_options(decoder_params)
52
+
53
+ for p in self.t5.encoder.parameters():
54
+ p.requires_grad = False
55
+
56
+ self.hidden_size = self.t5.config.d_model
57
+
58
+
59
+ def apply_decoder_options(self, params):
60
+
61
+ # LoRA setup
62
+ if params.get("use_lora", False):
63
+ lora_rank = params.get("lora_rank", 8)
64
+ lora_alpha = params.get("lora_alpha", 16)
65
+
66
+ print(f"[INFO] LoRA enabled for T5 decoder (Rank={lora_rank})")
67
+
68
+ # Target query and value matrices in all T5 attention blocks
69
+ lora_config = LoraConfig(
70
+ r=lora_rank,
71
+ lora_alpha=lora_alpha,
72
+ target_modules=["q", "v"],
73
+ lora_dropout=params.get("lora_dropout", 0.1),
74
+ bias="none",
75
+ task_type="CAUSAL_LM"
76
+ )
77
+
78
+ self.t5 = get_peft_model(self.t5, lora_config)
79
+
80
+ self.t5.print_trainable_parameters()
81
+
82
+ # The freeze_decoder flag (if present) is ignored when using LoRA, as LoRA automatically handles freezing and only exposes the adapter weights.
83
+
84
+ return
85
+
86
+
87
+ num_decoder_layers = self.t5.config.num_decoder_layers #
88
+
89
+ trainable_layers = params.get("trainable_decoder_layers")
90
+
91
+ if trainable_layers is not None:
92
+ num_frozen = num_decoder_layers - trainable_layers
93
+
94
+ if num_frozen > 0:
95
+ print(f"[INFO] Partial Tuning: Freezing first {num_frozen} of {num_decoder_layers} decoder blocks.")
96
+
97
+ for i, block in enumerate(self.t5.decoder.block):
98
+ if i < num_frozen:
99
+ for p in block.parameters():
100
+ p.requires_grad = False
101
+ print(f" > Block {i} frozen.")
102
+ else:
103
+ for p in block.parameters():
104
+ p.requires_grad = True
105
+ print(f" > Block {i} trainable.")
106
+
107
+ if num_frozen > 0:
108
+ for p in self.t5.decoder.embed_tokens.parameters():
109
+ p.requires_grad = False
110
+ print(" > Decoder embeddings frozen.")
111
+
112
+ return
113
+
114
+ if params.get("freeze_decoder", False):
115
+ print("[INFO] Freezing all T5 decoder parameters.")
116
+ for p in self.t5.decoder.parameters():
117
+ p.requires_grad = False
118
+
119
+ if params.get("dropout_override") is not None:
120
+ self.t5.config.dropout_rate = params["dropout_override"]
121
+
122
+
123
+ def forward(
124
+ self,
125
+ pixel_values=None,
126
+ input_ids=None,
127
+ attention_mask=None,
128
+ labels=None
129
+ ):
130
+
131
+ vision_out = self.vision_encoder(pixel_values)
132
+ image_embeds = vision_out["image_embeds"]
133
+
134
+ if image_embeds.dim() == 2:
135
+ image_embeds = image_embeds.unsqueeze(1)
136
+
137
+ projected = self.projector(image_embeds)
138
+
139
+ B, S, _ = projected.shape
140
+ encoder_attention_mask = torch.ones(B, S, dtype=torch.long, device=projected.device)
141
+
142
+ encoder_outputs = BaseModelOutput(last_hidden_state=projected)
143
+
144
+ decoder_attention_mask = attention_mask
145
+
146
+ output = self.t5(
147
+ input_ids=input_ids,
148
+ decoder_attention_mask=decoder_attention_mask,
149
+ attention_mask=encoder_attention_mask,
150
+ encoder_outputs=encoder_outputs,
151
+ labels=labels,
152
+ return_dict=True,
153
+ )
154
+
155
+ return output
156
+
157
+ @torch.no_grad()
158
+ def generate(self, pixel_values, tokenizer, max_length=32, num_beams=3):
159
+
160
+ vision_out = self.vision_encoder(pixel_values)
161
+ image_embeds = vision_out["image_embeds"]
162
+
163
+ if image_embeds.dim() == 2:
164
+ image_embeds = image_embeds.unsqueeze(1) # (B, 1, D)
165
+
166
+ projected = self.projector(image_embeds) # (B, S, d_model)
167
+
168
+ encoder_outputs = BaseModelOutput(
169
+ last_hidden_state=projected
170
+ )
171
+
172
+ generated_ids = self.t5.generate(
173
+ encoder_outputs=encoder_outputs,
174
+ decoder_start_token_id=self.t5.config.decoder_start_token_id,
175
+ input_ids=torch.tensor([[tokenizer.pad_token_id]]).to(projected.device),
176
+ max_length=max_length,
177
+ num_beams=num_beams
178
+ )
179
+
180
+ return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
181
+
182
+
183
+ @staticmethod
184
+ def get_t5_hidden_size(t5_name):
185
+
186
+ cfg = AutoConfig.from_pretrained(t5_name)
187
+ return cfg.d_model