StanfordAIMI
/

CheXficient

@@ -1,5 +1,6 @@
 from transformers import PretrainedConfig
 class CheXficientConfig(PretrainedConfig):
     model_type = "chexficient_clip"
     def __init__(

 from transformers import PretrainedConfig
 class CheXficientConfig(PretrainedConfig):
     model_type = "chexficient_clip"
     def __init__(

modeling_chexficient.py CHANGED Viewed

@@ -24,8 +24,6 @@ URL_DICT = {
 class TextEncoder(nn.Module):
     def __init__(self, model_name='emilyalsentzer/Bio_ClinicalBERT'):
         super().__init__()
-        # self.model = AutoModel.from_pretrained(model_name, ignore_mismatched_sizes=False, cache_dir='./huggingface',)
-        # self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./huggingface/tokenizers')
         self.model = AutoModel.from_pretrained(model_name, use_safetensors=True, ignore_mismatched_sizes=False, )
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, )
         if self.tokenizer.bos_token_id is None:
@@ -72,14 +70,6 @@ class CheXficientModel(PreTrainedModel):
         super().__init__(config)
         # ===== Encoders =====
-        # self.image_encoder = AutoModel.from_pretrained(
-        #     config.vision_model_name,
-        #     use_safetensors=True
-        # )
-        # self.text_encoder = AutoModel.from_pretrained(
-        #     config.text_model_name,
-        #     use_safetensors=True
-        # )
         self.image_encoder = ImageEncoder(model_name=config.vision_model_name, image_size=config.image_size)
         self.text_encoder = TextEncoder(model_name=config.text_model_name)
@@ -97,30 +87,41 @@ class CheXficientModel(PreTrainedModel):
         self.post_init()
-    def get_image_features(self, pixel_values):
-        vision_outputs = self.image_encoder(pixel_values=pixel_values)
-        pooled = vision_outputs.last_hidden_state[:, 0]
-        projected = self.image_projection(pooled)
-        return F.normalize(projected, dim=-1)
-    def get_text_features(self, input_ids, attention_mask):
-        text_outputs = self.text_encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask
-        )
-        pooled = text_outputs.last_hidden_state[:, 0]
-        projected = self.text_projection(pooled)
-        return F.normalize(projected, dim=-1)
     def forward(
         self,
         pixel_values=None,
-        input_ids=None,
-        attention_mask=None,
         return_loss=False
     ):
-        image_features = self.get_image_features(pixel_values)
-        text_features = self.get_text_features(input_ids, attention_mask)
         logit_scale = self.logit_scale.exp()

 class TextEncoder(nn.Module):
     def __init__(self, model_name='emilyalsentzer/Bio_ClinicalBERT'):
         super().__init__()
         self.model = AutoModel.from_pretrained(model_name, use_safetensors=True, ignore_mismatched_sizes=False, )
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, )
         if self.tokenizer.bos_token_id is None:
         super().__init__(config)
         # ===== Encoders =====
         self.image_encoder = ImageEncoder(model_name=config.vision_model_name, image_size=config.image_size)
         self.text_encoder = TextEncoder(model_name=config.text_model_name)
         self.post_init()
+    def encode_image(self, pixel_values):
+        image_features = self.image_encoder(pixel_values)
+        image_embeddings = self.image_projection(image_features)
+        image_embeddings = image_embeddings / image_embeddings.norm(dim=1, keepdim=True)
+        return image_embeddings
+    def encode_text(self, text_tokens):
+        text_features = self.text_encoder(text_tokens)
+        if self.text_pooling == "eos":
+            # take features from the eot embedding (eos_token is the highest number in each sequence)
+            eos_token_indices = text_tokens["attention_mask"].sum(dim=-1) - 1
+            text_features = text_features[torch.arange(text_features.shape[0]), eos_token_indices]
+        elif self.text_pooling == "bos":   # [CLS] token
+            text_features = text_features[:, 0]
+        elif self.text_pooling == "mean":
+            input_mask_expanded = text_tokens["attention_mask"].unsqueeze(axis=-1).expand(text_features.size()).float()
+            text_features = torch.sum(text_features * input_mask_expanded, axis=1) / torch.clamp(input_mask_expanded.sum(axis=1), min=1e-9)
+        else:
+            raise NotImplementedError("Not supported pooling method : %s", self.text_pooling)
+        text_embeddings = self.text_projection(text_features) if self.projection else text_features
+        text_embeddings = text_embeddings / text_embeddings.norm(dim=1, keepdim=True)
+        return text_embeddings
     def forward(
         self,
         pixel_values=None,
+        text_tokens=None,
         return_loss=False
     ):
+        image_features = self.encode_image(pixel_values)
+        text_features = self.encode_text(text_tokens)
         logit_scale = self.logit_scale.exp()