Upload Florence2ForConditionalGeneration

Files changed (3) hide show

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "model_checkpoints/gigantic_fukuiraptor/epoch_9/",
   "architectures": [
     "Florence2ForConditionalGeneration"
   ],
@@ -160,7 +160,7 @@
     "length_penalty": 1.0,
     "max_length": 20,
     "min_length": 0,
-    "model_type": "davit",
     "no_repeat_ngram_size": 0,
     "num_beam_groups": 1,
     "num_beams": 1,

 {
+  "_name_or_path": "model_checkpoints/vqainstruct_no_lora/epoch_5",
   "architectures": [
     "Florence2ForConditionalGeneration"
   ],
     "length_penalty": 1.0,
     "max_length": 20,
     "min_length": 0,
+    "model_type": "",
     "no_repeat_ngram_size": 0,
     "num_beam_groups": 1,
     "num_beams": 1,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1b6f79f7fe43daf6285f057156a5110ff0724e40ab4f2c395823ac44856a15a2
 size 3291921348

 version https://git-lfs.github.com/spec/v1
+oid sha256:1d9a3bc6abcace5e9820630945fe26cfa961fe2577f8adeb48256acba876123e
 size 3291921348

modeling_florence2.py CHANGED Viewed

@@ -2288,8 +2288,7 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
             image_hidden_states of the model produced by the vision encoder
     """
-    loss: torch.FloatTensor = None
     logits: torch.FloatTensor = None
     last_hidden_state: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
@@ -2530,7 +2529,6 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
     def __init__(self, config: Florence2Config):
         super().__init__(config)
         assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
-        # del config.vision_config.model_type
         self.vision_tower = DaViT.from_config(config=config.vision_config)
         # remove unused layers
         del self.vision_tower.head
@@ -2734,7 +2732,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
                 image_features = self._encode_image(pixel_values)
                 inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
-        attention_mask = attention_mask.to(inputs_embeds.dtype)
         outputs = self.language_model(
             attention_mask=attention_mask,
             labels=labels,

             image_hidden_states of the model produced by the vision encoder
     """
+    loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     last_hidden_state: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     def __init__(self, config: Florence2Config):
         super().__init__(config)
         assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config.vision_config)
         # remove unused layers
         del self.vision_tower.head
                 image_features = self._encode_image(pixel_values)
                 inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
+        if inputs_embeds is not None:
+            attention_mask = attention_mask.to(inputs_embeds.dtype)
         outputs = self.language_model(
             attention_mask=attention_mask,
             labels=labels,