Update transformers to 4.48.0 (#17)

- Update transformers to 4.48.0 (db05f0330e8b132d3e8e098ff4d5c71f3dd6bdc9)
- Pin transformers to <4.49 (6b647f4a57c435178c7648e1733a9a688f40170d)

Files changed (6) hide show

README.md +2 -5
config.json +2 -132
generation_config.json +1 -1
processing_maira2.py +3 -0
processor_config.json +1 -0
tokenizer_config.json +1 -0

README.md CHANGED Viewed

@@ -84,13 +84,10 @@ pillow
 protobuf
 sentencepiece
 torch
-transformers
 ```
-Note: You may temporarily need to install transformers from source since MAIRA-2 requires `transformers>=4.46.0.dev0`. Due to an [incompatible commit](https://github.com/huggingface/transformers/commit/0f49deacbff3e57cde45222842c0db6375e4fa43) in transformers main, the current fix is to install a transformers version from or after commit [88d960937c81a32bfb63356a2e8ecf7999619681](https://github.com/huggingface/transformers/commit/88d960937c81a32bfb63356a2e8ecf7999619681) but before commit [0f49deacbff3e57cde45222842c0db6375e4fa43](https://github.com/huggingface/transformers/commit/0f49deacbff3e57cde45222842c0db6375e4fa43).
-```
-pip install git+https://github.com/huggingface/transformers.git@88d960937c81a32bfb63356a2e8ecf7999619681
-```
 First, initialise the model and put it in eval mode.
 ```python

 protobuf
 sentencepiece
 torch
+transformers>=4.48.0,<4.49
 ```
+Note: MAIRA-2 has last been tested with transformers v4.48.0.
 First, initialise the model and put it in eval mode.
 ```python

config.json CHANGED Viewed

@@ -12,171 +12,53 @@
   "image_seq_length": 576,
   "image_token_index": 32204,
   "model_type": "maira2",
   "pad_token_id": 0,
   "projector_hidden_act": "gelu",
   "projector_n_layers": 4,
   "text_config": {
     "_name_or_path": "lmsys/vicuna-7b-v1.5",
-    "add_cross_attention": false,
     "architectures": [
       "LlamaForCausalLM"
     ],
-    "attention_bias": false,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": 1,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 2,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "head_dim": 128,
-    "hidden_act": "silu",
-    "hidden_size": 4096,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "initializer_range": 0.02,
-    "intermediate_size": 11008,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "length_penalty": 1.0,
-    "max_length": 20,
     "max_position_embeddings": 4096,
-    "min_length": 0,
-    "mlp_bias": false,
     "model_type": "llama",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 32,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_hidden_layers": 32,
-    "num_key_value_heads": 32,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
     "pad_token_id": 0,
-    "prefix": null,
-    "pretraining_tp": 1,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
     "rms_norm_eps": 1e-05,
     "rope_scaling": {
       "factor": 1.5,
       "rope_type": "linear"
     },
-    "rope_theta": 10000.0,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": false,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
     "torch_dtype": "bfloat16",
-    "torchscript": false,
-    "typical_p": 1.0,
-    "use_bfloat16": false,
-    "use_cache": true,
     "vocab_size": 32207
   },
   "torch_dtype": "float32",
-  "transformers_version": "4.46.0.dev0",
   "vision_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
     "apply_layernorm": true,
     "architectures": [
       "Dinov2Model"
     ],
     "attention_probs_dropout_prob": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
     "drop_path_rate": 0.0,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
     "hidden_act": "gelu",
     "hidden_dropout_prob": 0.0,
     "hidden_size": 768,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
     "image_size": 518,
-    "initializer_range": 0.02,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
     "layer_norm_eps": 1e-06,
     "layerscale_value": 1.0,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
     "mlp_ratio": 4,
     "model_type": "dinov2",
-    "no_repeat_ngram_size": 0,
     "num_attention_heads": 12,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_channels": 3,
     "num_hidden_layers": 12,
-    "num_return_sequences": 1,
     "out_features": [
       "stage12"
     ],
     "out_indices": [
       12
     ],
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 14,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
     "qkv_bias": true,
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
     "reshape_hidden_states": false,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
     "stage_names": [
       "stem",
       "stage1",
@@ -192,19 +74,7 @@
       "stage11",
       "stage12"
     ],
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
     "torch_dtype": "float32",
-    "torchscript": false,
-    "typical_p": 1.0,
-    "use_bfloat16": false,
     "use_swiglu_ffn": false
   },
   "vision_feature_layer": -1,

   "image_seq_length": 576,
   "image_token_index": 32204,
   "model_type": "maira2",
+  "multimodal_projector_bias": true,
   "pad_token_id": 0,
   "projector_hidden_act": "gelu",
   "projector_n_layers": 4,
   "text_config": {
     "_name_or_path": "lmsys/vicuna-7b-v1.5",
     "architectures": [
       "LlamaForCausalLM"
     ],
     "max_position_embeddings": 4096,
     "model_type": "llama",
     "pad_token_id": 0,
     "rms_norm_eps": 1e-05,
     "rope_scaling": {
       "factor": 1.5,
       "rope_type": "linear"
     },
     "torch_dtype": "bfloat16",
     "vocab_size": 32207
   },
   "torch_dtype": "float32",
+  "transformers_version": "4.48.0",
   "vision_config": {
     "apply_layernorm": true,
     "architectures": [
       "Dinov2Model"
     ],
     "attention_probs_dropout_prob": 0.0,
     "drop_path_rate": 0.0,
     "hidden_act": "gelu",
     "hidden_dropout_prob": 0.0,
     "hidden_size": 768,
     "image_size": 518,
     "layer_norm_eps": 1e-06,
     "layerscale_value": 1.0,
     "mlp_ratio": 4,
     "model_type": "dinov2",
     "num_attention_heads": 12,
     "num_hidden_layers": 12,
     "out_features": [
       "stage12"
     ],
     "out_indices": [
       12
     ],
     "qkv_bias": true,
     "reshape_hidden_states": false,
     "stage_names": [
       "stem",
       "stage1",
       "stage11",
       "stage12"
     ],
     "torch_dtype": "float32",
     "use_swiglu_ffn": false
   },
   "vision_feature_layer": -1,

generation_config.json CHANGED Viewed

@@ -5,5 +5,5 @@
   "max_length": 4096,
   "max_new_tokens": 450,
   "pad_token_id": 0,
-  "transformers_version": "4.46.0.dev0"
 }

   "max_length": 4096,
   "max_new_tokens": 450,
   "pad_token_id": 0,
+  "transformers_version": "4.48.0"
 }

processing_maira2.py CHANGED Viewed

@@ -40,6 +40,7 @@ class Maira2Processor(LlavaProcessor):
         "patch_size",
         "vision_feature_select_strategy",
         "image_token",
         "phrase_start_token",
         "phrase_end_token",
         "box_start_token",
@@ -55,6 +56,7 @@ class Maira2Processor(LlavaProcessor):
         vision_feature_select_strategy: str | None = None,
         chat_template: str | None = None,
         image_token: str = "<image>",
         phrase_start_token: str = "<obj>",
         phrase_end_token: str = "</obj>",
         box_start_token: str = "<box>",
@@ -69,6 +71,7 @@ class Maira2Processor(LlavaProcessor):
             vision_feature_select_strategy=vision_feature_select_strategy,
             chat_template=chat_template,
             image_token=image_token,
             **kwargs,
         )

         "patch_size",
         "vision_feature_select_strategy",
         "image_token",
+        "num_additional_image_tokens",
         "phrase_start_token",
         "phrase_end_token",
         "box_start_token",
         vision_feature_select_strategy: str | None = None,
         chat_template: str | None = None,
         image_token: str = "<image>",
+        num_additional_image_tokens: int = 1,
         phrase_start_token: str = "<obj>",
         phrase_end_token: str = "</obj>",
         box_start_token: str = "<box>",
             vision_feature_select_strategy=vision_feature_select_strategy,
             chat_template=chat_template,
             image_token=image_token,
+            num_additional_image_tokens=num_additional_image_tokens,
             **kwargs,
         )

processor_config.json CHANGED Viewed

@@ -5,6 +5,7 @@
   "box_end_token": "</box>",
   "box_start_token": "<box>",
   "image_token": "<image>",
   "num_box_coord_bins": 100,
   "patch_size": 14,
   "phrase_end_token": "</obj>",

   "box_end_token": "</box>",
   "box_start_token": "<box>",
   "image_token": "<image>",
+  "num_additional_image_tokens": 1,
   "num_box_coord_bins": 100,
   "patch_size": 14,
   "phrase_end_token": "</obj>",

tokenizer_config.json CHANGED Viewed

@@ -1688,6 +1688,7 @@
   "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}You are an expert radiology assistant tasked with interpreting a chest X-ray study.  {% for message in messages %}{% if message[\"role\"] == \"user\" %}USER:  {% else %}ASSISTANT: {% endif %}{% for item in message[\"content\"] %}{% if item[\"type\"] == \"text\" %}{{ item[\"text\"] }}{% elif item[\"type\"] == \"image\" %}<image>{% endif %}{% endfor %}{% if message[\"role\"] == \"user\" %}  {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
   "legacy": false,
   "model_max_length": 4096,
   "pad_token": "<unk>",

   "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}You are an expert radiology assistant tasked with interpreting a chest X-ray study.  {% for message in messages %}{% if message[\"role\"] == \"user\" %}USER:  {% else %}ASSISTANT: {% endif %}{% for item in message[\"content\"] %}{% if item[\"type\"] == \"text\" %}{{ item[\"text\"] }}{% elif item[\"type\"] == \"image\" %}<image>{% endif %}{% endfor %}{% if message[\"role\"] == \"user\" %}  {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
+  "extra_special_tokens": {},
   "legacy": false,
   "model_max_length": 4096,
   "pad_token": "<unk>",