ChemVLM-26B-1-2

Image-Text-to-Text

Transformers

Model card Files Files and versions

xet

Community

minpeter commited on Aug 7, 2025

Commit

db77928

verified ·

1 Parent(s): 025bbdb

diff for compatibility

Browse files

Files changed (5) hide show

README.md +12 -127
chat_template.jinja +7 -0
config.json +5 -2
generation_config.json +7 -1
tokenizer_config.json +2 -3

README.md CHANGED Viewed

@@ -11,138 +11,23 @@ library_name: transformers
 license: apache-2.0
 ---
-## Citation
-arxiv.org/abs/2408.07246
-```
-@inproceedings{li2025chemvlm,
-  title={Chemvlm: Exploring the power of multimodal large language models in chemistry area},
-  author={Li, Junxian and Zhang, Di and Wang, Xunzhi and Hao, Zeying and Lei, Jingdi and Tan, Qian and Zhou, Cai and Liu, Wei and Yang, Yaotian and Xiong, Xinrui and others},
-  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
-  volume={39},
-  number={1},
-  pages={415--423},
-  year={2025}
-}
-```
-Codebase and datasets can be found at https://github.com/AI4Chem/ChemVlm.
-Quick start as below(```transformers>=4.37.0 is needed```)
-Update: You may also need
-```
-pip install sentencepiece
-pip install einops
-pip install timm
-pip install accelerate>=0.26.0
-```
-Code:
-```Python
-from transformers import AutoTokenizer, AutoModelforCasualLM
-import torch
-import torchvision.transforms as T
-import transformers
-from torchvision.transforms.functional import InterpolationMode
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-def build_transform(input_size):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    transform = T.Compose([
-        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
-        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
-        T.ToTensor(),
-        T.Normalize(mean=MEAN, std=STD)
-    ])
-    return transform
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
-    best_ratio_diff = float('inf')
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
-    orig_width, orig_height = image.size
-    aspect_ratio = orig_width / orig_height
-    # calculate the existing image aspect ratio
-    target_ratios = set(
-        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-        i * j <= max_num and i * j >= min_num)
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-    assert len(processed_images) == blocks
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-    return processed_images
-def load_image(image_file, input_size=448, max_num=6):
-    image = Image.open(image_file).convert('RGB')
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
-    pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values)
-    return pixel_values
-tokenizer = AutoTokenizer.from_pretrained('AI4Chem/ChemVLM-26B-1-2', trust_remote_code=True)
-query = "Please describe the molecule in the image."
-image_path = "your image path"
-pixel_values = load_image(image_path, max_num=6).to(torch.bfloat16).cuda()
-model = AutoModelForCausalLM.from_pretrained(
-    "AI4Chem/ChemVLM-26B-1-2",
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-    trust_remote_code=True
-).to(device).eval().cuda()
-gen_kwargs = {"max_length": 1000, "do_sample": True, "temperature": 0.7, "top_p": 0.9}
-response = model.chat(tokenizer, pixel_values, query, gen_kwargs)
-```

 license: apache-2.0
 ---
+<!-- header start -->
+<p align="center">
+  <img src="https://huggingface.co/datasets/FriendliAI/documentation-images/resolve/main/model-card-assets/friendliai.png" width="100%" alt="FriendliAI Logo">
+</p>
+<!-- header end -->
+# AI4Chem/ChemVLM-26B-1-2
+* Model creator: [AI4Chem](https://huggingface.co/AI4Chem)
+* Original model: [ChemVLM-26B-1-2](https://huggingface.co/AI4Chem/ChemVLM-26B-1-2)
+## Differences
+* Added missing eos_token (`<|im_end|>`) to config.json
+## License
+Refer to the license of the original model card.

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{{- bos_token -}}
+{%- for message in messages -%}
+	{{- "<|im_start|>" + message["role"] + "\n" + message["content"] + "<|im_end|>" + "\n" -}}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+	{{- "<|im_start|>assistant\n" -}}
+{%- endif -%}

config.json CHANGED Viewed

@@ -35,7 +35,10 @@
     "do_sample": false,
     "early_stopping": false,
     "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 2,
     "exponential_decay_length_penalty": null,
     "finetuning_task": null,
     "forced_bos_token_id": null,
@@ -197,4 +200,4 @@
     "use_bfloat16": true,
     "use_flash_attn": true
   }
-}

     "do_sample": false,
     "early_stopping": false,
     "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": [
+      2,
+      92542
+    ],
     "exponential_decay_length_penalty": null,
     "finetuning_task": null,
     "forced_bos_token_id": null,
     "use_bfloat16": true,
     "use_flash_attn": true
   }
+}

generation_config.json CHANGED Viewed

@@ -1,4 +1,10 @@
 {
   "_from_model_config": true,
   "transformers_version": "4.44.2"
-}

 {
   "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2,
+    92542
+  ],
+  "pad_token_id": 2,
   "transformers_version": "4.44.2"
+}

tokenizer_config.json CHANGED Viewed

@@ -163,11 +163,10 @@
     ]
   },
   "bos_token": "<s>",
-  "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
   "clean_up_tokenization_spaces": false,
-  "eos_token": "</s>",
   "model_max_length": 2048,
   "pad_token": "</s>",
   "tokenizer_class": "InternLM2Tokenizer",
   "unk_token": "<unk>"
-}

     ]
   },
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
   "model_max_length": 2048,
   "pad_token": "</s>",
   "tokenizer_class": "InternLM2Tokenizer",
   "unk_token": "<unk>"
+}