LiAutoAD
/

Ristretto-3B

Image-Text-to-Text

feature-extraction

Model card Files Files and versions

junshan233 commited on Mar 27, 2025

Commit

b2f557a

·

verified ·

1 Parent(s): 889f986

Update README.md

Files changed (1) hide show

README.md +4 -3

README.md CHANGED Viewed

@@ -40,8 +40,8 @@ from transformers import AutoModel, AutoTokenizer
 import requests
 from io import BytesIO
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
 def build_transform(input_size):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
@@ -128,7 +128,8 @@ response = requests.get(image_url)
 image_data = BytesIO(response.content)
 pixel_values = load_image(image_data, max_num=10).to(torch.bfloat16).cuda()
 generation_config = dict(max_new_tokens=1024, do_sample=True)
-# optimal alpha ranges from 64 to 576
 num_image_token = 256
 # pure-text conversation

 import requests
 from io import BytesIO
+IMAGENET_MEAN = (0.5, 0.5, 0.5)
+IMAGENET_STD = (0.5, 0.5, 0.5)
 def build_transform(input_size):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
 image_data = BytesIO(response.content)
 pixel_values = load_image(image_data, max_num=10).to(torch.bfloat16).cuda()
 generation_config = dict(max_new_tokens=1024, do_sample=True)
+# The recommended range for `num_image_token` is 64 to 576, and the value can be adjusted based on task requirements.
 num_image_token = 256
 # pure-text conversation