import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from PIL import Image import warnings # disable some warnings transformers.logging.set_verbosity_error() transformers.logging.disable_progress_bar() warnings.filterwarnings('ignore') # set device device = 'cuda' # or cpu torch.set_default_device(device) model_name = 'BAAI/Bunny-v1_1-Llama-3-8B-V' # or 'BAAI/Bunny-Llama-3-8B-V' or 'BAAI/Bunny-v1_1-4B' or 'BAAI/Bunny-v1_0-4B' or 'BAAI/Bunny-v1_0-3B' or 'BAAI/Bunny-v1_0-3B-zh' or 'BAAI/Bunny-v1_0-2B-zh' # create model model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # float32 for cpu device_map='auto', trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True) # for batch inference tokenizer.padding_side = "left" tokenizer.pad_token_id = model.generation_config.pad_token_id padding_max_length = 128 # customize for your circumstance tokenizer.add_tokens(['']) image_token_id = tokenizer.convert_tokens_to_ids('') # text prompts prompts = [ 'What is the astronaut holding in his hand?', 'Why is the image funny?', 'What is the occupation of the person in the picture?', 'What animal is in the picture?' ] texts = [ f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n{prompt} ASSISTANT:" for prompt in prompts] input_ids = torch.tensor( [tokenizer(text, padding='max_length', max_length=padding_max_length).input_ids for text in texts], dtype=torch.long).to(device) input_ids[input_ids == image_token_id] = -200 # images, sample images can be found in https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V/tree/main/images image_paths = [ 'example_1.png', 'example_2.png', 'example_1.png', 'example_2.png' ] images = [Image.open(image_path) for image_path in image_paths] image_tensor = model.process_images(images, model.config).to(dtype=model.dtype, device=device) # generate output_ids = model.generate( input_ids, images=image_tensor, max_new_tokens=100, use_cache=True, repetition_penalty=1.0 # increase this to avoid chattering ) print([ans.strip() for ans in tokenizer.batch_decode(output_ids[:, input_ids.shape[1]:], skip_special_tokens=True)])