import argparse import torch from llava.constants import ( IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IMAGE_PLACEHOLDER, ) from llava.conversation import conv_templates, SeparatorStyle from llava.model.builder import load_pretrained_model from llava.utils import disable_torch_init from llava.mm_utils import ( process_images, tokenizer_image_token, get_model_name_from_path, ) from PIL import Image import requests from PIL import Image from io import BytesIO import re import os def image_parser(args): print(args.image_file) out = args.image_file.split(args.sep) print(args.sep) print(out) return out def load_image(image_file): if image_file.startswith("http") or image_file.startswith("https"): response = requests.get(image_file) image = Image.open(BytesIO(response.content)).convert("RGB") else: image = Image.open(image_file).convert("RGB") return image def load_images(image_files): out = [] for image_file in image_files: image = load_image(image_file) out.append(image) return out prompt = "Please describe the object coverd by the green mask." model_path = "liuhaotian/llava-v1.5-7b" root_path = '/data/work2-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap' data_path = "/data/work-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap/ExoQuery_piano.json" save_path = "/data/work-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap/ExoQuery_piano_withtext.json" def eval_model(args): # Model disable_torch_init() model_name = get_model_name_from_path(args.model_path) tokenizer, model, image_processor, context_len = load_pretrained_model( args.model_path, args.model_base, model_name ) qs = args.query image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN if IMAGE_PLACEHOLDER in qs: if model.config.mm_use_im_start_end: qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs) else: qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs) else: if model.config.mm_use_im_start_end: qs = image_token_se + "\n" + qs else: qs = DEFAULT_IMAGE_TOKEN + "\n" + qs if "llama-2" in model_name.lower(): conv_mode = "llava_llama_2" elif "mistral" in model_name.lower(): conv_mode = "mistral_instruct" elif "v1.6-34b" in model_name.lower(): conv_mode = "chatml_direct" elif "v1" in model_name.lower(): conv_mode = "llava_v1" elif "mpt" in model_name.lower(): conv_mode = "mpt" else: conv_mode = "llava_v0" if args.conv_mode is not None and conv_mode != args.conv_mode: print( "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format( conv_mode, args.conv_mode, args.conv_mode ) ) else: args.conv_mode = conv_mode conv = conv_templates[args.conv_mode].copy() conv.append_message(conv.roles[0], qs) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() #image_files_list = image_parser(args) new_data_list = [] with open(data_path, "r") as f: datas = json.load(f) for data in datas: query_path = data["first_frame_image"] query_path = os.path.join(root_path, query_path) frame = cv2.imread(query_path) # v1,直接使用生成json文件中的缩放的mask # v2,获取takes名称,取出物体字典,逆映射获取物体名字,使用gt中的mask h,w = frame.shape[:2] #针对query是exo的情况 frame = cv2.resize(frame, (w // 4, h // 4)) for obj in data["first_frame_anns"]: images = [] mask = decode(obj["segmentation"]) mask = cv2.resize(mask, (frame.shape[1], frame.shape[0])) out = blend_mask(frame, mask) image = Image.fromarray(out).convert("RGB") images.append(image) image_sizes = [x.size for x in images] images_tensor = process_images( images, image_processor, model.config ).to(model.device, dtype=torch.float16) input_ids = ( tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") .unsqueeze(0) .cuda() ) with torch.inference_mode(): output_ids = model.generate( input_ids, images=images_tensor, image_sizes=image_sizes, do_sample=True if args.temperature > 0 else False, temperature=args.temperature, top_p=args.top_p, num_beams=args.num_beams, max_new_tokens=args.max_new_tokens, use_cache=True, ) outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() obj["text"] = outputs new_data_list.append(data) with open(save_path, "w") as f: json.dump(new_data_list, f) if __name__ == "__main__": # parser = argparse.ArgumentParser() # parser.add_argument("--model-path", type=str, default="facebook/opt-350m") # parser.add_argument("--model-base", type=str, default=None) # parser.add_argument("--image-file", type=str, required=True) # parser.add_argument("--query", type=str, required=True) # parser.add_argument("--conv-mode", type=str, default=None) # parser.add_argument("--sep", type=str, default=",") # parser.add_argument("--temperature", type=float, default=0.2) # parser.add_argument("--top_p", type=float, default=None) # parser.add_argument("--num_beams", type=int, default=1) # parser.add_argument("--max_new_tokens", type=int, default=512) # args = parser.parse_args() args = type('Args', (), { "model_path": model_path, "model_base": None, "model_name": get_model_name_from_path(model_path), "query": prompt, "conv_mode": None, "sep": ",", "temperature": 0, "top_p": None, "num_beams": 1, "max_new_tokens": 512 })() eval_model(args)