ObjectRelator-plus / objectrelator /eval /eval_egoexo_output_text.py

Upload folder using huggingface_hub

36c1e62 verified about 2 months ago

18.4 kB

	import torch
	import os
	from enum import Enum
	from tqdm import tqdm
	import numpy as np
	from detectron2.structures import BitMasks
	from objectrelator.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
	DEFAULT_IM_END_TOKEN, DEFAULT_SEG_TOKEN, SEG_TOKEN_INDEX
	from objectrelator.model.builder import load_pretrained_model
	from objectrelator.utils import disable_torch_init
	from objectrelator.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
	from objectrelator.mask_config.data_args import DataArguments
	import cv2
	from torch.utils.data import Dataset, DataLoader
	from objectrelator import conversation as conversation_lib
	from datasets.egoexo_dataset import EgoExo_Dataset_eval
	from pycocotools.mask import encode, decode, frPyObjects
	from detectron2.structures import BoxMode
	from detectron2.data import MetadataCatalog, DatasetCatalog
	from typing import Dict, Optional, Sequence, List
	from dataclasses import dataclass, field
	import torch.distributed as dist
	import transformers
	from pathlib import Path
	from segmentation_evaluation import openseg_classes
	COLOR_MAP = openseg_classes.ADE20K_150_CATEGORIES
	from detectron2.data import detection_utils as utils
	import pickle
	import math
	import json
	import utils_metric
	import os
	import re
	from natsort import natsorted
	from transformers import TextStreamer

	# collection func
	@dataclass
	class DataCollatorForCOCODatasetV2(object):
	"""Collate examples for supervised fine-tuning."""

	tokenizer: transformers.PreTrainedTokenizer

	def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
	if len(instances[0]) == 0:
	return {}
	input_ids, labels = tuple([instance[key] for instance in instances]
	for key in ("input_ids", "labels"))
	input_ids = torch.nn.utils.rnn.pad_sequence(
	input_ids,
	batch_first=True,
	padding_value=self.tokenizer.pad_token_id)
	labels = torch.nn.utils.rnn.pad_sequence(labels,
	batch_first=True,
	padding_value=IGNORE_INDEX)
	input_ids = input_ids[:, :self.tokenizer.model_max_length]
	labels = labels[:, :self.tokenizer.model_max_length]
	batch = dict(
	input_ids=input_ids,
	labels=labels,
	attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
	)
	if 'image' in instances[0]:
	images = [instance['image'] for instance in instances]
	if all(x is not None and x.shape == images[0].shape for x in images):
	batch['images'] = torch.stack(images)
	else:
	batch['images'] = images
	if 'vp_image' in instances[0]:
	vp_images = [instance['vp_image'] for instance in instances]
	if all(x is not None and x.shape == vp_images[0].shape for x in vp_images):
	batch['vp_images'] = torch.stack(vp_images)
	else:
	batch['vp_images'] = vp_images
	for instance in instances:
	for key in ['input_ids', 'labels', 'image']:
	del instance[key]
	batch['seg_info'] = [instance for instance in instances]

	if 'dataset_type' in instances[0]:
	batch['dataset_type'] = [instance['dataset_type'] for instance in instances]

	if 'class_name_ids' in instances[0]:
	class_name_ids = [instance['class_name_ids'] for instance in instances]
	if any(x.shape != class_name_ids[0].shape for x in class_name_ids):
	batch['class_name_ids'] = torch.nn.utils.rnn.pad_sequence(
	class_name_ids,
	batch_first=True,
	padding_value=-1,
	)
	else:
	batch['class_name_ids'] = torch.stack(class_name_ids, dim=0)
	if 'token_refer_id' in instances[0]:
	token_refer_id = [instance['token_refer_id'] for instance in instances]
	batch['token_refer_id'] = token_refer_id
	if 'cls_indices' in instances[0]:
	cls_indices = [instance['cls_indices'] for instance in instances]
	if any(x.shape != cls_indices[0].shape for x in cls_indices):
	batch['cls_indices'] = torch.nn.utils.rnn.pad_sequence(
	cls_indices,
	batch_first=True,
	padding_value=-1,
	)
	else:
	batch['cls_indices'] = torch.stack(cls_indices, dim=0)
	if 'random_idx' in instances[0]:
	random_idxs = [instance['random_idx'] for instance in instances]
	batch['random_idx'] = torch.stack(random_idxs, dim=0)
	if 'class_name_embedding_indices' in instances[0]:
	class_name_embedding_indices = [instance['class_name_embedding_indices'] for instance in instances]
	class_name_embedding_indices = torch.nn.utils.rnn.pad_sequence(
	class_name_embedding_indices,
	batch_first=True,
	padding_value=0)
	batch['class_name_embedding_indices'] = class_name_embedding_indices
	if 'refer_embedding_indices' in instances[0]:
	refer_embedding_indices = [instance['refer_embedding_indices'] for instance in instances]
	refer_embedding_indices = torch.nn.utils.rnn.pad_sequence(
	refer_embedding_indices,
	batch_first=True,
	padding_value=0)
	batch['refer_embedding_indices'] = refer_embedding_indices

	return batch
	def __str__(self):
	fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
	return fmtstr.format(**self.__dict__)


	class Summary(Enum):
	NONE = 0
	AVERAGE = 1
	SUM = 2
	COUNT = 3


	class AverageMeter(object):
	"""Computes and stores the average and current value"""

	def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE):
	self.name = name
	self.fmt = fmt
	self.summary_type = summary_type
	self.reset()

	def reset(self):
	self.val = 0
	self.avg = 0
	self.sum = 0
	self.count = 0

	def update(self, val, n=1):
	self.val = val
	self.sum += val * n
	self.count += n
	self.avg = self.sum / self.count

	def all_reduce(self):
	device = "cuda" if torch.cuda.is_available() else "cpu"
	if isinstance(self.sum, np.ndarray):
	total = torch.tensor(
	self.sum.tolist()
	+ [
	self.count,
	],
	dtype=torch.float32,
	device=device,
	)
	else:
	total = torch.tensor(
	[self.sum, self.count], dtype=torch.float32, device=device
	)

	dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
	if total.shape[0] > 2:
	self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item()
	else:
	self.sum, self.count = total.tolist()
	self.avg = self.sum / (self.count + 1e-5)

	def __str__(self):
	fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
	return fmtstr.format(**self.__dict__)

	def summary(self):
	fmtstr = ""
	if self.summary_type is Summary.NONE:
	fmtstr = ""
	elif self.summary_type is Summary.AVERAGE:
	fmtstr = "{name} {avg:.3f}"
	elif self.summary_type is Summary.SUM:
	fmtstr = "{name} {sum:.3f}"
	elif self.summary_type is Summary.COUNT:
	fmtstr = "{name} {count:.3f}"
	else:
	raise ValueError("invalid summary type %r" % self.summary_type)

	return fmtstr.format(**self.__dict__)


	def intersectionAndUnionGPU(output, target, K, ignore_index=255):
	# 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
	assert output.dim() in [1, 2, 3]
	assert output.shape == target.shape
	output = output.view(-1)
	target = target.view(-1)
	output[target == ignore_index] = ignore_index
	intersection = output[output == target]
	area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1)
	area_output = torch.histc(output, bins=K, min=0, max=K - 1)
	area_target = torch.histc(target, bins=K, min=0, max=K - 1)
	area_union = area_output + area_target - area_intersection
	return area_intersection, area_union, area_target

	def parse_outputs(outputs,gt_mask):
	res_list = []
	for output in outputs:
	# gt = output['gt'].cpu().numpy().astype(np.uint8)

	pred_mask = output['instances'].pred_masks
	pred_mask = pred_mask.cpu().numpy()
	scores = output['instances'].scores.cpu().numpy()
	try:
	pred_cls = output['instances'].pred_classes.cpu().numpy()
	except:
	pred_cls = None
	res = {
	'pred':pred_mask,
	'gt': gt_mask,
	'scores':scores,
	'pred_cls':pred_cls
	}
	res_list.append(res)
	return res_list

	def compute_metric(intersection_meter,union_meter,acc_iou_meter, gt_cls, results_list):
	pred_list = []
	gt_list = []
	results_list = list(results_list)
	for results in results_list:
	gt = results['gt']
	preds = results['pred']
	scores = results['scores']
	preds = preds.astype(np.uint8)
	# pick mask with maximum score
	topk_scores,idx = torch.topk(torch.tensor(scores),1)
	idx = idx.cpu().numpy()
	topk_preds = preds[idx,:]
	if results['pred_cls'] is not None:
	topk_pred_cls = results['pred_cls'][idx]
	max_acc_iou = -1
	max_iou = 0
	max_intersection = 0
	max_union = 0
	max_i = 0
	# here topk=1, len(topk_preds)=1
	for i,pred_ in enumerate(topk_preds):
	intersection, union, _ = intersectionAndUnionGPU(
	torch.tensor(pred_).int().cuda().contiguous().clone(), torch.tensor(gt).int().cuda().contiguous(), 2, ignore_index=255
	)
	intersection, union = intersection.cpu().numpy(), union.cpu().numpy()
	acc_iou = intersection / (union + 1e-5)
	acc_iou[union == 0] = 1.0 # no-object target
	fore_acc_iou = acc_iou[1]
	if fore_acc_iou > max_acc_iou:
	max_acc_iou = fore_acc_iou
	max_iou = acc_iou
	max_intersection = intersection
	max_union = union
	max_i = i
	intersection_meter.update(max_intersection)
	union_meter.update(max_union)
	acc_iou_meter.update(max_iou, n=1)
	pred_list.append(topk_preds[max_i])
	gt_list.append(gt)

	return pred_list,gt_list






	@dataclass
	class DataArguments:
	data_path: str = field(default=None,
	metadata={"help": "Path to the training data."})
	lazy_preprocess: bool = False
	is_multimodal: bool = False
	image_folder: Optional[str] = field(default='/path/to/val2017')
	model_path: Optional[str] = field(default="/path/to/model")
	mask_config: Optional[str] = field(default="./objectrelator/mask_config/maskformer2_swin_base_384_bs16_50ep.yaml")
	image_aspect_ratio: str = 'square'
	image_grid_pinpoints: Optional[str] = field(default=None)
	json_path: str = '/path/to/coco'
	model_map_name: str = 'psalm'
	version: str = 'llava_phi'
	output_dir: str = './output/panoptic_segmentation'
	segmentation: bool = True
	eval_batch_size: int = 1
	dataloader_num_workers: int = 4
	seg_task: Optional[str] = field(default="referring")




	def evaluation():
	parser = transformers.HfArgumentParser(DataArguments)
	data_args = parser.parse_args_into_dataclasses()[0]
	disable_torch_init()
	model_path = os.path.expanduser(data_args.model_path)
	# model_path = get_latest_checkpoint_path(model_path)
	print(f'current model is {model_path}')
	model_name = 'psalm'
	print('Loading model:', model_name)
	tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, model_args=data_args, mask_config=data_args.mask_config, device='cuda')
	print('Model loaded successfully!')
	data_args.image_processor = image_processor
	data_args.is_multimodal = True
	conversation_lib.default_conversation = conversation_lib.conv_templates[data_args.version_val]


	data_args.refcoco_image_folder = data_args.image_folder
	eval_dataset = EgoExo_Dataset_eval(json_path=data_args.json_path, tokenizer=tokenizer, data_args=data_args)
	data_collator = DataCollatorForCOCODatasetV2(tokenizer=tokenizer)
	dataloader_params = {
	"batch_size": data_args.eval_batch_size,
	"num_workers": data_args.dataloader_num_workers,
	}
	eval_dataloader = DataLoader(eval_dataset, batch_size=dataloader_params['batch_size'], collate_fn=data_collator,
	num_workers=dataloader_params['num_workers'])

	def load_ref_dataset():
	return RefCOCO_dataset(json_path=data_args.json_path, tokenizer=tokenizer, data_args=data_args)

	DatasetCatalog.register('refcoco_dataset', load_ref_dataset)
	MetadataCatalog.get('refcoco_dataset').set(stuff_classes=['object'],)
	gt_json_path = data_args.json_path
	with open(gt_json_path) as f:
	gt_data = json.load(f)


	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	model.to(device=device,dtype=torch.float).eval()
	save_list = []
	intersection_meter = AverageMeter("Intersec", ":6.3f", Summary.SUM)
	union_meter = AverageMeter("Union", ":6.3f", Summary.SUM)
	acc_iou_meter = AverageMeter("gIoU", ":6.3f", Summary.SUM)

	streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	with torch.no_grad():
	for idx, inputs in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)):
	gt = gt_data[idx]['anns']
	h, w = gt_data[idx]['image_info']['height'], gt_data[idx]['image_info']['width']
	# generate gt mask
	masks = []
	for annotation in gt:
	if isinstance(annotation['segmentation'], list):
	segm = np.zeros((h, w), dtype=np.uint8)
	for poly in annotation['segmentation']:
	poly = np.array(poly, dtype=np.int32).reshape(-1, 2)
	cv2.fillPoly(segm, [poly], 1)
	masks.append(segm.astype(np.bool_))
	else:
	if isinstance(annotation['segmentation']['counts'], list):
	rle = mask.frPyObjects(annotation['segmentation'], *annotation['segmentation']['size'])
	segm = mask.decode(rle)
	else:
	segm = mask.decode(annotation['segmentation'])
	masks.append(segm.astype(np.bool_))
	# assert len(masks) == 1 #debug
	gt_mask = masks[0].astype(np.uint8)

	inputs = {k: v.to(device) if torch.is_tensor(v) else v for k, v in inputs.items()}
	# print("token_refer_id:", inputs['token_refer_id']) #debug
	inputs['token_refer_id'] = [ids.to(device) for ids in inputs['token_refer_id']]
	# print("input_keys:", inputs.keys()) #debug
	# print("input_ids", inputs['input_ids']) #debug
	# print("refer_embedding_indices:", inputs['refer_embedding_indices']) #debug
	outputs = model.eval_seg(
	input_ids=inputs['input_ids'],
	attention_mask=inputs['attention_mask'],
	images=inputs['images'].float(),
	seg_info=inputs['seg_info'],
	token_refer_id = inputs['token_refer_id'],
	refer_embedding_indices=inputs['refer_embedding_indices'],
	labels=inputs['labels']
	)
	output_ids = model.generate(
	input_ids=inputs['input_ids'],
	attention_mask=inputs['attention_mask'],
	images=inputs['images'].float(),
	seg_info=inputs['seg_info'],
	token_refer_id = inputs['token_refer_id'],
	refer_embedding_indices=inputs['refer_embedding_indices'],
	labels=inputs['labels'],
	do_sample=True,
	temperature=0.2,
	max_new_tokens=1024,
	streamer=streamer,
	use_cache=True,
	)

	# 解码生成的文本
	input_token_len = inputs['input_ids'].shape[1]
	generated_tokens = output_ids[:, input_token_len:]
	generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
	print("output_text:", generated_text) # debug


	gt_cls = inputs['seg_info'][0]['instances'].gt_classes
	if torch.cuda.is_available():
	torch.cuda.synchronize()
	cur_res = parse_outputs(outputs,gt_mask)
	pred,gt_mask = compute_metric(intersection_meter,union_meter,acc_iou_meter, gt_cls, cur_res)
	save_list.append({'pred':pred[0],'gt':gt_mask[0],'name':inputs['seg_info'][0]['file_name']})
	iou_class = intersection_meter.sum / (union_meter.sum + 1e-10)
	ciou = iou_class[1]
	giou = acc_iou_meter.avg[1]
	msg = "benchmark: {}: giou: {:.4f}, ciou: {:.4f}".format(save_suffix, giou, ciou)
	print(msg)
	# save_path = os.path.join(data_args.model_path,'pred_pkl')
	# Path(save_path).mkdir(parents=True,exist_ok=True)
	# with open(os.path.join(save_path,f'pred_{save_suffix}.txt'),'w') as f:
	# f.write(msg)
	# save_path_pred = "/scratch/yuqian_fu/test_result/mask/1247a29c-9fda-47ac-8b9c-78b1e76e977e_ref/30_pred_complex_ego_watch.png"
	# save_path_gt = "/scratch/yuqian_fu/test_result/mask/1247a29c-9fda-47ac-8b9c-78b1e76e977e_ref/30_gt.png"
	# os.makedirs(os.path.dirname(save_path_pred), exist_ok=True)
	# cv2.imwrite(save_path_pred, save_list[0]['pred'].astype(np.uint8))
	# os.makedirs(os.path.dirname(save_path_gt), exist_ok=True)
	# cv2.imwrite(save_path_gt, save_list[0]['gt'].astype(np.uint8))







	if __name__ == "__main__":
	evaluation()