ObjectRelator-plus / objectrelator /eval /referring_segmentation_outputText.py

Upload folder using huggingface_hub

36c1e62 verified 3 months ago

20.7 kB

	import argparse
	import torch
	import os
	from enum import Enum
	import json
	from tqdm import tqdm
	import shortuuid
	import numpy as np
	from objectrelator.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
	DEFAULT_IM_END_TOKEN, DEFAULT_SEG_TOKEN, SEG_TOKEN_INDEX, CLS_TOKEN_INDEX
	from objectrelator.model.builder import load_pretrained_model
	from objectrelator.utils import disable_torch_init
	from objectrelator.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
	import cv2
	from torch.utils.data import Dataset, DataLoader

	from objectrelator import conversation as conversation_lib
	# from objectrelator.train.train_datasets import DataCollatorForCOCODatasetV2, RefCOCO_dataset
	from datasets.egoexo_dataset import EgoExo_Dataset_train

	from detectron2.data import MetadataCatalog, DatasetCatalog
	from pycocotools import mask
	from typing import Dict, Optional, Sequence, List
	from dataclasses import dataclass, field
	import torch.distributed as dist
	import transformers
	import pickle
	from pathlib import Path
	from transformers import TextStreamer

	# collection func
	@dataclass
	class DataCollatorForCOCODatasetV2(object):
	"""Collate examples for supervised fine-tuning."""

	tokenizer: transformers.PreTrainedTokenizer

	def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
	if len(instances[0]) == 0:
	return {}
	input_ids, labels = tuple([instance[key] for instance in instances]
	for key in ("input_ids", "labels"))
	input_ids = torch.nn.utils.rnn.pad_sequence(
	input_ids,
	batch_first=True,
	padding_value=self.tokenizer.pad_token_id)
	labels = torch.nn.utils.rnn.pad_sequence(labels,
	batch_first=True,
	padding_value=IGNORE_INDEX)
	input_ids = input_ids[:, :self.tokenizer.model_max_length]
	labels = labels[:, :self.tokenizer.model_max_length]
	batch = dict(
	input_ids=input_ids,
	labels=labels,
	attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
	)
	if 'image' in instances[0]:
	images = [instance['image'] for instance in instances]
	if all(x is not None and x.shape == images[0].shape for x in images):
	batch['images'] = torch.stack(images)
	else:
	batch['images'] = images
	if 'vp_image' in instances[0]:
	vp_images = [instance['vp_image'] for instance in instances]
	if all(x is not None and x.shape == vp_images[0].shape for x in vp_images):
	batch['vp_images'] = torch.stack(vp_images)
	else:
	batch['vp_images'] = vp_images
	for instance in instances:
	for key in ['input_ids', 'labels', 'image']:
	del instance[key]
	batch['seg_info'] = [instance for instance in instances]

	if 'dataset_type' in instances[0]:
	batch['dataset_type'] = [instance['dataset_type'] for instance in instances]

	if 'class_name_ids' in instances[0]:
	class_name_ids = [instance['class_name_ids'] for instance in instances]
	if any(x.shape != class_name_ids[0].shape for x in class_name_ids):
	batch['class_name_ids'] = torch.nn.utils.rnn.pad_sequence(
	class_name_ids,
	batch_first=True,
	padding_value=-1,
	)
	else:
	batch['class_name_ids'] = torch.stack(class_name_ids, dim=0)
	if 'token_refer_id' in instances[0]:
	token_refer_id = [instance['token_refer_id'] for instance in instances]
	batch['token_refer_id'] = token_refer_id
	if 'cls_indices' in instances[0]:
	cls_indices = [instance['cls_indices'] for instance in instances]
	if any(x.shape != cls_indices[0].shape for x in cls_indices):
	batch['cls_indices'] = torch.nn.utils.rnn.pad_sequence(
	cls_indices,
	batch_first=True,
	padding_value=-1,
	)
	else:
	batch['cls_indices'] = torch.stack(cls_indices, dim=0)
	if 'random_idx' in instances[0]:
	random_idxs = [instance['random_idx'] for instance in instances]
	batch['random_idx'] = torch.stack(random_idxs, dim=0)
	if 'class_name_embedding_indices' in instances[0]:
	class_name_embedding_indices = [instance['class_name_embedding_indices'] for instance in instances]
	class_name_embedding_indices = torch.nn.utils.rnn.pad_sequence(
	class_name_embedding_indices,
	batch_first=True,
	padding_value=0)
	batch['class_name_embedding_indices'] = class_name_embedding_indices
	if 'refer_embedding_indices' in instances[0]:
	refer_embedding_indices = [instance['refer_embedding_indices'] for instance in instances]
	refer_embedding_indices = torch.nn.utils.rnn.pad_sequence(
	refer_embedding_indices,
	batch_first=True,
	padding_value=0)
	batch['refer_embedding_indices'] = refer_embedding_indices

	return batch
	def __str__(self):
	fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
	return fmtstr.format(**self.__dict__)

	class Summary(Enum):
	NONE = 0
	AVERAGE = 1
	SUM = 2
	COUNT = 3


	class AverageMeter(object):
	"""Computes and stores the average and current value"""

	def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE):
	self.name = name
	self.fmt = fmt
	self.summary_type = summary_type
	self.reset()

	def reset(self):
	self.val = 0
	self.avg = 0
	self.sum = 0
	self.count = 0

	def update(self, val, n=1):
	self.val = val
	self.sum += val * n
	self.count += n
	self.avg = self.sum / self.count

	def all_reduce(self):
	device = "cuda" if torch.cuda.is_available() else "cpu"
	if isinstance(self.sum, np.ndarray):
	total = torch.tensor(
	self.sum.tolist()
	+ [
	self.count,
	],
	dtype=torch.float32,
	device=device,
	)
	else:
	total = torch.tensor(
	[self.sum, self.count], dtype=torch.float32, device=device
	)

	dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
	if total.shape[0] > 2:
	self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item()
	else:
	self.sum, self.count = total.tolist()
	self.avg = self.sum / (self.count + 1e-5)

	def __str__(self):
	fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
	return fmtstr.format(**self.__dict__)

	def summary(self):
	fmtstr = ""
	if self.summary_type is Summary.NONE:
	fmtstr = ""
	elif self.summary_type is Summary.AVERAGE:
	fmtstr = "{name} {avg:.3f}"
	elif self.summary_type is Summary.SUM:
	fmtstr = "{name} {sum:.3f}"
	elif self.summary_type is Summary.COUNT:
	fmtstr = "{name} {count:.3f}"
	else:
	raise ValueError("invalid summary type %r" % self.summary_type)

	return fmtstr.format(**self.__dict__)


	def intersectionAndUnionGPU(output, target, K, ignore_index=255):
	# 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
	assert output.dim() in [1, 2, 3]
	assert output.shape == target.shape
	output = output.view(-1)
	target = target.view(-1)
	output[target == ignore_index] = ignore_index
	intersection = output[output == target]
	area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1)
	area_output = torch.histc(output, bins=K, min=0, max=K - 1)
	area_target = torch.histc(target, bins=K, min=0, max=K - 1)
	area_union = area_output + area_target - area_intersection
	return area_intersection, area_union, area_target

	def parse_outputs(outputs,gt_mask):
	res_list = []
	for output in outputs:
	# gt = output['gt'].cpu().numpy().astype(np.uint8)

	pred_mask = output['instances'].pred_masks
	pred_mask = pred_mask.cpu().numpy()
	scores = output['instances'].scores.cpu().numpy()
	try:
	pred_cls = output['instances'].pred_classes.cpu().numpy()
	except:
	pred_cls = None
	res = {
	'pred':pred_mask,
	'gt': gt_mask,
	'scores':scores,
	'pred_cls':pred_cls
	}
	res_list.append(res)
	return res_list

	# def create_generate_wrapper(model):
	# """创建一个包装器来处理generate方法中的参数兼容性问题"""
	# original_forward = model.forward

	# def filtered_forward(self, **kwargs):
	# # 过滤掉不支持的参数
	# filtered_kwargs = {}
	# supported_params = {
	# 'input_ids', 'attention_mask', 'images', 'seg_info',
	# 'token_refer_id', 'refer_embedding_indices', 'labels',
	# 'past_key_values', 'use_cache'
	# }

	# for key, value in kwargs.items():
	# if key in supported_params:
	# filtered_kwargs[key] = value

	# return original_forward(**filtered_kwargs)

	# # 临时替换forward方法
	# import types
	# model.forward = types.MethodType(filtered_forward, model)
	# return model

	def compute_metric(intersection_meter,union_meter,acc_iou_meter, gt_cls, results_list):
	pred_list = []
	gt_list = []
	results_list = list(results_list)
	for results in results_list:
	gt = results['gt']
	print("gt:", gt.shape, type(gt)) # debug
	preds = results['pred']
	print("preds:", preds.shape, type(preds)) # debug
	scores = results['scores']
	print("scores:", scores.shape, type(scores)) # debug
	preds = preds.astype(np.uint8)
	# pick mask with maximum score
	topk_scores,idx = torch.topk(torch.tensor(scores),1)
	idx = idx.cpu().numpy()
	topk_preds = preds[idx,:]
	print("topk_preds:", topk_preds.shape, type(topk_preds)) # debug
	if results['pred_cls'] is not None:
	topk_pred_cls = results['pred_cls'][idx]
	max_acc_iou = -1
	max_iou = 0
	max_intersection = 0
	max_union = 0
	max_i = 0
	# here topk=1, len(topk_preds)=1
	for i,pred_ in enumerate(topk_preds):
	intersection, union, _ = intersectionAndUnionGPU(
	torch.tensor(pred_).int().cuda().contiguous().clone(), torch.tensor(gt).int().cuda().contiguous(), 2, ignore_index=255
	)
	intersection, union = intersection.cpu().numpy(), union.cpu().numpy()
	acc_iou = intersection / (union + 1e-5)
	acc_iou[union == 0] = 1.0 # no-object target
	fore_acc_iou = acc_iou[1]
	if fore_acc_iou > max_acc_iou:
	max_acc_iou = fore_acc_iou
	max_iou = acc_iou
	max_intersection = intersection
	max_union = union
	max_i = i
	intersection_meter.update(max_intersection)
	union_meter.update(max_union)
	acc_iou_meter.update(max_iou, n=1)
	pred_list.append(topk_preds[max_i])
	gt_list.append(gt)

	return pred_list,gt_list






	@dataclass
	class DataArguments:
	data_path: str = field(default=None,
	metadata={"help": "Path to the training data."})
	lazy_preprocess: bool = False
	is_multimodal: bool = False
	image_folder: Optional[str] = field(default='/path/to/val2017')
	model_path: Optional[str] = field(default="/path/to/model")
	mask_config: Optional[str] = field(default="./objectrelator/mask_config/maskformer2_swin_base_384_bs16_50ep.yaml")
	image_aspect_ratio: str = 'square'
	image_grid_pinpoints: Optional[str] = field(default=None)
	json_path: str = '/path/to/coco'
	model_map_name: str = 'psalm_output_text' # 'psalm' or 'psalm_output_text'
	version: str = 'llava_phi'
	output_dir: str = './output/panoptic_segmentation'
	segmentation: bool = True
	eval_batch_size: int = 1
	dataloader_num_workers: int = 4
	seg_task: Optional[str] = field(default="referring")




	def evaluation():
	parser = transformers.HfArgumentParser(DataArguments)
	data_args = parser.parse_args_into_dataclasses()[0]
	disable_torch_init()
	model_path = os.path.expanduser(data_args.model_path)
	# model_name = get_model_name_from_path(model_path)
	model_name = data_args.model_map_name
	save_suffix = os.path.basename(data_args.json_path).split('.')[0]
	print(f'save suffix is {save_suffix}')
	print(f'current model is {model_path}')
	tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, model_args=data_args, mask_config=data_args.mask_config, device='cuda')

	# debug: 应用generate包装器来解决position_ids兼容性问题
	# model = create_generate_wrapper(model)
	# print("Applied generate wrapper for compatibility")

	data_args.image_processor = image_processor
	data_args.is_multimodal = True
	conversation_lib.default_conversation = conversation_lib.conv_templates[data_args.version]

	data_args.refcoco_image_folder = data_args.image_folder
	eval_dataset = EgoExo_Dataset_train(json_path=data_args.json_path, tokenizer=tokenizer, data_args=data_args)
	data_collator = DataCollatorForCOCODatasetV2(tokenizer=tokenizer)
	dataloader_params = {
	"batch_size": data_args.eval_batch_size,
	"num_workers": data_args.dataloader_num_workers,
	}
	eval_dataloader = DataLoader(eval_dataset, batch_size=dataloader_params['batch_size'], collate_fn=data_collator,
	num_workers=dataloader_params['num_workers'])

	def load_ref_dataset():
	return RefCOCO_dataset(json_path=data_args.json_path, tokenizer=tokenizer, data_args=data_args)

	DatasetCatalog.register('refcoco_dataset', load_ref_dataset)
	MetadataCatalog.get('refcoco_dataset').set(stuff_classes=['object'],)
	gt_json_path = data_args.json_path
	with open(gt_json_path) as f:
	gt_data = json.load(f)


	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	model.to(device=device,dtype=torch.float).eval()
	save_list = []
	intersection_meter = AverageMeter("Intersec", ":6.3f", Summary.SUM)
	union_meter = AverageMeter("Union", ":6.3f", Summary.SUM)
	acc_iou_meter = AverageMeter("gIoU", ":6.3f", Summary.SUM)

	streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	with torch.no_grad():
	for idx, inputs in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)):
	gt = gt_data[idx]['anns']
	h, w = gt_data[idx]['image_info']['height'], gt_data[idx]['image_info']['width']
	# generate gt mask
	masks = []
	for annotation in gt:
	if isinstance(annotation['segmentation'], list):
	segm = np.zeros((h, w), dtype=np.uint8)
	for poly in annotation['segmentation']:
	poly = np.array(poly, dtype=np.int32).reshape(-1, 2)
	cv2.fillPoly(segm, [poly], 1)
	masks.append(segm.astype(np.bool_))
	else:
	if isinstance(annotation['segmentation']['counts'], list):
	rle = mask.frPyObjects(annotation['segmentation'], *annotation['segmentation']['size'])
	segm = mask.decode(rle)
	else:
	segm = mask.decode(annotation['segmentation'])
	masks.append(segm.astype(np.bool_))
	# assert len(masks) == 1 #debug
	gt_mask = masks[0].astype(np.uint8)

	inputs = {k: v.to(device) if torch.is_tensor(v) else v for k, v in inputs.items()}
	# print("token_refer_id:", inputs['token_refer_id']) #debug
	inputs['token_refer_id'] = [ids.to(device) for ids in inputs['token_refer_id']]
	# print("input_keys:", inputs.keys()) #debug
	# print("input_ids", inputs['input_ids']) #debug
	# print("refer_embedding_indices:", inputs['refer_embedding_indices']) #debug
	outputs,next_token_ids = model.eval_seg(
	input_ids=inputs['input_ids'],
	attention_mask=inputs['attention_mask'],
	images=inputs['images'].float(),
	seg_info=inputs['seg_info'],
	token_refer_id = inputs['token_refer_id'],
	refer_embedding_indices=inputs['refer_embedding_indices'],
	labels=inputs['labels'],
	)

	'''以下为文本生成部分'''
	print("next_token_ids:", next_token_ids) # debug
	print("next_token_ids type:", type(next_token_ids), "shape:", next_token_ids.shape if hasattr(next_token_ids, 'shape') else 'no shape')

	# 处理不同类型的token输出
	if isinstance(next_token_ids, torch.Tensor):
	if next_token_ids.numel() == 1:
	# 单个token
	generated_text = tokenizer.decode([next_token_ids.item()], skip_special_tokens=True)
	else:
	# 多个tokens
	if len(next_token_ids.shape) == 0:
	generated_text = tokenizer.decode([next_token_ids.item()], skip_special_tokens=True)
	else:
	generated_text = tokenizer.decode(next_token_ids.tolist(), skip_special_tokens=True)
	else:
	# 处理列表或其他类型
	try:
	generated_text = tokenizer.decode(next_token_ids, skip_special_tokens=True)
	except:
	generated_text = str(next_token_ids)

	print("Generated text:", repr(generated_text)) # 使用repr显示特殊字符
	print("Generated text (clean):", generated_text.strip()) # 显示清理后的文本


	gt_cls = inputs['seg_info'][0]['instances'].gt_classes
	if torch.cuda.is_available():
	torch.cuda.synchronize()
	cur_res = parse_outputs(outputs,gt_mask)
	print("cur_res", len(cur_res)) # debug
	pred,gt_mask = compute_metric(intersection_meter,union_meter,acc_iou_meter, gt_cls, cur_res)
	save_list.append({'pred':pred[0],'gt':gt_mask[0],'name':inputs['seg_info'][0]['file_name']})
	print("pred_mask:", pred[0].shape, np.unique(pred[0]).tolist()) # debug
	print("gt_mask:", gt_mask[0].shape, np.unique(gt_mask[0]).tolist()) # debug
	print("=" * 50) # 分隔符
	iou_class = intersection_meter.sum / (union_meter.sum + 1e-10)
	ciou = iou_class[1]
	giou = acc_iou_meter.avg[1]
	msg = "benchmark: {}: giou: {:.4f}, ciou: {:.4f}".format(save_suffix, giou, ciou)
	print(msg)
	# save_path = os.path.join(data_args.model_path,'pred_pkl')
	# Path(save_path).mkdir(parents=True,exist_ok=True)
	# with open(os.path.join(save_path,f'pred_{save_suffix}.txt'),'w') as f:
	# f.write(msg)
	save_path_pred = "/scratch/yuqian_fu/test_result/mask/1247a29c-9fda-47ac-8b9c-78b1e76e977e_ref/30_pred_complex_ego_watch.png"
	save_path_gt = "/scratch/yuqian_fu/test_result/mask/1247a29c-9fda-47ac-8b9c-78b1e76e977e_ref/30_gt.png"
	# os.makedirs(os.path.dirname(save_path_pred), exist_ok=True)
	# cv2.imwrite(save_path_pred, save_list[0]['pred'].astype(np.uint8))
	# os.makedirs(os.path.dirname(save_path_gt), exist_ok=True)
	# cv2.imwrite(save_path_gt, save_list[0]['gt'].astype(np.uint8))







	if __name__ == "__main__":
	evaluation()