ObjectRelator-plus / objectrelator /eval /eval_egoexo_output_text.py
YuqianFu's picture
Upload folder using huggingface_hub
36c1e62 verified
import torch
import os
from enum import Enum
from tqdm import tqdm
import numpy as np
from detectron2.structures import BitMasks
from objectrelator.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
DEFAULT_IM_END_TOKEN, DEFAULT_SEG_TOKEN, SEG_TOKEN_INDEX
from objectrelator.model.builder import load_pretrained_model
from objectrelator.utils import disable_torch_init
from objectrelator.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
from objectrelator.mask_config.data_args import DataArguments
import cv2
from torch.utils.data import Dataset, DataLoader
from objectrelator import conversation as conversation_lib
from datasets.egoexo_dataset import EgoExo_Dataset_eval
from pycocotools.mask import encode, decode, frPyObjects
from detectron2.structures import BoxMode
from detectron2.data import MetadataCatalog, DatasetCatalog
from typing import Dict, Optional, Sequence, List
from dataclasses import dataclass, field
import torch.distributed as dist
import transformers
from pathlib import Path
from segmentation_evaluation import openseg_classes
COLOR_MAP = openseg_classes.ADE20K_150_CATEGORIES
from detectron2.data import detection_utils as utils
import pickle
import math
import json
import utils_metric
import os
import re
from natsort import natsorted
from transformers import TextStreamer
# collection func
@dataclass
class DataCollatorForCOCODatasetV2(object):
"""Collate examples for supervised fine-tuning."""
tokenizer: transformers.PreTrainedTokenizer
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
if len(instances[0]) == 0:
return {}
input_ids, labels = tuple([instance[key] for instance in instances]
for key in ("input_ids", "labels"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids,
batch_first=True,
padding_value=self.tokenizer.pad_token_id)
labels = torch.nn.utils.rnn.pad_sequence(labels,
batch_first=True,
padding_value=IGNORE_INDEX)
input_ids = input_ids[:, :self.tokenizer.model_max_length]
labels = labels[:, :self.tokenizer.model_max_length]
batch = dict(
input_ids=input_ids,
labels=labels,
attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
)
if 'image' in instances[0]:
images = [instance['image'] for instance in instances]
if all(x is not None and x.shape == images[0].shape for x in images):
batch['images'] = torch.stack(images)
else:
batch['images'] = images
if 'vp_image' in instances[0]:
vp_images = [instance['vp_image'] for instance in instances]
if all(x is not None and x.shape == vp_images[0].shape for x in vp_images):
batch['vp_images'] = torch.stack(vp_images)
else:
batch['vp_images'] = vp_images
for instance in instances:
for key in ['input_ids', 'labels', 'image']:
del instance[key]
batch['seg_info'] = [instance for instance in instances]
if 'dataset_type' in instances[0]:
batch['dataset_type'] = [instance['dataset_type'] for instance in instances]
if 'class_name_ids' in instances[0]:
class_name_ids = [instance['class_name_ids'] for instance in instances]
if any(x.shape != class_name_ids[0].shape for x in class_name_ids):
batch['class_name_ids'] = torch.nn.utils.rnn.pad_sequence(
class_name_ids,
batch_first=True,
padding_value=-1,
)
else:
batch['class_name_ids'] = torch.stack(class_name_ids, dim=0)
if 'token_refer_id' in instances[0]:
token_refer_id = [instance['token_refer_id'] for instance in instances]
batch['token_refer_id'] = token_refer_id
if 'cls_indices' in instances[0]:
cls_indices = [instance['cls_indices'] for instance in instances]
if any(x.shape != cls_indices[0].shape for x in cls_indices):
batch['cls_indices'] = torch.nn.utils.rnn.pad_sequence(
cls_indices,
batch_first=True,
padding_value=-1,
)
else:
batch['cls_indices'] = torch.stack(cls_indices, dim=0)
if 'random_idx' in instances[0]:
random_idxs = [instance['random_idx'] for instance in instances]
batch['random_idx'] = torch.stack(random_idxs, dim=0)
if 'class_name_embedding_indices' in instances[0]:
class_name_embedding_indices = [instance['class_name_embedding_indices'] for instance in instances]
class_name_embedding_indices = torch.nn.utils.rnn.pad_sequence(
class_name_embedding_indices,
batch_first=True,
padding_value=0)
batch['class_name_embedding_indices'] = class_name_embedding_indices
if 'refer_embedding_indices' in instances[0]:
refer_embedding_indices = [instance['refer_embedding_indices'] for instance in instances]
refer_embedding_indices = torch.nn.utils.rnn.pad_sequence(
refer_embedding_indices,
batch_first=True,
padding_value=0)
batch['refer_embedding_indices'] = refer_embedding_indices
return batch
def __str__(self):
fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
return fmtstr.format(**self.__dict__)
class Summary(Enum):
NONE = 0
AVERAGE = 1
SUM = 2
COUNT = 3
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE):
self.name = name
self.fmt = fmt
self.summary_type = summary_type
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def all_reduce(self):
device = "cuda" if torch.cuda.is_available() else "cpu"
if isinstance(self.sum, np.ndarray):
total = torch.tensor(
self.sum.tolist()
+ [
self.count,
],
dtype=torch.float32,
device=device,
)
else:
total = torch.tensor(
[self.sum, self.count], dtype=torch.float32, device=device
)
dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
if total.shape[0] > 2:
self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item()
else:
self.sum, self.count = total.tolist()
self.avg = self.sum / (self.count + 1e-5)
def __str__(self):
fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
return fmtstr.format(**self.__dict__)
def summary(self):
fmtstr = ""
if self.summary_type is Summary.NONE:
fmtstr = ""
elif self.summary_type is Summary.AVERAGE:
fmtstr = "{name} {avg:.3f}"
elif self.summary_type is Summary.SUM:
fmtstr = "{name} {sum:.3f}"
elif self.summary_type is Summary.COUNT:
fmtstr = "{name} {count:.3f}"
else:
raise ValueError("invalid summary type %r" % self.summary_type)
return fmtstr.format(**self.__dict__)
def intersectionAndUnionGPU(output, target, K, ignore_index=255):
# 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
assert output.dim() in [1, 2, 3]
assert output.shape == target.shape
output = output.view(-1)
target = target.view(-1)
output[target == ignore_index] = ignore_index
intersection = output[output == target]
area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1)
area_output = torch.histc(output, bins=K, min=0, max=K - 1)
area_target = torch.histc(target, bins=K, min=0, max=K - 1)
area_union = area_output + area_target - area_intersection
return area_intersection, area_union, area_target
def parse_outputs(outputs,gt_mask):
res_list = []
for output in outputs:
# gt = output['gt'].cpu().numpy().astype(np.uint8)
pred_mask = output['instances'].pred_masks
pred_mask = pred_mask.cpu().numpy()
scores = output['instances'].scores.cpu().numpy()
try:
pred_cls = output['instances'].pred_classes.cpu().numpy()
except:
pred_cls = None
res = {
'pred':pred_mask,
'gt': gt_mask,
'scores':scores,
'pred_cls':pred_cls
}
res_list.append(res)
return res_list
def compute_metric(intersection_meter,union_meter,acc_iou_meter, gt_cls, results_list):
pred_list = []
gt_list = []
results_list = list(results_list)
for results in results_list:
gt = results['gt']
preds = results['pred']
scores = results['scores']
preds = preds.astype(np.uint8)
# pick mask with maximum score
topk_scores,idx = torch.topk(torch.tensor(scores),1)
idx = idx.cpu().numpy()
topk_preds = preds[idx,:]
if results['pred_cls'] is not None:
topk_pred_cls = results['pred_cls'][idx]
max_acc_iou = -1
max_iou = 0
max_intersection = 0
max_union = 0
max_i = 0
# here topk=1, len(topk_preds)=1
for i,pred_ in enumerate(topk_preds):
intersection, union, _ = intersectionAndUnionGPU(
torch.tensor(pred_).int().cuda().contiguous().clone(), torch.tensor(gt).int().cuda().contiguous(), 2, ignore_index=255
)
intersection, union = intersection.cpu().numpy(), union.cpu().numpy()
acc_iou = intersection / (union + 1e-5)
acc_iou[union == 0] = 1.0 # no-object target
fore_acc_iou = acc_iou[1]
if fore_acc_iou > max_acc_iou:
max_acc_iou = fore_acc_iou
max_iou = acc_iou
max_intersection = intersection
max_union = union
max_i = i
intersection_meter.update(max_intersection)
union_meter.update(max_union)
acc_iou_meter.update(max_iou, n=1)
pred_list.append(topk_preds[max_i])
gt_list.append(gt)
return pred_list,gt_list
@dataclass
class DataArguments:
data_path: str = field(default=None,
metadata={"help": "Path to the training data."})
lazy_preprocess: bool = False
is_multimodal: bool = False
image_folder: Optional[str] = field(default='/path/to/val2017')
model_path: Optional[str] = field(default="/path/to/model")
mask_config: Optional[str] = field(default="./objectrelator/mask_config/maskformer2_swin_base_384_bs16_50ep.yaml")
image_aspect_ratio: str = 'square'
image_grid_pinpoints: Optional[str] = field(default=None)
json_path: str = '/path/to/coco'
model_map_name: str = 'psalm'
version: str = 'llava_phi'
output_dir: str = './output/panoptic_segmentation'
segmentation: bool = True
eval_batch_size: int = 1
dataloader_num_workers: int = 4
seg_task: Optional[str] = field(default="referring")
def evaluation():
parser = transformers.HfArgumentParser(DataArguments)
data_args = parser.parse_args_into_dataclasses()[0]
disable_torch_init()
model_path = os.path.expanduser(data_args.model_path)
# model_path = get_latest_checkpoint_path(model_path)
print(f'current model is {model_path}')
model_name = 'psalm'
print('Loading model:', model_name)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, model_args=data_args, mask_config=data_args.mask_config, device='cuda')
print('Model loaded successfully!')
data_args.image_processor = image_processor
data_args.is_multimodal = True
conversation_lib.default_conversation = conversation_lib.conv_templates[data_args.version_val]
data_args.refcoco_image_folder = data_args.image_folder
eval_dataset = EgoExo_Dataset_eval(json_path=data_args.json_path, tokenizer=tokenizer, data_args=data_args)
data_collator = DataCollatorForCOCODatasetV2(tokenizer=tokenizer)
dataloader_params = {
"batch_size": data_args.eval_batch_size,
"num_workers": data_args.dataloader_num_workers,
}
eval_dataloader = DataLoader(eval_dataset, batch_size=dataloader_params['batch_size'], collate_fn=data_collator,
num_workers=dataloader_params['num_workers'])
def load_ref_dataset():
return RefCOCO_dataset(json_path=data_args.json_path, tokenizer=tokenizer, data_args=data_args)
DatasetCatalog.register('refcoco_dataset', load_ref_dataset)
MetadataCatalog.get('refcoco_dataset').set(stuff_classes=['object'],)
gt_json_path = data_args.json_path
with open(gt_json_path) as f:
gt_data = json.load(f)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device=device,dtype=torch.float).eval()
save_list = []
intersection_meter = AverageMeter("Intersec", ":6.3f", Summary.SUM)
union_meter = AverageMeter("Union", ":6.3f", Summary.SUM)
acc_iou_meter = AverageMeter("gIoU", ":6.3f", Summary.SUM)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
with torch.no_grad():
for idx, inputs in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)):
gt = gt_data[idx]['anns']
h, w = gt_data[idx]['image_info']['height'], gt_data[idx]['image_info']['width']
# generate gt mask
masks = []
for annotation in gt:
if isinstance(annotation['segmentation'], list):
segm = np.zeros((h, w), dtype=np.uint8)
for poly in annotation['segmentation']:
poly = np.array(poly, dtype=np.int32).reshape(-1, 2)
cv2.fillPoly(segm, [poly], 1)
masks.append(segm.astype(np.bool_))
else:
if isinstance(annotation['segmentation']['counts'], list):
rle = mask.frPyObjects(annotation['segmentation'], *annotation['segmentation']['size'])
segm = mask.decode(rle)
else:
segm = mask.decode(annotation['segmentation'])
masks.append(segm.astype(np.bool_))
# assert len(masks) == 1 #debug
gt_mask = masks[0].astype(np.uint8)
inputs = {k: v.to(device) if torch.is_tensor(v) else v for k, v in inputs.items()}
# print("token_refer_id:", inputs['token_refer_id']) #debug
inputs['token_refer_id'] = [ids.to(device) for ids in inputs['token_refer_id']]
# print("input_keys:", inputs.keys()) #debug
# print("input_ids", inputs['input_ids']) #debug
# print("refer_embedding_indices:", inputs['refer_embedding_indices']) #debug
outputs = model.eval_seg(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
images=inputs['images'].float(),
seg_info=inputs['seg_info'],
token_refer_id = inputs['token_refer_id'],
refer_embedding_indices=inputs['refer_embedding_indices'],
labels=inputs['labels']
)
output_ids = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
images=inputs['images'].float(),
seg_info=inputs['seg_info'],
token_refer_id = inputs['token_refer_id'],
refer_embedding_indices=inputs['refer_embedding_indices'],
labels=inputs['labels'],
do_sample=True,
temperature=0.2,
max_new_tokens=1024,
streamer=streamer,
use_cache=True,
)
# 解码生成的文本
input_token_len = inputs['input_ids'].shape[1]
generated_tokens = output_ids[:, input_token_len:]
generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
print("output_text:", generated_text) # debug
gt_cls = inputs['seg_info'][0]['instances'].gt_classes
if torch.cuda.is_available():
torch.cuda.synchronize()
cur_res = parse_outputs(outputs,gt_mask)
pred,gt_mask = compute_metric(intersection_meter,union_meter,acc_iou_meter, gt_cls, cur_res)
save_list.append({'pred':pred[0],'gt':gt_mask[0],'name':inputs['seg_info'][0]['file_name']})
iou_class = intersection_meter.sum / (union_meter.sum + 1e-10)
ciou = iou_class[1]
giou = acc_iou_meter.avg[1]
msg = "benchmark: {}: giou: {:.4f}, ciou: {:.4f}".format(save_suffix, giou, ciou)
print(msg)
# save_path = os.path.join(data_args.model_path,'pred_pkl')
# Path(save_path).mkdir(parents=True,exist_ok=True)
# with open(os.path.join(save_path,f'pred_{save_suffix}.txt'),'w') as f:
# f.write(msg)
# save_path_pred = "/scratch/yuqian_fu/test_result/mask/1247a29c-9fda-47ac-8b9c-78b1e76e977e_ref/30_pred_complex_ego_watch.png"
# save_path_gt = "/scratch/yuqian_fu/test_result/mask/1247a29c-9fda-47ac-8b9c-78b1e76e977e_ref/30_gt.png"
# os.makedirs(os.path.dirname(save_path_pred), exist_ok=True)
# cv2.imwrite(save_path_pred, save_list[0]['pred'].astype(np.uint8))
# os.makedirs(os.path.dirname(save_path_gt), exist_ok=True)
# cv2.imwrite(save_path_gt, save_list[0]['gt'].astype(np.uint8))
if __name__ == "__main__":
evaluation()