owlv2 / scenic /projects /pixel_llm /evaluators.py

Upload 2446 files

1327f34 verified 5 days ago

32.4 kB

	# Copyright 2025 The Scenic Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Evaluation utils for PixelLLM."""
	# pylint: disable=g-explicit-length-test

	import json
	import os
	from typing import Any, Dict, Optional, List

	from absl import logging
	from coco_caption.coco import COCO as COCOCaption
	import cv2
	# pylint: disable=g-import-not-at-top
	try:
	from coco_caption.eval import COCOEvalCap
	from coco_caption.bleu import Bleu
	from coco_caption.cider import Cider
	from coco_caption.meteor import Meteor
	from coco_caption.rouge import Rouge
	from coco_caption.upp_tokenizer import tokenize
	except ImportError:
	COCOEvalCap = None
	Bleu = None
	Cider = None
	Meteor = None
	Rouge = None
	tokenize = None
	import numpy as np
	from pycocotools import mask as mask_api
	from scenic.model_lib.base_models import box_utils
	from scenic.projects.pixel_llm import densecap_evaluator

	# Evaluator without METEOR and SPICE
	# This import raises an error on colab.
	# pylint: disable=g-import-not-at-top
	try:
	from pix2seq.metrics.coco_caption_eval import COCOEvalCap as SimpleCOCOEvalCap
	except ImportError:
	SimpleCOCOEvalCap = None

	import tensorflow as tf


	class PointEvaluator(object):
	"""Class that evaluate the point prediction."""

	def __init__(
	self, dataset_name: Optional[str] = '', step: Optional[int] = None
	):
	del dataset_name, step
	self.results = []
	self._num_examples_added = 0

	def add_example(self, prediction: Any, target: Dict[str, np.ndarray]):
	"""Compute MSE."""

	self._num_examples_added += 1
	# [num_caps, max_text_tokens, num_gt_points, 2]
	gt_coords = target['points']
	# [num_caps, max_text_tokens]
	valid_token_mask = target.get(
	'token_padding_mask', target['text_tokens'] > 0
	)
	valid_token_mask *= gt_coords.max(axis=(-2, -1)) > 0

	# [num_caps, max_text_tokens, 2]
	# or [num_caps, max_text_tokens, num_pred_points, 2]
	pred_coords = prediction['point_coords']
	if pred_coords.ndim == 3:
	# [num_caps, max_text_tokens, 1, 2]
	pred_coords = pred_coords.reshape(gt_coords.shape[:-2] + (1, 2))

	# normalize coords
	height, width = target['size']
	gt_coords = gt_coords / np.array([width, height])
	pred_coords = pred_coords / np.array([width, height])

	# [num_caps, max_tokens, num_pred_points, num_gt_points]
	dist = np.mean(
	np.abs(
	np.expand_dims(pred_coords, axis=3)
	- np.expand_dims(gt_coords, axis=2)
	),
	axis=-1,
	)
	# only count the dist to the closest GT
	# [num_caps, max_tokens, num_pred_points]
	dist = np.min(dist, axis=-1).mean(axis=-1)

	# [num_caps, max_tokens]
	dist *= valid_token_mask
	error = dist.sum() / (valid_token_mask.sum() + 1e-8)

	self.results.append(error)

	def __len__(self):
	return self._num_examples_added

	def clear(self):
	self.results = []
	self._num_examples_added = 0

	def compute_metrics(
	self,
	save_dir: str,
	clear_annotations: Optional[bool] = True,
	skip_evaluate=False,
	):
	del save_dir, skip_evaluate
	result = np.array(self.results).mean()
	if clear_annotations:
	self.clear()
	return {'point_l1': result}


	class CaptionEvaluator(object):
	"""Class that feeds model outputs to COCO caption evaluation api."""

	def __init__(
	self, annotations_loc, eval_meteor_spice=False, step: Optional[int] = None
	):
	self.annotations_loc = annotations_loc
	logging.info('Initializing evaluator.')
	if self.annotations_loc:
	logging.info('Loading annotations from %s.', self.annotations_loc)
	self.coco = COCOCaption(self.annotations_loc)
	self.annotations = {
	'images': [],
	'annotations': [],
	'type': 'captions',
	'info': {},
	'licenses': [],
	'categories': [{'id': 1, 'name': 'object'}],
	}
	self.predictions = []
	self.pred_image_set = set()
	self.gt_image_set = set()
	self._num_examples_added = 0
	self._num_captions_added = 0
	self.eval_meteor_spice = eval_meteor_spice
	self.step = step

	def add_example(self, prediction: Any, target: Dict[str, np.ndarray]):
	"""Add a single example to the evaluator.

	Args:
	prediction: Model prediction tuple of 3 arrays: boxes, scores, classes.
	'boxes' is in shape of `[num_objects, 4]` and 'pred_boxes', 'classes'
	are botoh in shape of `[num_objects, num_classes]`. Box coordinates are
	absolute values in the input image coordinates. We need to scale them
	back to the original image coordinates using information in target.
	target: Target dictionary with keys and 'image/id'.
	"""
	if isinstance(prediction, dict):
	pred_caption = prediction['caption']
	else:
	pred_caption = prediction
	self._num_examples_added += 1
	id_key = 'image_id'
	empty_gt = False
	if self.annotations_loc:
	# we will use image_id that matches the annotation file
	img_id = int(target['image/id'])
	else:
	# we will create image_id on the fly
	img_id = self._num_examples_added
	if img_id not in self.gt_image_set:
	# avoid adding the same image twice due to repeated sampling.
	self.annotations['images'].append({'id': img_id})
	for x in target['captions']:
	# NOTE: if there is no gt but pred for some images, coco raise error
	# we use `empty_gt` to mark these kind of images and ignore them
	if x: # remove empty captions from padding.
	self._num_captions_added += 1
	self.annotations['annotations'].append(
	{'id': self._num_captions_added, id_key: img_id, 'caption': x}
	)
	# NOTE: this marks even img_id will be added into gt_image_set, there
	# is no gt for it, since it's filtered out above
	empty_gt = sum(len(t) for t in target['captions']) == 0
	self.gt_image_set.add(img_id)
	single_prediction = {
	id_key: img_id,
	'caption': pred_caption,
	}
	if img_id not in self.pred_image_set:
	if empty_gt:
	logging.warn('Image %s does not have any ground truth caption', img_id)
	else:
	self.predictions.append(single_prediction)
	else:
	logging.warn('Duplicate image %s not being added again', img_id)
	self.pred_image_set.add(img_id)

	def compute_metrics(
	self,
	save_dir: str,
	clear_annotations: Optional[bool] = True,
	skip_evaluate=False,
	):
	"""Computes the metrics for all added predictions."""
	json_file_path = self.write_pred_annotations_to_file(save_dir)
	if skip_evaluate:
	return {}
	if not self.annotations_loc:
	gt_file_path = self.write_pred_annotations_to_file(
	save_dir, is_groundtruth=True
	)
	self.coco = COCOCaption(gt_file_path)
	coco_res = self.coco.loadRes(json_file_path)
	evaluator_class = (
	COCOEvalCap if (self.eval_meteor_spice) else SimpleCOCOEvalCap
	)
	coco_eval = evaluator_class(self.coco, coco_res)
	coco_eval.params['image_id'] = coco_res.getImgIds()
	coco_eval.evaluate()
	results = coco_eval.eval
	if clear_annotations:
	self.clear()
	return results

	def clear(self):
	self.predictions = []
	self.pred_image_set = set()
	self._num_examples_added = 0
	self._num_captions_added = 0

	def __len__(self):
	return self._num_examples_added

	def write_pred_annotations_to_file(
	self, path: str, is_groundtruth: bool = False
	):
	"""Writes predictions to file in JSON format.

	Args:
	path: Path to write the prediction annotation JSON file.
	is_groundtruth: bool; if the file is ground truth or prediction.

	Returns:
	json_file_path: path to the saved json
	"""
	if not tf.io.gfile.exists(path):
	tf.io.gfile.makedirs(path)
	fname_app = 'predictions' if not is_groundtruth else 'annotations'
	if self.step:
	json_file_name = f'caption_{fname_app}_{self.step}.json'
	else:
	json_file_name = f'caption_{fname_app}.json'
	json_file_path = os.path.join(path, json_file_name)
	logging.info('Saving predictions to %s.', json_file_path)

	def _convert_to_serializable(obj):
	if isinstance(obj, np.ndarray):
	return obj.tolist()
	elif isinstance(obj, np.float32):
	return float(obj)
	else:
	raise TypeError(f'Unserializable object {obj} of type {type(obj)}')

	with tf.io.gfile.GFile(json_file_path, 'w') as f:
	f.write(
	json.dumps(
	self.predictions if not is_groundtruth else self.annotations,
	default=_convert_to_serializable,
	)
	)
	logging.info('Predicted annotations are stored in %s.', json_file_path)
	return json_file_path


	def rescale_and_convert_boxes_to_xywh(boxes, input_size, orig_size):
	"""Rescale boxes, and convert format to xywh."""
	h, w = orig_size
	input_h, input_w = np.asarray(input_size)
	scale_factor = np.array([w, h, w, h]) / np.array(
	[input_w, input_h, input_w, input_h])
	boxes = boxes * scale_factor[np.newaxis, :]
	boxes = np.maximum(boxes, 0)
	boxes[:, [0, 2]] = np.minimum(boxes[:, [0, 2]], w)
	boxes[:, [1, 3]] = np.minimum(boxes[:, [1, 3]], h)
	boxes[:, 2] -= boxes[:, 0]
	boxes[:, 3] -= boxes[:, 1]

	return boxes


	def rescale_and_encode_masks(
	masks, input_size, padded_size, orig_size, mask_threshold
	):
	"""Rescale masks, and encode into COCO format."""
	input_h, input_w = input_size
	padded_h, padded_w = padded_size
	h, w = orig_size
	out_masks = []
	for mask in masks:
	mask_h, mask_w = mask.shape
	mask_input_h = int(input_h * (mask_h / padded_h))
	mask_input_w = int(input_w * (mask_w / padded_w))

	mask = (
	cv2.resize(
	mask[:mask_input_h, :mask_input_w],
	(w, h),
	interpolation=cv2.INTER_LINEAR,
	)
	> mask_threshold
	)
	out_masks.append(mask_api.encode(
	np.asfortranarray(mask)
	))

	return out_masks


	def polygons_to_bitmask(
	polygons: List[np.ndarray], height: int, width: int
	) -> np.ndarray:
	"""Converts polygons to bitmask.

	Reference:
	https://github.com/facebookresearch/detectron2/blob/main/detectron2/structures/masks.py#L22

	Args:
	polygons(list[ndarray]): each array has shape (Nx2,)
	height(int):
	width(int):

	Returns:
	ndarray: a bool mask of shape (height, width)
	"""
	if not len(polygons):
	# COCOAPI does not support empty polygons
	return np.zeros((height, width)).astype(bool)
	rles = mask_api.frPyObjects(polygons, height, width)
	rle = mask_api.merge(rles)
	return mask_api.decode(rle).astype(bool)


	def decode_to_mask(segm, image_size):
	"""Converts segmentation to mask."""
	if isinstance(segm, list):
	# polygon
	mask = polygons_to_bitmask(segm, *image_size)
	elif isinstance(segm, dict):
	# COCO RLE
	mask = mask_api.decode(segm)
	elif isinstance(segm, np.ndarray):
	assert (
	segm.ndim == 2
	), 'Expect segmentation of 2 dimensions, got {}.'.format(segm.ndim)
	# mask array
	mask = segm
	else:
	raise ValueError(
	"Cannot convert segmentation of type '{}' to BitMasks!"
	'Supported types are: polygons as list[list[float] or ndarray],'
	' COCO-style RLE as a dict, or a binary segmentation mask '
	' in a 2D numpy array of shape HxW.'.format(type(segm))
	)
	return mask


	def mask_to_box(mask):
	"""Converts mask to box."""
	boxes = np.zeros((4,), dtype=np.float32)
	x_any = np.any(mask, axis=0)
	y_any = np.any(mask, axis=1)
	x = np.where(x_any)[0]
	y = np.where(y_any)[0]
	if len(x) and len(y):
	boxes = np.array([x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=np.float32)

	return boxes


	class RefCocoEvaluator(object):
	"""Class that evaluates the RefCOCO.

	Reference: https://github.com/ashkamath/mdetr/blob/main/datasets/refexp.py

	"""

	def __init__(
	self,
	dataset_name: str,
	annotations_loc: str,
	k=(1,),
	iou_threshold=0.5,
	step: Optional[int] = None,
	):
	self.dataset_name = dataset_name
	self.annotations_loc = annotations_loc
	if self.annotations_loc:
	logging.info('Loading refer annotations from %s.', self.annotations_loc)
	self.annotations = json.load(tf.io.gfile.GFile(self.annotations_loc))
	else:
	self.annotations = {
	'images': [],
	'annotations': [],
	'type': 'refer',
	'info': {},
	'licenses': [],
	'categories': [{'id': 1, 'name': 'object'}],
	}
	self.predictions = []
	self.pred_image_set = set()
	self.gt_image_set = set()

	self.k = k
	self.iou_threshold = iou_threshold
	self.mask_threshold = 0.
	# self.results = []
	self._num_examples_added = 0
	self.step = step

	def add_example(self, prediction: Any, target: Dict[str, np.ndarray]):
	"""Compute Precision."""
	boxes = prediction['detection_boxes']
	masks = prediction.get('detection_masks', None)
	boxes = rescale_and_convert_boxes_to_xywh(
	boxes, target['size'], target['orig_size']
	)
	boxes = np.asarray(boxes).tolist()
	if masks is not None:
	masks = rescale_and_encode_masks(
	masks,
	target['size'],
	target['padded_size'],
	target['orig_size'],
	self.mask_threshold,
	)
	img_id = int(target['image/id'])

	if img_id in self.pred_image_set:
	logging.warn('Duplicate image %s not being added again', img_id)
	return
	self.pred_image_set.add(img_id)

	for i in range(len(boxes)):
	refexp_id = int(target['refexp_ids'][i])
	# [4], in XYXY abs format
	pred_box = boxes[i]
	caption = target['captions'][i]
	if not refexp_id > 0:
	continue

	self._num_examples_added += 1

	single_pred = {
	'id': refexp_id,
	'image_id': img_id,
	'bbox': pred_box,
	'refexp': caption,
	}
	if masks is not None:
	single_pred['segmentation'] = masks[i]
	self.predictions.append(single_pred)

	# create annotation json
	if not self.annotations_loc and img_id not in self.gt_image_set:
	# avoid adding the same image twice due to repeated sampling.
	self.annotations['images'].append({'id': img_id})
	gt_boxes = target['boxes']
	gt_boxes = rescale_and_convert_boxes_to_xywh(
	gt_boxes, target['size'], target['orig_size']
	)
	gt_boxes = np.asarray(gt_boxes).tolist()
	for i in range(len(gt_boxes)):
	gt_box = gt_boxes[i]
	refexp_id = int(target['refexp_ids'][i])
	if not refexp_id > 0:
	continue

	caption = target['captions'][i]
	self.annotations['annotations'].append({
	'id': refexp_id,
	'image_id': img_id,
	'bbox': gt_box,
	'refexp': caption,
	})
	self.gt_image_set.add(img_id)

	def __len__(self):
	return self._num_examples_added

	def clear(self):
	self.predictions = []
	self._num_examples_added = 0
	self.pred_image_set = set()
	self.gt_image_set = set()

	def write_pred_annotations_to_file(
	self, path: str, is_groundtruth: bool = False
	):
	"""Writes predictions to file in JSON format.

	Args:
	path: Path to write the prediction annotation JSON file.
	is_groundtruth: bool; if the file is ground truth or prediction.

	Returns:
	json_file_path: path to the saved json
	"""
	if not tf.io.gfile.exists(path):
	tf.io.gfile.makedirs(path)
	fname_app = 'predictions' if not is_groundtruth else 'annotations'
	if self.step:
	json_file_name = f'{self.dataset_name}_{fname_app}_{self.step}.json'
	else:
	json_file_name = f'{self.dataset_name}_{fname_app}.json'
	json_file_path = os.path.join(path, json_file_name)
	logging.info('Saving predictions to %s.', json_file_path)

	def _convert_to_serializable(obj):
	if isinstance(obj, np.ndarray):
	return obj.tolist()
	elif isinstance(obj, np.float32):
	return float(obj)
	else:
	raise TypeError(f'Unserializable object {obj} of type {type(obj)}')

	with tf.io.gfile.GFile(json_file_path, 'w') as f:
	f.write(
	json.dumps(
	self.predictions if not is_groundtruth else self.annotations,
	default=_convert_to_serializable,
	)
	)
	logging.info('Predicted annotations are stored in %s.', json_file_path)
	return json_file_path

	def compute_metrics(
	self,
	save_dir: str,
	clear_annotations: Optional[bool] = True,
	skip_evaluate=False,
	) -> Dict[str, Any]:
	"""Computes the metrics for all added predictions."""
	self.write_pred_annotations_to_file(save_dir)
	if not self.annotations_loc:
	self.write_pred_annotations_to_file(save_dir, is_groundtruth=True)
	if skip_evaluate:
	return {}

	pred_map = {d['id']: idx for idx, d in enumerate(self.predictions)}
	# NOTE(jiaruixu): handle coco style annotation
	if 'refexp_id' in self.annotations['annotations'][0]:
	gt_anno_map = {}
	for idx, d in enumerate(self.annotations['annotations']):
	refexp_ids = d['refexp_id']
	for refexp_id in refexp_ids:
	gt_anno_map[refexp_id] = idx
	else:
	gt_anno_map = {
	d['id']: idx for idx, d in enumerate(self.annotations['annotations'])
	}
	gt_image_map = {
	d['id']: idx for idx, d in enumerate(self.annotations['images'])
	}
	eval_seg = (
	'segmentation' in self.predictions[0]
	and 'segmentation' in self.annotations['annotations'][0]
	)
	box_tp_list = []

	seg_inter_list = []
	seg_union_list = []
	seg_box_tp_list = []

	for refexp_id in pred_map:
	pred = self.predictions[pred_map[refexp_id]]
	gt_anno = self.annotations['annotations'][gt_anno_map[refexp_id]]
	# single box
	pred_box = np.array(pred['bbox']).reshape(-1, 4)
	gt_box = np.array(gt_anno['bbox']).reshape(-1, 4)
	pred_box[:, 2:4] += pred_box[:, :2]
	gt_box[:, 2:4] += gt_box[:, :2]

	box_iou, _ = box_utils.box_iou(pred_box, gt_box, np_backbone=np)
	for k in self.k:
	box_tp_list.append(max(box_iou[:k]) > self.iou_threshold)
	if eval_seg:
	gt_image = self.annotations['images'][gt_image_map[gt_anno['image_id']]]
	image_size = (gt_image['height'], gt_image['width'])
	pred_mask = decode_to_mask(pred['segmentation'], image_size)
	gt_mask = decode_to_mask(gt_anno['segmentation'], image_size)
	cur_inter = (pred_mask & gt_mask).sum()
	cur_union = (pred_mask \| gt_mask).sum()
	seg_inter_list.append(cur_inter)
	seg_union_list.append(cur_union)

	pred_seg_box = mask_to_box(pred_mask).reshape(-1, 4)

	seg_box_iou, _ = box_utils.box_iou(pred_seg_box, gt_box, np_backbone=np)
	for k in self.k:
	seg_box_tp_list.append(max(seg_box_iou[:k]) > self.iou_threshold)

	# compute mean over all refexp
	box_tp = (
	np.array(box_tp_list).reshape(len(pred_map), len(self.k)).mean(axis=0)
	)
	metrics = {
	f'box_Precision@{k}': result for k, result in zip(self.k, box_tp)
	}

	if eval_seg:
	# compute mean over all refexp
	seg_box_tp = (
	np.array(seg_box_tp_list)
	.reshape(len(pred_map), len(self.k))
	.mean(axis=0)
	)

	metrics.update(
	{
	f'seg_box_Precision@{k}': result
	for k, result in zip(self.k, seg_box_tp)
	}
	)

	seg_inter_list = np.array(seg_inter_list)
	seg_union_list = np.array(seg_union_list)

	metrics['seg_cIoU'] = seg_inter_list.mean() / (
	seg_union_list.mean() + 1e-5
	)
	metrics['seg_gIoU'] = (seg_inter_list / (seg_union_list + 1e-5)).mean()
	metrics['seg_AP'] = (
	(seg_inter_list / (seg_union_list + 1e-5)) > self.iou_threshold
	).mean()

	if clear_annotations:
	self.clear()
	return metrics


	class DensecapEvaluator(object):
	"""DensecapEvaluator wrapper."""

	def __init__(self, dataset_name: str, annotations_loc, eval_meteor=True,
	ignore_empty_string=True,
	step: Optional[int] = None):
	self.dataset_name = dataset_name
	self.step = step
	self.evaluator = densecap_evaluator.DensecapEval(
	annotations_loc, eval_meteor=eval_meteor,
	ignore_empty_string=ignore_empty_string)
	self.predictions = []
	self._num_examples_added = 0
	self.pred_image_set = set()

	def add_example(self, prediction: Any, target: Dict[str, np.ndarray]):
	"""Add prediction of a single image to the evaluator.

	Args:
	prediction: Model prediction tuple of 4 arrays: boxes, scores, classes,
	captions. 'boxes' is in shape of `[num_objects, 4]` and 'pred_boxes',
	'classes' are botoh in shape of `[num_objects, num_classes]`. 'captions'
	is a list of strings. Box coordinates are absolute values in the input
	image coordinates. We need to scale them back to the original image
	coordinates using information in target.
	target: Target dictionary with keys 'orig_size', 'size', and 'image/id'.
	"""
	boxes = prediction['detection_boxes']
	scores = prediction['detection_scores']
	captions = prediction['captions']

	boxes = rescale_and_convert_boxes_to_xywh(
	boxes, target['size'], target['orig_size']
	)
	boxes = np.asarray(boxes).tolist()
	img_id = int(target['image/id'])

	if img_id in self.pred_image_set:
	logging.warn('Duplicate image %s not being added again', img_id)
	return
	self.pred_image_set.add(img_id)

	for bbox, score, caption in zip(
	boxes, scores, captions):
	single_classification = {
	'image_id': img_id,
	'category_id': 0,
	'bbox': bbox,
	'score': score,
	'caption': caption,
	}
	self.predictions.append(single_classification)
	self._num_examples_added += 1

	# pytype: disable=signature-mismatch
	def compute_metrics(
	self,
	save_dir: str,
	clear_annotations: Optional[bool] = True,
	skip_evaluate=False,
	) -> Dict[str, Any]:
	# pytype: enable=signature-mismatch
	"""Computes the metrics for all added predictions."""
	if self.step:
	fname_app = f'{self.dataset_name}_{self.step}.json'
	else:
	fname_app = f'{self.dataset_name}.json'
	self.write_pred_annotations_to_file(save_dir, fname_app=fname_app)
	if skip_evaluate:
	return {}
	results = self.evaluator.compute_metrics(self.predictions)
	if clear_annotations:
	self.clear()
	return results

	def clear(self):
	self.predictions = []
	self._num_examples_added = 0
	self.pred_image_set = set()

	def __len__(self):
	return self._num_examples_added

	def write_pred_annotations_to_file(self,
	path: str,
	fname_app: Optional[str] = None):
	"""Writes predictions to file in JSON format.

	Args:
	path: Path to write the prediction annotation JSON file.
	fname_app: Optional string to append to the file name.
	"""
	if not tf.io.gfile.exists(path):
	tf.io.gfile.makedirs(path)
	json_file_name = f"predictions{fname_app if fname_app else ''}.json"
	json_file_path = os.path.join(path, json_file_name)

	def _convert_to_serializable(obj):
	if isinstance(obj, np.ndarray):
	return obj.tolist()
	elif isinstance(obj, np.float32):
	return float(obj)
	else:
	raise TypeError(f'Unserializable object {obj} of type {type(obj)}')

	with tf.io.gfile.GFile(json_file_path, 'w') as f:
	f.write(
	json.dumps(
	self.predictions,
	default=_convert_to_serializable))
	logging.info('Predicted annotations are stored in %s.', json_file_path)


	class LocaEvaluator(object):
	"""Location-conditioned Caption wrapper."""
	merge_gt_boxes_iou = 0.7

	def __init__(self, dataset_name: str,
	step: Optional[int] = None,
	merge_gt_boxes: Optional[bool] = False,
	meteor_jar_path: Optional[str] = None,
	java_jre_path: Optional[str] = None):
	self.dataset_name = dataset_name
	self.merge_gt_boxes = merge_gt_boxes
	self.step = step
	self.predictions = []
	self._num_examples_added = 0
	self._num_captions_added = 0
	self.pred_image_set = set()
	self.meteor_jar_path = meteor_jar_path
	self.java_jre_path = java_jre_path
	self.annotations = {
	'images': [],
	'annotations': [],
	'type': 'captions',
	'info': {},
	'licenses': [],
	'categories': [{'id': 1, 'name': 'object'}],
	}

	@staticmethod
	def merge_gt_anno(gts, iou_thresh, is_gt=True):
	"""VG ground truth are overlaping. We need to merge them before evaluating.

	Original code:
	github.com/jcjohnson/densecap/blob/maste*/densecap/box_utils.lua#L590
	github.com/jcjohnson/densecap/blob/maste*/eval/eval_utils.lua#L105

	Args:
	gts: gts of a single image. list of dicts, each with the following keys:
	'bbox': list of 4 floats in order (l, t, w, h)
	'caption': a string.
	...
	iou_thresh: float
	is_gt: bool
	Returns:
	new_gts: list of dicts. Might have different length from the input.
	'bbox': list of 4 floats in order (l, t, w, h)
	'captions': list of strings.
	"""
	new_gts = []
	if not gts:
	return new_gts
	gt_boxes = np.asarray([x['bbox'] for x in gts], dtype=np.float32)
	ious, _ = box_utils.box_iou(gt_boxes, gt_boxes, np_backbone=np) # N x N

	while True:
	can_merge = ious >= iou_thresh
	# Find the largest cluster and merge it.
	num_merges = can_merge.sum(axis=1) # N
	ind = np.argmax(num_merges) # int
	if num_merges[ind] == 0:
	break
	merge_inds = np.nonzero(can_merge[ind])[0]
	new_box = gt_boxes[merge_inds].mean(axis=0)
	all_captions = [gts[x]['caption'].replace('\n', '') for x in merge_inds]
	for merge_ind in merge_inds:
	if is_gt:
	new_gt = {
	'bbox': new_box,
	'captions': all_captions,
	'id': gts[merge_ind]['id'],
	}
	else:
	new_gt = {
	'bbox': new_box,
	'caption': gts[merge_ind]['caption'],
	'id': gts[merge_ind]['id'],
	}
	new_gts.append(new_gt)
	ious[merge_inds, :] = 0.0
	ious[:, merge_inds] = 0.0
	return new_gts

	def add_example(self, prediction: Any, target: Dict[str, np.ndarray]):
	"""Add prediction of a single image to the evaluator.

	Args:
	prediction: Model prediction tuple of 4 arrays: boxes, scores, classes,
	captions. 'boxes' is in shape of `[num_objects, 4]` and 'pred_boxes',
	'classes' are botoh in shape of `[num_objects, num_classes]`. 'captions'
	is a list of strings. Box coordinates are absolute values in the input
	image coordinates. We need to scale them back to the original image
	coordinates using information in target.
	target: Target dictionary with keys 'orig_size', 'size', and 'image/id'.
	"""
	captions = prediction['captions']
	boxes = prediction['detection_boxes']
	gt_captions = target['captions']
	gt_boxes = target['boxes']

	boxes = rescale_and_convert_boxes_to_xywh(
	boxes, target['size'], target['orig_size']
	)
	boxes = np.asarray(boxes).tolist()
	gt_boxes = rescale_and_convert_boxes_to_xywh(
	gt_boxes, target['size'], target['orig_size']
	)
	gt_boxes = np.asarray(gt_boxes).tolist()
	assert len(boxes) == len(captions)
	assert len(gt_boxes) == len(boxes)
	assert len(gt_captions) == len(captions)

	img_id = int(target['image/id'])

	if img_id in self.pred_image_set:
	logging.warn('Duplicate image %s not being added again', img_id)
	return
	self.pred_image_set.add(img_id)
	self.annotations['images'].append({'id': self._num_captions_added})

	cur_preds = []
	cur_annos = []
	for caption, box, gt_caption, gt_box in zip(
	captions, boxes, gt_captions, gt_boxes
	):
	if max(gt_box) <= 0:
	continue
	single_classification = {
	'image_id': img_id,
	'id': self._num_captions_added,
	'category_id': 0,
	'bbox': box,
	'caption': caption,
	}
	single_annotation = {
	'image_id': img_id,
	'id': self._num_captions_added,
	'category_id': 0,
	'bbox': gt_box,
	'caption': gt_caption,
	}
	# self.annotations['annotations'].append(single_annotation)
	# self.predictions.append(single_classification)
	cur_preds.append(single_classification)
	cur_annos.append(single_annotation)
	self._num_captions_added += 1
	if self.merge_gt_boxes:
	cur_preds = self.merge_gt_anno(
	cur_preds, self.merge_gt_boxes_iou, is_gt=False
	)
	cur_annos = self.merge_gt_anno(cur_annos, self.merge_gt_boxes_iou)
	self.predictions.extend(cur_preds)
	self.annotations['annotations'].extend(cur_annos)
	self._num_examples_added += 1

	# pytype: disable=signature-mismatch
	def compute_metrics(
	self,
	save_dir: str,
	clear_annotations: Optional[bool] = True,
	skip_evaluate=False,
	) -> Dict[str, Any]:
	# pytype: enable=signature-mismatch
	"""Computes the metrics for all added predictions."""
	if self.step:
	fname_app = f'{self.dataset_name}_{self.step}.json'
	else:
	fname_app = f'{self.dataset_name}.json'
	self.write_pred_annotations_to_file(save_dir, fname_app=fname_app)
	if skip_evaluate:
	return {}
	res = {}
	gts = {}
	for pred in self.predictions:
	if 'captions' in pred:
	res[pred['id']] = [{'caption': c} for c in pred['captions']]
	else:
	res[pred['id']] = [pred]
	for anno in self.annotations['annotations']:
	if 'captions' in anno:
	gts[anno['id']] = [{'caption': c} for c in anno['captions']]
	else:
	gts[anno['id']] = [anno]

	res = tokenize(res)
	gts = tokenize(gts)

	scorers = [
	(Rouge(), 'ROUGE_L'),
	(Cider(), 'CIDEr'),
	(Bleu(), 'BLEU-4'),
	(Meteor(), 'Meteor'),
	]
	results = {}
	for scorer, method in scorers:
	logging.info('computing %s score...', scorer.method())
	score, _ = scorer.compute_score(gts, res)
	results[method] = score
	if clear_annotations:
	self.clear()
	return results

	def clear(self):
	self.predictions = []
	self._num_examples_added = 0
	self._num_captions_added = 0
	self.pred_image_set = set()

	def __len__(self):
	return self._num_examples_added

	def write_pred_annotations_to_file(self,
	path: str,
	fname_app: Optional[str] = None):
	"""Writes predictions to file in JSON format.

	Args:
	path: Path to write the prediction annotation JSON file.
	fname_app: Optional string to append to the file name.
	"""
	if not tf.io.gfile.exists(path):
	tf.io.gfile.makedirs(path)
	json_file_name = f"predictions{fname_app if fname_app else ''}.json"
	json_file_path = os.path.join(path, json_file_name)

	def _convert_to_serializable(obj):
	if isinstance(obj, np.ndarray):
	return obj.tolist()
	elif isinstance(obj, np.float32):
	return float(obj)
	else:
	raise TypeError(f'Unserializable object {obj} of type {type(obj)}')

	with tf.io.gfile.GFile(json_file_path, 'w') as f:
	f.write(
	json.dumps(
	self.predictions,
	default=_convert_to_serializable))
	logging.info('Predicted annotations are stored in %s.', json_file_path)