"""This module calculates CLIPScore, a reference-free evaluation metric for image captioning.""" import evaluate import datasets from evaluate.utils.logging import get_logger from transformers import CLIPProcessor, CLIPModel, AutoTokenizer logger = get_logger(__name__) _CITATION = """\ @article{DBLP:journals/corr/abs-2104-08718, author = {Jack Hessel and Ari Holtzman and Maxwell Forbes and Ronan Le Bras and Yejin Choi}, title = {CLIPScore: {A} Reference-free Evaluation Metric for Image Captioning}, journal = {CoRR}, volume = {abs/2104.08718}, year = {2021}, url = {https://arxiv.org/abs/2104.08718}, eprinttype = {arXiv}, eprint = {2104.08718}, timestamp = {Sat, 29 Apr 2023 10:09:27 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-2104-08718.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } """ _DESCRIPTION = """\ This new module is designed to calculate CLIPScore, a reference-free evaluation metric for image captioning. """ _KWARGS_DESCRIPTION = """ Computes CLIPScore to evaluate the alignment between an image and a text. Args: predictions: list of text predictions to score. Each prediction should be a string. images: list of images to score against. Each image should be a PIL image. Returns: clip_score: CLIPScore between the image and the text. Examples: >>> metric = evaluate.load("sunhill/clip_score") >>> results = metric.compute(predictions=["A cat sitting on a couch."], images=[PIL_image]) >>> print(results) {'clip_score': 0.2076} """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class CLIPScore(evaluate.Metric): """CLIPScore metric.""" def _info(self): return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features( { "predictions": datasets.Value("string"), "references": datasets.Image(), } ), # Homepage of the module for documentation homepage="https://huggingface.co/spaces/sunhill/clip_score", # Additional links to the codebase or references codebase_urls=["https://github.com/Taited/clip-score"], reference_urls=["https://arxiv.org/abs/2104.08718"], ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" logger.info("Downloading and preparing CLIP ViT-B/32 model...") self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") self.tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") def _compute(self, predictions, references): """Returns the scores""" refer = self.processor( text=None, images=references, return_tensors="pt", padding=True ) pred = self.tokenizer(predictions, return_tensors="pt", padding=True) refer_features = self.model.get_image_features(**refer) pred_features = self.model.get_text_features(**pred) refer_features = refer_features / refer_features.norm(dim=1, keepdim=True) pred_features = pred_features / pred_features.norm(dim=1, keepdim=True) clip_score = (refer_features * pred_features).sum().item() return {"clip_score": clip_score / refer_features.shape[0]}