Spaces:
Build error
Build error
| import argparse | |
| import json | |
| import os | |
| from typing import Any, Dict, List | |
| from loguru import logger | |
| import torch | |
| from torch.utils.data import DataLoader | |
| from virtex.config import Config | |
| from virtex.data import ImageDirectoryDataset | |
| from virtex.factories import TokenizerFactory, PretrainingModelFactory | |
| from virtex.utils.checkpointing import CheckpointManager | |
| from virtex.utils.common import common_parser | |
| from virtex.utils.metrics import CocoCaptionsEvaluator | |
| # fmt: off | |
| parser = common_parser( | |
| description="""Run image captioning inference on a pretrained model, and/or | |
| evaluate pretrained model on COCO Captions val2017 split.""" | |
| ) | |
| parser.add_argument( | |
| "--data-root", default=None, | |
| help="""Path to a directory containing image files to generate captions for. | |
| Default: COCO val2017 image directory as expected relative to project root.""" | |
| ) | |
| parser.add_argument( | |
| "--checkpoint-path", required=True, | |
| help="Path to load checkpoint and run captioning evaluation." | |
| ) | |
| parser.add_argument( | |
| "--output", default=None, | |
| help="Path to save predictions as a JSON file." | |
| ) | |
| parser.add_argument( | |
| "--calc-metrics", action="store_true", | |
| help="""Calculate CIDEr and SPICE metrics using ground truth COCO Captions. | |
| This flag should not be set when running inference on arbitrary images.""" | |
| ) | |
| # fmt: on | |
| def main(_A: argparse.Namespace): | |
| if _A.num_gpus_per_machine == 0: | |
| # Set device as CPU if num_gpus_per_machine = 0. | |
| device = torch.device("cpu") | |
| else: | |
| # Get the current device (this will be zero here by default). | |
| device = torch.cuda.current_device() | |
| _C = Config(_A.config, _A.config_override) | |
| tokenizer = TokenizerFactory.from_config(_C) | |
| if _A.data_root is None: | |
| _A.data_root = os.path.join(_C.DATA.ROOT, "val2017") | |
| val_dataloader = DataLoader( | |
| ImageDirectoryDataset(_A.data_root), | |
| batch_size=_C.OPTIM.BATCH_SIZE, | |
| num_workers=_A.cpu_workers, | |
| pin_memory=True, | |
| ) | |
| # Initialize model from a checkpoint. | |
| model = PretrainingModelFactory.from_config(_C).to(device) | |
| ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) | |
| model.eval() | |
| # Make a list of predictions to evaluate. | |
| predictions: List[Dict[str, Any]] = [] | |
| for val_iteration, val_batch in enumerate(val_dataloader, start=1): | |
| val_batch["image"] = val_batch["image"].to(device) | |
| with torch.no_grad(): | |
| output_dict = model(val_batch) | |
| # Make a dictionary of predictions in COCO format. | |
| for image_id, caption in zip( | |
| val_batch["image_id"], output_dict["predictions"] | |
| ): | |
| predictions.append( | |
| { | |
| # Convert image id to int if possible (mainly for COCO eval). | |
| "image_id": int(image_id) if image_id.isdigit() else image_id, | |
| "caption": tokenizer.decode(caption.tolist()), | |
| } | |
| ) | |
| # Save predictions as a JSON file if specified. | |
| if _A.output is not None: | |
| os.makedirs(os.path.dirname(_A.output), exist_ok=True) | |
| json.dump(predictions, open(_A.output, "w")) | |
| logger.info(f"Saved predictions to {_A.output}") | |
| # Calculate CIDEr and SPICE metrics using ground truth COCO Captions. This | |
| # should be skipped when running inference on arbitrary images. | |
| if _A.calc_metrics: | |
| # Assume ground truth (COCO val2017 annotations) exist. | |
| gt = os.path.join(_C.DATA.ROOT, "annotations", "captions_val2017.json") | |
| metrics = CocoCaptionsEvaluator(gt).evaluate(predictions) | |
| logger.info(f"Iter: {ITERATION} | Metrics: {metrics}") | |
| if __name__ == "__main__": | |
| _A = parser.parse_args() | |
| if _A.num_gpus_per_machine > 1: | |
| raise ValueError("Using multiple GPUs is not supported for this script.") | |
| # No distributed training here, just a single process. | |
| main(_A) | |