Spaces:

evanec
/

coco-demo

Running

File size: 5,936 Bytes
# evaluate_batched.py
import os
import json
from tqdm import tqdm
import torch
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from src.utils import count_encoder_decoder_params, load_experiment
from src.inference import load_image, generate_caption
from data.transforms import build_coco_transform

try:
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge
    HAS_COCOEVAL = True
except ImportError:
    print("WARNING: pycocoevalcap not installed → CIDEr/ROUGE disabled.")
    HAS_COCOEVAL = False

# Batched Evaluation (non-breaking addition)

@torch.no_grad()
def evaluate_batched(
    model,
    tokenizer,
    preprocess,
    image_size,
    data_dir="data/processed",
    save_dir="checkpoints",
    device="cuda",
    batch_size=16,
    num_beams=1,
):
    """
    Batched version of evaluate().
    """

    from src.inference import load_images_batch, generate_captions_batch

    captions_path = os.path.join(data_dir, "captions.json")
    splits_path = os.path.join(data_dir, "splits.json")

    captions = json.load(open(captions_path))
    splits = json.load(open(splits_path))
    val_ids = splits["val"]

    preds = []
    refs_tokenized = []
    refs_strings = []

    print(f"Running *batched* evaluation on {len(val_ids)} images… (batch={batch_size})\n")

    # Loop in batches
    for start in tqdm(range(0, len(val_ids), batch_size), desc="Evaluating (batched)"):
        end = min(start + batch_size, len(val_ids))
        batch_ids = val_ids[start:end]

        # Image paths
        img_paths = [
            os.path.join(data_dir, "images", f"{int(i):012d}.jpg")
            for i in batch_ids
        ]

        # Load batch into tensor
        img_batch = load_images_batch(img_paths, preprocess, image_size).to(device)

        # Generate predictions for batch
        batch_preds = generate_captions_batch(
            model,
            tokenizer,
            img_batch,
            device=device,
            num_beams=num_beams,
            max_new_tokens=32
        )

        # Collect references
        for i, img_id in enumerate(batch_ids):
            gt_caps = captions[str(img_id)]["captions"]

            refs_strings.append(gt_caps)
            refs_tokenized.append([c.split() for c in gt_caps])
            preds.append(batch_preds[i])

    # Print sample predictions (20 samples, same as evaluate())
    print("\nSample Predictions:\n")
    num_examples = 20
    for i in range(min(num_examples, len(preds))):
        img_id = val_ids[i]
        print(f"Image ID: {img_id}")
        print(f"Prediction: {preds[i]}")
        print("Ground Truths:")
        for ref in refs_strings[i]:
            print(f"  - {ref}")
        print("-" * 60)

    # Compute BLEU
    smoothie = SmoothingFunction().method3

    bleu1 = corpus_bleu(
        refs_tokenized, [p.split() for p in preds],
        weights=(1, 0, 0, 0),
        smoothing_function=smoothie
    )

    bleu4 = corpus_bleu(
        refs_tokenized, [p.split() for p in preds],
        weights=(0.25, 0.25, 0.25, 0.25),
        smoothing_function=smoothie
    )

    scores = {"BLEU-1": bleu1, "BLEU-4": bleu4}

    # CIDEr / ROUGE
    if HAS_COCOEVAL:
        cider_refs = {}
        cider_preds = {}

        for i in range(len(preds)):
            img_id = val_ids[i]
            cid = str(int(img_id))
            cider_refs[cid] = captions[cid]["captions"]
            cider_preds[cid] = [preds[i]]

        cider = Cider()
        cider_score, _ = cider.compute_score(cider_refs, cider_preds)
        scores["CIDEr"] = cider_score

        rouge = Rouge()
        rouge_score, _ = rouge.compute_score(cider_refs, cider_preds)
        scores["ROUGE-L"] = rouge_score

    # Save results
    samples_full = []
    for i in range(len(preds)):
        img_id = val_ids[i]
        samples_full.append({
            "id": int(img_id),
            "prediction": preds[i],
            "references": refs_strings[i],
            "image": f"{int(img_id):012d}.jpg",
        })

    samples_preview = samples_full[:20]
    param_info = count_encoder_decoder_params(model)

    out_path = os.path.join(save_dir, "eval_results.json")
    with open(out_path, "w") as f:
        json.dump({
            "scores": scores,
            "derived_params": param_info,
            "samples_preview": samples_preview,
            "samples_full": samples_full
        }, f, indent=2)

    # Print final scores
    print("\nEvaluation Scores:")
    for k, v in scores.items():
        print(f"{k}: {v:.4f}")

    print(f"\nSaved batched results to: {out_path}")


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument("--checkpoint", type=str, required=True,
                        help="Path to checkpoint directory")
    parser.add_argument("--data_dir", type=str, default="data/processed")
    parser.add_argument("--batch_size", type=int, default=16,
                        help="Batch size for batched evaluation")
    parser.add_argument("--num_beams", type=int, default=1,
                        help="For beam search")

    args = parser.parse_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device: {device}")

    # Load model + tokenizer
    model, tokenizer, meta, config = load_experiment(args.checkpoint, device=device)
    image_size = config["model"].get("image_size", 224)
    preprocess = build_coco_transform(image_size=image_size)

    # Run batched evaluation
    evaluate_batched(
        model=model,
        tokenizer=tokenizer,
        preprocess=preprocess,
        image_size=image_size,
        data_dir=args.data_dir,
        save_dir=args.checkpoint,
        device=device,
        batch_size=args.batch_size,
        num_beams=args.num_beams
    )

    # Usage:
    # python evaluate_batched.py --checkpoint checkpoints/vision_t5/20251117_171912 --batch_size 16