File size: 5,936 Bytes
1809762
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# evaluate_batched.py
import os
import json
from tqdm import tqdm
import torch
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from src.utils import count_encoder_decoder_params, load_experiment
from src.inference import load_image, generate_caption
from data.transforms import build_coco_transform

try:
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge
    HAS_COCOEVAL = True
except ImportError:
    print("WARNING: pycocoevalcap not installed → CIDEr/ROUGE disabled.")
    HAS_COCOEVAL = False

# Batched Evaluation (non-breaking addition)

@torch.no_grad()
def evaluate_batched(
    model,
    tokenizer,
    preprocess,
    image_size,
    data_dir="data/processed",
    save_dir="checkpoints",
    device="cuda",
    batch_size=16,
    num_beams=1,
):
    """
    Batched version of evaluate().
    """

    from src.inference import load_images_batch, generate_captions_batch

    captions_path = os.path.join(data_dir, "captions.json")
    splits_path = os.path.join(data_dir, "splits.json")

    captions = json.load(open(captions_path))
    splits = json.load(open(splits_path))
    val_ids = splits["val"]

    preds = []
    refs_tokenized = []
    refs_strings = []

    print(f"Running *batched* evaluation on {len(val_ids)} images… (batch={batch_size})\n")

    # Loop in batches
    for start in tqdm(range(0, len(val_ids), batch_size), desc="Evaluating (batched)"):
        end = min(start + batch_size, len(val_ids))
        batch_ids = val_ids[start:end]

        # Image paths
        img_paths = [
            os.path.join(data_dir, "images", f"{int(i):012d}.jpg")
            for i in batch_ids
        ]

        # Load batch into tensor
        img_batch = load_images_batch(img_paths, preprocess, image_size).to(device)

        # Generate predictions for batch
        batch_preds = generate_captions_batch(
            model,
            tokenizer,
            img_batch,
            device=device,
            num_beams=num_beams,
            max_new_tokens=32
        )

        # Collect references
        for i, img_id in enumerate(batch_ids):
            gt_caps = captions[str(img_id)]["captions"]

            refs_strings.append(gt_caps)
            refs_tokenized.append([c.split() for c in gt_caps])
            preds.append(batch_preds[i])

    # Print sample predictions (20 samples, same as evaluate())
    print("\nSample Predictions:\n")
    num_examples = 20
    for i in range(min(num_examples, len(preds))):
        img_id = val_ids[i]
        print(f"Image ID: {img_id}")
        print(f"Prediction: {preds[i]}")
        print("Ground Truths:")
        for ref in refs_strings[i]:
            print(f"  - {ref}")
        print("-" * 60)

    # Compute BLEU
    smoothie = SmoothingFunction().method3

    bleu1 = corpus_bleu(
        refs_tokenized, [p.split() for p in preds],
        weights=(1, 0, 0, 0),
        smoothing_function=smoothie
    )

    bleu4 = corpus_bleu(
        refs_tokenized, [p.split() for p in preds],
        weights=(0.25, 0.25, 0.25, 0.25),
        smoothing_function=smoothie
    )

    scores = {"BLEU-1": bleu1, "BLEU-4": bleu4}

    # CIDEr / ROUGE
    if HAS_COCOEVAL:
        cider_refs = {}
        cider_preds = {}

        for i in range(len(preds)):
            img_id = val_ids[i]
            cid = str(int(img_id))
            cider_refs[cid] = captions[cid]["captions"]
            cider_preds[cid] = [preds[i]]

        cider = Cider()
        cider_score, _ = cider.compute_score(cider_refs, cider_preds)
        scores["CIDEr"] = cider_score

        rouge = Rouge()
        rouge_score, _ = rouge.compute_score(cider_refs, cider_preds)
        scores["ROUGE-L"] = rouge_score

    # Save results
    samples_full = []
    for i in range(len(preds)):
        img_id = val_ids[i]
        samples_full.append({
            "id": int(img_id),
            "prediction": preds[i],
            "references": refs_strings[i],
            "image": f"{int(img_id):012d}.jpg",
        })

    samples_preview = samples_full[:20]
    param_info = count_encoder_decoder_params(model)

    out_path = os.path.join(save_dir, "eval_results.json")
    with open(out_path, "w") as f:
        json.dump({
            "scores": scores,
            "derived_params": param_info,
            "samples_preview": samples_preview,
            "samples_full": samples_full
        }, f, indent=2)

    # Print final scores
    print("\nEvaluation Scores:")
    for k, v in scores.items():
        print(f"{k}: {v:.4f}")

    print(f"\nSaved batched results to: {out_path}")


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument("--checkpoint", type=str, required=True,
                        help="Path to checkpoint directory")
    parser.add_argument("--data_dir", type=str, default="data/processed")
    parser.add_argument("--batch_size", type=int, default=16,
                        help="Batch size for batched evaluation")
    parser.add_argument("--num_beams", type=int, default=1,
                        help="For beam search")

    args = parser.parse_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device: {device}")

    # Load model + tokenizer
    model, tokenizer, meta, config = load_experiment(args.checkpoint, device=device)
    image_size = config["model"].get("image_size", 224)
    preprocess = build_coco_transform(image_size=image_size)

    # Run batched evaluation
    evaluate_batched(
        model=model,
        tokenizer=tokenizer,
        preprocess=preprocess,
        image_size=image_size,
        data_dir=args.data_dir,
        save_dir=args.checkpoint,
        device=device,
        batch_size=args.batch_size,
        num_beams=args.num_beams
    )

    # Usage:
    # python evaluate_batched.py --checkpoint checkpoints/vision_t5/20251117_171912 --batch_size 16