File size: 6,136 Bytes
1809762
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# eval.py
import os
import json
from tqdm import tqdm
import torch
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from src.utils import count_encoder_decoder_params, load_experiment
from src.inference import load_image, generate_caption
from PIL import Image


# COCO metrics
try:
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge
    HAS_COCOEVAL = True
except ImportError:
    print("WARNING: pycocoevalcap not installed → CIDEr/ROUGE disabled.")
    HAS_COCOEVAL = False


def evaluate(model, tokenizer, preprocess, image_size, data_dir="data/processed", save_dir="checkpoints", device="cuda"):


    captions_path = os.path.join(data_dir, "captions.json")
    splits_path = os.path.join(data_dir, "splits.json")

    captions = json.load(open(captions_path))
    splits = json.load(open(splits_path))
    val_ids = splits["val"]

    preds = []
    refs_tokenized = []   # for BLEU
    refs_strings = []     # for JSON log

    print(f"Running evaluation on {len(val_ids)} images…\n")
    with torch.no_grad():
        for idx, img_id in enumerate(tqdm(val_ids, desc="Evaluating")):
            img_path = os.path.join(data_dir, "images", f"{int(img_id):012d}.jpg")

            img_tensor = load_image(img_path, preprocess).to(device)


            pred_caption = generate_caption(model, tokenizer, img_tensor, device=device)
            gt_caps = captions[str(img_id)]["captions"]

            # Tokenized refs for BLEU
            refs_tokenized.append([c.split() for c in gt_caps])

            # String refs for JSON
            refs_strings.append(gt_caps)

            preds.append(pred_caption)

            #if idx >= 20:
            #    break

        # Print 20 sample predictions
        print("\nSample Predictions:\n")
        num_examples = 20
        for i in range(min(num_examples, len(preds))):
            img_id = val_ids[i]
            print(f"Image ID: {img_id}")
            print(f"Prediction: {preds[i]}")
            print(f"Ground Truths:")
            for ref in refs_strings[i]:
                print(f"  - {ref}")
            print("-" * 60)


        #print("Number of preds:", len(preds))
        #print("Number of refs_tokenized:", len(refs_tokenized))
        #print("Example hypothesis:", preds[0])
        #print("Example hypothesis tokens:", preds[0].split())
        #print("Example references:", refs_strings[0])
        #print("Example references tokenized:", refs_tokenized[0])

        #if HAS_COCOEVAL:
            # Show first 2 examples only
        #    for i in range(min(2, len(preds))):
        #        img_id = str(int(val_ids[i]))
        #        print(f"\nImage ID: {img_id}")
        #        print("  COCOEvalCap refs (list of strings):")
        #        print(" ", captions[img_id]["captions"])
        #        print("  COCOEvalCap pred:")
        #        print(" ", preds[i])

            
        # BLEU
        smoothie = SmoothingFunction().method3

        bleu1 = corpus_bleu(
            refs_tokenized, [p.split() for p in preds],
            weights=(1, 0, 0, 0),
            smoothing_function=smoothie
        )

        bleu4 = corpus_bleu(
            refs_tokenized, [p.split() for p in preds],
            weights=(0.25, 0.25, 0.25, 0.25),
            smoothing_function=smoothie
        )

        scores = {"BLEU-1": bleu1, "BLEU-4": bleu4}

        # CIDEr / ROUGE
        if HAS_COCOEVAL:
            cider_refs = {}
            cider_preds = {}

            for i in range(len(preds)):
                img_id = val_ids[i]
                cid = str(int(img_id))  
                cider_refs[cid] = captions[cid]["captions"]
                cider_preds[cid] = [preds[i]]

            #keys = list(cider_refs.keys())[:5]
            #for k in keys:
            #    print(f"{k}: {cider_refs[k]}")

            #keys = list(cider_preds.keys())[:5]
            #for k in keys:
            #    print(f"{k}: {cider_preds[k]}")
        
            cider = Cider()
            cider_score, _ = cider.compute_score(cider_refs, cider_preds)
            scores["CIDEr"] = cider_score

            rouge = Rouge()
            rouge_score, _ = rouge.compute_score(cider_refs, cider_preds)
            scores["ROUGE-L"] = rouge_score

        # Save all samples
        samples_full = []
        for i in range(len(preds)):
            img_id = val_ids[i]
            samples_full.append({
                "id": int(img_id),
                "prediction": preds[i],
                "references": refs_strings[i],
                "image": f"{int(img_id):012d}.jpg",
            })

        # Save a preview subset (first 20)
        samples_preview = samples_full[:20]
        
        param_info = count_encoder_decoder_params(model)

        out_path = os.path.join(save_dir, "eval_results.json")

        with open(out_path, "w") as f:
            json.dump({
                "scores": scores,
                "derived_params": param_info,
                "samples_preview": samples_preview,
                "samples_full": samples_full
            }, f, indent=2)

        # Print final scores
        print("\nEvaluation Scores:")
        for k, v in scores.items():
            print(f"{k}: {v:.4f}")

        print(f"\nSaved detailed results to: {out_path}")


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument("--checkpoint", type=str, default="checkpoints/vision_t5")
    parser.add_argument("--data_dir", type=str, default="data/processed")

    args = parser.parse_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device: {device}")

    model, tokenizer, meta, config = load_experiment(args.checkpoint, device=device)
    image_size = config["model"].get("image_size", 224)
    preprocess = build_coco_transform(image_size=image_size)

    evaluate(
        model,
        tokenizer,
        preprocess=preprocess,
        data_dir=args.data_dir,
        save_dir=args.checkpoint,
        device=device
    )
    
    # python evaluate.py --checkpoint checkpoints/vision_t5/20251117_171912