Spaces:

evanec
/

coco-demo

Running

App Files Files Community

coco-demo / src /evaluate.py

evanec

Upload 12 files

1809762 verified 6 days ago

raw

history blame contribute delete

6.14 kB

	# eval.py
	import os
	import json
	from tqdm import tqdm
	import torch
	from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
	from src.utils import count_encoder_decoder_params, load_experiment
	from src.inference import load_image, generate_caption
	from PIL import Image


	# COCO metrics
	try:
	from pycocoevalcap.cider.cider import Cider
	from pycocoevalcap.rouge.rouge import Rouge
	HAS_COCOEVAL = True
	except ImportError:
	print("WARNING: pycocoevalcap not installed → CIDEr/ROUGE disabled.")
	HAS_COCOEVAL = False


	def evaluate(model, tokenizer, preprocess, image_size, data_dir="data/processed", save_dir="checkpoints", device="cuda"):


	captions_path = os.path.join(data_dir, "captions.json")
	splits_path = os.path.join(data_dir, "splits.json")

	captions = json.load(open(captions_path))
	splits = json.load(open(splits_path))
	val_ids = splits["val"]

	preds = []
	refs_tokenized = [] # for BLEU
	refs_strings = [] # for JSON log

	print(f"Running evaluation on {len(val_ids)} images…\n")
	with torch.no_grad():
	for idx, img_id in enumerate(tqdm(val_ids, desc="Evaluating")):
	img_path = os.path.join(data_dir, "images", f"{int(img_id):012d}.jpg")

	img_tensor = load_image(img_path, preprocess).to(device)


	pred_caption = generate_caption(model, tokenizer, img_tensor, device=device)
	gt_caps = captions[str(img_id)]["captions"]

	# Tokenized refs for BLEU
	refs_tokenized.append([c.split() for c in gt_caps])

	# String refs for JSON
	refs_strings.append(gt_caps)

	preds.append(pred_caption)

	#if idx >= 20:
	# break

	# Print 20 sample predictions
	print("\nSample Predictions:\n")
	num_examples = 20
	for i in range(min(num_examples, len(preds))):
	img_id = val_ids[i]
	print(f"Image ID: {img_id}")
	print(f"Prediction: {preds[i]}")
	print(f"Ground Truths:")
	for ref in refs_strings[i]:
	print(f" - {ref}")
	print("-" * 60)


	#print("Number of preds:", len(preds))
	#print("Number of refs_tokenized:", len(refs_tokenized))
	#print("Example hypothesis:", preds[0])
	#print("Example hypothesis tokens:", preds[0].split())
	#print("Example references:", refs_strings[0])
	#print("Example references tokenized:", refs_tokenized[0])

	#if HAS_COCOEVAL:
	# Show first 2 examples only
	# for i in range(min(2, len(preds))):
	# img_id = str(int(val_ids[i]))
	# print(f"\nImage ID: {img_id}")
	# print(" COCOEvalCap refs (list of strings):")
	# print(" ", captions[img_id]["captions"])
	# print(" COCOEvalCap pred:")
	# print(" ", preds[i])


	# BLEU
	smoothie = SmoothingFunction().method3

	bleu1 = corpus_bleu(
	refs_tokenized, [p.split() for p in preds],
	weights=(1, 0, 0, 0),
	smoothing_function=smoothie
	)

	bleu4 = corpus_bleu(
	refs_tokenized, [p.split() for p in preds],
	weights=(0.25, 0.25, 0.25, 0.25),
	smoothing_function=smoothie
	)

	scores = {"BLEU-1": bleu1, "BLEU-4": bleu4}

	# CIDEr / ROUGE
	if HAS_COCOEVAL:
	cider_refs = {}
	cider_preds = {}

	for i in range(len(preds)):
	img_id = val_ids[i]
	cid = str(int(img_id))
	cider_refs[cid] = captions[cid]["captions"]
	cider_preds[cid] = [preds[i]]

	#keys = list(cider_refs.keys())[:5]
	#for k in keys:
	# print(f"{k}: {cider_refs[k]}")

	#keys = list(cider_preds.keys())[:5]
	#for k in keys:
	# print(f"{k}: {cider_preds[k]}")

	cider = Cider()
	cider_score, _ = cider.compute_score(cider_refs, cider_preds)
	scores["CIDEr"] = cider_score

	rouge = Rouge()
	rouge_score, _ = rouge.compute_score(cider_refs, cider_preds)
	scores["ROUGE-L"] = rouge_score

	# Save all samples
	samples_full = []
	for i in range(len(preds)):
	img_id = val_ids[i]
	samples_full.append({
	"id": int(img_id),
	"prediction": preds[i],
	"references": refs_strings[i],
	"image": f"{int(img_id):012d}.jpg",
	})

	# Save a preview subset (first 20)
	samples_preview = samples_full[:20]

	param_info = count_encoder_decoder_params(model)

	out_path = os.path.join(save_dir, "eval_results.json")

	with open(out_path, "w") as f:
	json.dump({
	"scores": scores,
	"derived_params": param_info,
	"samples_preview": samples_preview,
	"samples_full": samples_full
	}, f, indent=2)

	# Print final scores
	print("\nEvaluation Scores:")
	for k, v in scores.items():
	print(f"{k}: {v:.4f}")

	print(f"\nSaved detailed results to: {out_path}")


	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser()

	parser.add_argument("--checkpoint", type=str, default="checkpoints/vision_t5")
	parser.add_argument("--data_dir", type=str, default="data/processed")

	args = parser.parse_args()

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Device: {device}")

	model, tokenizer, meta, config = load_experiment(args.checkpoint, device=device)
	image_size = config["model"].get("image_size", 224)
	preprocess = build_coco_transform(image_size=image_size)

	evaluate(
	model,
	tokenizer,
	preprocess=preprocess,
	data_dir=args.data_dir,
	save_dir=args.checkpoint,
	device=device
	)

	# python evaluate.py --checkpoint checkpoints/vision_t5/20251117_171912