Spaces:

pchandragrid
/

image_captioning

Running

App Files Files Community

image_captioning / src /data /coco_vit_gpt2_dataset.py

pchandragrid

Deploy Streamlit app

a745a5e 3 days ago

raw

history blame contribute delete

2.34 kB

	import json
	import os
	import random
	from typing import Any, Dict, List

	from PIL import Image
	from torch.utils.data import Dataset


	class COCODatasetViTGPT2(Dataset):
	"""
	COCO dataset tailored for ViT + GPT-2 style architectures with
	separate image processor and tokenizer.
	"""

	def __init__(
	self,
	annotation_path: str,
	image_folder: str,
	image_processor: Any,
	tokenizer: Any,
	mode: str = "short",
	max_length: int = 20,
	) -> None:
	self.image_folder = image_folder
	self.image_processor = image_processor
	self.tokenizer = tokenizer
	self.max_length = max_length
	self.mode = mode

	with open(annotation_path, "r") as f:
	raw_data = [json.loads(line) for line in f]

	self.annotations: List[Dict[str, Any]] = []

	for ann in raw_data:
	filtered: List[str] = []

	for cap in ann["captions"]:
	words = cap.split()
	wc = len(words)

	if mode == "short" and wc <= 8:
	filtered.append(cap)
	elif mode == "long" and wc > 15:
	filtered.append(cap)
	elif mode == "mixed":
	filtered.append(cap)

	if filtered:
	self.annotations.append(
	{
	"image": ann["image"],
	"captions": filtered,
	}
	)

	def __len__(self) -> int:
	return len(self.annotations)

	def __getitem__(self, idx: int) -> Dict[str, Any]:
	ann = self.annotations[idx]
	caption = random.choice(ann["captions"])

	image_path = os.path.join(self.image_folder, ann["image"])
	image = Image.open(image_path).convert("RGB")

	pixel_values = self.image_processor(
	images=image,
	return_tensors="pt",
	).pixel_values.squeeze(0)

	tokenized = self.tokenizer(
	caption,
	padding="max_length",
	truncation=True,
	max_length=self.max_length,
	return_tensors="pt",
	)

	input_ids = tokenized.input_ids.squeeze(0)

	return {
	"pixel_values": pixel_values,
	"labels": input_ids,
	}