Spaces:

pchandragrid
/

image_captioning

Running

App Files Files Community

image_captioning / dataset_advanced.py

pchandragrid

Deploy Streamlit app

a745a5e 3 days ago

raw

history blame contribute delete

2.68 kB

	import json
	import os
	import random
	import re
	from torch.utils.data import Dataset
	from PIL import Image


	class COCODatasetAdvanced(Dataset):
	def __init__(self,
	annotation_path,
	image_folder,
	processor,
	mode="mixed",
	max_length=40):

	self.image_folder = image_folder
	self.processor = processor
	self.max_length = max_length
	self.mode = mode

	with open(annotation_path, "r") as f:
	raw_data = [json.loads(line) for line in f]

	self.annotations = []

	for ann in raw_data:

	filtered_captions = []

	for cap in ann["captions"]:

	cap = cap.strip().lower()

	# ---------- QUALITY FILTERS ----------

	# Remove very short captions
	if len(cap.split()) < 3:
	continue

	# Remove repeated words
	words = cap.split()
	if len(set(words)) < len(words) * 0.6:
	continue

	# Remove non-alphabetic captions
	if not re.search(r"[a-z]", cap):
	continue

	word_count = len(words)

	# ---------- LENGTH FILTERS ----------

	if self.mode == "short" and word_count <= 8:
	filtered_captions.append(cap)

	elif self.mode == "long" and word_count > 15:
	filtered_captions.append(cap)

	elif self.mode == "mixed":
	filtered_captions.append(cap)

	if len(filtered_captions) > 0:
	self.annotations.append({
	"image": ann["image"],
	"captions": filtered_captions
	})

	def __len__(self):
	return len(self.annotations)

	def __getitem__(self, idx):

	ann = self.annotations[idx]
	file_name = ann["image"]
	caption = random.choice(ann["captions"])

	image_path = os.path.join(self.image_folder, file_name)
	image = Image.open(image_path).convert("RGB")

	encoding = self.processor(
	images=image,
	text=caption,
	padding="max_length",
	truncation=True,
	max_length=self.max_length,
	return_tensors="pt"
	)

	input_ids = encoding["input_ids"].squeeze(0)

	return {
	"pixel_values": encoding["pixel_values"].squeeze(0),
	"input_ids": input_ids,
	"attention_mask": encoding["attention_mask"].squeeze(0),
	"labels": input_ids.clone()
	}