Spaces:
Running
Running
File size: 1,475 Bytes
a745a5e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import json
import os
import random
from typing import Any, Dict
from PIL import Image
from torch.utils.data import Dataset
class COCODataset384(Dataset):
"""
COCO-style dataset that always resizes images to 384x384 and uses
a BLIP-style processor for joint image-text encoding.
"""
def __init__(self, annotation_path: str, image_folder: str, processor: Any) -> None:
self.image_folder = image_folder
self.processor = processor
with open(annotation_path, "r") as f:
self.annotations = [json.loads(line) for line in f]
def __len__(self) -> int:
return len(self.annotations)
def __getitem__(self, idx: int) -> Dict[str, Any]:
ann = self.annotations[idx]
caption = random.choice(ann["captions"])
image_path = os.path.join(self.image_folder, ann["image"])
image = Image.open(image_path).convert("RGB")
# 384px resize for the vision backbone
image = image.resize((384, 384))
encoding = self.processor(
image,
caption,
padding="max_length",
truncation=True,
return_tensors="pt",
)
input_ids = encoding["input_ids"].squeeze(0)
return {
"pixel_values": encoding["pixel_values"].squeeze(0),
"input_ids": input_ids,
"attention_mask": encoding["attention_mask"].squeeze(0),
"labels": input_ids.clone(),
}
|