Spaces:
Running
Running
| import json | |
| import os | |
| import random | |
| from typing import Any, Dict | |
| from PIL import Image | |
| from torch.utils.data import Dataset | |
| class COCODataset384(Dataset): | |
| """ | |
| COCO-style dataset that always resizes images to 384x384 and uses | |
| a BLIP-style processor for joint image-text encoding. | |
| """ | |
| def __init__(self, annotation_path: str, image_folder: str, processor: Any) -> None: | |
| self.image_folder = image_folder | |
| self.processor = processor | |
| with open(annotation_path, "r") as f: | |
| self.annotations = [json.loads(line) for line in f] | |
| def __len__(self) -> int: | |
| return len(self.annotations) | |
| def __getitem__(self, idx: int) -> Dict[str, Any]: | |
| ann = self.annotations[idx] | |
| caption = random.choice(ann["captions"]) | |
| image_path = os.path.join(self.image_folder, ann["image"]) | |
| image = Image.open(image_path).convert("RGB") | |
| # 384px resize for the vision backbone | |
| image = image.resize((384, 384)) | |
| encoding = self.processor( | |
| image, | |
| caption, | |
| padding="max_length", | |
| truncation=True, | |
| return_tensors="pt", | |
| ) | |
| input_ids = encoding["input_ids"].squeeze(0) | |
| return { | |
| "pixel_values": encoding["pixel_values"].squeeze(0), | |
| "input_ids": input_ids, | |
| "attention_mask": encoding["attention_mask"].squeeze(0), | |
| "labels": input_ids.clone(), | |
| } | |