Spaces:

abka03
/

stylsteer-vlm

Running

File size: 3,854 Bytes

e6f24ae

"""Track D — COCO val2017 dataset loader.

Used for custom rhetorical styles (poetic, scientific, narrative, factual,
natural, fictional). No ground-truth styled captions — LLM judge only.
"""

import json
import logging
import random
from pathlib import Path
from typing import Any, Dict, List, Optional

from src.data.base import StyleDataset

logger = logging.getLogger(__name__)


class COCODataset(StyleDataset):
    """COCO val2017 for custom rhetorical tracks (Track D).

    Expected directory structure:
        COCO_IMAGE_DIR/   (from .env, e.g. data/coco_val2017/images/)
            ├── 000000000139.jpg
            └── ...
        COCO_ANNOT_FILE/  (from .env)
            └── captions_val2017.json

    No ground-truth styled captions. Factual COCO captions used as neutral reference only.
    """

    RHETORICAL_STYLES = ["poetic", "scientific", "narrative", "factual", "natural", "fictional"]

    def __init__(
        self,
        data_dir: str = "",
        image_dir: str = "",
        annot_file: str = "",
        **kwargs,
    ):
        self.image_dir = Path(image_dir) if image_dir else None
        self.annot_file = Path(annot_file) if annot_file else None
        super().__init__(data_dir=data_dir, **kwargs)

    @property
    def track(self) -> str:
        return "D"

    @property
    def styles(self) -> List[str]:
        return self.RHETORICAL_STYLES

    @property
    def has_ground_truth(self) -> bool:
        return False

    def _load_data(self) -> List[Dict[str, Any]]:
        # Resolve image directory
        img_dir = self.image_dir or self.data_dir / "images"
        annot = self.annot_file or self.data_dir / "annotations" / "captions_val2017.json"

        if not Path(img_dir).exists():
            logger.warning(f"COCO images not found at {img_dir}. Using mock data.")
            return self._mock_data()

        # Load annotation file to get image list
        image_ids = []
        if Path(annot).exists():
            with open(annot) as f:
                coco_data = json.load(f)
            image_ids = [img["id"] for img in coco_data.get("images", [])]
        else:
            # Fall back to listing image files
            image_ids = [
                int(p.stem) for p in Path(img_dir).glob("*.jpg")
            ]

        # Deterministic subset
        rng = random.Random(self.seed)
        n = self.n_images or 500
        if len(image_ids) > n:
            image_ids = rng.sample(image_ids, n)
        image_ids.sort()

        # Create entries — one per image per style
        data = []
        for img_id in image_ids:
            filename = f"{img_id:012d}.jpg"
            image_path = str(Path(img_dir) / filename)
            for style in self.styles:
                data.append({
                    "image_id": str(img_id),
                    "image_path": image_path,
                    "style": style,
                    "caption_gt": None,  # No ground truth for Track D
                })

        logger.info(f"COCO Track D: {len(image_ids)} images × {len(self.styles)} styles = {len(data)} items")
        return data

    def get_images(self, style: str) -> List[Dict[str, Any]]:
        """Override to return unique images (not per-style duplicates)."""
        items = [d for d in self.data if d["style"] == style]
        # Already de-duplicated at image level since each image has one entry per style
        return items

    def _mock_data(self) -> List[Dict[str, Any]]:
        data = []
        for i in range(max(self.n_images or 5, 5)):
            for style in self.styles:
                data.append({
                    "image_id": f"mock_{i}",
                    "image_path": f"mock_image_{i}.jpg",
                    "style": style,
                    "caption_gt": None,
                })
        return data