File size: 3,102 Bytes
"""
Dataset loader for training RF-DETR models.
Supports COCO format annotations.
"""

import json
import os
from pathlib import Path
from typing import Dict, List, Tuple


def load_coco_annotations(annotation_path: str) -> Dict:
    """Load COCO format annotation file."""
    if not os.path.exists(annotation_path):
        raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
    
    with open(annotation_path, 'r') as f:
        return json.load(f)


def get_image_paths(images_dir: str) -> List[str]:
    """Get all image file paths from directory."""
    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
    image_paths = []
    
    for ext in image_extensions:
        image_paths.extend(Path(images_dir).glob(f'*{ext}'))
        image_paths.extend(Path(images_dir).glob(f'*{ext.upper()}'))
    
    return sorted([str(p) for p in image_paths])


def validate_dataset(dataset_path: str) -> Tuple[bool, List[str]]:
    """Validate dataset structure and return issues."""
    issues = []
    dataset_path = Path(dataset_path)
    
    if not dataset_path.exists():
        issues.append(f"Dataset directory does not exist: {dataset_path}")
        return False, issues
    
    images_dir = dataset_path / "images"
    annotations_file = dataset_path / "annotations" / "annotations.json"
    
    if not images_dir.exists():
        issues.append(f"Images directory missing: {images_dir}")
    
    if not annotations_file.exists():
        issues.append(f"Annotations file missing: {annotations_file}")
    else:
        try:
            coco_data = load_coco_annotations(str(annotations_file))
            if 'images' not in coco_data:
                issues.append("COCO format: missing 'images' key")
            if 'annotations' not in coco_data:
                issues.append("COCO format: missing 'annotations' key")
            if 'categories' not in coco_data:
                issues.append("COCO format: missing 'categories' key")
        except json.JSONDecodeError as e:
            issues.append(f"Invalid JSON in annotations: {e}")
    
    return len(issues) == 0, issues


def get_dataset_info(dataset_path: str) -> Dict:
    """Get summary information about the dataset."""
    dataset_path = Path(dataset_path)
    info = {
        'path': str(dataset_path),
        'images_count': 0,
        'annotations_count': 0,
        'categories_count': 0,
        'categories': []
    }
    
    images_dir = dataset_path / "images"
    annotations_file = dataset_path / "annotations" / "annotations.json"
    
    if images_dir.exists():
        info['images_count'] = len(get_image_paths(str(images_dir)))
    
    if annotations_file.exists():
        try:
            coco_data = load_coco_annotations(str(annotations_file))
            info['annotations_count'] = len(coco_data.get('annotations', []))
            info['categories_count'] = len(coco_data.get('categories', []))
            info['categories'] = [cat['name'] for cat in coco_data.get('categories', [])]
        except Exception as e:
            info['error'] = str(e)
    
    return info