soccer-ball-detection / src /dataset_loader.py
eeeeeeeeeeeeee3's picture
Upload src/dataset_loader.py with huggingface_hub
6272613 verified
"""
Dataset loader for training RF-DETR models.
Supports COCO format annotations.
"""
import json
import os
from pathlib import Path
from typing import Dict, List, Tuple
def load_coco_annotations(annotation_path: str) -> Dict:
"""Load COCO format annotation file."""
if not os.path.exists(annotation_path):
raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
with open(annotation_path, 'r') as f:
return json.load(f)
def get_image_paths(images_dir: str) -> List[str]:
"""Get all image file paths from directory."""
image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
image_paths = []
for ext in image_extensions:
image_paths.extend(Path(images_dir).glob(f'*{ext}'))
image_paths.extend(Path(images_dir).glob(f'*{ext.upper()}'))
return sorted([str(p) for p in image_paths])
def validate_dataset(dataset_path: str) -> Tuple[bool, List[str]]:
"""Validate dataset structure and return issues."""
issues = []
dataset_path = Path(dataset_path)
if not dataset_path.exists():
issues.append(f"Dataset directory does not exist: {dataset_path}")
return False, issues
images_dir = dataset_path / "images"
annotations_file = dataset_path / "annotations" / "annotations.json"
if not images_dir.exists():
issues.append(f"Images directory missing: {images_dir}")
if not annotations_file.exists():
issues.append(f"Annotations file missing: {annotations_file}")
else:
try:
coco_data = load_coco_annotations(str(annotations_file))
if 'images' not in coco_data:
issues.append("COCO format: missing 'images' key")
if 'annotations' not in coco_data:
issues.append("COCO format: missing 'annotations' key")
if 'categories' not in coco_data:
issues.append("COCO format: missing 'categories' key")
except json.JSONDecodeError as e:
issues.append(f"Invalid JSON in annotations: {e}")
return len(issues) == 0, issues
def get_dataset_info(dataset_path: str) -> Dict:
"""Get summary information about the dataset."""
dataset_path = Path(dataset_path)
info = {
'path': str(dataset_path),
'images_count': 0,
'annotations_count': 0,
'categories_count': 0,
'categories': []
}
images_dir = dataset_path / "images"
annotations_file = dataset_path / "annotations" / "annotations.json"
if images_dir.exists():
info['images_count'] = len(get_image_paths(str(images_dir)))
if annotations_file.exists():
try:
coco_data = load_coco_annotations(str(annotations_file))
info['annotations_count'] = len(coco_data.get('annotations', []))
info['categories_count'] = len(coco_data.get('categories', []))
info['categories'] = [cat['name'] for cat in coco_data.get('categories', [])]
except Exception as e:
info['error'] = str(e)
return info