File size: 12,180 Bytes
747451d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | # /*---------------------------------------------------------------------------------------------
# * Copyright (c) 2022-2023 STMicroelectronics.
# * All rights reserved.
# *
# * This software is licensed under terms that can be found in the LICENSE file in
# * the root directory of this software component.
# * If no LICENSE file comes with this software, it is provided AS-IS.
# *--------------------------------------------------------------------------------------------*/
import os
import numpy as np
import tensorflow as tf
from pathlib import Path
from omegaconf import DictConfig
from object_detection.tf.src.datasets.utils import compute_labels_stats, compute_class_stats, \
convert_dataset_to_yolo, convert_val_dataset_to_yolo, \
add_tfs_files_to_dataset, load_subset_dataloaders
from munch import DefaultMunch
from hydra.core.hydra_config import HydraConfig
def load_coco_like(cfg: DictConfig,
image_size: tuple[int],
val_batch_size: int) -> dict:
"""
Load COCO-format dataset and create TFS files.
Handles both Officially downloaded COCO dataset and custom COCO-format datasets.
Args:
cfg (DictConfig): Configuration object containing dataset parameters including:
For downloaded COCO dataset (download_data=True):
- dataset.download_data: Set to True to use downloaded dataset (only allowed if training is enabled)
- dataset.class_names: List of class names to use
For custom dataset (download_data=False):
- dataset.train_images_path: Path to training images
- dataset.val_images_path: Path to validation images (optional)
- dataset.train_annotations_path: Path to training annotations
- dataset.val_annotations_path: Path to validation annotations (optional)
- dataset.class_names: List of class names to use
Common parameters:
- dataset.format: The corresponding dataset format (tfs, darknet_yolo, coco, pascal_voc).
- dataset.data_dir: Root directory containing raw dataset before tfs generation and splitting.
- dataset.training_path: Path for processed training data
- dataset.test_path: Path for processed test data
- dataset.validation_path: Path for processed validation data
- dataset.quantization_path: Path for quantization data (required if quantization is enabled)
- dataset.prediction_path: Path for prediction data (required if prediction is enabled)
- settings.max_detections: Optional maximum number of detections per image
- settings.exclude_unlabeled_images: Whether to exclude images without labels
- operation_mode (str): One of the supported modes or chains (e.g., chain_eqeb, training, evaluation, etc.)
Returns:
dict[str, tf.data.Dataset]: Dictionary containing training, validation, test,
quantization and prediction datasets as TensorFlow datasets.
"""
if not hasattr(cfg, 'operation_mode'):
raise ValueError("cfg.operation_mode must be specified")
mode_str = cfg.operation_mode.lower()
mode_groups = DefaultMunch.fromDict({
"training": ["training", "chain_tqeb", "chain_tqe"],
"evaluation": ["evaluation", "chain_tqeb", "chain_tqe", "chain_eqe", "chain_eqeb"],
"quantization": ["quantization", "chain_tqeb", "chain_tqe", "chain_eqe",
"chain_qb", "chain_eqeb", "chain_qd"],
"benchmarking": ["benchmarking", "chain_tqeb", "chain_qb", "chain_eqeb"],
"deployment": ["deployment", "chain_qd"],
"prediction": ["prediction"],
"compression": ["compression"]
})
# Conditional addition based on cfg.quantization.operating_mode
if getattr(cfg.quantization, "operating_mode", None) == "full_auto":
additional_items = ["quantization", "chain_qd", "chain_qb"]
for item in additional_items:
if item not in mode_groups.evaluation:
mode_groups.evaluation.append(item)
# Helper function to check if current mode_str belongs to a group
def is_mode_in_group(group_name):
return mode_str in mode_groups.get(group_name, [])
is_training = is_mode_in_group("training")
is_evaluation = is_mode_in_group("evaluation")
is_quantization = is_mode_in_group("quantization")
is_prediction = is_mode_in_group("prediction")
# Verify required class names
if not hasattr(cfg.dataset, 'class_names'):
raise ValueError("Class names must be specified in cfg.dataset.class_names")
dataset_format = getattr(cfg.dataset, "format", "").lower()
# If training or evaluation mode
if is_training or is_evaluation:
if dataset_format == "coco":
if is_training:
# Download dataset allowed only if training is enabled
if hasattr(cfg.dataset, 'download_data') and cfg.dataset.download_data:
if not hasattr(cfg.dataset, 'data_dir'):
raise ValueError("data_dir must be specified in cfg.dataset when download_data=True")
data_dir = cfg.dataset.data_dir
if not os.path.exists(os.path.join(data_dir, 'train2017')) or \
not os.path.exists(os.path.join(data_dir, 'val2017')) or \
not os.path.exists(os.path.join(data_dir, 'annotations')):
raise ValueError(f"Downloaded COCO dataset structure not found in {data_dir}. Expected train2017, val2017, and annotations directories.")
cfg.dataset.train_images_path = os.path.join(data_dir, 'train2017')
cfg.dataset.val_images_path = os.path.join(data_dir, 'val2017')
cfg.dataset.train_annotations_path = os.path.join(data_dir, 'annotations', 'instances_train2017.json')
cfg.dataset.val_annotations_path = os.path.join(data_dir, 'annotations', 'instances_val2017.json')
else:
if not hasattr(cfg.dataset, 'data_dir'):
raise ValueError("data_dir must be specified in cfg.dataset. It will be used to store the .tfs format dataset before splitting.")
if not (hasattr(cfg.dataset, 'train_images_path') and cfg.dataset.train_images_path):
raise ValueError("For custom dataset (download_data set to False) in training mode, train_images_path must be specified in cfg.dataset")
if not (hasattr(cfg.dataset, 'train_annotations_path') and cfg.dataset.train_annotations_path):
raise ValueError("For custom dataset (download_data set to False) in training mode, train_annotations_path must be specified in cfg.dataset")
if not (hasattr(cfg.dataset, 'val_images_path') and cfg.dataset.val_images_path) or \
not (hasattr(cfg.dataset, 'val_annotations_path') and cfg.dataset.val_annotations_path):
print("Warning: Validation data paths not provided. Will use the validation_split from training data.")
if not hasattr(cfg.dataset, 'training_path'):
raise ValueError("cfg.dataset.training_path must be specified for processed training data storage in training mode")
if hasattr(cfg.dataset, 'validation_path') and cfg.dataset.validation_path:
os.makedirs(cfg.dataset.validation_path, exist_ok=True)
raw_dataset_path = getattr(cfg.dataset, 'data_dir', None)
if raw_dataset_path is None:
raw_dataset_path = getattr(cfg.dataset, 'training_path', None)
if raw_dataset_path is None:
raise ValueError("Could not determine dataset root path. Please specify either data_dir or training_path in cfg.dataset")
os.makedirs(raw_dataset_path, exist_ok=True)
print("Starting dataset conversion to YOLO Darknet format...")
convert_dataset_to_yolo(cfg)
print("Dataset conversion completed.\n")
print("Starting dataset analysis...")
compute_class_stats(dataset_path=raw_dataset_path,
dataset_name=getattr(cfg.dataset, 'dataset_name', None),
histogram_dir=HydraConfig.get().runtime.output_dir)
compute_labels_stats(dataset_path=raw_dataset_path,
dataset_name=getattr(cfg.dataset, 'dataset_name', None),
histogram_dir=HydraConfig.get().runtime.output_dir)
print("Dataset analysis completed.\n")
else:
#Evaluation mode
if not hasattr(cfg.dataset, 'test_path'):
raise ValueError("cfg.dataset.test_path must be specified in evaluation mode")
if not os.path.exists(cfg.dataset.test_path):
raise ValueError(f"Test path {cfg.dataset.test_path} does not exist")
tfs_dataset_path = cfg.dataset.test_path
os.makedirs(tfs_dataset_path, exist_ok=True)
# Checking the case where running evaluation without training on COCO dataset
# In this case, we require the validation dataset path variables in the cfg
# and convert only the validation dataset
if mode_str in ["evaluation", "chain_eqe", "chain_eqeb"]:
convert_val_dataset_to_yolo(cfg)
print(f"Creating .tfs files for the {'training' if is_training else 'evaluation'} dataset...")
exclude_unlabeled = (hasattr(cfg.dataset, 'exclude_unlabeled') and cfg.dataset.exclude_unlabeled)
max_detections = (hasattr(cfg.dataset, 'max_detections') and cfg.dataset.max_detections)
add_tfs_files_to_dataset(dataset_path=tfs_dataset_path,
exclude_unlabeled_images=exclude_unlabeled,
padded_labels_size=max_detections)
print(".tfs files creation completed.")
elif dataset_format == "tfs":
# If format is tfs, directly load darknet-like without conversion or analysis
print("Dataset format is 'tfs'. Skipping conversion and analysis steps.")
return load_subset_dataloaders(cfg, is_training, is_evaluation,
is_prediction, is_quantization,
image_size=image_size, val_batch_size=val_batch_size)
else:
# If other formats or no format specified, proceed with existing logic or raise error if needed
# For now, fallback to existing COCO logic or raise error
raise ValueError(f"Unsupported dataset format '{dataset_format}' for training/evaluation mode.")
if is_prediction:
if not hasattr(cfg.dataset, 'prediction_path') and not cfg.dataset.prediction_path:
raise ValueError("cfg.dataset.prediction_path must be specified in prediction mode")
else:
if not os.path.exists(cfg.dataset.prediction_path):
raise ValueError(f"Prediction path {cfg.dataset.prediction_path} does not exist")
if is_quantization:
if not hasattr(cfg.dataset, 'quantization_path') and not cfg.dataset.quantization_path:
raise ValueError("cfg.dataset.quantization_path must be specified in quantization mode")
else:
if not os.path.exists(cfg.dataset.quantization_path):
raise ValueError(f"Quantization path {cfg.dataset.quantization_path} does not exist")
print("Loading datasets in darknet format...")
return load_subset_dataloaders(cfg, is_training, is_evaluation,
is_prediction, is_quantization,
image_size=image_size, val_batch_size=val_batch_size) |