| | """ |
| | Image dataset for Electrical Outlets. |
| | FINAL v5: Direct folder_to_class mapping — no pattern matching, no ambiguity. |
| | """ |
| | from pathlib import Path |
| | import json |
| | import logging |
| | from collections import defaultdict |
| | from typing import Optional, Callable, List, Tuple |
| |
|
| | import torch |
| | from torch.utils.data import Dataset |
| | from PIL import Image |
| |
|
| | logger = logging.getLogger(__name__) |
| | logging.basicConfig(level=logging.INFO, format="%(message)s") |
| |
|
| |
|
| | class ElectricalOutletsImageDataset(Dataset): |
| |
|
| | def __init__( |
| | self, |
| | root: Path, |
| | label_mapping_path: Path, |
| | split: str = "train", |
| | train_ratio: float = 0.7, |
| | val_ratio: float = 0.15, |
| | seed: int = 42, |
| | transform: Optional[Callable] = None, |
| | extensions: Tuple[str, ...] = (".jpg", ".jpeg", ".png"), |
| | ): |
| | self.root = Path(root) |
| | self.transform = transform |
| | self.extensions = extensions |
| | self.split = split |
| |
|
| | with open(label_mapping_path) as f: |
| | lm = json.load(f) |
| |
|
| | self.folder_to_class = lm["image"]["folder_to_class"] |
| | self.class_to_idx = lm["image"]["class_to_idx"] |
| | self.idx_to_issue_type = lm["image"]["idx_to_issue_type"] |
| | self.idx_to_severity = lm["image"]["idx_to_severity"] |
| | self.num_classes = len(self.class_to_idx) |
| |
|
| | |
| | self.samples: List[Tuple[Path, int]] = [] |
| | class_counts = defaultdict(int) |
| | matched_folders = [] |
| | unmatched_folders = [] |
| |
|
| | for folder in sorted(self.root.iterdir()): |
| | if not folder.is_dir(): |
| | continue |
| | |
| | class_key = self.folder_to_class.get(folder.name) |
| | if class_key is None: |
| | unmatched_folders.append(folder.name) |
| | continue |
| | cls_idx = self.class_to_idx[class_key] |
| | count = 0 |
| | for f in folder.iterdir(): |
| | if f.suffix.lower() in self.extensions: |
| | self.samples.append((f, cls_idx)) |
| | count += 1 |
| | class_counts[cls_idx] += count |
| | matched_folders.append(f" ✓ {folder.name} → {class_key} (idx={cls_idx}): {count} images") |
| |
|
| | |
| | logger.info(f"\n{'='*60}") |
| | logger.info(f"Dataset loading from: {self.root}") |
| | logger.info(f"{'='*60}") |
| | for line in matched_folders: |
| | logger.info(line) |
| | for uf in unmatched_folders: |
| | logger.warning(f" ✗ SKIPPED: '{uf}' (not in folder_to_class)") |
| | logger.info(f"\nClass distribution:") |
| | for idx in sorted(class_counts.keys()): |
| | name = [k for k, v in self.class_to_idx.items() if v == idx][0] |
| | logger.info(f" Class {idx} ({name}): {class_counts[idx]} images") |
| | logger.info(f"Total: {len(self.samples)} images in {self.num_classes} classes") |
| |
|
| | if len(self.samples) == 0: |
| | logger.error("NO SAMPLES FOUND! Check that data_root points to the folder containing your class subfolders.") |
| | raise ValueError(f"No images found in {self.root}. Check folder names match label_mapping.json folder_to_class keys.") |
| |
|
| | |
| | by_class = defaultdict(list) |
| | for i, (_, cls) in enumerate(self.samples): |
| | by_class[cls].append(i) |
| |
|
| | train_idx, val_idx, test_idx = [], [], [] |
| | for cls in sorted(by_class.keys()): |
| | indices = by_class[cls] |
| | g = torch.Generator().manual_seed(seed) |
| | perm = torch.randperm(len(indices), generator=g).tolist() |
| | n_cls = len(indices) |
| | n_tr = int(n_cls * train_ratio) |
| | n_va = int(n_cls * val_ratio) |
| | train_idx.extend([indices[p] for p in perm[:n_tr]]) |
| | val_idx.extend([indices[p] for p in perm[n_tr:n_tr + n_va]]) |
| | test_idx.extend([indices[p] for p in perm[n_tr + n_va:]]) |
| |
|
| | if split == "train": |
| | self.indices = train_idx |
| | elif split == "val": |
| | self.indices = val_idx |
| | else: |
| | self.indices = test_idx |
| |
|
| | logger.info(f"Split '{split}': {len(self.indices)} samples\n") |
| |
|
| | def __len__(self): |
| | return len(self.indices) |
| |
|
| | def __getitem__(self, idx): |
| | i = self.indices[idx] |
| | path, cls = self.samples[i] |
| | img = Image.open(path).convert("RGB") |
| | if self.transform: |
| | img = self.transform(img) |
| | return img, cls |
| |
|
| | def get_issue_type(self, class_idx: int) -> str: |
| | return self.idx_to_issue_type[class_idx] |
| |
|
| | def get_severity(self, class_idx: int) -> str: |
| | return self.idx_to_severity[class_idx] |
| |
|
| |
|
| | def get_image_class_weights(label_mapping_path: Path, root: Path) -> torch.Tensor: |
| | """Compute inverse frequency weights for class-weighted loss.""" |
| | with open(label_mapping_path) as f: |
| | lm = json.load(f) |
| | folder_to_class = lm["image"]["folder_to_class"] |
| | class_to_idx = lm["image"]["class_to_idx"] |
| | num_classes = len(class_to_idx) |
| | counts = [0] * num_classes |
| |
|
| | root = Path(root) |
| | for folder in root.iterdir(): |
| | if not folder.is_dir(): |
| | continue |
| | class_key = folder_to_class.get(folder.name) |
| | if class_key is None: |
| | continue |
| | cls_idx = class_to_idx[class_key] |
| | n = sum(1 for f in folder.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png")) |
| | counts[cls_idx] += n |
| |
|
| | total = sum(counts) |
| | if total == 0: |
| | return torch.ones(num_classes) |
| | weights = [total / (num_classes * c) if c else 1.0 for c in counts] |
| | return torch.tensor(weights, dtype=torch.float32) |
| |
|