| """ |
| predictor.py β Student inference file for hidden evaluation. |
| |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| β DO NOT RENAME ANY FUNCTION. β |
| β DO NOT CHANGE FUNCTION SIGNATURES. β |
| β DO NOT REMOVE ANY FUNCTION. β |
| β DO NOT RENAME CLS_CLASS_MAPPING or SEG_CLASS_MAPPING. β |
| β You may add helper functions / imports as needed. β |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| Tasks |
| ----- |
| Task 3.1 β Multi-label image-level classification (5 classes). |
| Task 3.2 β Object detection + instance segmentation (5 classes). |
| |
| You must implement ALL FOUR functions below. |
| |
| Class Mappings |
| -------------- |
| Fill in the two dictionaries below (CLS_CLASS_MAPPING, SEG_CLASS_MAPPING) |
| to map your model's output indices to the canonical category names. |
| |
| The canonical 5 categories (from the DeepFashion2 subset) are: |
| short sleeve top, long sleeve top, trousers, shorts, skirt |
| |
| Your indices can be in any order, but the category name strings |
| must match exactly (case-insensitive). Background class is optional |
| but recommended for detection/segmentation models β the evaluator |
| will automatically ignore it. |
| |
| Important: Masks must be at the ORIGINAL image resolution. |
| If your model internally resizes images, resize the masks back |
| to the input image dimensions before returning them. |
| |
| Model Weights |
| ------------- |
| Place your trained weights inside model_files/ as: |
| model_files/cls.pt (or cls.pth) β classification model |
| model_files/seg.pt (or seg.pth) β detection + segmentation model |
| |
| Evaluation Metrics |
| ------------------ |
| Classification : Macro F1-score + Per-label macro accuracy |
| Detection : mAP @ [0.5 : 0.05 : 0.95] |
| Segmentation : Per-class mIoU (macro-averaged) |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
| from typing import Any, Dict, List |
|
|
| import numpy as np |
| import torch |
| import torch.nn as nn |
| import torchvision.models as models |
| import torchvision.transforms as T |
| from PIL import Image |
| from ultralytics import YOLO |
| import cv2 |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| CLS_CLASS_MAPPING: Dict[int, str] = { |
| 0: "short sleeve top", |
| 1: "long sleeve top", |
| 2: "shorts", |
| 3: "trousers", |
| 4: "skirt", |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| SEG_CLASS_MAPPING: Dict[int, str] = { |
| 0: "short sleeve top", |
| 1: "long sleeve top", |
| 2: "shorts", |
| 3: "trousers", |
| 4: "skirt", |
| } |
|
|
|
|
| |
| |
| |
|
|
| def _find_weights(folder: Path, stem: str) -> Path: |
| """Return the first existing weights file matching stem.pt or stem.pth.""" |
| for ext in (".pt", ".pth"): |
| candidate = folder / "model_files" / (stem + ext) |
| if candidate.exists(): |
| return candidate |
| raise FileNotFoundError( |
| f"No weights file found for '{stem}' in {folder / 'model_files'}" |
| ) |
|
|
|
|
| def _load_json(path: Path) -> Dict[str, Any]: |
| with open(path, "r", encoding="utf-8") as f: |
| return json.load(f) |
|
|
|
|
| |
| |
| |
|
|
| def load_classification_model(folder: str, device: str) -> Any: |
| """ |
| Load your trained classification model. |
| |
| Parameters |
| ---------- |
| folder : str |
| Absolute path to your submission folder (the one containing |
| this predictor.py, model_files/, class_mapping_cls.json, etc.). |
| device : str |
| PyTorch device string, e.g. "cuda", "mps", or "cpu". |
| |
| Returns |
| ------- |
| model : Any |
| Whatever object your predict_classification function needs. |
| This is passed directly as the first argument to |
| predict_classification(). |
| |
| Notes |
| ----- |
| - Load weights from <folder>/model_files/cls.pt (or .pth). |
| - Use CLS_CLASS_MAPPING defined above to map output indices. |
| - The returned object can be a dict, a nn.Module, or anything |
| your prediction function expects. |
| """ |
| model_path = _find_weights(Path(folder), "cls") |
| |
| |
| model = models.efficientnet_b0(weights=None) |
| in_features = model.classifier[1].in_features |
| |
| model.classifier[1] = nn.Linear(in_features, 5) |
| |
| |
| state_dict = torch.load(model_path, map_location=device) |
| model.load_state_dict(state_dict) |
| |
| model.to(device) |
| model.eval() |
| |
| return model |
|
|
|
|
| def predict_classification(model: Any, images: List[Image.Image]) -> List[Dict]: |
| """ |
| Run multi-label classification on a list of images. |
| |
| Parameters |
| ---------- |
| model : Any |
| The object returned by load_classification_model(). |
| images : list of PIL.Image.Image |
| A list of RGB PIL images. |
| |
| Returns |
| ------- |
| results : list of dict |
| One dict per image, with the key "labels": |
| |
| [ |
| {"labels": [int, int, int, int, int]}, |
| {"labels": [int, int, int, int, int]}, |
| ... |
| ] |
| |
| Each "labels" list has exactly 5 elements (one per class, |
| in the order defined by your CLS_CLASS_MAPPING dictionary). |
| Each element is 0 or 1. |
| |
| Example |
| ------- |
| >>> results = predict_classification(model, [img1, img2]) |
| >>> results[0] |
| {"labels": [1, 0, 0, 1, 0]} |
| """ |
| |
| transform = T.Compose([ |
| T.Resize((256, 256)), |
| T.CenterCrop((224, 224)), |
| T.ToTensor(), |
| T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) |
| ]) |
| |
| device = next(model.parameters()).device |
| results = [] |
| |
| with torch.no_grad(): |
| for img in images: |
| |
| if img.mode != "RGB": |
| img = img.convert("RGB") |
| |
| img_tensor = transform(img).unsqueeze(0).to(device) |
| out = model(img_tensor) |
| |
| prob = torch.sigmoid(out).squeeze(0) |
| |
| pred = (prob > 0.4).int().tolist() |
| |
| results.append({"labels": pred}) |
| |
| return results |
|
|
|
|
| |
| |
| |
|
|
| def load_detection_model(folder: str, device: str) -> Any: |
| """ |
| Load your trained detection + segmentation model. |
| |
| Parameters |
| ---------- |
| folder : str |
| Absolute path to your submission folder. |
| device : str |
| PyTorch device string, e.g. "cuda", "mps", or "cpu". |
| |
| Returns |
| ------- |
| model : Any |
| Whatever object your predict_detection_segmentation function |
| needs. Passed directly as the first argument. |
| |
| Notes |
| ----- |
| - Load weights from <folder>/model_files/seg.pt (or .pth). |
| - Use SEG_CLASS_MAPPING defined above to map output indices. |
| """ |
| model_path = _find_weights(Path(folder), "seg") |
| model = YOLO(model_path) |
| model.to(device) |
| return model |
|
|
|
|
| def predict_detection_segmentation( |
| model: Any, |
| images: List[Image.Image], |
| ) -> List[Dict]: |
| """ |
| Run detection + instance segmentation on a list of images. |
| |
| Parameters |
| ---------- |
| model : Any |
| The object returned by load_detection_model(). |
| images : list of PIL.Image.Image |
| A list of RGB PIL images. |
| |
| Returns |
| ------- |
| results : list of dict |
| One dict per image with keys "boxes", "scores", "labels", "masks": |
| |
| [ |
| { |
| "boxes": [[x1, y1, x2, y2], ...], # list of float coords |
| "scores": [float, ...], # confidence in [0, 1] |
| "labels": [int, ...], # class indices (see mapping) |
| "masks": [np.ndarray, ...] # binary masks, HΓW, uint8 |
| }, |
| ... |
| ] |
| |
| Output contract |
| --------------- |
| - boxes / scores / labels / masks must all have the same length |
| (= number of detected instances in that image). |
| - Each box is [x1, y1, x2, y2] with x1 < x2, y1 < y2. |
| - Coordinates must be within image bounds (0 β€ x β€ width, 0 β€ y β€ height). |
| - Each score is a float in [0, 1]. |
| - Each label is an int index matching your SEG_CLASS_MAPPING. |
| - Each mask is a 2-D numpy array of shape (image_height, image_width) |
| with dtype uint8, containing only 0 and 1. |
| - If no objects are detected, return empty lists for all keys. |
| |
| Example |
| ------- |
| >>> results = predict_detection_segmentation(model, [img]) |
| >>> results[0]["boxes"] |
| [[100.0, 40.0, 300.0, 420.0], [50.0, 200.0, 250.0, 600.0]] |
| >>> results[0]["masks"][0].shape |
| (height, width) |
| """ |
| results = [] |
| |
| for img in images: |
| if img.mode != "RGB": |
| img = img.convert("RGB") |
| |
| w, h = img.size |
| |
| |
| |
| preds = model.predict(source=img, imgsz=640, conf=0.25, verbose=False, retina_masks=True) |
| pred = preds[0] |
| |
| boxes = [] |
| scores = [] |
| labels = [] |
| masks_list = [] |
| |
| if pred.boxes is not None and len(pred.boxes) > 0: |
| boxes = pred.boxes.xyxy.cpu().numpy().tolist() |
| scores = pred.boxes.conf.cpu().numpy().tolist() |
| labels = pred.boxes.cls.cpu().numpy().astype(int).tolist() |
| |
| if pred.masks is not None and len(pred.masks) > 0: |
| masks_data = pred.masks.data.cpu().numpy() |
| |
| for m in masks_data: |
| |
| if m.shape != (h, w): |
| m = cv2.resize(m, (w, h), interpolation=cv2.INTER_NEAREST) |
| |
| |
| m_binary = (m > 0.5).astype(np.uint8) |
| masks_list.append(m_binary) |
| |
| |
| if len(masks_list) != len(boxes): |
| masks_list = [np.zeros((h, w), dtype=np.uint8) for _ in boxes] |
| |
| results.append({ |
| "boxes": boxes, |
| "scores": scores, |
| "labels": labels, |
| "masks": masks_list |
| }) |
| |
| return results |
|
|