File size: 13,045 Bytes
0243bd1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | """
predictor.py β Student inference file for hidden evaluation.
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β DO NOT RENAME ANY FUNCTION. β
β DO NOT CHANGE FUNCTION SIGNATURES. β
β DO NOT REMOVE ANY FUNCTION. β
β DO NOT RENAME CLS_CLASS_MAPPING or SEG_CLASS_MAPPING. β
β You may add helper functions / imports as needed. β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Tasks
-----
Task 3.1 β Multi-label image-level classification (5 classes).
Task 3.2 β Object detection + instance segmentation (5 classes).
You must implement ALL FOUR functions below.
Class Mappings
--------------
Fill in the two dictionaries below (CLS_CLASS_MAPPING, SEG_CLASS_MAPPING)
to map your model's output indices to the canonical category names.
The canonical 5 categories (from the DeepFashion2 subset) are:
short sleeve top, long sleeve top, trousers, shorts, skirt
Your indices can be in any order, but the category name strings
must match exactly (case-insensitive). Background class is optional
but recommended for detection/segmentation models β the evaluator
will automatically ignore it.
Important: Masks must be at the ORIGINAL image resolution.
If your model internally resizes images, resize the masks back
to the input image dimensions before returning them.
Model Weights
-------------
Place your trained weights inside model_files/ as:
model_files/cls.pt (or cls.pth) β classification model
model_files/seg.pt (or seg.pth) β detection + segmentation model
Evaluation Metrics
------------------
Classification : Macro F1-score + Per-label macro accuracy
Detection : mAP @ [0.5 : 0.05 : 0.95]
Segmentation : Per-class mIoU (macro-averaged)
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any, Dict, List
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image
from ultralytics import YOLO
import cv2
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# CLASS MAPPINGS β FILL THESE IN
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Classification: maps your model's output index β canonical class name.
# Must have exactly 5 entries (one per clothing class, NO background).
# Example:
# CLS_CLASS_MAPPING = {
# 0: "short sleeve top",
# 1: "long sleeve top",
# 2: "trousers",
# 3: "shorts",
# 4: "skirt",
# }
CLS_CLASS_MAPPING: Dict[int, str] = {
0: "short sleeve top",
1: "long sleeve top",
2: "shorts",
3: "trousers",
4: "skirt",
}
# Detection + Segmentation: maps your model's output index β class name.
# Include background if your model outputs it (evaluator will ignore it).
# Example:
# SEG_CLASS_MAPPING = {
# 0: "background",
# 1: "short sleeve top",
# 2: "long sleeve top",
# 3: "trousers",
# 4: "shorts",
# 5: "skirt",
# }
SEG_CLASS_MAPPING: Dict[int, str] = {
0: "short sleeve top",
1: "long sleeve top",
2: "shorts",
3: "trousers",
4: "skirt",
}
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Helper utilities (you may modify or add more)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _find_weights(folder: Path, stem: str) -> Path:
"""Return the first existing weights file matching stem.pt or stem.pth."""
for ext in (".pt", ".pth"):
candidate = folder / "model_files" / (stem + ext)
if candidate.exists():
return candidate
raise FileNotFoundError(
f"No weights file found for '{stem}' in {folder / 'model_files'}"
)
def _load_json(path: Path) -> Dict[str, Any]:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# TASK 3.1 β CLASSIFICATION
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def load_classification_model(folder: str, device: str) -> Any:
"""
Load your trained classification model.
Parameters
----------
folder : str
Absolute path to your submission folder (the one containing
this predictor.py, model_files/, class_mapping_cls.json, etc.).
device : str
PyTorch device string, e.g. "cuda", "mps", or "cpu".
Returns
-------
model : Any
Whatever object your predict_classification function needs.
This is passed directly as the first argument to
predict_classification().
Notes
-----
- Load weights from <folder>/model_files/cls.pt (or .pth).
- Use CLS_CLASS_MAPPING defined above to map output indices.
- The returned object can be a dict, a nn.Module, or anything
your prediction function expects.
"""
model_path = _find_weights(Path(folder), "cls")
# Initialize EfficientNet B0 model
model = models.efficientnet_b0(weights=None)
in_features = model.classifier[1].in_features
# We have 5 classes
model.classifier[1] = nn.Linear(in_features, 5)
# Load weights
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()
return model
def predict_classification(model: Any, images: List[Image.Image]) -> List[Dict]:
"""
Run multi-label classification on a list of images.
Parameters
----------
model : Any
The object returned by load_classification_model().
images : list of PIL.Image.Image
A list of RGB PIL images.
Returns
-------
results : list of dict
One dict per image, with the key "labels":
[
{"labels": [int, int, int, int, int]},
{"labels": [int, int, int, int, int]},
...
]
Each "labels" list has exactly 5 elements (one per class,
in the order defined by your CLS_CLASS_MAPPING dictionary).
Each element is 0 or 1.
Example
-------
>>> results = predict_classification(model, [img1, img2])
>>> results[0]
{"labels": [1, 0, 0, 1, 0]}
"""
# Equivalent to the val_transform in albumentations used during training
transform = T.Compose([
T.Resize((256, 256)),
T.CenterCrop((224, 224)),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
device = next(model.parameters()).device
results = []
with torch.no_grad():
for img in images:
# Ensure image is in RGB
if img.mode != "RGB":
img = img.convert("RGB")
img_tensor = transform(img).unsqueeze(0).to(device)
out = model(img_tensor)
prob = torch.sigmoid(out).squeeze(0)
# Threshold matches your compute_f1 logic
pred = (prob > 0.4).int().tolist()
results.append({"labels": pred})
return results
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# TASK 3.2 β DETECTION + INSTANCE SEGMENTATION
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def load_detection_model(folder: str, device: str) -> Any:
"""
Load your trained detection + segmentation model.
Parameters
----------
folder : str
Absolute path to your submission folder.
device : str
PyTorch device string, e.g. "cuda", "mps", or "cpu".
Returns
-------
model : Any
Whatever object your predict_detection_segmentation function
needs. Passed directly as the first argument.
Notes
-----
- Load weights from <folder>/model_files/seg.pt (or .pth).
- Use SEG_CLASS_MAPPING defined above to map output indices.
"""
model_path = _find_weights(Path(folder), "seg")
model = YOLO(model_path)
model.to(device)
return model
def predict_detection_segmentation(
model: Any,
images: List[Image.Image],
) -> List[Dict]:
"""
Run detection + instance segmentation on a list of images.
Parameters
----------
model : Any
The object returned by load_detection_model().
images : list of PIL.Image.Image
A list of RGB PIL images.
Returns
-------
results : list of dict
One dict per image with keys "boxes", "scores", "labels", "masks":
[
{
"boxes": [[x1, y1, x2, y2], ...], # list of float coords
"scores": [float, ...], # confidence in [0, 1]
"labels": [int, ...], # class indices (see mapping)
"masks": [np.ndarray, ...] # binary masks, HΓW, uint8
},
...
]
Output contract
---------------
- boxes / scores / labels / masks must all have the same length
(= number of detected instances in that image).
- Each box is [x1, y1, x2, y2] with x1 < x2, y1 < y2.
- Coordinates must be within image bounds (0 β€ x β€ width, 0 β€ y β€ height).
- Each score is a float in [0, 1].
- Each label is an int index matching your SEG_CLASS_MAPPING.
- Each mask is a 2-D numpy array of shape (image_height, image_width)
with dtype uint8, containing only 0 and 1.
- If no objects are detected, return empty lists for all keys.
Example
-------
>>> results = predict_detection_segmentation(model, [img])
>>> results[0]["boxes"]
[[100.0, 40.0, 300.0, 420.0], [50.0, 200.0, 250.0, 600.0]]
>>> results[0]["masks"][0].shape
(height, width)
"""
results = []
for img in images:
if img.mode != "RGB":
img = img.convert("RGB")
w, h = img.size
# YOLO prediction on PIL image directly
# We use retina_masks=True for higher resolution masks and correct sizing
preds = model.predict(source=img, imgsz=640, conf=0.25, verbose=False, retina_masks=True)
pred = preds[0]
boxes = []
scores = []
labels = []
masks_list = []
if pred.boxes is not None and len(pred.boxes) > 0:
boxes = pred.boxes.xyxy.cpu().numpy().tolist()
scores = pred.boxes.conf.cpu().numpy().tolist()
labels = pred.boxes.cls.cpu().numpy().astype(int).tolist()
if pred.masks is not None and len(pred.masks) > 0:
masks_data = pred.masks.data.cpu().numpy() # Extract masks (N, H, W)
for m in masks_data:
# Explicitly ensure it matches original image shape (h, w)
if m.shape != (h, w):
m = cv2.resize(m, (w, h), interpolation=cv2.INTER_NEAREST)
# Convert to strict binary uint8 (0 or 1)
m_binary = (m > 0.5).astype(np.uint8)
masks_list.append(m_binary)
# Fallback if masks were missing but boxes were detected
if len(masks_list) != len(boxes):
masks_list = [np.zeros((h, w), dtype=np.uint8) for _ in boxes]
results.append({
"boxes": boxes,
"scores": scores,
"labels": labels,
"masks": masks_list
})
return results
|