schp-pascal-7 / modeling_schp.py

style: reorder import

50b9f67 10 days ago

15.5 kB

	"""
	SCHP (Self-Correction Human Parsing) — Transformers-compatible implementation.

	Architecture inlined from https://github.com/GoGoDuck912/Self-Correction-Human-Parsing
	(networks/AugmentCE2P.py) with the CUDA-only InPlaceABNSync replaced by a pure-PyTorch
	drop-in, making the model fully runnable on CPU.
	"""

	import functools
	from dataclasses import dataclass
	from typing import Optional, Tuple, Union

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from schp.configuration_schp import SCHPConfig
	from transformers import PreTrainedModel
	from transformers.utils import ModelOutput


	# ── Pure-PyTorch InPlaceABNSync shim ──────────────────────────────────────────
	class InPlaceABNSync(nn.BatchNorm2d):
	"""CPU-compatible drop-in for InPlaceABNSync.

	Subclasses ``nn.BatchNorm2d`` directly so that state-dict keys
	(weight, bias, running_mean, running_var) match the original SCHP
	checkpoints without any nesting.
	"""

	def __init__(self, num_features, activation="leaky_relu", slope=0.01, **kwargs):
	bn_kwargs = {
	k: v
	for k, v in kwargs.items()
	if k in ("eps", "momentum", "affine", "track_running_stats")
	}
	super().__init__(num_features, **bn_kwargs)
	self.activation = activation
	self.slope = slope

	def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore[override]
	input = super().forward(input)
	if self.activation == "leaky_relu":
	return F.leaky_relu(input, negative_slope=self.slope, inplace=True)
	elif self.activation == "elu":
	return F.elu(input, inplace=True)
	return input


	# BatchNorm2d with no activation (activation="none")
	BatchNorm2d = functools.partial(InPlaceABNSync, activation="none")
	affine_par = True


	# ── Model architecture (inlined from AugmentCE2P.py) ─────────────────────────
	def _conv3x3(in_planes, out_planes, stride=1):
	return nn.Conv2d(
	in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
	)


	class _Bottleneck(nn.Module):
	expansion = 4

	def __init__(
	self, inplanes, planes, stride=1, dilation=1, downsample=None, multi_grid=1
	):
	super().__init__()
	self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
	self.bn1 = BatchNorm2d(planes)
	self.conv2 = nn.Conv2d(
	planes,
	planes,
	kernel_size=3,
	stride=stride,
	padding=dilation * multi_grid,
	dilation=dilation * multi_grid,
	bias=False,
	)
	self.bn2 = BatchNorm2d(planes)
	self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
	self.bn3 = BatchNorm2d(planes * 4)
	self.relu = nn.ReLU(inplace=False)
	self.relu_inplace = nn.ReLU(inplace=True)
	self.downsample = downsample
	self.dilation = dilation
	self.stride = stride

	def forward(self, x):
	residual = x
	out = self.relu(self.bn1(self.conv1(x)))
	out = self.relu(self.bn2(self.conv2(out)))
	out = self.bn3(self.conv3(out))
	if self.downsample is not None:
	residual = self.downsample(x)
	return self.relu_inplace(out + residual)


	class _PSPModule(nn.Module):
	def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
	super().__init__()
	self.stages = nn.ModuleList(
	[
	nn.Sequential(
	nn.AdaptiveAvgPool2d(size),
	nn.Conv2d(features, out_features, kernel_size=1, bias=False),
	InPlaceABNSync(out_features),
	)
	for size in sizes
	]
	)
	self.bottleneck = nn.Sequential(
	nn.Conv2d(
	features + len(sizes) * out_features,
	out_features,
	kernel_size=3,
	padding=1,
	dilation=1,
	bias=False,
	),
	InPlaceABNSync(out_features),
	)

	def forward(self, feats):
	h, w = feats.size(2), feats.size(3)
	priors = [
	F.interpolate(
	stage(feats), size=(h, w), mode="bilinear", align_corners=True
	)
	for stage in self.stages
	] + [feats]
	return self.bottleneck(torch.cat(priors, dim=1))


	class _Edge_Module(nn.Module):
	def __init__(self, in_fea=(256, 512, 1024), mid_fea=256, out_fea=2):
	super().__init__()
	self.conv1 = nn.Sequential(
	nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, bias=False),
	InPlaceABNSync(mid_fea),
	)
	self.conv2 = nn.Sequential(
	nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, bias=False),
	InPlaceABNSync(mid_fea),
	)
	self.conv3 = nn.Sequential(
	nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, bias=False),
	InPlaceABNSync(mid_fea),
	)
	self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, bias=True)
	self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, bias=True)

	def forward(self, x1, x2, x3):
	_, _, h, w = x1.size()
	ef1 = self.conv1(x1)
	ef2 = self.conv2(x2)
	ef3 = self.conv3(x3)
	e1 = self.conv4(ef1)
	e2 = F.interpolate(
	self.conv4(ef2), size=(h, w), mode="bilinear", align_corners=True
	)
	e3 = F.interpolate(
	self.conv4(ef3), size=(h, w), mode="bilinear", align_corners=True
	)
	ef2 = F.interpolate(ef2, size=(h, w), mode="bilinear", align_corners=True)
	ef3 = F.interpolate(ef3, size=(h, w), mode="bilinear", align_corners=True)
	edge = self.conv5(torch.cat([e1, e2, e3], dim=1))
	edge_fea = torch.cat([ef1, ef2, ef3], dim=1)
	return edge, edge_fea


	class _Decoder_Module(nn.Module):
	def __init__(self, num_classes):
	super().__init__()
	self.conv1 = nn.Sequential(
	nn.Conv2d(512, 256, kernel_size=1, bias=False),
	InPlaceABNSync(256),
	)
	self.conv2 = nn.Sequential(
	nn.Conv2d(256, 48, kernel_size=1, bias=False),
	InPlaceABNSync(48),
	)
	self.conv3 = nn.Sequential(
	nn.Conv2d(304, 256, kernel_size=1, bias=False),
	InPlaceABNSync(256),
	nn.Conv2d(256, 256, kernel_size=1, bias=False),
	InPlaceABNSync(256),
	)
	self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, bias=True)

	def forward(self, xt, xl):
	_, _, h, w = xl.size()
	xt = F.interpolate(
	self.conv1(xt), size=(h, w), mode="bilinear", align_corners=True
	)
	xl = self.conv2(xl)
	x = self.conv3(torch.cat([xt, xl], dim=1))
	return self.conv4(x), x


	class _SCHPResNet(nn.Module):
	"""SCHP ResNet-101 backbone + decoder (reproduced from AugmentCE2P.py)."""

	def __init__(self, num_classes: int):
	self.inplanes = 128
	super().__init__()
	# Three-layer stem
	self.conv1 = _conv3x3(3, 64, stride=2)
	self.bn1 = BatchNorm2d(64)
	self.relu1 = nn.ReLU(inplace=False)
	self.conv2 = _conv3x3(64, 64)
	self.bn2 = BatchNorm2d(64)
	self.relu2 = nn.ReLU(inplace=False)
	self.conv3 = _conv3x3(64, 128)
	self.bn3 = BatchNorm2d(128)
	self.relu3 = nn.ReLU(inplace=False)
	self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
	# ResNet stages
	self.layer1 = self._make_layer(_Bottleneck, 64, 3)
	self.layer2 = self._make_layer(_Bottleneck, 128, 4, stride=2)
	self.layer3 = self._make_layer(_Bottleneck, 256, 23, stride=2)
	self.layer4 = self._make_layer(
	_Bottleneck, 512, 3, stride=1, dilation=2, multi_grid=(1, 1, 1)
	)
	# Head modules
	self.context_encoding = _PSPModule(2048, 512)
	self.edge = _Edge_Module()
	self.decoder = _Decoder_Module(num_classes)
	self.fushion = nn.Sequential(
	nn.Conv2d(1024, 256, kernel_size=1, bias=False),
	InPlaceABNSync(256),
	nn.Dropout2d(0.1),
	nn.Conv2d(256, num_classes, kernel_size=1, bias=True),
	)

	def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
	downsample = None
	if stride != 1 or self.inplanes != planes * block.expansion:
	downsample = nn.Sequential(
	nn.Conv2d(
	self.inplanes,
	planes * block.expansion,
	kernel_size=1,
	stride=stride,
	bias=False,
	),
	BatchNorm2d(planes * block.expansion, affine=affine_par),
	)

	def _grid(i, g):
	return g[i % len(g)] if isinstance(g, tuple) else 1

	layers = [
	block(
	self.inplanes,
	planes,
	stride,
	dilation=dilation,
	downsample=downsample,
	multi_grid=_grid(0, multi_grid),
	)
	]
	self.inplanes = planes * block.expansion
	for i in range(1, blocks):
	layers.append(
	block(
	self.inplanes,
	planes,
	dilation=dilation,
	multi_grid=_grid(i, multi_grid),
	)
	)
	return nn.Sequential(*layers)

	def forward(self, x):
	x = self.relu1(self.bn1(self.conv1(x)))
	x = self.relu2(self.bn2(self.conv2(x)))
	x = self.relu3(self.bn3(self.conv3(x)))
	x = self.maxpool(x)
	x2 = self.layer1(x)
	x3 = self.layer2(x2)
	x4 = self.layer3(x3)
	x5 = self.layer4(x4)
	context = self.context_encoding(x5)
	parsing_result, parsing_fea = self.decoder(context, x2)
	edge_result, edge_fea = self.edge(x2, x3, x4)
	fusion_result = self.fushion(torch.cat([parsing_fea, edge_fea], dim=1))
	# Return format mirrors the original: [[parsing, fusion], [edge]]
	return [[parsing_result, fusion_result], [edge_result]]


	# ── Transformers output dataclass ────────────────────────────────────────────
	@dataclass
	class SCHPSemanticSegmenterOutput(ModelOutput):
	"""
	Output type for :class:`SCHPForSemanticSegmentation`.

	Args:
	loss: Cross-entropy loss (only when ``labels`` is provided).
	logits: Final fusion logits, shape ``(batch, num_labels, H, W)``,
	upsampled to the input image resolution.
	parsing_logits: Decoder-branch logits before fusion,
	shape ``(batch, num_labels, H, W)``.
	edge_logits: Edge-branch logits, shape ``(batch, 2, H, W)``.
	"""

	loss: Optional[torch.Tensor] = None
	logits: Optional[torch.Tensor] = None
	parsing_logits: Optional[torch.Tensor] = None
	edge_logits: Optional[torch.Tensor] = None


	# ── PreTrainedModel wrapper ───────────────────────────────────────────────────
	class SCHPForSemanticSegmentation(PreTrainedModel):
	"""
	SCHP ResNet-101 for human parsing / semantic segmentation.

	Usage — loading from an original SCHP ``.pth`` checkpoint::

	model = SCHPForSemanticSegmentation.from_schp_checkpoint(
	"checkpoints/schp/exp-schp-201908301523-atr.pth"
	)

	Usage — loading after :meth:`save_pretrained`::

	model = SCHPForSemanticSegmentation.from_pretrained(
	"./my-schp-model", trust_remote_code=True
	)
	"""

	config_class = SCHPConfig
	# num_batches_tracked is not stored in the original SCHP checkpoints
	_keys_to_ignore_on_load_missing = [r"\.num_batches_tracked$"]

	def __init__(self, config: SCHPConfig):
	super().__init__(config)
	self.model = _SCHPResNet(num_classes=config.num_labels)
	self.post_init()

	def forward(
	self,
	pixel_values: torch.Tensor,
	labels: Optional[torch.LongTensor] = None,
	return_dict: Optional[bool] = None,
	) -> Union[SCHPSemanticSegmenterOutput, Tuple]:
	"""
	Args:
	pixel_values: ``(batch, 3, H, W)`` — normalised with SCHP BGR-indexed means.
	labels: ``(batch, H, W)`` integer class map for computing CE loss.
	return_dict: Override ``config.use_return_dict``.
	"""
	return_dict = return_dict if return_dict is not None else True

	h, w = pixel_values.shape[-2:]
	raw = self.model(pixel_values)
	# raw = [[parsing_result, fusion_result], [edge_result]]

	logits = F.interpolate(
	raw[0][1], size=(h, w), mode="bilinear", align_corners=True
	)
	parsing_logits = F.interpolate(
	raw[0][0], size=(h, w), mode="bilinear", align_corners=True
	)
	edge_logits = F.interpolate(
	raw[1][0], size=(h, w), mode="bilinear", align_corners=True
	)

	loss = None
	if labels is not None:
	loss = F.cross_entropy(logits, labels.long())

	if not return_dict:
	return (loss, logits) if loss is not None else (logits,)

	return SCHPSemanticSegmenterOutput(
	loss=loss,
	logits=logits,
	parsing_logits=parsing_logits,
	edge_logits=edge_logits,
	)

	@classmethod
	def from_schp_checkpoint(
	cls,
	checkpoint_path: str,
	config: Optional[SCHPConfig] = None,
	map_location: str = "cpu",
	) -> "SCHPForSemanticSegmentation":
	"""
	Load from an original SCHP ``.pth`` checkpoint.

	Handles the ``module.`` prefix added by ``DataParallel`` training and
	remaps keys to the ``model.*`` namespace used by this wrapper.

	Args:
	checkpoint_path: Path to the ``.pth`` file.
	config: :class:`SCHPConfig` instance. Defaults to ATR-18 config.
	map_location: PyTorch device string (``"cpu"`` or ``"cuda"``).
	"""
	if config is None:
	config = SCHPConfig()

	model = cls(config)

	raw = torch.load(checkpoint_path, map_location=map_location)
	state_dict = raw.get("state_dict", raw)

	# Strip DataParallel module. prefix if present
	if all(k.startswith("module.") for k in state_dict):
	state_dict = {k[len("module.") :]: v for k, v in state_dict.items()}

	# Remap to model.* namespace (self.model = _SCHPResNet)
	state_dict = {"model." + k: v for k, v in state_dict.items()}

	missing, unexpected = model.load_state_dict(state_dict, strict=False)
	real_missing = [k for k in missing if "num_batches_tracked" not in k]
	if real_missing:
	raise RuntimeError(
	f"Missing keys when loading SCHP checkpoint ({len(real_missing)} total): "
	f"{real_missing[:5]}"
	)
	if unexpected:
	raise RuntimeError(
	f"Unexpected keys when loading SCHP checkpoint ({len(unexpected)} total): "
	f"{unexpected[:5]}"
	)

	return model