MangaLineExtraction-hf / modeling_mle.py

Upload 2 files

5495955 verified about 2 years ago

12.5 kB

	"""PyTorch MLE (Mnaga Line Extraction) model"""

	from dataclasses import dataclass

	import torch
	import torch.nn as nn

	from transformers import PreTrainedModel
	from transformers.modeling_outputs import ModelOutput, BaseModelOutput
	from transformers.activations import ACT2FN

	from .configuration_mle import MLEConfig


	@dataclass
	class MLEModelOutput(ModelOutput):
	last_hidden_state: torch.FloatTensor \| None = None


	@dataclass
	class MLEForAnimeLineExtractionOutput(ModelOutput):
	last_hidden_state: torch.FloatTensor \| None = None
	pixel_values: torch.Tensor \| None = None


	class MLEBatchNorm(nn.Module):
	def __init__(
	self,
	config: MLEConfig,
	in_features: int,
	):
	super().__init__()

	self.norm = nn.BatchNorm2d(in_features, eps=config.batch_norm_eps)
	# the original model uses leaky_relu
	if config.hidden_act == "leaky_relu":
	self.act_fn = nn.LeakyReLU(negative_slope=config.negative_slope)
	else:
	self.act_fn = ACT2FN[config.hidden_act]

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.norm(hidden_states)
	hidden_states = self.act_fn(hidden_states)

	return hidden_states


	class MLEResBlock(nn.Module):
	def __init__(
	self,
	config: MLEConfig,
	in_channels: int,
	out_channels: int,
	stride_size: int,
	):
	super().__init__()

	self.norm1 = MLEBatchNorm(config, in_channels)
	self.conv1 = nn.Conv2d(
	in_channels,
	out_channels,
	config.block_kernel_size,
	stride=stride_size,
	padding=config.block_kernel_size // 2,
	)

	self.norm2 = MLEBatchNorm(config, out_channels)
	self.conv2 = nn.Conv2d(
	out_channels,
	out_channels,
	config.block_kernel_size,
	stride=1,
	padding=config.block_kernel_size // 2,
	)

	if in_channels != out_channels or stride_size != 1:
	self.resize = nn.Conv2d(
	in_channels,
	out_channels,
	kernel_size=1,
	stride=stride_size,
	)
	else:
	self.resize = None

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	output = self.norm1(hidden_states)
	output = self.conv1(output)
	output = self.norm2(output)
	output = self.conv2(output)

	if self.resize is not None:
	resized_input = self.resize(hidden_states)
	output += resized_input
	else:
	output += hidden_states

	return output


	class MLEEncoderLayer(nn.Module):
	def __init__(
	self,
	config: MLEConfig,
	in_features: int,
	out_features: int,
	num_layers: int,
	stride_sizes: list[int],
	):
	super().__init__()

	self.blocks = nn.ModuleList(
	[
	MLEResBlock(
	config,
	in_channels=in_features if i == 0 else out_features,
	out_channels=out_features,
	stride_size=stride_sizes[i],
	)
	for i in range(num_layers)
	]
	)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	for block in self.blocks:
	hidden_states = block(hidden_states)
	return hidden_states


	class MLEEncoder(nn.Module):
	def __init__(
	self,
	config: MLEConfig,
	):
	super().__init__()

	self.layers = nn.ModuleList(
	[
	MLEEncoderLayer(
	config,
	in_features=(
	config.in_channels
	if i == 0
	else config.in_channels
	* config.block_patch_size
	* (config.upsample_ratio ** (i - 1))
	),
	out_features=config.in_channels
	* config.block_patch_size
	* (config.upsample_ratio**i),
	num_layers=num_layers,
	stride_sizes=(
	[
	1 if i_layer < num_layers - 1 else 2
	for i_layer in range(num_layers)
	]
	if i > 0
	else [1 for _ in range(num_layers)]
	),
	)
	for i, num_layers in enumerate(config.num_encoder_layers)
	]
	)

	def forward(
	self, hidden_states: torch.Tensor
	) -> tuple[torch.Tensor, tuple[torch.Tensor, ...]]:
	all_hidden_states: tuple[torch.Tensor, ...] = ()
	for layer in self.layers:
	hidden_states = layer(hidden_states)
	all_hidden_states += (hidden_states,)
	return hidden_states, all_hidden_states


	class MLEUpsampleBlock(nn.Module):
	def __init__(self, config: MLEConfig, in_features: int, out_features: int):
	super().__init__()

	self.norm = MLEBatchNorm(config, in_features=in_features)
	self.conv = nn.Conv2d(
	in_features,
	out_features,
	config.block_kernel_size,
	stride=1,
	padding=config.block_kernel_size // 2,
	)
	self.upsample = nn.Upsample(scale_factor=config.upsample_ratio)

	def forward(self, hidden_states: torch.Tensor):
	output = self.norm(hidden_states)
	output = self.conv(output)
	output = self.upsample(output)

	return output


	class MLEUpsampleResBlock(nn.Module):
	def __init__(self, config: MLEConfig, in_features: int, out_features: int):
	super().__init__()

	self.upsample = MLEUpsampleBlock(
	config, in_features=in_features, out_features=out_features
	)

	self.norm = MLEBatchNorm(config, in_features=out_features)
	self.conv = nn.Conv2d(
	out_features,
	out_features,
	config.block_kernel_size,
	stride=1,
	padding=config.block_kernel_size // 2,
	)

	if in_features != out_features:
	self.resize = nn.Sequential(
	nn.Conv2d(
	in_features,
	out_features,
	kernel_size=1,
	stride=1,
	),
	nn.Upsample(scale_factor=config.upsample_ratio),
	)
	else:
	self.resize = None

	def forward(self, hidden_states: torch.Tensor):
	output = self.upsample(hidden_states)
	output = self.norm(output)
	output = self.conv(output)

	if self.resize is not None:
	output += self.resize(hidden_states)

	return output


	class MLEDecoderLayer(nn.Module):
	def __init__(
	self,
	config: MLEConfig,
	in_features: int,
	out_features: int,
	num_layers: int,
	):
	super().__init__()

	self.blocks = nn.ModuleList(
	[
	(
	MLEResBlock(
	config,
	in_channels=out_features,
	out_channels=out_features,
	stride_size=1,
	)
	if i > 0
	else MLEUpsampleResBlock(
	config,
	in_features=in_features,
	out_features=out_features,
	)
	)
	for i in range(num_layers)
	]
	)

	def forward(
	self, hidden_states: torch.Tensor, shortcut_states: torch.Tensor
	) -> torch.Tensor:
	for block in self.blocks:
	hidden_states = block(hidden_states)

	hidden_states += shortcut_states

	return hidden_states


	class MLEDecoderHead(nn.Module):
	def __init__(self, config: MLEConfig, num_layers: int):
	super().__init__()

	self.layer = MLEEncoderLayer(
	config,
	in_features=config.block_patch_size,
	out_features=config.last_hidden_channels,
	stride_sizes=[1 for _ in range(num_layers)],
	num_layers=num_layers,
	)
	self.norm = MLEBatchNorm(config, in_features=config.last_hidden_channels)
	self.conv = nn.Conv2d(
	config.last_hidden_channels,
	out_channels=1,
	kernel_size=1,
	stride=1,
	)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.layer(hidden_states)
	hidden_states = self.norm(hidden_states)
	pixel_values = self.conv(hidden_states)
	return pixel_values


	class MLEDecoder(nn.Module):
	def __init__(
	self,
	config: MLEConfig,
	):
	super().__init__()

	encoder_output_channels = (
	config.in_channels
	* config.block_patch_size
	* (config.upsample_ratio ** (len(config.num_encoder_layers) - 1))
	)
	upsample_ratio = config.upsample_ratio
	num_decoder_layers = config.num_decoder_layers

	self.layers = nn.ModuleList(
	[
	(
	MLEDecoderLayer(
	config,
	in_features=encoder_output_channels // (upsample_ratio**i),
	out_features=encoder_output_channels
	// (upsample_ratio ** (i + 1)),
	num_layers=num_layers,
	)
	if i < len(num_decoder_layers) - 1
	else MLEDecoderHead(
	config,
	num_layers=num_layers,
	)
	)
	for i, num_layers in enumerate(num_decoder_layers)
	]
	)

	def forward(
	self,
	last_hidden_states: torch.Tensor,
	encoder_hidden_states: tuple[torch.Tensor, ...],
	) -> torch.Tensor:
	hidden_states = last_hidden_states
	num_encoder_hidden_states = len(encoder_hidden_states) # 5

	for i, layer in enumerate(self.layers):
	if i < len(self.layers) - 1:
	hidden_states = layer(
	hidden_states,
	# 0, 1, 2, 3, 4
	# ↓ ↓ ↓ ↓ ↓
	# 8, 7, 6, 5, 5
	encoder_hidden_states[num_encoder_hidden_states - 2 - i],
	)
	else:
	# decoder head
	hidden_states = layer(hidden_states)

	return hidden_states


	class MLEPretrainedModel(PreTrainedModel):
	config_class = MLEConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True


	class MLEModel(MLEPretrainedModel):
	def __init__(self, config: MLEConfig):
	super().__init__(config)
	self.config = config

	self.encoder = MLEEncoder(config)
	self.decoder = MLEDecoder(config)

	# Initialize weights and apply final processing
	self.post_init()

	def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
	encoder_output, all_hidden_states = self.encoder(pixel_values)
	decoder_output = self.decoder(encoder_output, all_hidden_states)

	return decoder_output


	class MLEForAnimeLineExtraction(MLEPretrainedModel):
	def __init__(self, config: MLEConfig):
	super().__init__(config)

	self.model = MLEModel(config)

	def postprocess(self, output_tensor: torch.Tensor, input_shape: tuple[int, int]):
	pixel_values = output_tensor[:, 0, :, :]
	pixel_values = torch.clip(pixel_values, 0, 255)

	pixel_values = pixel_values[:, 0 : input_shape[0], 0 : input_shape[1]]
	return pixel_values

	def forward(
	self, pixel_values: torch.Tensor, return_dict: bool = True
	) -> tuple[torch.Tensor, ...] \| MLEForAnimeLineExtractionOutput:
	# height, width
	input_image_size = (pixel_values.shape[2], pixel_values.shape[3])

	model_output = self.model(pixel_values)

	if not return_dict:
	return (model_output, self.postprocess(model_output, input_image_size))

	else:
	return MLEForAnimeLineExtractionOutput(
	last_hidden_state=model_output,
	pixel_values=self.postprocess(model_output, input_image_size),
	)