diff --git a/.gitattributes b/.gitattributes index 01af77dc76a68de64c71a17a166681c6e15c414d..b43131c82cd21f9ed4be4cef3bbc6c7583623af6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -48,3 +48,6 @@ wandb/offline-run-20250624_115955-iye05c18/run-iye05c18.wandb filter=lfs diff=lf wandb/offline-run-20250721_000454-up3efnok/run-up3efnok.wandb filter=lfs diff=lfs merge=lfs -text wandb/offline-run-20250722_003110-femxkckf/run-femxkckf.wandb filter=lfs diff=lfs merge=lfs -text seamless_interaction/assets/banner.gif filter=lfs diff=lfs merge=lfs -text +docs/resources/grpo_countdown.png filter=lfs diff=lfs merge=lfs -text +docs/resources/grpo_geoqa.png filter=lfs diff=lfs merge=lfs -text +docs/resources/grpo_openr1_multimodal.png filter=lfs diff=lfs merge=lfs -text diff --git a/docs/resources/grpo_countdown.png b/docs/resources/grpo_countdown.png new file mode 100644 index 0000000000000000000000000000000000000000..af2ce0c0ce08cb3d8b152f6bafe2c15c056dcd72 --- /dev/null +++ b/docs/resources/grpo_countdown.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b55fe6864e0c92549940d6989d92b3ab22be38a035cff3694525252737fc91e +size 2226402 diff --git a/docs/resources/grpo_geoqa.png b/docs/resources/grpo_geoqa.png new file mode 100644 index 0000000000000000000000000000000000000000..071d9b8eacb301bd96e30c2eff1471e68a7632a8 --- /dev/null +++ b/docs/resources/grpo_geoqa.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71246376b16f2ff288542dca2ff31532b16ef99f5e862797463d548e447e1f8d +size 2238084 diff --git a/docs/resources/grpo_openr1_multimodal.png b/docs/resources/grpo_openr1_multimodal.png new file mode 100644 index 0000000000000000000000000000000000000000..5d91a327b58283acf43de08cbdbdae59a8bca7c3 --- /dev/null +++ b/docs/resources/grpo_openr1_multimodal.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:050f56792468a4c9797a90314e322c16dd916bde3be24a7ce7c7b96381e70d9e +size 2302511 diff --git a/docs/transformers/build/lib/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py b/docs/transformers/build/lib/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..47cec7afac1a7222fbb2779ff51f840fa8c54e1d --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py @@ -0,0 +1,246 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Distill Any Depth checkpoints from the original repository. URL: +https://github.com/Westlake-AGI-Lab/Distill-Any-Depth""" + +import argparse +import re +from pathlib import Path + +import requests +import torch +from huggingface_hub import hf_hub_download +from PIL import Image +from safetensors.torch import load_file + +from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + r"(backbone|pretrained)\.cls_token": r"backbone.embeddings.cls_token", + r"(backbone|pretrained)\.mask_token": r"backbone.embeddings.mask_token", + r"(backbone|pretrained)\.pos_embed": r"backbone.embeddings.position_embeddings", + r"(backbone|pretrained)\.patch_embed\.proj\.(weight|bias)": r"backbone.embeddings.patch_embeddings.projection.\2", + r"(backbone|pretrained)\.norm\.(weight|bias)": r"backbone.layernorm.\2", + r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.proj\.(weight|bias)": r"backbone.encoder.layer.\4.attention.output.dense.\5", + r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.ls(1|2)\.gamma": r"backbone.encoder.layer.\4.layer_scale\5.lambda1", + r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.mlp\.fc(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.mlp.fc\5.\6", + r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.norm(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.norm\5.\6", + r"depth_head\.projects\.(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.projection.\2", + r"depth_head\.resize_layers\.(?!2)(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.resize.\2", + r"depth_head\.scratch\.layer(\d+)_rn\.weight": lambda m: f"neck.convs.{int(m[1]) - 1}.weight", + r"depth_head\.scratch\.output_conv(\d+)(?:\.(\d+))?\.(weight|bias)": lambda m: ( + f"head.conv{int(m[1]) + (int(m[2]) // 2 if m[2] else 0)}.{m[3]}" if m[1] == "2" else f"head.conv{m[1]}.{m[3]}" + ), + r"depth_head\.scratch\.refinenet(\d+)\.out_conv\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.projection.{m[2]}", + r"depth_head\.scratch\.refinenet(\d+)\.resConfUnit(\d+)\.conv(\d+)\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.residual_layer{m[2]}.convolution{m[3]}.{m[4]}", +} + + +def get_dpt_config(model_name): + if "small" in model_name: + out_indices = [3, 6, 9, 12] + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False + ) + fusion_hidden_size = 64 + neck_hidden_sizes = [48, 96, 192, 384] + elif "base" in model_name: + out_indices = [3, 6, 9, 12] + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False + ) + fusion_hidden_size = 128 + neck_hidden_sizes = [96, 192, 384, 768] + elif "large" in model_name: + out_indices = [5, 12, 18, 24] + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False + ) + fusion_hidden_size = 256 + neck_hidden_sizes = [256, 512, 1024, 1024] + else: + raise NotImplementedError(f"Model not supported: {model_name}") + + depth_estimation_type = "relative" + max_depth = None + + config = DepthAnythingConfig( + reassemble_hidden_size=backbone_config.hidden_size, + patch_size=backbone_config.patch_size, + backbone_config=backbone_config, + fusion_hidden_size=fusion_hidden_size, + neck_hidden_sizes=neck_hidden_sizes, + depth_estimation_type=depth_estimation_type, + max_depth=max_depth, + ) + + return config + + +def convert_key_pattern(key, mapping): + for pattern, replacement in mapping.items(): + match = re.fullmatch(pattern, key) + if match: + if callable(replacement): + return replacement(match) + return re.sub(pattern, replacement, key) + return None + + +def convert_keys(state_dict, config): + new_state_dict = {} + qkv_pattern = r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.qkv\.(weight|bias)" + qkv_keys = [k for k in list(state_dict.keys()) if re.match(qkv_pattern, k)] + for old_key in qkv_keys: + value = state_dict.pop(old_key) + match = re.match(qkv_pattern, old_key) + _, _, _, layer, attr = match.groups() + hidden_size = config.backbone_config.hidden_size + q = value[:hidden_size] + k = value[hidden_size : hidden_size * 2] + v = value[-hidden_size:] + + for proj, tensor in zip(["query", "key", "value"], [q, k, v]): + new_key = f"backbone.encoder.layer.{layer}.attention.attention.{proj}.{attr}" + new_state_dict[new_key] = tensor + + for old_key in list(state_dict.keys()): + value = state_dict.pop(old_key) + new_key = convert_key_pattern(old_key, ORIGINAL_TO_CONVERTED_KEY_MAPPING) + + new_state_dict[new_key] = value + + return new_state_dict + + +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + return Image.open(requests.get(url, stream=True).raw) + + +name_to_checkpoint = { + "distill-any-depth-small": "small/model.safetensors", + "distill-any-depth-base": "base/model.safetensors", + "distill-any-depth-large": "large/model.safetensors", +} + + +@torch.no_grad() +def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): + config = get_dpt_config(model_name) + + repo_id = "xingyang1/Distill-Any-Depth" + filepath = hf_hub_download(repo_id=repo_id, filename=name_to_checkpoint[model_name]) + state_dict = load_file(filepath) + + converted_state_dict = convert_keys(state_dict, config) + + model = DepthAnythingForDepthEstimation(config) + model.load_state_dict(converted_state_dict) + model.eval() + + processor = DPTImageProcessor( + do_resize=True, + size={"height": 518, "width": 518}, + ensure_multiple_of=14, + keep_aspect_ratio=True, + do_rescale=True, + do_normalize=True, + image_mean=[0.485, 0.456, 0.406], + image_std=[0.229, 0.224, 0.225], + ) + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + pixel_values = processor(image, return_tensors="pt").pixel_values + + with torch.no_grad(): + outputs = model(pixel_values) + predicted_depth = outputs.predicted_depth + + print("Shape of predicted depth:", predicted_depth.shape) + print("First values:", predicted_depth[0, :3, :3]) + + if verify_logits: + print("Verifying logits...") + expected_shape = torch.Size([1, 518, 686]) + + if model_name == "distill-any-depth-small": + expected_slice = torch.tensor( + [[2.5653, 2.5249, 2.5570], [2.4897, 2.5235, 2.5355], [2.5255, 2.5261, 2.5422]] + ) + elif model_name == "distill-any-depth-base": + expected_slice = torch.tensor( + [[4.8976, 4.9075, 4.9403], [4.8872, 4.8906, 4.9448], [4.8712, 4.8898, 4.8838]] + ) + elif model_name == "distill-any-depth-large": + expected_slice = torch.tensor( + [[55.1067, 51.1828, 51.6803], [51.9098, 50.7529, 51.4494], [50.1745, 50.5491, 50.8818]] + ) + else: + raise ValueError("Not supported") + + assert predicted_depth.shape == torch.Size(expected_shape) + assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model and processor to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + print("Pushing model and processor to hub...") + model.push_to_hub(repo_id=f"{model_name.title()}-hf") + processor.push_to_hub(repo_id=f"{model_name.title()}-hf") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", + default="distill-any-depth-small", + type=str, + choices=name_to_checkpoint.keys(), + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether to push the model to the hub after conversion.", + ) + parser.add_argument( + "--verify_logits", + action="store_true", + required=False, + help="Whether to verify the logits after conversion.", + ) + + args = parser.parse_args() + convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/docs/transformers/build/lib/transformers/models/depth_anything/modeling_depth_anything.py b/docs/transformers/build/lib/transformers/models/depth_anything/modeling_depth_anything.py new file mode 100644 index 0000000000000000000000000000000000000000..17b79134adbe6f02c1e1c48caac3d84cd83bef7e --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/depth_anything/modeling_depth_anything.py @@ -0,0 +1,469 @@ +# coding=utf-8 +# Copyright 2024 TikTok and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Depth Anything model.""" + +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from ...file_utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_outputs import DepthEstimatorOutput +from ...modeling_utils import PreTrainedModel +from ...utils import logging +from ...utils.backbone_utils import load_backbone +from .configuration_depth_anything import DepthAnythingConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "DepthAnythingConfig" + + +DEPTH_ANYTHING_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DepthAnythingConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DEPTH_ANYTHING_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] + for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +class DepthAnythingReassembleLayer(nn.Module): + def __init__(self, config, channels, factor): + super().__init__() + self.projection = nn.Conv2d(in_channels=config.reassemble_hidden_size, out_channels=channels, kernel_size=1) + + # up/down sampling depending on factor + if factor > 1: + self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0) + elif factor == 1: + self.resize = nn.Identity() + elif factor < 1: + # so should downsample + self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1) + + # Copied from transformers.models.dpt.modeling_dpt.DPTReassembleLayer.forward + def forward(self, hidden_state): + hidden_state = self.projection(hidden_state) + hidden_state = self.resize(hidden_state) + + return hidden_state + + +class DepthAnythingReassembleStage(nn.Module): + """ + This class reassembles the hidden states of the backbone into image-like feature representations at various + resolutions. + + This happens in 3 stages: + 1. Take the patch embeddings and reshape them to image-like feature representations. + 2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`. + 3. Resizing the spatial dimensions (height, width). + + Args: + config (`[DepthAnythingConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.config = config + self.layers = nn.ModuleList() + for channels, factor in zip(config.neck_hidden_sizes, config.reassemble_factors): + self.layers.append(DepthAnythingReassembleLayer(config, channels=channels, factor=factor)) + + def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]: + """ + Args: + hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`): + List of hidden states from the backbone. + """ + out = [] + + for i, hidden_state in enumerate(hidden_states): + # reshape to (batch_size, num_channels, height, width) + hidden_state = hidden_state[:, 1:] + batch_size, _, num_channels = hidden_state.shape + hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + hidden_state = self.layers[i](hidden_state) + out.append(hidden_state) + + return out + + +class DepthAnythingPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DepthAnythingConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.activation1 = nn.ReLU() + self.convolution1 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=True, + ) + + self.activation2 = nn.ReLU() + self.convolution2 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=True, + ) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + hidden_state = self.convolution1(hidden_state) + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + return hidden_state + residual + + +class DepthAnythingFeatureFusionLayer(nn.Module): + """Feature fusion layer, merges feature maps from different stages. + + Args: + config (`[DepthAnythingConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) + + self.residual_layer1 = DepthAnythingPreActResidualLayer(config) + self.residual_layer2 = DepthAnythingPreActResidualLayer(config) + + def forward(self, hidden_state, residual=None, size=None): + if residual is not None: + if hidden_state.shape != residual.shape: + residual = nn.functional.interpolate( + residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False + ) + hidden_state = hidden_state + self.residual_layer1(residual) + + hidden_state = self.residual_layer2(hidden_state) + + modifier = {"scale_factor": 2} if size is None else {"size": size} + + hidden_state = nn.functional.interpolate( + hidden_state, + **modifier, + mode="bilinear", + align_corners=True, + ) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +class DepthAnythingFeatureFusionStage(nn.Module): + # Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage.__init__ with DPT->DepthAnything + def __init__(self, config): + super().__init__() + self.layers = nn.ModuleList() + for _ in range(len(config.neck_hidden_sizes)): + self.layers.append(DepthAnythingFeatureFusionLayer(config)) + + def forward(self, hidden_states, size=None): + # reversing the hidden_states, we start from the last + hidden_states = hidden_states[::-1] + + fused_hidden_states = [] + fused_hidden_state = None + + for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)): + size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None + + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state, size=size) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size) + + fused_hidden_states.append(fused_hidden_state) + + return fused_hidden_states + + +# Modified from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->DepthAnything,dpt->depth_anything +# avoiding sdpa and flash_attn_2 support, it's done in the backend +class DepthAnythingPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DepthAnythingConfig + base_model_prefix = "depth_anything" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +class DepthAnythingNeck(nn.Module): + """ + DepthAnythingNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as + input and produces another list of tensors as output. For DepthAnything, it includes 2 stages: + + * DepthAnythingReassembleStage + * DepthAnythingFeatureFusionStage. + + Args: + config (dict): config dict. + """ + + def __init__(self, config): + super().__init__() + self.config = config + + self.reassemble_stage = DepthAnythingReassembleStage(config) + + self.convs = nn.ModuleList() + for channel in config.neck_hidden_sizes: + self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False)) + + # fusion + self.fusion_stage = DepthAnythingFeatureFusionStage(config) + + def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]: + """ + Args: + hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`): + List of hidden states from the backbone. + """ + if not isinstance(hidden_states, (tuple, list)): + raise TypeError("hidden_states should be a tuple or list of tensors") + + if len(hidden_states) != len(self.config.neck_hidden_sizes): + raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") + + # postprocess hidden states + hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width) + + features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)] + + # fusion blocks + output = self.fusion_stage(features) + + return output + + +class DepthAnythingDepthEstimationHead(nn.Module): + """ + Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples + the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's + supplementary material). The final activation function is either ReLU or Sigmoid, depending on the depth estimation + type (relative or metric). For metric depth estimation, the output is scaled by the maximum depth used during pretraining. + """ + + def __init__(self, config): + super().__init__() + + self.head_in_index = config.head_in_index + self.patch_size = config.patch_size + + features = config.fusion_hidden_size + self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1) + self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1) + self.activation1 = nn.ReLU() + self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0) + if config.depth_estimation_type == "relative": + self.activation2 = nn.ReLU() + elif config.depth_estimation_type == "metric": + self.activation2 = nn.Sigmoid() + else: + raise ValueError(f"Unknown depth estimation type: {config.depth_estimation_type}") + self.max_depth = config.max_depth + + def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> torch.Tensor: + hidden_states = hidden_states[self.head_in_index] + + predicted_depth = self.conv1(hidden_states) + predicted_depth = nn.functional.interpolate( + predicted_depth, + (int(patch_height * self.patch_size), int(patch_width * self.patch_size)), + mode="bilinear", + align_corners=True, + ) + predicted_depth = self.conv2(predicted_depth) + predicted_depth = self.activation1(predicted_depth) + predicted_depth = self.conv3(predicted_depth) + predicted_depth = self.activation2(predicted_depth) * self.max_depth + predicted_depth = predicted_depth.squeeze(dim=1) # shape (batch_size, height, width) + + return predicted_depth + + +@add_start_docstrings( + """ + Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. + """, + DEPTH_ANYTHING_START_DOCSTRING, +) +class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel): + _no_split_modules = ["DPTViTEmbeddings"] + + def __init__(self, config): + super().__init__(config) + + self.backbone = load_backbone(config) + self.neck = DepthAnythingNeck(config) + self.head = DepthAnythingDepthEstimationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DEPTH_ANYTHING_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Returns: + + Examples: + ```python + >>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf") + >>> model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # interpolate to original size + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], + ... ) + + >>> # visualize the prediction + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) + ```""" + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + outputs = self.backbone.forward_with_filtered_kwargs( + pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions + ) + hidden_states = outputs.feature_maps + + _, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + patch_height = height // patch_size + patch_width = width // patch_size + + hidden_states = self.neck(hidden_states, patch_height, patch_width) + + predicted_depth = self.head(hidden_states, patch_height, patch_width) + + if not return_dict: + if output_hidden_states: + output = (predicted_depth,) + outputs[1:] + else: + output = (predicted_depth,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +__all__ = ["DepthAnythingForDepthEstimation", "DepthAnythingPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/depth_pro/configuration_depth_pro.py b/docs/transformers/build/lib/transformers/models/depth_pro/configuration_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..64b58d552067a4827e1d7aeaa892fd90398c6614 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/depth_pro/configuration_depth_pro.py @@ -0,0 +1,205 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DepthPro model configuration""" + +from copy import deepcopy + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class DepthProConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DepthProModel`]. It is used to instantiate a + DepthPro model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DepthPro + [apple/DepthPro](https://huggingface.co/apple/DepthPro) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + fusion_hidden_size (`int`, *optional*, defaults to 256): + The number of channels before fusion. + patch_size (`int`, *optional*, defaults to 384): + The size (resolution) of each patch. This is also the image_size for backbone model. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`): + Indices of the intermediate hidden states from the patch encoder to use for fusion. + intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`): + Hidden state dimensions during upsampling for each intermediate hidden state in `intermediate_hook_ids`. + scaled_images_ratios (`List[float]`, *optional*, defaults to `[0.25, 0.5, 1]`): + Ratios of scaled images to be used by the patch encoder. + scaled_images_overlap_ratios (`List[float]`, *optional*, defaults to `[0.0, 0.5, 0.25]`): + Overlap ratios between patches for each scaled image in `scaled_images_ratios`. + scaled_images_feature_dims (`List[int]`, *optional*, defaults to `[1024, 1024, 512]`): + Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`. + merge_padding_value (`int`, *optional*, defaults to 3): + When merging smaller patches back to the image size, overlapping sections of this size are removed. + use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): + Whether to use batch normalization in the pre-activate residual units of the fusion blocks. + use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`): + Whether to use bias in the pre-activate residual units of the fusion blocks. + use_fov_model (`bool`, *optional*, defaults to `False`): + Whether to use `DepthProFovModel` to generate the field of view. + num_fov_head_layers (`int`, *optional*, defaults to 2): + Number of convolution layers in the head of `DepthProFovModel`. + image_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the image encoder model, which is loaded using the [`AutoModel`] API. + By default, Dinov2 model is used as backbone. + patch_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the patch encoder model, which is loaded using the [`AutoModel`] API. + By default, Dinov2 model is used as backbone. + fov_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the fov encoder model, which is loaded using the [`AutoModel`] API. + By default, Dinov2 model is used as backbone. + + Example: + + ```python + >>> from transformers import DepthProConfig, DepthProModel + + >>> # Initializing a DepthPro apple/DepthPro style configuration + >>> configuration = DepthProConfig() + + >>> # Initializing a model (with random weights) from the apple/DepthPro style configuration + >>> model = DepthProModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "depth_pro" + sub_configs = {"image_model_config": AutoConfig, "patch_model_config": AutoConfig, "fov_model_config": AutoConfig} + + def __init__( + self, + fusion_hidden_size=256, + patch_size=384, + initializer_range=0.02, + intermediate_hook_ids=[11, 5], + intermediate_feature_dims=[256, 256], + scaled_images_ratios=[0.25, 0.5, 1], + scaled_images_overlap_ratios=[0.0, 0.5, 0.25], + scaled_images_feature_dims=[1024, 1024, 512], + merge_padding_value=3, + use_batch_norm_in_fusion_residual=False, + use_bias_in_fusion_residual=True, + use_fov_model=False, + num_fov_head_layers=2, + image_model_config=None, + patch_model_config=None, + fov_model_config=None, + **kwargs, + ): + super().__init__(**kwargs) + + # scaled_images_ratios is sorted + if scaled_images_ratios != sorted(scaled_images_ratios): + raise ValueError( + f"Values in scaled_images_ratios={scaled_images_ratios} should be sorted from low to high" + ) + + # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent + if not (len(scaled_images_ratios) == len(scaled_images_overlap_ratios) == len(scaled_images_feature_dims)): + raise ValueError( + f"len(scaled_images_ratios)={len(scaled_images_ratios)} and " + f"len(scaled_images_overlap_ratios)={len(scaled_images_overlap_ratios)} and " + f"len(scaled_images_feature_dims)={len(scaled_images_feature_dims)}, " + f"should match in config." + ) + + # intermediate_hook_ids, intermediate_feature_dims should be consistent + if not (len(intermediate_hook_ids) == len(intermediate_feature_dims)): + raise ValueError( + f"len(intermediate_hook_ids)={len(intermediate_hook_ids)} and " + f"len(intermediate_feature_dims)={len(intermediate_feature_dims)}, " + f"should match in config." + ) + + # fusion_hidden_size should be consistent with num_fov_head_layers + if fusion_hidden_size // 2**num_fov_head_layers == 0: + raise ValueError( + f"fusion_hidden_size={fusion_hidden_size} should be consistent with num_fov_head_layers={num_fov_head_layers} " + "i.e fusion_hidden_size // 2**num_fov_head_layers > 0" + ) + + self.fusion_hidden_size = fusion_hidden_size + self.patch_size = patch_size + self.initializer_range = initializer_range + self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual + self.use_bias_in_fusion_residual = use_bias_in_fusion_residual + self.use_fov_model = use_fov_model + self.num_fov_head_layers = num_fov_head_layers + self.intermediate_hook_ids = intermediate_hook_ids + self.intermediate_feature_dims = intermediate_feature_dims + self.scaled_images_ratios = scaled_images_ratios + self.scaled_images_overlap_ratios = scaled_images_overlap_ratios + self.scaled_images_feature_dims = scaled_images_feature_dims + self.merge_padding_value = merge_padding_value + self.image_model_config = image_model_config + self.patch_model_config = patch_model_config + self.fov_model_config = fov_model_config + + for sub_config_key in self.sub_configs.keys(): + sub_config = getattr(self, sub_config_key) + + if sub_config is None: + sub_config = CONFIG_MAPPING["dinov2"](image_size=patch_size) + logger.info( + f"`{sub_config_key}` is `None`. Initializing `{sub_config_key}` with the `Dinov2Config` " + f"with default values except `{sub_config_key}.image_size` is set to `config.patch_size`." + ) + elif isinstance(sub_config, dict): + sub_config = deepcopy(sub_config) + if "model_type" not in sub_config: + raise KeyError( + f"The `model_type` key is missing in the `{sub_config_key}` dictionary. Please provide the model type." + ) + elif sub_config["model_type"] not in CONFIG_MAPPING: + raise ValueError( + f"The model type `{sub_config['model_type']}` in `{sub_config_key}` is not supported. Please provide a valid model type." + ) + image_size = sub_config.get("image_size") + if image_size != patch_size: + logger.info( + f"The `image_size` in `{sub_config_key}` is set to `{image_size}`, " + f"but it does not match the required `patch_size` of `{patch_size}`. " + f"Updating `image_size` to `{patch_size}` for consistency. " + f"Ensure that `image_size` aligns with `patch_size` in the configuration." + ) + sub_config.update({"image_size": patch_size}) + sub_config = CONFIG_MAPPING[sub_config["model_type"]](**sub_config) + elif isinstance(sub_config, PretrainedConfig): + sub_config = sub_config + image_size = getattr(sub_config, "image_size", None) + if image_size != patch_size: + raise ValueError( + f"`config.{sub_config_key}.image_size={image_size}` should match `config.patch_size={patch_size}`." + ) + else: + raise TypeError( + f"Invalid type for `sub_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(sub_config)}." + ) + + setattr(self, sub_config_key, sub_config) + + +__all__ = ["DepthProConfig"] diff --git a/docs/transformers/build/lib/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/docs/transformers/build/lib/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..b24c6a5174f06159f21e9fb5a8c6d12cf2bafbd2 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -0,0 +1,254 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import gc +import os + +import regex as re +import torch +from huggingface_hub import hf_hub_download + +from transformers import ( + DepthProConfig, + DepthProForDepthEstimation, + DepthProImageProcessorFast, +) + + +# fmt: off +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + + # encoder + r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.model.embeddings.cls_token", + r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.model.embeddings.position_embeddings", + r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.embeddings.patch_embeddings.projection.\2", + r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.norm\3.\4", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.attention.(query|key|value).\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.output.dense.\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.layer_scale\3.lambda1", + r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.mlp.fc\3.\4", + r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.layernorm.\2", + r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.neck.fuse_image_with_low_res.\1", + + # fov + r"fov.encoder.0.cls_token": r"fov_model.fov_encoder.model.embeddings.cls_token", + r"fov.encoder.0.pos_embed": r"fov_model.fov_encoder.model.embeddings.position_embeddings", + r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.fov_encoder.model.embeddings.patch_embeddings.projection.\1", + r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.norm\2.\3", + r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.attention.attention.(query|key|value).\2", + r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.attention.output.dense.\2", + r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.fov_encoder.model.encoder.layer.\1.layer_scale\2.lambda1", + r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.mlp.fc\2.\3", + r"fov.encoder.0.norm.(weight|bias)": r"fov_model.fov_encoder.model.layernorm.\1", + r"fov.downsample.0.(weight|bias)": r"fov_model.conv.\1", + r"fov.encoder.1.(weight|bias)": r"fov_model.fov_encoder.neck.\1", + r"fov.head.(\d+).(weight|bias)": r"fov_model.head.layers.\1.\2", + + # head + r"head.(\d+).(weight|bias)": r"head.layers.\1.\2", + + # upsamples + r"encoder.upsample_lowres.(weight|bias)": r"depth_pro.neck.feature_upsample.image_block.layers.0.\1", + r"encoder.upsample_latent(\d+).(\d+).(weight|bias)": lambda match: ( + f"depth_pro.neck.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" + ), + r"encoder.upsample(\d+).(\d+).(weight|bias)": lambda match: ( + f"depth_pro.neck.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" + ), + + # projections between encoder and fusion + r"decoder.convs.(\d+).weight": lambda match: ( + f"depth_pro.neck.feature_projection.projections.{4-int(match.group(1))}.weight" + ), + + # fusion stage + r"decoder.fusions.([1234]).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( + f"fusion_stage.intermediate.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}" + ), + r"decoder.fusions.0.resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( + f"fusion_stage.final.residual_layer{match.group(1)}.convolution{(int(match.group(2))+1)//2}.{match.group(3)}" + ), + r"decoder.fusions.([1234]).out_conv.(weight|bias)": lambda match: ( + f"fusion_stage.intermediate.{4-int(match.group(1))}.projection.{match.group(2)}" + ), + r"decoder.fusions.0.out_conv.(weight|bias)": lambda match: ( + f"fusion_stage.final.projection.{match.group(1)}" + ), + r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: ( + f"fusion_stage.intermediate.{4-int(match.group(1))}.deconv.{match.group(2)}" + ), +} +# fmt: on + + +def convert_old_keys_to_new_keys(state_dict_keys: dict = None): + output_dict = {} + if state_dict_keys is not None: + old_text = "\n".join(state_dict_keys) + new_text = old_text + for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + if replacement is None: + new_text = re.sub(pattern, "", new_text) # an empty line + continue + new_text = re.sub(pattern, replacement, new_text) + output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) + return output_dict + + +def get_qkv_state_dict(key, parameter): + """ + new key which looks like this + xxxx.(q|k|v).xxx (m, n) + + is converted to + xxxx.q.xxxx (m//3, n) + xxxx.k.xxxx (m//3, n) + xxxx.v.xxxx (m//3, n) + """ + qkv_state_dict = {} + placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)" + replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] + replacements_vals = torch.split( + parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0 + ) + for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): + qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val + return qkv_state_dict + + +def write_model( + hf_repo_id: str, + output_dir: str, + safe_serialization: bool = True, +): + os.makedirs(output_dir, exist_ok=True) + + # ------------------------------------------------------------ + # Create and save config + # ------------------------------------------------------------ + + # create config + backbone_config = { + "model_type": "dinov2", + "num_hidden_layers": 24, + "patch_size": 16, + "hidden_size": 1024, + "num_attention_heads": 16, + "image_size": 384, + "use_mask_token": False, + } + config = DepthProConfig( + # original implementation uses same config for all 3 models + image_model_config=backbone_config, + patch_model_config=backbone_config, + fov_model_config=backbone_config, + use_fov_model=True, + ) + + # save config + config.save_pretrained(output_dir) + print("Model config saved successfully...") + + # ------------------------------------------------------------ + # Convert weights + # ------------------------------------------------------------ + + # download and load state_dict from hf repo + file_path = hf_hub_download(hf_repo_id, "depth_pro.pt") + loaded = torch.load(file_path, weights_only=True) + + print("Converting model...") + all_keys = list(loaded.keys()) + new_keys = convert_old_keys_to_new_keys(all_keys) + + state_dict = {} + for key in all_keys: + new_key = new_keys[key] + current_parameter = loaded.pop(key) + + if "qkv" in key: + qkv_state_dict = get_qkv_state_dict(new_key, current_parameter) + state_dict.update(qkv_state_dict) + else: + state_dict[new_key] = current_parameter + + print("Loading the checkpoint in a DepthPro model.") + model = DepthProForDepthEstimation(config) + model.load_state_dict(state_dict, strict=True, assign=True) + print("Checkpoint loaded successfully.") + + print("Saving the model.") + model.save_pretrained(output_dir, safe_serialization=safe_serialization) + del state_dict, model + + # Safety check: reload the converted model + gc.collect() + print("Reloading the model to check if it's saved correctly.") + model = DepthProForDepthEstimation.from_pretrained(output_dir, device_map="auto") + print("Model reloaded successfully.") + return model + + +def write_image_processor(output_dir: str): + image_processor = DepthProImageProcessorFast() + image_processor.save_pretrained(output_dir) + return image_processor + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--hf_repo_id", + default="apple/DepthPro", + help="Location of official weights from apple on HF", + ) + parser.add_argument( + "--output_dir", + default="apple_DepthPro", + help="Location to write the converted model and processor", + ) + parser.add_argument( + "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." + ) + parser.add_argument( + "--push_to_hub", + action=argparse.BooleanOptionalAction, + help="Whether or not to push the converted model to the huggingface hub.", + ) + parser.add_argument( + "--hub_repo_id", + default="apple/DepthPro-hf", + help="Huggingface hub repo to write the converted model and processor", + ) + args = parser.parse_args() + + model = write_model( + hf_repo_id=args.hf_repo_id, + output_dir=args.output_dir, + safe_serialization=args.safe_serialization, + ) + + image_processor = write_image_processor( + output_dir=args.output_dir, + ) + + if args.push_to_hub: + print("Pushing to hub...") + model.push_to_hub(args.hub_repo_id) + image_processor.push_to_hub(args.hub_repo_id) + + +if __name__ == "__main__": + main() diff --git a/docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro.py b/docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..978a32da7e911aaf3756a918e1b1fc5bf30a3183 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro.py @@ -0,0 +1,392 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for DepthPro.""" + +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + +import numpy as np + +from ...utils.import_utils import requires + + +if TYPE_CHECKING: + from .modeling_depth_pro import DepthProDepthEstimatorOutput + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + is_torch_available, + make_list_of_images, + pil_torch_interpolation_mapping, + to_numpy_array, + valid_images, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + logging, + requires_backends, +) + + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +@requires(backends=("torchvision", "torch")) +class DepthProImageProcessor(BaseImageProcessor): + r""" + Constructs a DepthPro image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + **kwargs, + ): + super().__init__(**kwargs) + size = size if size is not None else {"height": 1536, "width": 1536} + size = get_size_dict(size) + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized images. + """ + requires_backends(self, "torch") + + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + + # we use torch interpolation instead of image.resize because DepthProImageProcessor + # rescales, then normalizes, which may cause some values to become negative, before resizing the image. + # image.resize expects all values to be in range [0, 1] or [0, 255] and throws an exception otherwise, + # however pytorch interpolation works with negative values. + # relevant issue here: https://github.com/huggingface/transformers/issues/34920 + # input should be (B, C, H, W) + image_tensor = torch.from_numpy(image).unsqueeze(0) + resized_image = torch.nn.functional.interpolate( + input=image_tensor, + size=output_size, + mode=pil_torch_interpolation_mapping[resample].value, + ) + resized_image = resized_image.squeeze(0).numpy() + return resized_image + + def _validate_input_arguments( + self, + do_resize: bool, + size: Dict[str, int], + resample: PILImageResampling, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + data_format: Union[str, ChannelDimension], + ): + if do_resize and None in (size, resample): + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + size = size if size is not None else self.size + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + self._validate_input_arguments( + do_resize=do_resize, + size=size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + ) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + for image in images: + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + # depth-pro rescales and normalizes the image before resizing it + # uses torch interpolation which requires ChannelDimension.FIRST + if do_resize: + image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format) + image = self.resize(image=image, size=size, resample=resample) + image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST) + else: + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + + all_images.append(image) + + data = {"pixel_values": all_images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + outputs: "DepthProDepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + ) -> Dict[str, List[TensorType]]: + """ + Post-processes the raw depth predictions from the model to generate + final depth predictions which is caliberated using the field of view if provided + and resized to specified target sizes if provided. + + Args: + outputs ([`DepthProDepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`): + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing + is performed. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`. + + Raises: + `ValueError`: + If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + fov = outputs.field_of_view + + batch_size = len(predicted_depth) + + if target_sizes is not None and batch_size != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + results = [] + fov = [None] * batch_size if fov is None else fov + target_sizes = [None] * batch_size if target_sizes is None else target_sizes + for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes): + focal_length = None + if target_size is not None: + # scale image w.r.t fov + if fov_value is not None: + width = target_size[1] + focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * width / focal_length + + # interpolate + depth = torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=depth.unsqueeze(0).unsqueeze(1), + size=target_size, + mode=pil_torch_interpolation_mapping[self.resample].value, + ).squeeze() + + # inverse the depth + depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) + + results.append( + { + "predicted_depth": depth, + "field_of_view": fov_value, + "focal_length": focal_length, + } + ) + + return results + + +__all__ = ["DepthProImageProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..a0a4cd96b861ae286c5af7555c891e87a12ca86b --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -0,0 +1,189 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for DepthPro.""" + +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + +from ...image_processing_base import BatchFeature +from ...image_processing_utils_fast import ( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + BaseImageProcessorFast, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + PILImageResampling, + SizeDict, +) +from ...utils import ( + TensorType, + add_start_docstrings, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + logging, + requires_backends, +) +from ...utils.import_utils import requires + + +if TYPE_CHECKING: + from .modeling_depth_pro import DepthProDepthEstimatorOutput + +logger = logging.get_logger(__name__) + + +if is_torch_available(): + import torch + + +if is_torchvision_available(): + from ...image_utils import pil_torch_interpolation_mapping + + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + + +@add_start_docstrings( + "Constructs a fast DepthPro image processor.", + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, +) +@requires(backends=("torchvision", "torch")) +class DepthProImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_STANDARD_MEAN + image_std = IMAGENET_STANDARD_STD + size = {"height": 1536, "width": 1536} + do_resize = True + do_rescale = True + do_normalize = True + + # DepthPro resizes image after rescaling and normalizing, + # which makes it different from BaseImageProcessorFast._preprocess + def _preprocess( + self, + images: List["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, List[float]]], + image_std: Optional[Union[float, List[float]]], + return_tensors: Optional[Union[str, TensorType]], + ) -> BatchFeature: + # Group images by size for batched scaling + grouped_images, grouped_images_index = group_images_by_shape(images) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + if do_resize: + stacked_images = self.resize( + image=stacked_images, + size=size, + interpolation=interpolation, + antialias=False, + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + # Copied from transformers.models.depth_pro.image_processing_depth_pro.DepthProImageProcessor.post_process_depth_estimation + def post_process_depth_estimation( + self, + outputs: "DepthProDepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + ) -> Dict[str, List[TensorType]]: + """ + Post-processes the raw depth predictions from the model to generate + final depth predictions which is caliberated using the field of view if provided + and resized to specified target sizes if provided. + + Args: + outputs ([`DepthProDepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`): + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing + is performed. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`. + + Raises: + `ValueError`: + If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + fov = outputs.field_of_view + + batch_size = len(predicted_depth) + + if target_sizes is not None and batch_size != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + results = [] + fov = [None] * batch_size if fov is None else fov + target_sizes = [None] * batch_size if target_sizes is None else target_sizes + for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes): + focal_length = None + if target_size is not None: + # scale image w.r.t fov + if fov_value is not None: + width = target_size[1] + focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * width / focal_length + + # interpolate + depth = torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=depth.unsqueeze(0).unsqueeze(1), + size=target_size, + mode=pil_torch_interpolation_mapping[self.resample].value, + ).squeeze() + + # inverse the depth + depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) + + results.append( + { + "predicted_depth": depth, + "field_of_view": fov_value, + "focal_length": focal_length, + } + ) + + return results + + +__all__ = ["DepthProImageProcessorFast"] diff --git a/docs/transformers/build/lib/transformers/models/depth_pro/modeling_depth_pro.py b/docs/transformers/build/lib/transformers/models/depth_pro/modeling_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..c26bf484f50a5ee3eb8ad6bd710ac14c131c4ce8 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/depth_pro/modeling_depth_pro.py @@ -0,0 +1,1218 @@ +# coding=utf-8 +# Copyright 2024 The Apple Research Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DepthPro model.""" + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from ..auto import AutoModel +from .configuration_depth_pro import DepthProConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +class DepthProOutput(ModelOutput): + """ + Base class for DepthPro's outputs. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*): + Features from encoders. Can be a single feature or a list of features. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer and the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + features: Union[torch.FloatTensor, List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class DepthProDepthEstimatorOutput(ModelOutput): + """ + Base class for DepthProForDepthEstimation's output. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`): + Predicted depth for each pixel. + field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided): + Field of View Scaler. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer and the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + predicted_depth: Optional[torch.FloatTensor] = None + field_of_view: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor: + """Creates Patches from Batch.""" + batch_size, num_channels, height, width = pixel_values.shape + + if height == width == patch_size: + # create patches only if scaled image is not already equal to patch size + return pixel_values + + stride = torch_int(patch_size * (1 - overlap_ratio)) + + patches = F.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)) + patches = patches.permute(2, 0, 1) + patches = patches.reshape(-1, num_channels, patch_size, patch_size) + + return patches + + +def reshape_features(hidden_states: torch.Tensor) -> torch.Tensor: + """Discard class token and reshape 1D feature map to a 2D grid.""" + n_samples, seq_len, hidden_size = hidden_states.shape + size = torch_int(seq_len**0.5) + + hidden_states = hidden_states[:, -(size**2) :, :] # remove special tokens if there are any + hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size) + hidden_states = hidden_states.permute(0, 3, 1, 2) + + return hidden_states + + +def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor: + """Merges smaller patches into image-like feature map.""" + n_patches, hidden_size, out_size, out_size = patches.shape + n_patches_per_batch = n_patches // batch_size + sqrt_n_patches_per_batch = torch_int(n_patches_per_batch**0.5) + new_out_size = sqrt_n_patches_per_batch * out_size + + if n_patches == batch_size: + # merge only if the patches were created from scaled image + # patches are not created when scaled image size is equal to patch size + return patches + + if n_patches_per_batch < 4: + # for each batch, atleast 4 small patches are required to + # recreate a large square patch from merging them and later padding is applied + # 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding) + # 4 x (8x8) patches becomes 1 x (16x16) patch (padding later) + # 5 x (8x8) patches becomes 1 x (16x16) patch (extra patch ignored, padding later) + # 9 x (8x8) patches becomes 1 x (24x24) patch (padding later) + # thus the following code only rearranges the patches and removes extra ones + padding = 0 + + # make sure padding is not large enough to remove more than half of the patch + padding = min(out_size // 4, padding) + + if padding == 0: + # faster when no padding is required + merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size) + merged = merged.permute(1, 2, 0, 3, 4) + merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :] + merged = merged.reshape( + batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size + ) + merged = merged.permute(0, 1, 2, 4, 3, 5) + merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size) + else: + # padding example: + # let out_size = 8, new_out_size = 32, padding = 2 + # each patch is separated by "|" + # and padding is applied to the merging edges of each patch + # 00 01 02 03 04 05 06 07 | 08 09 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31 + # 00 01 02 03 04 05 -- -- | -- -- 10 11 12 13 -- -- | -- -- 18 19 20 21 -- -- | -- -- 26 27 28 29 30 31 + i = 0 + boxes = [] + for h in range(sqrt_n_patches_per_batch): + boxes_in_row = [] + for w in range(sqrt_n_patches_per_batch): + box = patches[batch_size * i : batch_size * (i + 1)] + + # collect paddings + paddings = [0, 0, 0, 0] + if h != 0: + # remove pad from height if box is not at top border + paddings[0] = padding + if w != 0: + # remove pad from width if box is not at left border + paddings[2] = padding + if h != sqrt_n_patches_per_batch - 1: + # remove pad from height if box is not at bottom border + paddings[1] = padding + if w != sqrt_n_patches_per_batch - 1: + # remove pad from width if box is not at right border + paddings[3] = padding + + # remove paddings + _, _, box_h, box_w = box.shape + pad_top, pad_bottom, pad_left, pad_right = paddings + box = box[:, :, pad_top : box_h - pad_bottom, pad_left : box_w - pad_right] + + boxes_in_row.append(box) + i += 1 + boxes_in_row = torch.cat(boxes_in_row, dim=-1) + boxes.append(boxes_in_row) + merged = torch.cat(boxes, dim=-2) + + return merged + + +def reconstruct_feature_maps( + hidden_state: torch.Tensor, batch_size: int, padding: int, output_size: Tuple[float, float] +) -> torch.Tensor: + """ + Reconstructs feature maps from the hidden state produced by any of the encoder. Converts the hidden state of shape + `(n_patches_per_batch * batch_size, seq_len, hidden_size)` to feature maps of shape + `(batch_size, hidden_size, output_size[0], output_size[1])`. + + Args: + hidden_state (torch.Tensor): Input tensor of shape `(n_patches_per_batch * batch_size, seq_len, hidden_size)` + representing the encoded patches. + batch_size (int): The number of samples in a batch. + padding (int): The amount of padding to be removed when merging patches. + output_size (Tuple[float, float]): The desired output size for the feature maps, specified as `(height, width)`. + + Returns: + torch.Tensor: Reconstructed feature maps of shape `(batch_size, hidden_size, output_size[0], output_size[1])`. + """ + # reshape back to image like + features = reshape_features(hidden_state) + + # merge all patches in a batch to create one large patch per batch + features = merge_patches( + features, + batch_size=batch_size, + padding=padding, + ) + + # interpolate patches to base size + features = F.interpolate( + features, + size=output_size, + mode="bilinear", + align_corners=False, + ) + + return features + + +class DepthProPatchEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + self.intermediate_hook_ids = config.intermediate_hook_ids + self.intermediate_feature_dims = config.intermediate_feature_dims + self.scaled_images_ratios = config.scaled_images_ratios + self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios + self.scaled_images_feature_dims = config.scaled_images_feature_dims + self.merge_padding_value = config.merge_padding_value + + self.n_scaled_images = len(config.scaled_images_ratios) + self.n_intermediate_hooks = len(config.intermediate_hook_ids) + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.patch_model_config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> List[torch.Tensor]: + batch_size, num_channels, height, width = pixel_values.shape + + if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size: + raise ValueError( + f"Image size {height}x{width} is too small to be scaled " + f"with scaled_images_ratios={self.scaled_images_ratios} " + f"when patch_size={self.config.patch_size}." + ) + + # STEP 1: create 3-level image + + scaled_images = [] + for ratio in self.scaled_images_ratios: + scaled_images.append( + F.interpolate( + pixel_values, + scale_factor=ratio, + mode="bilinear", + align_corners=False, + ) + ) + + # STEP 2: create patches + + for i in range(self.n_scaled_images): + scaled_images[i] = split_to_patches( + scaled_images[i], + patch_size=self.config.patch_size, + overlap_ratio=self.scaled_images_overlap_ratios[i], + ) + n_patches_per_scaled_image = [len(i) for i in scaled_images] + patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first + + # STEP 3: apply patch encoder + + encodings = self.model( + # each patch is processed as a separate batch + patches, + head_mask=head_mask, + # required for intermediate features + output_hidden_states=self.n_intermediate_hooks > 0, + ) + + scaled_images_last_hidden_state = torch.split_with_sizes(encodings[0], n_patches_per_scaled_image[::-1]) + # -1 (reverse list) as patch encoder returns high res patches first, we need low res first + scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1] + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram + + scaled_images_features = [] + for i in range(self.n_scaled_images): + hidden_state = scaled_images_last_hidden_state[i] + batch_size = batch_size + padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])) + output_height = base_height * 2**i + output_width = base_width * 2**i + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=padding, + output_size=(output_height, output_width), + ) + scaled_images_features.append(features) + + # STEP 5: get intermediate features - (1-2) in diagram + + intermediate_features = [] + for i in range(self.n_intermediate_hooks): + # +1 to correct index position as hidden_states contain embedding output as well + hidden_state = encodings[2][self.intermediate_hook_ids[i] + 1] + padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])) + output_height = base_height * 2 ** (self.n_scaled_images - 1) + output_width = base_width * 2 ** (self.n_scaled_images - 1) + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=padding, + output_size=(output_height, output_width), + ) + intermediate_features.append(features) + + # STEP 7: combine all features + features = [*scaled_images_features, *intermediate_features] + + return features + + +class DepthProImageEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.image_model_config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, DepthProOutput]: + batch_size, num_channels, height, width = pixel_values.shape + + # scale the image for image_encoder + size = self.config.image_model_config.image_size + pixel_values = F.interpolate( + pixel_values, + size=(size, size), + mode="bilinear", + align_corners=False, + ) + encodings = self.model( + pixel_values=pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + features = reconstruct_feature_maps( + encodings[0], + batch_size=batch_size, + padding=0, + output_size=(base_height, base_width), + ) + + if not return_dict: + return (encodings[0], features) + encodings[2:] # ignore last_hidden_state and poooler output + + return DepthProOutput( + last_hidden_state=encodings.last_hidden_state, + features=features, + hidden_states=encodings.hidden_states, + attentions=encodings.attentions, + ) + + +class DepthProEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.intermediate_hook_ids = config.intermediate_hook_ids + self.intermediate_feature_dims = config.intermediate_feature_dims + self.scaled_images_ratios = config.scaled_images_ratios + self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios + self.scaled_images_feature_dims = config.scaled_images_feature_dims + self.merge_padding_value = config.merge_padding_value + + self.n_scaled_images = len(self.scaled_images_ratios) + self.n_intermediate_hooks = len(self.intermediate_hook_ids) + + self.patch_encoder = DepthProPatchEncoder(config) + self.image_encoder = DepthProImageEncoder(config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, DepthProOutput]: + batch_size, num_channels, height, width = pixel_values.shape + + patch_features = self.patch_encoder( + pixel_values, + head_mask=head_mask, + ) + image_encodings = self.image_encoder( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_features = image_encodings[1] # index 1 contains features + + features = [image_features, *patch_features] + + if not return_dict: + return (image_encodings[0], features) + image_encodings[2:] + + return DepthProOutput( + last_hidden_state=image_encodings.last_hidden_state, + features=features, + hidden_states=image_encodings.hidden_states, + attentions=image_encodings.attentions, + ) + + +class DepthProFeatureUpsampleBlock(nn.Module): + def __init__( + self, + config: DepthProConfig, + input_dims: int, + intermediate_dims: int, + output_dims: int, + n_upsample_layers: int, + use_proj: bool = True, + bias: bool = False, + ): + super().__init__() + self.config = config + self.layers = nn.ModuleList() + + # create first projection layer + if use_proj: + proj = nn.Conv2d( + in_channels=input_dims, + out_channels=intermediate_dims, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.layers.append(proj) + + # create following upsample layers + for i in range(n_upsample_layers): + in_channels = intermediate_dims if i == 0 else output_dims + layer = nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=output_dims, + kernel_size=2, + stride=2, + padding=0, + bias=bias, + ) + self.layers.append(layer) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + for layer in self.layers: + features = layer(features) + return features + + +class DepthProFeatureUpsample(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.n_scaled_images = len(self.config.scaled_images_ratios) + self.n_intermediate_hooks = len(self.config.intermediate_hook_ids) + + # for image_features + self.image_block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.image_model_config.hidden_size, + intermediate_dims=config.image_model_config.hidden_size, + output_dims=config.scaled_images_feature_dims[0], + n_upsample_layers=1, + use_proj=False, + bias=True, + ) + + # for scaled_images_features + self.scaled_images = nn.ModuleList() + for i, feature_dims in enumerate(config.scaled_images_feature_dims): + block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.patch_model_config.hidden_size, + intermediate_dims=feature_dims, + output_dims=feature_dims, + n_upsample_layers=1, + ) + self.scaled_images.append(block) + + # for intermediate_features + self.intermediate = nn.ModuleList() + for i, feature_dims in enumerate(config.intermediate_feature_dims): + intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims + block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.patch_model_config.hidden_size, + intermediate_dims=intermediate_dims, + output_dims=feature_dims, + n_upsample_layers=2 + i, + ) + self.intermediate.append(block) + + def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]: + features[0] = self.image_block(features[0]) + + for i in range(self.n_scaled_images): + features[i + 1] = self.scaled_images[i](features[i + 1]) + + for i in range(self.n_intermediate_hooks): + features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1]) + + return features + + +class DepthProFeatureProjection(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims + self.projections = nn.ModuleList() + for i, in_channels in enumerate(combined_feature_dims): + if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size: + # projection for last layer can be ignored if input and output channels already match + self.projections.append(nn.Identity()) + else: + self.projections.append( + nn.Conv2d( + in_channels=in_channels, + out_channels=config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + ) + + def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]: + projected_features = [] + for i, projection in enumerate(self.projections): + upsampled_feature = projection(features[i]) + projected_features.append(upsampled_feature) + return projected_features + + +class DepthProNeck(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + self.feature_upsample = DepthProFeatureUpsample(config) + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=config.scaled_images_feature_dims[0] * 2, + out_channels=config.scaled_images_feature_dims[0], + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + self.feature_projection = DepthProFeatureProjection(config) + + def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]: + features = self.feature_upsample(features) + # global features = low res features + image features + global_features = torch.cat((features[1], features[0]), dim=1) + global_features = self.fuse_image_with_low_res(global_features) + features = [global_features, *features[2:]] + features = self.feature_projection(features) + return features + + +# General docstring +_CONFIG_FOR_DOC = "DepthProConfig" + + +DEPTH_PRO_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DEPTH_PRO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] + for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + +DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + use_fov_model (`bool`, *optional*, defaults to `True`): + Whether to use `DepthProFovModel` to generate the field of view. +""" + + +class DepthProPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DepthProConfig + base_model_prefix = "depth_pro" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _supports_sdpa = True + _no_split_modules = ["DepthProPreActResidualLayer"] + _keys_to_ignore_on_load_unexpected = ["fov_model.*"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)): + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + if module.bias is not None: + module.bias.data.zero_() + + +@add_start_docstrings( + "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.", + DEPTH_PRO_START_DOCSTRING, +) +class DepthProModel(DepthProPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.encoder = DepthProEncoder(config) + self.neck = DepthProNeck(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.encoder.image_encoder.model.get_input_embeddings() + + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, DepthProOutput]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, DepthProModel + + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> checkpoint = "apple/DepthPro-hf" + >>> processor = AutoProcessor.from_pretrained(checkpoint) + >>> model = DepthProModel.from_pretrained(checkpoint) + + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... output = model(**inputs) + + >>> output.last_hidden_state.shape + torch.Size([1, 35, 577, 1024]) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encodings = self.encoder( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + features = encodings[1] # index 1 contains features + features = self.neck(features) + + if not return_dict: + return (encodings[0], features) + encodings[2:] + + return DepthProOutput( + last_hidden_state=encodings.last_hidden_state, + features=features, + hidden_states=encodings.hidden_states, + attentions=encodings.attentions, + ) + + +# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro +class DepthProPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DepthProConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.use_batch_norm = config.use_batch_norm_in_fusion_residual + use_bias_in_fusion_residual = ( + config.use_bias_in_fusion_residual + if config.use_bias_in_fusion_residual is not None + else not self.use_batch_norm + ) + + self.activation1 = nn.ReLU() + self.convolution1 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias_in_fusion_residual, + ) + + self.activation2 = nn.ReLU() + self.convolution2 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias_in_fusion_residual, + ) + + if self.use_batch_norm: + self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size) + self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + + hidden_state = self.convolution1(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm1(hidden_state) + + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm2(hidden_state) + + return hidden_state + residual + + +# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer +# except it uses deconv and skip_add and needs no interpolation +class DepthProFeatureFusionLayer(nn.Module): + def __init__(self, config: DepthProConfig, use_deconv: bool = True): + super().__init__() + self.config = config + self.use_deconv = use_deconv + + self.residual_layer1 = DepthProPreActResidualLayer(config) + self.residual_layer2 = DepthProPreActResidualLayer(config) + + if self.use_deconv: + self.deconv = nn.ConvTranspose2d( + in_channels=config.fusion_hidden_size, + out_channels=config.fusion_hidden_size, + kernel_size=2, + stride=2, + padding=0, + bias=False, + ) + + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) + + def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor: + if residual is not None: + residual = self.residual_layer1(residual) + hidden_state = hidden_state + residual + + hidden_state = self.residual_layer2(hidden_state) + if self.use_deconv: + hidden_state = self.deconv(hidden_state) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro +# with deconv and reversed layers +class DepthProFeatureFusionStage(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + self.intermediate = nn.ModuleList() + for _ in range(self.num_layers - 1): + self.intermediate.append(DepthProFeatureFusionLayer(config)) + + # final layer doesnot require deconvolution + self.final = DepthProFeatureFusionLayer(config, use_deconv=False) + + def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: + if self.num_layers != len(hidden_states): + raise ValueError( + f"num_layers={self.num_layers} in DepthProFeatureFusionStage" + f"doesnot match len(hidden_states)={len(hidden_states)}" + ) + + fused_hidden_states = [] + fused_hidden_state = None + for hidden_state, layer in zip(hidden_states[:-1], self.intermediate): + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) + + hidden_state = hidden_states[-1] + fused_hidden_state = self.final(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) + + return fused_hidden_states + + +class DepthProFovEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.fov_model_config) + self.neck = nn.Linear(config.fov_model_config.hidden_size, config.fusion_hidden_size // 2) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + + # scale the image for fov_encoder + size = self.config.fov_model_config.image_size + pixel_values = F.interpolate( + pixel_values, + size=(size, size), + mode="bilinear", + align_corners=False, + ) + encodings = self.model( + pixel_values=pixel_values, + head_mask=head_mask, + ) + hidden_state = encodings[0] + hidden_state = self.neck(hidden_state) + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=0, + output_size=(base_height, base_width), + ) + + return features + + +class DepthProFovHead(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.fusion_hidden_size = config.fusion_hidden_size + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + # create initial head layers + self.layers = nn.ModuleList() + for i in range(config.num_fov_head_layers): + self.layers.append( + nn.Conv2d( + math.ceil(self.fusion_hidden_size / 2 ** (i + 1)), + math.ceil(self.fusion_hidden_size / 2 ** (i + 2)), + kernel_size=3, + stride=2, + padding=1, + ) + ) + self.layers.append(nn.ReLU(True)) + # calculate expected shapes to finally generate a scalar output from final head layer + final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1)) + final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) + self.layers.append( + nn.Conv2d( + in_channels=final_in_channels, out_channels=1, kernel_size=final_kernel_size, stride=1, padding=0 + ) + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + features = F.interpolate( + features, + size=(self.out_size, self.out_size), + mode="bilinear", + align_corners=False, + ) + for layer in self.layers: + features = layer(features) + return features + + +class DepthProFovModel(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.fusion_hidden_size = config.fusion_hidden_size + + self.fov_encoder = DepthProFovEncoder(config) + self.conv = nn.Conv2d( + self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1 + ) + self.activation = nn.ReLU(inplace=True) + self.head = DepthProFovHead(config) + + def forward( + self, + pixel_values: torch.Tensor, + global_features: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + fov_features = self.fov_encoder(pixel_values, head_mask) + + global_features = self.conv(global_features) + global_features = self.activation(global_features) + + fov_features = fov_features + global_features + fov_output = self.head(fov_features) + fov_output = fov_output.flatten() + + return fov_output + + +class DepthProDepthEstimationHead(nn.Module): + """ + The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks. + This module comprises a sequence of convolutional and transposed convolutional layers + that process the feature map from the fusion to produce a single-channel depth map. + Key operations include dimensionality reduction and upsampling to match the input resolution. + """ + + def __init__(self, config): + super().__init__() + self.config = config + + features = config.fusion_hidden_size + self.layers = nn.ModuleList( + [ + nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), + nn.ConvTranspose2d( + in_channels=features // 2, + out_channels=features // 2, + kernel_size=2, + stride=2, + padding=0, + bias=True, + ), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + ] + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for layer in self.layers: + hidden_states = layer(hidden_states) + + predicted_depth = hidden_states.squeeze(dim=1) + return predicted_depth + + +@add_start_docstrings( + """ + DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers). + """, + DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING, +) +class DepthProForDepthEstimation(DepthProPreTrainedModel): + def __init__(self, config, use_fov_model=None): + super().__init__(config) + self.config = config + self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model + + # dinov2 (vit) like encoders + self.depth_pro = DepthProModel(config) + + # dpt (vit) like fusion stage + self.fusion_stage = DepthProFeatureFusionStage(config) + + # depth estimation head + self.head = DepthProDepthEstimationHead(config) + + # dinov2 (vit) like encoder + self.fov_model = DepthProFovModel(config) if self.use_fov_model else None + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], DepthProDepthEstimatorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> checkpoint = "apple/DepthPro-hf" + >>> processor = AutoImageProcessor.from_pretrained(checkpoint) + >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint) + + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> model.to(device) + + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt").to(device) + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # interpolate to original size + >>> post_processed_output = processor.post_process_depth_estimation( + ... outputs, target_sizes=[(image.height, image.width)], + ... ) + + >>> # get the field of view (fov) predictions + >>> field_of_view = post_processed_output[0]["field_of_view"] + >>> focal_length = post_processed_output[0]["focal_length"] + + >>> # visualize the prediction + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) + ```""" + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + depth_pro_outputs = self.depth_pro( + pixel_values=pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + features = depth_pro_outputs.features + fused_hidden_states = self.fusion_stage(features) + predicted_depth = self.head(fused_hidden_states[-1]) + + if self.use_fov_model: + # frozen features from encoder are used + features_for_fov = features[0].detach() + fov = self.fov_model( + pixel_values=pixel_values, + global_features=features_for_fov, + head_mask=head_mask, + ) + else: + fov = None + + if not return_dict: + outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions] + return tuple(v for v in outputs if v is not None) + + return DepthProDepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + field_of_view=fov, + hidden_states=depth_pro_outputs.hidden_states, + attentions=depth_pro_outputs.attentions, + ) + + +__all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"] diff --git a/docs/transformers/build/lib/transformers/models/detr/__init__.py b/docs/transformers/build/lib/transformers/models/detr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4b8aae5e70381b21772bdec395693c007a9c02b7 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/detr/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_detr import * + from .feature_extraction_detr import * + from .image_processing_detr import * + from .image_processing_detr_fast import * + from .modeling_detr import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/detr/configuration_detr.py b/docs/transformers/build/lib/transformers/models/detr/configuration_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..3dd37c36a4ae843b93b5ae165c7d94937b5f3449 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/detr/configuration_detr.py @@ -0,0 +1,289 @@ +# coding=utf-8 +# Copyright 2021 Facebook AI Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DETR model configuration""" + +from collections import OrderedDict +from typing import Mapping + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class DetrConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DetrModel`]. It is used to instantiate a DETR + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the DETR + [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + use_timm_backbone (`bool`, *optional*, defaults to `True`): + Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] + API. + backbone_config (`PretrainedConfig` or `dict`, *optional*): + The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which + case it will default to `ResNetConfig()`. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + num_queries (`int`, *optional*, defaults to 100): + Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetrModel`] can + detect in a single image. For COCO, we recommend 100 queries. + d_model (`int`, *optional*, defaults to 256): + This parameter is a general dimension parameter, defining dimensions for components such as the encoder layer and projection parameters in the decoder layer, among others. + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (`float`, *optional*, defaults to 1): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + backbone (`str`, *optional*, defaults to `"resnet50"`): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, `True`): + Whether to use pretrained weights for the backbone. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + dilation (`bool`, *optional*, defaults to `False`): + Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when + `use_timm_backbone` = `True`. + class_cost (`float`, *optional*, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + mask_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the Focal loss in the panoptic segmentation loss. + dice_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.1): + Relative classification weight of the 'no-object' class in the object detection loss. + + Examples: + + ```python + >>> from transformers import DetrConfig, DetrModel + + >>> # Initializing a DETR facebook/detr-resnet-50 style configuration + >>> configuration = DetrConfig() + + >>> # Initializing a model (with random weights) from the facebook/detr-resnet-50 style configuration + >>> model = DetrModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "detr" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + use_timm_backbone=True, + backbone_config=None, + num_channels=3, + num_queries=100, + encoder_layers=6, + encoder_ffn_dim=2048, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=2048, + decoder_attention_heads=8, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + init_xavier_std=1.0, + auxiliary_loss=False, + position_embedding_type="sine", + backbone="resnet50", + use_pretrained_backbone=True, + backbone_kwargs=None, + dilation=False, + class_cost=1, + bbox_cost=5, + giou_cost=2, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.1, + **kwargs, + ): + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [1, 2, 3, 4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): + if backbone_config is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") + backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + backbone = None + # set timm attributes to None + dilation = None + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.use_timm_backbone = use_timm_backbone + self.backbone_config = backbone_config + self.num_channels = num_channels + self.num_queries = num_queries + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.num_hidden_layers = encoder_layers + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.backbone_kwargs = backbone_kwargs + self.dilation = dilation + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.mask_loss_coefficient = mask_loss_coefficient + self.dice_loss_coefficient = dice_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.eos_coefficient = eos_coefficient + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + @classmethod + def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs): + """Instantiate a [`DetrConfig`] (or a derived class) from a pre-trained backbone model configuration. + + Args: + backbone_config ([`PretrainedConfig`]): + The backbone configuration. + Returns: + [`DetrConfig`]: An instance of a configuration object + """ + return cls(backbone_config=backbone_config, **kwargs) + + +class DetrOnnxConfig(OnnxConfig): + torch_onnx_minimum_version = version.parse("1.11") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ("pixel_mask", {0: "batch"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-5 + + @property + def default_onnx_opset(self) -> int: + return 12 + + +__all__ = ["DetrConfig", "DetrOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..ba985145014c50d7f1b56b383652b794f1cac8e6 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DETR checkpoints with timm backbone.""" + +import argparse +import json +from collections import OrderedDict +from pathlib import Path + +import requests +import torch +from huggingface_hub import hf_hub_download +from PIL import Image + +from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + +# here we list all keys to be renamed (original name on the left, our name on the right) +rename_keys = [] +for i in range(6): + # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms + rename_keys.append( + (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight") + ) + rename_keys.append( + (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") + ) + rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) + rename_keys.append( + (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") + ) + rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")) + rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) + # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + rename_keys.append( + (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") + ) + rename_keys.append( + ( + f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", + f"decoder.layers.{i}.encoder_attn.out_proj.weight", + ) + ) + rename_keys.append( + ( + f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", + f"decoder.layers.{i}.encoder_attn.out_proj.bias", + ) + ) + rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") + ) + rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") + ) + rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) + +# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads +rename_keys.extend( + [ + ("input_proj.weight", "input_projection.weight"), + ("input_proj.bias", "input_projection.bias"), + ("query_embed.weight", "query_position_embeddings.weight"), + ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), + ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), + ("class_embed.weight", "class_labels_classifier.weight"), + ("class_embed.bias", "class_labels_classifier.bias"), + ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), + ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), + ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), + ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), + ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), + ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), + ] +) + + +def rename_key(state_dict, old, new): + val = state_dict.pop(old) + state_dict[new] = val + + +def rename_backbone_keys(state_dict): + new_state_dict = OrderedDict() + for key, value in state_dict.items(): + if "backbone.0.body" in key: + new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model") + new_state_dict[new_key] = value + else: + new_state_dict[key] = value + + return new_state_dict + + +def read_in_q_k_v(state_dict, is_panoptic=False): + prefix = "" + if is_panoptic: + prefix = "detr." + + # first: transformer encoder + for i in range(6): + # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + # next: transformer decoder (which is a bit more complex because it also includes cross-attention) + for i in range(6): + # read in weights + bias of input projection layer of self-attention + in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + # read in weights + bias of input projection layer of cross-attention + in_proj_weight_cross_attn = state_dict.pop( + f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" + ) + in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") + # next, add query, keys and values (in that order) of cross-attention to the state dict + state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] + state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] + state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] + state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] + state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] + state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + + return im + + +@torch.no_grad() +def convert_detr_checkpoint(model_name, pytorch_dump_folder_path): + """ + Copy/paste/tweak model's weights to our DETR structure. + """ + + # load default config + config = DetrConfig() + # set backbone and dilation attributes + if "resnet101" in model_name: + config.backbone = "resnet101" + if "dc5" in model_name: + config.dilation = True + is_panoptic = "panoptic" in model_name + if is_panoptic: + config.num_labels = 250 + else: + config.num_labels = 91 + repo_id = "huggingface/label-files" + filename = "coco-detection-id2label.json" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + # load image processor + format = "coco_panoptic" if is_panoptic else "coco_detection" + image_processor = DetrImageProcessor(format=format) + + # prepare image + img = prepare_img() + encoding = image_processor(images=img, return_tensors="pt") + pixel_values = encoding["pixel_values"] + + logger.info(f"Converting model {model_name}...") + + # load original model from torch hub + detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval() + state_dict = detr.state_dict() + # rename keys + for src, dest in rename_keys: + if is_panoptic: + src = "detr." + src + rename_key(state_dict, src, dest) + state_dict = rename_backbone_keys(state_dict) + # query, key and value matrices need special treatment + read_in_q_k_v(state_dict, is_panoptic=is_panoptic) + # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them + prefix = "detr.model." if is_panoptic else "model." + for key in state_dict.copy().keys(): + if is_panoptic: + if ( + key.startswith("detr") + and not key.startswith("class_labels_classifier") + and not key.startswith("bbox_predictor") + ): + val = state_dict.pop(key) + state_dict["detr.model" + key[4:]] = val + elif "class_labels_classifier" in key or "bbox_predictor" in key: + val = state_dict.pop(key) + state_dict["detr." + key] = val + elif key.startswith("bbox_attention") or key.startswith("mask_head"): + continue + else: + val = state_dict.pop(key) + state_dict[prefix + key] = val + else: + if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): + val = state_dict.pop(key) + state_dict[prefix + key] = val + # finally, create HuggingFace model and load state dict + model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) + model.load_state_dict(state_dict) + model.eval() + # verify our conversion + original_outputs = detr(pixel_values) + outputs = model(pixel_values) + assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) + assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) + if is_panoptic: + assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) + + # Save model and image processor + logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + image_processor.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert." + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." + ) + args = parser.parse_args() + convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/docs/transformers/build/lib/transformers/models/detr/convert_detr_to_pytorch.py b/docs/transformers/build/lib/transformers/models/detr/convert_detr_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..6ba6a0e2920aa08718f2fe3fe5eec0ce6fcecb9b --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/detr/convert_detr_to_pytorch.py @@ -0,0 +1,385 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DETR checkpoints with native (Transformers) backbone.""" + +import argparse +import json +from pathlib import Path + +import requests +import torch +from huggingface_hub import hf_hub_download +from PIL import Image + +from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_detr_config(model_name): + # initialize config + if "resnet-50" in model_name: + backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50") + elif "resnet-101" in model_name: + backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101") + else: + raise ValueError("Model name should include either resnet50 or resnet101") + + config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config) + + # set label attributes + is_panoptic = "panoptic" in model_name + if is_panoptic: + config.num_labels = 250 + else: + config.num_labels = 91 + repo_id = "huggingface/label-files" + filename = "coco-detection-id2label.json" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + return config, is_panoptic + + +def create_rename_keys(config): + # here we list all keys to be renamed (original name on the left, our name on the right) + rename_keys = [] + + # stem + # fmt: off + rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight")) + rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight")) + rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias")) + rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean")) + rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var")) + # stages + for stage_idx in range(len(config.backbone_config.depths)): + for layer_idx in range(config.backbone_config.depths[stage_idx]): + # shortcut + if layer_idx == 0: + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight", + ) + ) + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight", + ) + ) + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias", + ) + ) + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean", + ) + ) + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var", + ) + ) + # 3 convs + for i in range(3): + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight", + ) + ) + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight", + ) + ) + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias", + ) + ) + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean", + ) + ) + rename_keys.append( + ( + f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var", + f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var", + ) + ) + # fmt: on + + for i in range(config.encoder_layers): + # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms + rename_keys.append( + ( + f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", + f"encoder.layers.{i}.self_attn.out_proj.weight", + ) + ) + rename_keys.append( + (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") + ) + rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) + rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) + rename_keys.append( + (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") + ) + rename_keys.append( + (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias") + ) + rename_keys.append( + (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight") + ) + rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) + # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + rename_keys.append( + ( + f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", + f"decoder.layers.{i}.self_attn.out_proj.weight", + ) + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") + ) + rename_keys.append( + ( + f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", + f"decoder.layers.{i}.encoder_attn.out_proj.weight", + ) + ) + rename_keys.append( + ( + f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", + f"decoder.layers.{i}.encoder_attn.out_proj.bias", + ) + ) + rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) + rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") + ) + rename_keys.append( + (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight") + ) + rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) + + # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads + rename_keys.extend( + [ + ("input_proj.weight", "input_projection.weight"), + ("input_proj.bias", "input_projection.bias"), + ("query_embed.weight", "query_position_embeddings.weight"), + ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), + ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), + ("class_embed.weight", "class_labels_classifier.weight"), + ("class_embed.bias", "class_labels_classifier.bias"), + ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), + ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), + ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), + ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), + ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), + ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), + ] + ) + + return rename_keys + + +def rename_key(state_dict, old, new): + val = state_dict.pop(old) + state_dict[new] = val + + +def read_in_q_k_v(state_dict, is_panoptic=False): + prefix = "" + if is_panoptic: + prefix = "detr." + + # first: transformer encoder + for i in range(6): + # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + # next: transformer decoder (which is a bit more complex because it also includes cross-attention) + for i in range(6): + # read in weights + bias of input projection layer of self-attention + in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + # read in weights + bias of input projection layer of cross-attention + in_proj_weight_cross_attn = state_dict.pop( + f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" + ) + in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") + # next, add query, keys and values (in that order) of cross-attention to the state dict + state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] + state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] + state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] + state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] + state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] + state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + + return im + + +@torch.no_grad() +def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): + """ + Copy/paste/tweak model's weights to our DETR structure. + """ + + # load default config + config, is_panoptic = get_detr_config(model_name) + + # load original model from torch hub + model_name_to_original_name = { + "detr-resnet-50": "detr_resnet50", + "detr-resnet-101": "detr_resnet101", + } + logger.info(f"Converting model {model_name}...") + detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval() + state_dict = detr.state_dict() + # rename keys + for src, dest in create_rename_keys(config): + if is_panoptic: + src = "detr." + src + rename_key(state_dict, src, dest) + # query, key and value matrices need special treatment + read_in_q_k_v(state_dict, is_panoptic=is_panoptic) + # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them + prefix = "detr.model." if is_panoptic else "model." + for key in state_dict.copy().keys(): + if is_panoptic: + if ( + key.startswith("detr") + and not key.startswith("class_labels_classifier") + and not key.startswith("bbox_predictor") + ): + val = state_dict.pop(key) + state_dict["detr.model" + key[4:]] = val + elif "class_labels_classifier" in key or "bbox_predictor" in key: + val = state_dict.pop(key) + state_dict["detr." + key] = val + elif key.startswith("bbox_attention") or key.startswith("mask_head"): + continue + else: + val = state_dict.pop(key) + state_dict[prefix + key] = val + else: + if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): + val = state_dict.pop(key) + state_dict[prefix + key] = val + + # finally, create HuggingFace model and load state dict + model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) + model.load_state_dict(state_dict) + model.eval() + + # verify our conversion on an image + format = "coco_panoptic" if is_panoptic else "coco_detection" + processor = DetrImageProcessor(format=format) + + encoding = processor(images=prepare_img(), return_tensors="pt") + pixel_values = encoding["pixel_values"] + + original_outputs = detr(pixel_values) + outputs = model(pixel_values) + + assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3) + assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3) + if is_panoptic: + assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + # Save model and image processor + logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + # Upload model and image processor to the hub + logger.info("Uploading PyTorch model and image processor to the hub...") + model.push_to_hub(f"nielsr/{model_name}") + processor.push_to_hub(f"nielsr/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name", + default="detr-resnet-50", + type=str, + choices=["detr-resnet-50", "detr-resnet-101"], + help="Name of the DETR model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." + ) + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.") + args = parser.parse_args() + convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/docs/transformers/build/lib/transformers/models/detr/feature_extraction_detr.py b/docs/transformers/build/lib/transformers/models/detr/feature_extraction_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..a81f83c8c313bdb8a904f0b359360c0e100a83d9 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/detr/feature_extraction_detr.py @@ -0,0 +1,48 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for DETR.""" + +import warnings + +from ...image_transforms import rgb_to_id as _rgb_to_id +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_detr import DetrImageProcessor + + +logger = logging.get_logger(__name__) + + +def rgb_to_id(x): + warnings.warn( + "rgb_to_id has moved and will not be importable from this module from v5. " + "Please import from transformers.image_transforms instead.", + FutureWarning, + ) + return _rgb_to_id(x) + + +@requires(backends=("vision",)) +class DetrFeatureExtractor(DetrImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class DetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers." + " Please use DetrImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["DetrFeatureExtractor"] diff --git a/docs/transformers/build/lib/transformers/models/detr/image_processing_detr_fast.py b/docs/transformers/build/lib/transformers/models/detr/image_processing_detr_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..dc14ec61f0659a316c366484655cac9eee367ec5 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/detr/image_processing_detr_fast.py @@ -0,0 +1,1312 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for DETR.""" + +import io +import pathlib +from collections import defaultdict +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +from ...image_processing_utils import BatchFeature, get_size_dict +from ...image_processing_utils_fast import ( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) +from ...image_transforms import ( + center_to_corners_format, + corners_to_center_format, + id_to_rgb, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + validate_annotations, +) +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + add_start_docstrings, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + is_vision_available, + logging, +) +from ...utils.import_utils import requires +from .image_processing_detr import ( + compute_segments, + convert_segmentation_to_rle, + get_size_with_aspect_ratio, + remove_low_and_no_objects, +) + + +if is_torch_available(): + import torch + from torch import nn + +if is_vision_available(): + import PIL + + +if is_torchvision_v2_available(): + from torchvision.io import read_image + from torchvision.transforms.v2 import functional as F +elif is_torchvision_available(): + from torchvision.io import read_image + from torchvision.transforms import functional as F + + +logger = logging.get_logger(__name__) + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33 +def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`List[List[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8, device=device) + mask = torch.any(mask, axis=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, axis=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device) + + return masks + + +# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50 +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by DETR. + """ + image_height, image_width = image.size()[-2:] + + image_id = target["image_id"] + image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) + + classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } + + if keypoints: + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device) + new_target["masks"] = masks[keep] + + return new_target + + +def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.numel() == 0: + return torch.zeros((0, 4), device=masks.device) + + h, w = masks.shape[-2:] + y = torch.arange(0, h, dtype=torch.float32, device=masks.device) + x = torch.arange(0, w, dtype=torch.float32, device=masks.device) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = torch.meshgrid(y, x, indexing="ij") + + x_mask = masks * torch.unsqueeze(x, 0) + x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0] + x_min = ( + torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + y_mask = masks * torch.unsqueeze(y, 0) + y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0] + y_min = ( + torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + return torch.stack([x_min, y_min, x_max, y_max], 1) + + +# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py +# Copyright (c) 2018, Alexander Kirillov +# All rights reserved. +def rgb_to_id(color): + """ + Converts RGB color to unique ID. + """ + if isinstance(color, torch.Tensor) and len(color.shape) == 3: + if color.dtype == torch.uint8: + color = color.to(torch.int32) + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] + return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) + + +def prepare_coco_panoptic_annotation( + image: torch.Tensor, + target: Dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> Dict: + """ + Prepare a coco panoptic annotation for DETR. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = torch.as_tensor( + [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device + ) + new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + + if "segments_info" in target: + masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device) + masks = rgb_to_id(masks) + + ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device) + masks = masks == ids[:, None, None] + masks = masks.to(torch.bool) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = torch.as_tensor( + [segment_info["category_id"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["iscrowd"] = torch.as_tensor( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["area"] = torch.as_tensor( + [segment_info["area"] for segment_info in target["segments_info"]], + dtype=torch.float32, + device=image.device, + ) + + return new_target + + +class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + do_pad: Optional[bool] + pad_size: Optional[Dict[str, int]] + return_segmentation_masks: Optional[bool] + + +@add_start_docstrings( + "Constructs a fast Detr image processor.", + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + """ + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + """, +) +@requires(backends=("torchvision", "torch")) +class DetrImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_DEFAULT_MEAN + image_std = IMAGENET_DEFAULT_STD + format = AnnotationFormat.COCO_DETECTION + do_resize = True + do_rescale = True + do_normalize = True + do_pad = True + size = {"shortest_edge": 800, "longest_edge": 1333} + default_to_square = False + model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = DetrFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None: + if "pad_and_return_pixel_mask" in kwargs: + kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + + size = kwargs.pop("size", None) + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + self.size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + do_convert_annotations = kwargs.get("do_convert_annotations", None) + do_normalize = kwargs.get("do_normalize", None) + if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None: + self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize + + super().__init__(**kwargs) + + @classmethod + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `DetrImageProcessorFast.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + def prepare_annotation( + self, + image: torch.Tensor, + target: Dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Dict: + """ + Prepare an annotation for feeding into DETR model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + def resize( + self, + image: torch.Tensor, + size: SizeDict, + interpolation: "F.InterpolationMode" = None, + **kwargs, + ) -> torch.Tensor: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + Resampling filter to use if resizing the image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size["shortest_edge"], + size["longest_edge"], + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"]) + elif size.height and size.width: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + + image = F.resize( + image, + size=new_size, + interpolation=interpolation, + **kwargs, + ) + return image + + def resize_annotation( + self, + annotation: Dict[str, Any], + orig_size: Tuple[int, int], + target_size: Tuple[int, int], + threshold: float = 0.5, + interpolation: "F.InterpolationMode" = None, + ): + """ + Resizes an annotation to a target size. + + Args: + annotation (`Dict[str, Any]`): + The annotation dictionary. + orig_size (`Tuple[int, int]`): + The original size of the input image. + target_size (`Tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`): + The resampling filter to use when resizing the masks. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST + ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device + ) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks] + masks = torch.stack(masks).to(torch.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= torch.as_tensor( + [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device + ) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + def _update_annotation_for_padded_image( + self, + annotation: Dict, + input_image_size: Tuple[int, int], + output_image_size: Tuple[int, int], + padding, + update_bboxes, + ) -> Dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size)) + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = F.pad( + masks, + padding, + fill=0, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + def pad( + self, + image: torch.Tensor, + padded_size: Tuple[int, int], + annotation: Optional[Dict[str, Any]] = None, + update_bboxes: bool = True, + fill: int = 0, + ): + original_size = image.size()[-2:] + padding_bottom = padded_size[0] - original_size[0] + padding_right = padded_size[1] - original_size[1] + if padding_bottom < 0 or padding_right < 0: + raise ValueError( + f"Padding dimensions are negative. Please make sure that the padded size is larger than the " + f"original size. Got padded size: {padded_size}, original size: {original_size}." + ) + if original_size != padded_size: + padding = [0, 0, padding_right, padding_bottom] + image = F.pad(image, padding, fill=fill) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, original_size, padded_size, padding, update_bboxes + ) + + # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device) + pixel_mask[: original_size[0], : original_size[1]] = 1 + + return image, pixel_mask, annotation + + @add_start_docstrings( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, + """ + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """, + ) + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + **kwargs: Unpack[DetrFastImageProcessorKwargs], + ) -> BatchFeature: + if "pad_and_return_pixel_mask" in kwargs: + kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + kwargs["size"] = kwargs.pop("max_size") + + return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs) + + def _preprocess( + self, + images: List["torch.Tensor"], + annotations: Optional[Union[AnnotationType, List[AnnotationType]]], + return_segmentation_masks: bool, + masks_path: Optional[Union[str, pathlib.Path]], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + do_convert_annotations: bool, + image_mean: Optional[Union[float, List[float]]], + image_std: Optional[Union[float, List[float]]], + do_pad: bool, + pad_size: Optional[Dict[str, int]], + format: Optional[Union[str, AnnotationFormat]], + return_tensors: Optional[Union[str, TensorType]], + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + """ + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + data = {} + + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( + image, + annotation, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=ChannelDimension.FIRST, + ) + + if do_resize: + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], + ) + image = resized_image + # Fused rescale and normalize + image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std) + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None + + if do_pad: + # depends on all resized image shapes so we need another loop + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images) + + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + if padded_size == image.size()[-2:]: + padded_images.append(image) + pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) + padded_annotations.append(annotation) + continue + image, pixel_mask, annotation = self.pad( + image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations + ) + padded_images.append(image) + padded_annotations.append(annotation) + pixel_masks.append(pixel_mask) + images = padded_images + annotations = padded_annotations if annotations is not None else None + data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) + + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + return encoded_inputs + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + ) + + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = nn.functional.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(out_bbox) + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + return results + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_segmentation + def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5): + """ + Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch. + + Args: + outputs ([`DetrSegmentationOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): + Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. + threshold (`float`, *optional*, defaults to 0.9): + Threshold to use to filter out queries. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_semantic_segmentation`.", + ) + out_logits, raw_masks = outputs.logits, outputs.pred_masks + empty_label = out_logits.shape[-1] - 1 + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.tolist()) + + for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes): + # we filter empty queries and detection below threshold + cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) + keep = cur_labels.ne(empty_label) & (cur_scores > threshold) + cur_scores = cur_scores[keep] + cur_labels = cur_labels[keep] + cur_masks = cur_masks[keep] + cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1 + + predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks} + preds.append(predictions) + return preds + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance + def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5): + """ + Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports + PyTorch. + + Args: + results (`List[Dict]`): + Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added. + outputs ([`DetrSegmentationOutput`]): + Raw outputs of the model. + orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original + image size (before any data augmentation). + max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). + threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an + image in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_instance_segmentation`.", + ) + + if len(orig_target_sizes) != len(max_target_sizes): + raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes") + max_h, max_w = max_target_sizes.max(0)[0].tolist() + outputs_masks = outputs.pred_masks.squeeze(2) + outputs_masks = nn.functional.interpolate( + outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False + ) + outputs_masks = (outputs_masks.sigmoid() > threshold).cpu() + + for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): + img_h, img_w = t[0], t[1] + results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) + results[i]["masks"] = nn.functional.interpolate( + results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" + ).byte() + + return results + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_panoptic + def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85): + """ + Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch. + + Args: + outputs ([`DetrSegmentationOutput`]): + Raw outputs of the model. + processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): + Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data + augmentation but before batching. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*): + Torch Tensor (or list) corresponding to the requested final size `(height, width)` of each prediction. + If left to None, it will default to the `processed_sizes`. + is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): + Dictionary mapping class indices to either True or False, depending on whether or not they are a thing. + If not set, defaults to the `is_thing_map` of COCO panoptic. + threshold (`float`, *optional*, defaults to 0.85): + Threshold to use to filter out queries. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for + an image in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_panoptic_segmentation`.", + ) + if target_sizes is None: + target_sizes = processed_sizes + if len(processed_sizes) != len(target_sizes): + raise ValueError("Make sure to pass in as many processed_sizes as target_sizes") + + if is_thing_map is None: + # default to is_thing_map of COCO panoptic + is_thing_map = {i: i <= 90 for i in range(201)} + + out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes + if not len(out_logits) == len(raw_masks) == len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks" + ) + empty_label = out_logits.shape[-1] - 1 + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.tolist()) + + for cur_logits, cur_masks, cur_boxes, size, target_size in zip( + out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes + ): + # we filter empty queries and detection below threshold + cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) + keep = cur_labels.ne(empty_label) & (cur_scores > threshold) + cur_scores = cur_scores[keep] + cur_labels = cur_labels[keep] + cur_masks = cur_masks[keep] + cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_boxes = center_to_corners_format(cur_boxes[keep]) + + h, w = cur_masks.shape[-2:] + if len(cur_boxes) != len(cur_labels): + raise ValueError("Not as many boxes as there are classes") + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.flatten(1) + stuff_equiv_classes = defaultdict(lambda: []) + for k, label in enumerate(cur_labels): + if not is_thing_map[label.item()]: + stuff_equiv_classes[label.item()].append(k) + + def get_ids_area(masks, scores, dedup=False): + # This helper function creates the final panoptic segmentation image + # It also returns the area of the masks that appears on the image + + m_id = masks.transpose(0, 1).softmax(-1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) + else: + m_id = m_id.argmax(-1).view(h, w) + + if dedup: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + if len(equiv) > 1: + for eq_id in equiv: + m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) + + final_h, final_w = to_tuple(target_size) + + seg_img = PIL.Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy())) + seg_img = seg_img.resize(size=(final_w, final_h), resample=PILImageResampling.NEAREST) + + np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) + np_seg_img = np_seg_img.view(final_h, final_w, 3) + np_seg_img = np_seg_img.numpy() + + m_id = torch.from_numpy(rgb_to_id(np_seg_img)) + + area = [] + for i in range(len(scores)): + area.append(m_id.eq(i).sum().item()) + return area, seg_img + + area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) + if cur_labels.numel() > 0: + # We know filter empty masks as long as we find some + while True: + filtered_small = torch.as_tensor( + [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device + ) + if filtered_small.any().item(): + cur_scores = cur_scores[~filtered_small] + cur_labels = cur_labels[~filtered_small] + cur_masks = cur_masks[~filtered_small] + area, seg_img = get_ids_area(cur_masks, cur_scores) + else: + break + + else: + cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device) + + segments_info = [] + for i, a in enumerate(area): + cat = cur_labels[i].item() + segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a}) + del cur_labels + + with io.BytesIO() as out: + seg_img.save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + preds.append(predictions) + return preds + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_object_detection + def post_process_object_detection( + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None + ): + """ + Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = nn.functional.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # Convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(out_bbox) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, List): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_semantic_segmentation + def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None): + """ + Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch. + + Args: + outputs ([`DetrForSegmentation`]): + Raw outputs of the model. + target_sizes (`List[Tuple[int, int]]`, *optional*): + A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the + batch. If unset, predictions will not be resized. + Returns: + `List[torch.Tensor]`: + A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width) + corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each + `torch.Tensor` correspond to a semantic class id. + """ + class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1] + masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width] + + # Remove the null class `[..., :-1]` + masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1] + masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + # Semantic segmentation logits of shape (batch_size, num_classes, height, width) + segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs) + batch_size = class_queries_logits.shape[0] + + # Resize logits and compute semantic segmentation maps + if target_sizes is not None: + if batch_size != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + semantic_segmentation = [] + for idx in range(batch_size): + resized_logits = nn.functional.interpolate( + segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False + ) + semantic_map = resized_logits[0].argmax(dim=0) + semantic_segmentation.append(semantic_map) + else: + semantic_segmentation = segmentation.argmax(dim=1) + semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] + + return semantic_segmentation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance_segmentation + def post_process_instance_segmentation( + self, + outputs, + threshold: float = 0.5, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + target_sizes: Optional[List[Tuple[int, int]]] = None, + return_coco_annotation: Optional[bool] = False, + ) -> List[Dict]: + """ + Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch. + + Args: + outputs ([`DetrForSegmentation`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.5): + The probability score threshold to keep predicted instance masks. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8): + The overlap mask area threshold to merge or discard small disconnected parts within each binary + instance mask. + target_sizes (`List[Tuple]`, *optional*): + List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested + final size (height, width) of each prediction. If unset, predictions will not be resized. + return_coco_annotation (`bool`, *optional*): + Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE) + format. + Returns: + `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: + - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or + `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to + `True`. Set to `None` if no mask if found above `threshold`. + - **segments_info** -- A dictionary that contains additional information on each segment. + - **id** -- An integer representing the `segment_id`. + - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`. + - **score** -- Prediction score of segment with `segment_id`. + """ + class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1] + masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + # Predicted label and score of each query (batch_size, num_queries) + pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1) + + # Loop over items in batch size + results: List[Dict[str, TensorType]] = [] + + for i in range(batch_size): + mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects( + mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels + ) + + # No mask found + if mask_probs_item.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + # Get segmentation map and segment information of batch item + target_size = target_sizes[i] if target_sizes is not None else None + segmentation, segments = compute_segments( + mask_probs=mask_probs_item, + pred_scores=pred_scores_item, + pred_labels=pred_labels_item, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + label_ids_to_fuse=[], + target_size=target_size, + ) + + # Return segmentation map in run-length encoding (RLE) format + if return_coco_annotation: + segmentation = convert_segmentation_to_rle(segmentation) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_panoptic_segmentation + def post_process_panoptic_segmentation( + self, + outputs, + threshold: float = 0.5, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + label_ids_to_fuse: Optional[Set[int]] = None, + target_sizes: Optional[List[Tuple[int, int]]] = None, + ) -> List[Dict]: + """ + Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports + PyTorch. + + Args: + outputs ([`DetrForSegmentation`]): + The outputs from [`DetrForSegmentation`]. + threshold (`float`, *optional*, defaults to 0.5): + The probability score threshold to keep predicted instance masks. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8): + The overlap mask area threshold to merge or discard small disconnected parts within each binary + instance mask. + label_ids_to_fuse (`Set[int]`, *optional*): + The labels in this state will have all their instances be fused together. For instance we could say + there can only be one sky in an image, but several persons, so the label ID for sky would be in that + set, but not the one for person. + target_sizes (`List[Tuple]`, *optional*): + List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested + final size (height, width) of each prediction in batch. If unset, predictions will not be resized. + Returns: + `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: + - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or + `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized to + the corresponding `target_sizes` entry. + - **segments_info** -- A dictionary that contains additional information on each segment. + - **id** -- an integer representing the `segment_id`. + - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`. + - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise. + Multiple instances of the same class / label were fused and assigned a single `segment_id`. + - **score** -- Prediction score of segment with `segment_id`. + """ + + if label_ids_to_fuse is None: + logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.") + label_ids_to_fuse = set() + + class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1] + masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + # Predicted label and score of each query (batch_size, num_queries) + pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1) + + # Loop over items in batch size + results: List[Dict[str, TensorType]] = [] + + for i in range(batch_size): + mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects( + mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels + ) + + # No mask found + if mask_probs_item.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + # Get segmentation map and segment information of batch item + target_size = target_sizes[i] if target_sizes is not None else None + segmentation, segments = compute_segments( + mask_probs=mask_probs_item, + pred_scores=pred_scores_item, + pred_labels=pred_labels_item, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + label_ids_to_fuse=label_ids_to_fuse, + target_size=target_size, + ) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + +__all__ = ["DetrImageProcessorFast"] diff --git a/docs/transformers/build/lib/transformers/models/detr/modeling_detr.py b/docs/transformers/build/lib/transformers/models/detr/modeling_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..cb47f58bda00160e18f1dbdcf3fc6ae78211bd53 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/detr/modeling_detr.py @@ -0,0 +1,1815 @@ +# coding=utf-8 +# Copyright 2021 Facebook AI Research The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DETR model.""" + +import math +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union + +import torch +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_timm_available, + logging, + replace_return_docstrings, + requires_backends, +) +from ...utils.backbone_utils import load_backbone +from .configuration_detr import DetrConfig + + +if is_timm_available(): + from timm import create_model + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "DetrConfig" +_CHECKPOINT_FOR_DOC = "facebook/detr-resnet-50" + + +@dataclass +class DetrDecoderOutput(BaseModelOutputWithCrossAttentions): + """ + Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions, + namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them + gone through a layernorm. This is useful when training the model with auxiliary decoding losses. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +class DetrModelOutput(Seq2SeqModelOutput): + """ + Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput, + namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them + gone through a layernorm. This is useful when training the model with auxiliary decoding losses. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each + layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +class DetrObjectDetectionOutput(ModelOutput): + """ + Output type of [`DetrForObjectDetection`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each + layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[List[Dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class DetrSegmentationOutput(ModelOutput): + """ + Output type of [`DetrForSegmentation`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`): + Segmentation masks logits for all queries. See also + [`~DetrImageProcessor.post_process_semantic_segmentation`] or + [`~DetrImageProcessor.post_process_instance_segmentation`] + [`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic + segmentation masks respectively. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each + layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + pred_masks: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[List[Dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +# BELOW: utilities copied from +# https://github.com/facebookresearch/detr/blob/master/backbone.py +class DetrFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = DetrFrozenBatchNorm2d(module.num_features) + + if not module.weight.device == torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class DetrConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API + if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. + requires_backends(self, ["timm"]) + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (1, 2, 3, 4)) + num_channels = kwargs.pop("in_chans", config.num_channels) + if config.dilation: + kwargs["output_stride"] = kwargs.get("output_stride", 16) + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + out_indices=out_indices, + in_chans=num_channels, + **kwargs, + ) + else: + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = None + if config.backbone is not None: + backbone_model_type = config.backbone + elif config.backbone_config is not None: + backbone_model_type = config.backbone_config.model_type + else: + raise ValueError("Either `backbone` or `backbone_config` should be provided in the config") + + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +class DetrConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +class DetrSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float() + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class DetrLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = DetrSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = DetrLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +class DetrAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the DETR paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor]): + return tensor if object_queries is None else tensor + object_queries + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, + key_value_states: Optional[torch.Tensor] = None, + spatial_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size, target_len, embed_dim = hidden_states.size() + + # add position embeddings to the hidden states before projecting to queries and keys + if object_queries is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, object_queries) + + # add key-value position embeddings to the key value states + if spatial_position_embeddings is not None: + key_value_states_original = key_value_states + key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, batch_size) + value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class DetrEncoderLayer(nn.Module): + def __init__(self, config: DetrConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = DetrAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + object_queries: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + object_queries (`torch.FloatTensor`, *optional*): + Object queries (also called content embeddings), to be added to the hidden states. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + object_queries=object_queries, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class DetrDecoderLayer(nn.Module): + def __init__(self, config: DetrConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = DetrAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = DetrAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, + query_position_embeddings: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + object_queries (`torch.FloatTensor`, *optional*): + object_queries that are added to the hidden states + in the cross-attention layer. + query_position_embeddings (`torch.FloatTensor`, *optional*): + position embeddings that are added to the queries and keys + in the self-attention layer. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + object_queries=query_position_embeddings, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + object_queries=query_position_embeddings, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + spatial_position_embeddings=object_queries, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +class DetrPreTrainedModel(PreTrainedModel): + config_class = DetrConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + _no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"] + + def _init_weights(self, module): + std = self.config.init_std + xavier_std = self.config.init_xavier_std + + if isinstance(module, DetrMHAttentionMap): + nn.init.zeros_(module.k_linear.bias) + nn.init.zeros_(module.q_linear.bias) + nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std) + nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std) + elif isinstance(module, DetrLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +DETR_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`DetrConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DETR_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. + + Pixel values can be obtained using [`AutoImageProcessor`]. See [`DetrImageProcessor.__call__`] for details. + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class DetrEncoder(DetrPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`DetrEncoderLayer`]. + + The encoder updates the flattened feature map through multiple self-attention layers. + + Small tweak for DETR: + + - object_queries are added to the forward pass. + + Args: + config: DetrConfig + """ + + def __init__(self, config: DetrConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + object_queries=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Object queries that are added to the queries in each self-attention layer. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + # we add object_queries as extra input to the encoder_layer + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + object_queries=object_queries, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class DetrDecoder(DetrPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some small tweaks for DETR: + + - object_queries and query_position_embeddings are added to the forward pass. + - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers. + + Args: + config: DetrConfig + """ + + def __init__(self, config: DetrConfig): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + + self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)]) + # in DETR, the decoder uses layernorm after the last decoder layer output + self.layernorm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + object_queries=None, + query_position_embeddings=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + The query embeddings that are passed into the decoder. + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`: + + - 1 for queries that are **not masked**, + - 0 for queries that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Object queries that are added to the queries and keys in each cross-attention layer. + query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + , *optional*): Position embeddings that are added to the values and keys in each self-attention layer. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + input_shape = inputs_embeds.size()[:-1] + + combined_attention_mask = None + + if attention_mask is not None and combined_attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + combined_attention_mask = combined_attention_mask + _prepare_4d_attention_mask( + attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # optional intermediate hidden states + intermediate = () if self.config.auxiliary_loss else None + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + combined_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + object_queries=object_queries, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if self.config.auxiliary_loss: + hidden_states = self.layernorm(hidden_states) + intermediate += (hidden_states,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # finally, apply layernorm + hidden_states = self.layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # stack intermediate decoder activations + if self.config.auxiliary_loss: + intermediate = torch.stack(intermediate) + + if not return_dict: + return tuple( + v + for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate] + if v is not None + ) + return DetrDecoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + intermediate_hidden_states=intermediate, + ) + + +@add_start_docstrings( + """ + The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without + any specific head on top. + """, + DETR_START_DOCSTRING, +) +class DetrModel(DetrPreTrainedModel): + def __init__(self, config: DetrConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = DetrConvEncoder(config) + object_queries = build_position_encoding(config) + self.backbone = DetrConvModel(backbone, object_queries) + + # Create projection layer + self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1) + + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) + + self.encoder = DetrEncoder(config) + self.decoder = DetrDecoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DetrModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], DetrModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DetrModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") + >>> model = DetrModel.from_pretrained("facebook/detr-resnet-50") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**inputs) + + >>> # the last hidden states are the final query embeddings of the Transformer decoder + >>> # these are of shape (batch_size, num_queries, hidden_size) + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 100, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), device=device) + + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # pixel_values should be of shape (batch_size, num_channels, height, width) + # pixel_mask should be of shape (batch_size, height, width) + features, object_queries_list = self.backbone(pixel_values, pixel_mask) + + # get final feature map and downsampled mask + feature_map, mask = features[-1] + + if mask is None: + raise ValueError("Backbone does not return downsampled pixel mask") + + # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + projected_feature_map = self.input_projection(feature_map) + + # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC + # In other words, turn their shape into (batch_size, sequence_length, hidden_size) + flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1) + object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1) + + flattened_mask = mask.flatten(1) + + # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder + # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, heigth*width) + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=flattened_features, + attention_mask=flattened_mask, + object_queries=object_queries, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output) + query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1) + queries = torch.zeros_like(query_position_embeddings) + + # decoder outputs consists of (dec_features, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + inputs_embeds=queries, + attention_mask=None, + object_queries=object_queries, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=flattened_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return DetrModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + ) + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +class DetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +@add_start_docstrings( + """ + DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks + such as COCO detection. + """, + DETR_START_DOCSTRING, +) +class DetrForObjectDetection(DetrPreTrainedModel): + def __init__(self, config: DetrConfig): + super().__init__(config) + + # DETR encoder-decoder model + self.model = DetrModel(config) + + # Object detection heads + self.class_labels_classifier = nn.Linear( + config.d_model, config.num_labels + 1 + ) # We add one for the "no object" class + self.bbox_predictor = DetrMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[List[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], DetrObjectDetectionOutput]: + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DetrForObjectDetection + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") + >>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[ + ... 0 + ... ] + + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98] + Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66] + Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76] + Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93] + Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DETR base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + # class logits + predicted bounding boxes + logits = self.class_labels_classifier(sequence_output) + pred_boxes = self.bbox_predictor(sequence_output).sigmoid() + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + outputs_class, outputs_coord = None, None + if self.config.auxiliary_loss: + intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4] + outputs_class = self.class_labels_classifier(intermediate) + outputs_coord = self.bbox_predictor(intermediate).sigmoid() + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord + ) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + return ((loss, loss_dict) + output) if loss is not None else output + + return DetrObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks + such as COCO panoptic. + + """, + DETR_START_DOCSTRING, +) +class DetrForSegmentation(DetrPreTrainedModel): + def __init__(self, config: DetrConfig): + super().__init__(config) + + # object detection model + self.detr = DetrForObjectDetection(config) + + # segmentation head + hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads + intermediate_channel_sizes = self.detr.model.backbone.conv_encoder.intermediate_channel_sizes + + self.mask_head = DetrMaskHeadSmallConv( + hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size + ) + + self.bbox_attention = DetrMHAttentionMap( + hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std + ) + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[List[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], DetrSegmentationOutput]: + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each + dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels, + bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves + should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a + `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a + `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`. + + Returns: + + Examples: + + ```python + >>> import io + >>> import requests + >>> from PIL import Image + >>> import torch + >>> import numpy + + >>> from transformers import AutoImageProcessor, DetrForSegmentation + >>> from transformers.image_transforms import rgb_to_id + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic") + >>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**inputs) + + >>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps + >>> # Segmentation results are returned as a list of dictionaries + >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)]) + + >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found + >>> panoptic_seg = result[0]["segmentation"] + >>> # Get prediction score and segment_id to class_id mapping of each segment + >>> panoptic_segments_info = result[0]["segments_info"] + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones((batch_size, height, width), device=device) + + # First, get list of feature maps and position embeddings + features, object_queries_list = self.detr.model.backbone(pixel_values, pixel_mask=pixel_mask) + + # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + feature_map, mask = features[-1] + batch_size, num_channels, height, width = feature_map.shape + projected_feature_map = self.detr.model.input_projection(feature_map) + + # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC + # In other words, turn their shape into (batch_size, sequence_length, hidden_size) + flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1) + object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1) + + flattened_mask = mask.flatten(1) + + # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder + # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, heigth*width) + if encoder_outputs is None: + encoder_outputs = self.detr.model.encoder( + inputs_embeds=flattened_features, + attention_mask=flattened_mask, + object_queries=object_queries, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output) + query_position_embeddings = self.detr.model.query_position_embeddings.weight.unsqueeze(0).repeat( + batch_size, 1, 1 + ) + queries = torch.zeros_like(query_position_embeddings) + + # decoder outputs consists of (dec_features, dec_hidden, dec_attn) + decoder_outputs = self.detr.model.decoder( + inputs_embeds=queries, + attention_mask=None, + object_queries=object_queries, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=flattened_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = decoder_outputs[0] + + # Sixth, compute logits, pred_boxes and pred_masks + logits = self.detr.class_labels_classifier(sequence_output) + pred_boxes = self.detr.bbox_predictor(sequence_output).sigmoid() + + memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width) + mask = flattened_mask.view(batch_size, height, width) + + # FIXME h_boxes takes the last one computed, keep this in mind + # important: we need to reverse the mask, since in the original implementation the mask works reversed + # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32) + bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask) + + seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]]) + + pred_masks = seg_masks.view(batch_size, self.detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]) + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + outputs_class, outputs_coord = None, None + if self.config.auxiliary_loss: + intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1] + outputs_class = self.detr.class_labels_classifier(intermediate) + outputs_coord = self.detr.bbox_predictor(intermediate).sigmoid() + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, device, pred_boxes, pred_masks, self.config, outputs_class, outputs_coord + ) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs + else: + output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs + return ((loss, loss_dict) + output) if loss is not None else output + + return DetrSegmentationOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + pred_masks=pred_masks, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +def _expand(tensor, length: int): + return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1) + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py +class DetrMaskHeadSmallConv(nn.Module): + """ + Simple convolutional head, using group norm. Upsampling is done using a FPN approach + """ + + def __init__(self, dim, fpn_dims, context_dim): + super().__init__() + + if dim % 8 != 0: + raise ValueError( + "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in" + " GroupNorm is set to 8" + ) + + inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64] + + self.lay1 = nn.Conv2d(dim, dim, 3, padding=1) + self.gn1 = nn.GroupNorm(8, dim) + self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1) + self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1]) + self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) + self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2]) + self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) + self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3]) + self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1) + self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4]) + self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1) + + self.dim = dim + + self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1) + self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1) + self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_uniform_(m.weight, a=1) + nn.init.constant_(m.bias, 0) + + def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]): + # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with + # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32). + # We expand the projected feature map to match the number of heads. + x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1) + + x = self.lay1(x) + x = self.gn1(x) + x = nn.functional.relu(x) + x = self.lay2(x) + x = self.gn2(x) + x = nn.functional.relu(x) + + cur_fpn = self.adapter1(fpns[0]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay3(x) + x = self.gn3(x) + x = nn.functional.relu(x) + + cur_fpn = self.adapter2(fpns[1]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay4(x) + x = self.gn4(x) + x = nn.functional.relu(x) + + cur_fpn = self.adapter3(fpns[2]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay5(x) + x = self.gn5(x) + x = nn.functional.relu(x) + + x = self.out_lay(x) + return x + + +class DetrMHAttentionMap(nn.Module): + """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" + + def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None): + super().__init__() + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.dropout = nn.Dropout(dropout) + + self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + + self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5 + + def forward(self, q, k, mask: Optional[Tensor] = None): + q = self.q_linear(q) + k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) + queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads) + keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]) + weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head) + + if mask is not None: + weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min) + weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) + weights = self.dropout(weights) + return weights + + +__all__ = [ + "DetrForObjectDetection", + "DetrForSegmentation", + "DetrModel", + "DetrPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/dialogpt/__init__.py b/docs/transformers/build/lib/transformers/models/dialogpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/build/lib/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..03f38084cfbf7428678e0ec7b25d0fe2ae9dace1 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,46 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import torch + +from transformers.utils import WEIGHTS_NAME + + +DIALOGPT_MODELS = ["small", "medium", "large"] + +OLD_KEY = "lm_head.decoder.weight" +NEW_KEY = "lm_head.weight" + + +def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): + d = torch.load(checkpoint_path, weights_only=True) + d[NEW_KEY] = d.pop(OLD_KEY) + os.makedirs(pytorch_dump_folder_path, exist_ok=True) + torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--dialogpt_path", default=".", type=str) + args = parser.parse_args() + for MODEL in DIALOGPT_MODELS: + checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") + pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" + convert_dialogpt_checkpoint( + checkpoint_path, + pytorch_dump_folder_path, + ) diff --git a/docs/transformers/build/lib/transformers/models/diffllama/__init__.py b/docs/transformers/build/lib/transformers/models/diffllama/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c162fce0a48bd164bd0e0a615b942ee4805a12aa --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/diffllama/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_diffllama import * + from .modeling_diffllama import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/diffllama/configuration_diffllama.py b/docs/transformers/build/lib/transformers/models/diffllama/configuration_diffllama.py new file mode 100644 index 0000000000000000000000000000000000000000..1b38f55d3903f83d7fbaee9cc798723f3de80497 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/diffllama/configuration_diffllama.py @@ -0,0 +1,199 @@ +# coding=utf-8 +# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on Llama implementations in this library and Microsoft's +# Differential Transformer implementations. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DiffLlama model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation + + +class DiffLlamaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DiffLlamaModel`]. It is used to instantiate an DiffLlama + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults + will yield a similar configuration to that of the [kajuma/DiffLlama-0.3B-handcut](https://huggingface.co/kajuma/DiffLlama-0.3B-handcut). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the DiffLlama model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`DiffLlamaModel`] + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 8192): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 16): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'diffllama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'diffllama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'diffllama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'diffllama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + lambda_std_dev (`float`, *optional*, defaults to 0.1): + The standard deviation for initialization of parameter lambda in attention layer. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_heads + + ```python + >>> from transformers import DiffLlamaModel, DiffLlamaConfig + + >>> # Initializing a DiffLlama diffllama-7b style configuration + >>> configuration = DiffLlamaConfig() + + >>> # Initializing a model from the diffllama-7b style configuration + >>> model = DiffLlamaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "diffllama" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=16, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + lambda_std_dev=0.1, + head_dim=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.lambda_std_dev = lambda_std_dev + self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["DiffLlamaConfig"] diff --git a/docs/transformers/build/lib/transformers/models/esm/openfold_utils/rigid_utils.py b/docs/transformers/build/lib/transformers/models/esm/openfold_utils/rigid_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4d0f2f69b350ce7589a1c82e896cb08ec5808f41 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/esm/openfold_utils/rigid_utils.py @@ -0,0 +1,1242 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from functools import lru_cache +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple + +import numpy as np +import torch + + +def rot_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + """ + Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting. + + Args: + a: [*, 3, 3] left multiplicand + b: [*, 3, 3] right multiplicand + Returns: + The product ab + """ + + def row_mul(i: int) -> torch.Tensor: + return torch.stack( + [ + a[..., i, 0] * b[..., 0, 0] + a[..., i, 1] * b[..., 1, 0] + a[..., i, 2] * b[..., 2, 0], + a[..., i, 0] * b[..., 0, 1] + a[..., i, 1] * b[..., 1, 1] + a[..., i, 2] * b[..., 2, 1], + a[..., i, 0] * b[..., 0, 2] + a[..., i, 1] * b[..., 1, 2] + a[..., i, 2] * b[..., 2, 2], + ], + dim=-1, + ) + + return torch.stack( + [ + row_mul(0), + row_mul(1), + row_mul(2), + ], + dim=-2, + ) + + +def rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor: + """ + Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting. + + Args: + r: [*, 3, 3] rotation matrices + t: [*, 3] coordinate tensors + Returns: + [*, 3] rotated coordinates + """ + x, y, z = torch.unbind(t, dim=-1) + return torch.stack( + [ + r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z, + r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z, + r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z, + ], + dim=-1, + ) + + +@lru_cache(maxsize=None) +def identity_rot_mats( + batch_dims: Tuple[int, ...], + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + requires_grad: bool = True, +) -> torch.Tensor: + rots = torch.eye(3, dtype=dtype, device=device, requires_grad=requires_grad) + rots = rots.view(*((1,) * len(batch_dims)), 3, 3) + rots = rots.expand(*batch_dims, -1, -1) + rots = rots.contiguous() + + return rots + + +@lru_cache(maxsize=None) +def identity_trans( + batch_dims: Tuple[int, ...], + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + requires_grad: bool = True, +) -> torch.Tensor: + trans = torch.zeros((*batch_dims, 3), dtype=dtype, device=device, requires_grad=requires_grad) + return trans + + +@lru_cache(maxsize=None) +def identity_quats( + batch_dims: Tuple[int, ...], + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + requires_grad: bool = True, +) -> torch.Tensor: + quat = torch.zeros((*batch_dims, 4), dtype=dtype, device=device, requires_grad=requires_grad) + + with torch.no_grad(): + quat[..., 0] = 1 + + return quat + + +_quat_elements: List[str] = ["a", "b", "c", "d"] +_qtr_keys: List[str] = [l1 + l2 for l1 in _quat_elements for l2 in _quat_elements] +_qtr_ind_dict: Dict[str, int] = {key: ind for ind, key in enumerate(_qtr_keys)} + + +def _to_mat(pairs: List[Tuple[str, int]]) -> np.ndarray: + mat = np.zeros((4, 4)) + for key, value in pairs: + ind = _qtr_ind_dict[key] + mat[ind // 4][ind % 4] = value + + return mat + + +_QTR_MAT = np.zeros((4, 4, 3, 3)) +_QTR_MAT[..., 0, 0] = _to_mat([("aa", 1), ("bb", 1), ("cc", -1), ("dd", -1)]) +_QTR_MAT[..., 0, 1] = _to_mat([("bc", 2), ("ad", -2)]) +_QTR_MAT[..., 0, 2] = _to_mat([("bd", 2), ("ac", 2)]) +_QTR_MAT[..., 1, 0] = _to_mat([("bc", 2), ("ad", 2)]) +_QTR_MAT[..., 1, 1] = _to_mat([("aa", 1), ("bb", -1), ("cc", 1), ("dd", -1)]) +_QTR_MAT[..., 1, 2] = _to_mat([("cd", 2), ("ab", -2)]) +_QTR_MAT[..., 2, 0] = _to_mat([("bd", 2), ("ac", -2)]) +_QTR_MAT[..., 2, 1] = _to_mat([("cd", 2), ("ab", 2)]) +_QTR_MAT[..., 2, 2] = _to_mat([("aa", 1), ("bb", -1), ("cc", -1), ("dd", 1)]) + + +def quat_to_rot(quat: torch.Tensor) -> torch.Tensor: + """ + Converts a quaternion to a rotation matrix. + + Args: + quat: [*, 4] quaternions + Returns: + [*, 3, 3] rotation matrices + """ + # [*, 4, 4] + quat = quat[..., None] * quat[..., None, :] + + # [4, 4, 3, 3] + mat = _get_quat("_QTR_MAT", dtype=quat.dtype, device=quat.device) + + # [*, 4, 4, 3, 3] + shaped_qtr_mat = mat.view((1,) * len(quat.shape[:-2]) + mat.shape) + quat = quat[..., None, None] * shaped_qtr_mat + + # [*, 3, 3] + return torch.sum(quat, dim=(-3, -4)) + + +def rot_to_quat(rot: torch.Tensor) -> torch.Tensor: + if rot.shape[-2:] != (3, 3): + raise ValueError("Input rotation is incorrectly shaped") + + [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = [[rot[..., i, j] for j in range(3)] for i in range(3)] + + k = [ + [ + xx + yy + zz, + zy - yz, + xz - zx, + yx - xy, + ], + [ + zy - yz, + xx - yy - zz, + xy + yx, + xz + zx, + ], + [ + xz - zx, + xy + yx, + yy - xx - zz, + yz + zy, + ], + [ + yx - xy, + xz + zx, + yz + zy, + zz - xx - yy, + ], + ] + + _, vectors = torch.linalg.eigh((1.0 / 3.0) * torch.stack([torch.stack(t, dim=-1) for t in k], dim=-2)) + return vectors[..., -1] + + +_QUAT_MULTIPLY = np.zeros((4, 4, 4)) +_QUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, -1]] + +_QUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, -1, 0]] + +_QUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0], [0, 1, 0, 0]] + +_QUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0], [1, 0, 0, 0]] + +_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :] + +_CACHED_QUATS: Dict[str, np.ndarray] = { + "_QTR_MAT": _QTR_MAT, + "_QUAT_MULTIPLY": _QUAT_MULTIPLY, + "_QUAT_MULTIPLY_BY_VEC": _QUAT_MULTIPLY_BY_VEC, +} + + +@lru_cache(maxsize=None) +def _get_quat(quat_key: str, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + return torch.tensor(_CACHED_QUATS[quat_key], dtype=dtype, device=device) + + +def quat_multiply(quat1: torch.Tensor, quat2: torch.Tensor) -> torch.Tensor: + """Multiply a quaternion by another quaternion.""" + mat = _get_quat("_QUAT_MULTIPLY", dtype=quat1.dtype, device=quat1.device) + reshaped_mat = mat.view((1,) * len(quat1.shape[:-1]) + mat.shape) + return torch.sum(reshaped_mat * quat1[..., :, None, None] * quat2[..., None, :, None], dim=(-3, -2)) + + +def quat_multiply_by_vec(quat: torch.Tensor, vec: torch.Tensor) -> torch.Tensor: + """Multiply a quaternion by a pure-vector quaternion.""" + mat = _get_quat("_QUAT_MULTIPLY_BY_VEC", dtype=quat.dtype, device=quat.device) + reshaped_mat = mat.view((1,) * len(quat.shape[:-1]) + mat.shape) + return torch.sum(reshaped_mat * quat[..., :, None, None] * vec[..., None, :, None], dim=(-3, -2)) + + +def invert_rot_mat(rot_mat: torch.Tensor) -> torch.Tensor: + return rot_mat.transpose(-1, -2) + + +def invert_quat(quat: torch.Tensor) -> torch.Tensor: + quat_prime = quat.clone() + quat_prime[..., 1:] *= -1 + inv = quat_prime / torch.sum(quat**2, dim=-1, keepdim=True) + return inv + + +class Rotation: + """ + A 3D rotation. Depending on how the object is initialized, the rotation is represented by either a rotation matrix + or a quaternion, though both formats are made available by helper functions. To simplify gradient computation, the + underlying format of the rotation cannot be changed in-place. Like Rigid, the class is designed to mimic the + behavior of a torch Tensor, almost as if each Rotation object were a tensor of rotations, in one format or another. + """ + + def __init__( + self, + rot_mats: Optional[torch.Tensor] = None, + quats: Optional[torch.Tensor] = None, + normalize_quats: bool = True, + ): + """ + Args: + rot_mats: + A [*, 3, 3] rotation matrix tensor. Mutually exclusive with quats + quats: + A [*, 4] quaternion. Mutually exclusive with rot_mats. If normalize_quats is not True, must be a unit + quaternion + normalize_quats: + If quats is specified, whether to normalize quats + """ + if (rot_mats is None and quats is None) or (rot_mats is not None and quats is not None): + raise ValueError("Exactly one input argument must be specified") + + if (rot_mats is not None and rot_mats.shape[-2:] != (3, 3)) or (quats is not None and quats.shape[-1] != 4): + raise ValueError("Incorrectly shaped rotation matrix or quaternion") + + # Force full-precision + if quats is not None: + quats = quats.to(dtype=torch.float32) + if rot_mats is not None: + rot_mats = rot_mats.to(dtype=torch.float32) + + if quats is not None and normalize_quats: + quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True) + + self._rot_mats = rot_mats + self._quats = quats + + @staticmethod + def identity( + shape, + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + requires_grad: bool = True, + fmt: str = "quat", + ) -> Rotation: + """ + Returns an identity Rotation. + + Args: + shape: + The "shape" of the resulting Rotation object. See documentation for the shape property + dtype: + The torch dtype for the rotation + device: + The torch device for the new rotation + requires_grad: + Whether the underlying tensors in the new rotation object should require gradient computation + fmt: + One of "quat" or "rot_mat". Determines the underlying format of the new object's rotation + Returns: + A new identity rotation + """ + if fmt == "rot_mat": + rot_mats = identity_rot_mats( + shape, + dtype, + device, + requires_grad, + ) + return Rotation(rot_mats=rot_mats, quats=None) + elif fmt == "quat": + quats = identity_quats(shape, dtype, device, requires_grad) + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError(f"Invalid format: f{fmt}") + + # Magic methods + + def __getitem__(self, index: Any) -> Rotation: + """ + Allows torch-style indexing over the virtual shape of the rotation object. See documentation for the shape + property. + + Args: + index: + A torch index. E.g. (1, 3, 2), or (slice(None,)) + Returns: + The indexed rotation + """ + if type(index) is not tuple: + index = (index,) + + if self._rot_mats is not None: + rot_mats = self._rot_mats[index + (slice(None), slice(None))] + return Rotation(rot_mats=rot_mats) + elif self._quats is not None: + quats = self._quats[index + (slice(None),)] + return Rotation(quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def __mul__(self, right: torch.Tensor) -> Rotation: + """ + Pointwise left multiplication of the rotation with a tensor. Can be used to e.g. mask the Rotation. + + Args: + right: + The tensor multiplicand + Returns: + The product + """ + if not (isinstance(right, torch.Tensor)): + raise TypeError("The other multiplicand must be a Tensor") + + if self._rot_mats is not None: + rot_mats = self._rot_mats * right[..., None, None] + return Rotation(rot_mats=rot_mats, quats=None) + elif self._quats is not None: + quats = self._quats * right[..., None] + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def __rmul__(self, left: torch.Tensor) -> Rotation: + """ + Reverse pointwise multiplication of the rotation with a tensor. + + Args: + left: + The left multiplicand + Returns: + The product + """ + return self.__mul__(left) + + # Properties + + @property + def shape(self) -> torch.Size: + """ + Returns the virtual shape of the rotation object. This shape is defined as the batch dimensions of the + underlying rotation matrix or quaternion. If the Rotation was initialized with a [10, 3, 3] rotation matrix + tensor, for example, the resulting shape would be [10]. + + Returns: + The virtual shape of the rotation object + """ + if self._rot_mats is not None: + return self._rot_mats.shape[:-2] + elif self._quats is not None: + return self._quats.shape[:-1] + else: + raise ValueError("Both rotations are None") + + @property + def dtype(self) -> torch.dtype: + """ + Returns the dtype of the underlying rotation. + + Returns: + The dtype of the underlying rotation + """ + if self._rot_mats is not None: + return self._rot_mats.dtype + elif self._quats is not None: + return self._quats.dtype + else: + raise ValueError("Both rotations are None") + + @property + def device(self) -> torch.device: + """ + The device of the underlying rotation + + Returns: + The device of the underlying rotation + """ + if self._rot_mats is not None: + return self._rot_mats.device + elif self._quats is not None: + return self._quats.device + else: + raise ValueError("Both rotations are None") + + @property + def requires_grad(self) -> bool: + """ + Returns the requires_grad property of the underlying rotation + + Returns: + The requires_grad property of the underlying tensor + """ + if self._rot_mats is not None: + return self._rot_mats.requires_grad + elif self._quats is not None: + return self._quats.requires_grad + else: + raise ValueError("Both rotations are None") + + def get_rot_mats(self) -> torch.Tensor: + """ + Returns the underlying rotation as a rotation matrix tensor. + + Returns: + The rotation as a rotation matrix tensor + """ + if self._rot_mats is not None: + return self._rot_mats + elif self._quats is not None: + return quat_to_rot(self._quats) + else: + raise ValueError("Both rotations are None") + + def get_quats(self) -> torch.Tensor: + """ + Returns the underlying rotation as a quaternion tensor. + + Depending on whether the Rotation was initialized with a quaternion, this function may call torch.linalg.eigh. + + Returns: + The rotation as a quaternion tensor. + """ + if self._rot_mats is not None: + return rot_to_quat(self._rot_mats) + elif self._quats is not None: + return self._quats + else: + raise ValueError("Both rotations are None") + + def get_cur_rot(self) -> torch.Tensor: + """ + Return the underlying rotation in its current form + + Returns: + The stored rotation + """ + if self._rot_mats is not None: + return self._rot_mats + elif self._quats is not None: + return self._quats + else: + raise ValueError("Both rotations are None") + + # Rotation functions + + def compose_q_update_vec(self, q_update_vec: torch.Tensor, normalize_quats: bool = True) -> Rotation: + """ + Returns a new quaternion Rotation after updating the current object's underlying rotation with a quaternion + update, formatted as a [*, 3] tensor whose final three columns represent x, y, z such that (1, x, y, z) is the + desired (not necessarily unit) quaternion update. + + Args: + q_update_vec: + A [*, 3] quaternion update tensor + normalize_quats: + Whether to normalize the output quaternion + Returns: + An updated Rotation + """ + quats = self.get_quats() + new_quats = quats + quat_multiply_by_vec(quats, q_update_vec) + return Rotation( + rot_mats=None, + quats=new_quats, + normalize_quats=normalize_quats, + ) + + def compose_r(self, r: Rotation) -> Rotation: + """ + Compose the rotation matrices of the current Rotation object with those of another. + + Args: + r: + An update rotation object + Returns: + An updated rotation object + """ + r1 = self.get_rot_mats() + r2 = r.get_rot_mats() + new_rot_mats = rot_matmul(r1, r2) + return Rotation(rot_mats=new_rot_mats, quats=None) + + def compose_q(self, r: Rotation, normalize_quats: bool = True) -> Rotation: + """ + Compose the quaternions of the current Rotation object with those of another. + + Depending on whether either Rotation was initialized with quaternions, this function may call + torch.linalg.eigh. + + Args: + r: + An update rotation object + Returns: + An updated rotation object + """ + q1 = self.get_quats() + q2 = r.get_quats() + new_quats = quat_multiply(q1, q2) + return Rotation(rot_mats=None, quats=new_quats, normalize_quats=normalize_quats) + + def apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + Apply the current Rotation as a rotation matrix to a set of 3D coordinates. + + Args: + pts: + A [*, 3] set of points + Returns: + [*, 3] rotated points + """ + rot_mats = self.get_rot_mats() + return rot_vec_mul(rot_mats, pts) + + def invert_apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + The inverse of the apply() method. + + Args: + pts: + A [*, 3] set of points + Returns: + [*, 3] inverse-rotated points + """ + rot_mats = self.get_rot_mats() + inv_rot_mats = invert_rot_mat(rot_mats) + return rot_vec_mul(inv_rot_mats, pts) + + def invert(self) -> Rotation: + """ + Returns the inverse of the current Rotation. + + Returns: + The inverse of the current Rotation + """ + if self._rot_mats is not None: + return Rotation(rot_mats=invert_rot_mat(self._rot_mats), quats=None) + elif self._quats is not None: + return Rotation( + rot_mats=None, + quats=invert_quat(self._quats), + normalize_quats=False, + ) + else: + raise ValueError("Both rotations are None") + + # "Tensor" stuff + + def unsqueeze(self, dim: int) -> Rotation: + """ + Analogous to torch.unsqueeze. The dimension is relative to the shape of the Rotation object. + + Args: + dim: A positive or negative dimension index. + Returns: + The unsqueezed Rotation. + """ + if dim >= len(self.shape): + raise ValueError("Invalid dimension") + + if self._rot_mats is not None: + rot_mats = self._rot_mats.unsqueeze(dim if dim >= 0 else dim - 2) + return Rotation(rot_mats=rot_mats, quats=None) + elif self._quats is not None: + quats = self._quats.unsqueeze(dim if dim >= 0 else dim - 1) + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + @staticmethod + def cat(rs: Sequence[Rotation], dim: int) -> Rotation: + """ + Concatenates rotations along one of the batch dimensions. Analogous to torch.cat(). + + Note that the output of this operation is always a rotation matrix, regardless of the format of input + rotations. + + Args: + rs: + A list of rotation objects + dim: + The dimension along which the rotations should be concatenated + Returns: + A concatenated Rotation object in rotation matrix format + """ + rot_mats = torch.cat( + [r.get_rot_mats() for r in rs], + dim=dim if dim >= 0 else dim - 2, + ) + + return Rotation(rot_mats=rot_mats, quats=None) + + def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rotation: + """ + Apply a Tensor -> Tensor function to underlying rotation tensors, mapping over the rotation dimension(s). Can + be used e.g. to sum out a one-hot batch dimension. + + Args: + fn: + A Tensor -> Tensor function to be mapped over the Rotation + Returns: + The transformed Rotation object + """ + if self._rot_mats is not None: + rot_mats = self._rot_mats.view(self._rot_mats.shape[:-2] + (9,)) + rot_mats = torch.stack(list(map(fn, torch.unbind(rot_mats, dim=-1))), dim=-1) + rot_mats = rot_mats.view(rot_mats.shape[:-1] + (3, 3)) + return Rotation(rot_mats=rot_mats, quats=None) + elif self._quats is not None: + quats = torch.stack(list(map(fn, torch.unbind(self._quats, dim=-1))), dim=-1) + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def cuda(self) -> Rotation: + """ + Analogous to the cuda() method of torch Tensors + + Returns: + A copy of the Rotation in CUDA memory + """ + if self._rot_mats is not None: + return Rotation(rot_mats=self._rot_mats.cuda(), quats=None) + elif self._quats is not None: + return Rotation(rot_mats=None, quats=self._quats.cuda(), normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def to(self, device: Optional[torch.device], dtype: Optional[torch.dtype]) -> Rotation: + """ + Analogous to the to() method of torch Tensors + + Args: + device: + A torch device + dtype: + A torch dtype + Returns: + A copy of the Rotation using the new device and dtype + """ + if self._rot_mats is not None: + return Rotation( + rot_mats=self._rot_mats.to(device=device, dtype=dtype), + quats=None, + ) + elif self._quats is not None: + return Rotation( + rot_mats=None, + quats=self._quats.to(device=device, dtype=dtype), + normalize_quats=False, + ) + else: + raise ValueError("Both rotations are None") + + def detach(self) -> Rotation: + """ + Returns a copy of the Rotation whose underlying Tensor has been detached from its torch graph. + + Returns: + A copy of the Rotation whose underlying Tensor has been detached from its torch graph + """ + if self._rot_mats is not None: + return Rotation(rot_mats=self._rot_mats.detach(), quats=None) + elif self._quats is not None: + return Rotation( + rot_mats=None, + quats=self._quats.detach(), + normalize_quats=False, + ) + else: + raise ValueError("Both rotations are None") + + +class Rigid: + """ + A class representing a rigid transformation. Little more than a wrapper around two objects: a Rotation object and a + [*, 3] translation Designed to behave approximately like a single torch tensor with the shape of the shared batch + dimensions of its component parts. + """ + + def __init__(self, rots: Optional[Rotation], trans: Optional[torch.Tensor]): + """ + Args: + rots: A [*, 3, 3] rotation tensor + trans: A corresponding [*, 3] translation tensor + """ + # (we need device, dtype, etc. from at least one input) + + batch_dims, dtype, device, requires_grad = None, None, None, None + if trans is not None: + batch_dims = trans.shape[:-1] + dtype = trans.dtype + device = trans.device + requires_grad = trans.requires_grad + elif rots is not None: + batch_dims = rots.shape + dtype = rots.dtype + device = rots.device + requires_grad = rots.requires_grad + else: + raise ValueError("At least one input argument must be specified") + + if rots is None: + rots = Rotation.identity( + batch_dims, + dtype, + device, + requires_grad, + ) + elif trans is None: + trans = identity_trans( + batch_dims, + dtype, + device, + requires_grad, + ) + + assert rots is not None + assert trans is not None + + if (rots.shape != trans.shape[:-1]) or (rots.device != trans.device): + raise ValueError("Rots and trans incompatible") + + # Force full precision. Happens to the rotations automatically. + trans = trans.to(dtype=torch.float32) + + self._rots = rots + self._trans = trans + + @staticmethod + def identity( + shape: Tuple[int, ...], + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + requires_grad: bool = True, + fmt: str = "quat", + ) -> Rigid: + """ + Constructs an identity transformation. + + Args: + shape: + The desired shape + dtype: + The dtype of both internal tensors + device: + The device of both internal tensors + requires_grad: + Whether grad should be enabled for the internal tensors + Returns: + The identity transformation + """ + return Rigid( + Rotation.identity(shape, dtype, device, requires_grad, fmt=fmt), + identity_trans(shape, dtype, device, requires_grad), + ) + + def __getitem__(self, index: Any) -> Rigid: + """ + Indexes the affine transformation with PyTorch-style indices. The index is applied to the shared dimensions of + both the rotation and the translation. + + E.g.:: + + r = Rotation(rot_mats=torch.rand(10, 10, 3, 3), quats=None) t = Rigid(r, torch.rand(10, 10, 3)) indexed = + t[3, 4:6] assert(indexed.shape == (2,)) assert(indexed.get_rots().shape == (2,)) + assert(indexed.get_trans().shape == (2, 3)) + + Args: + index: A standard torch tensor index. E.g. 8, (10, None, 3), + or (3, slice(0, 1, None)) + Returns: + The indexed tensor + """ + if type(index) is not tuple: + index = (index,) + + return Rigid( + self._rots[index], + self._trans[index + (slice(None),)], + ) + + def __mul__(self, right: torch.Tensor) -> Rigid: + """ + Pointwise left multiplication of the transformation with a tensor. Can be used to e.g. mask the Rigid. + + Args: + right: + The tensor multiplicand + Returns: + The product + """ + if not (isinstance(right, torch.Tensor)): + raise TypeError("The other multiplicand must be a Tensor") + + new_rots = self._rots * right + new_trans = self._trans * right[..., None] + + return Rigid(new_rots, new_trans) + + def __rmul__(self, left: torch.Tensor) -> Rigid: + """ + Reverse pointwise multiplication of the transformation with a tensor. + + Args: + left: + The left multiplicand + Returns: + The product + """ + return self.__mul__(left) + + @property + def shape(self) -> torch.Size: + """ + Returns the shape of the shared dimensions of the rotation and the translation. + + Returns: + The shape of the transformation + """ + return self._trans.shape[:-1] + + @property + def device(self) -> torch.device: + """ + Returns the device on which the Rigid's tensors are located. + + Returns: + The device on which the Rigid's tensors are located + """ + return self._trans.device + + def get_rots(self) -> Rotation: + """ + Getter for the rotation. + + Returns: + The rotation object + """ + return self._rots + + def get_trans(self) -> torch.Tensor: + """ + Getter for the translation. + + Returns: + The stored translation + """ + return self._trans + + def compose_q_update_vec(self, q_update_vec: torch.Tensor) -> Rigid: + """ + Composes the transformation with a quaternion update vector of shape [*, 6], where the final 6 columns + represent the x, y, and z values of a quaternion of form (1, x, y, z) followed by a 3D translation. + + Args: + q_vec: The quaternion update vector. + Returns: + The composed transformation. + """ + q_vec, t_vec = q_update_vec[..., :3], q_update_vec[..., 3:] + new_rots = self._rots.compose_q_update_vec(q_vec) + + trans_update = self._rots.apply(t_vec) + new_translation = self._trans + trans_update + + return Rigid(new_rots, new_translation) + + def compose(self, r: Rigid) -> Rigid: + """ + Composes the current rigid object with another. + + Args: + r: + Another Rigid object + Returns: + The composition of the two transformations + """ + new_rot = self._rots.compose_r(r._rots) + new_trans = self._rots.apply(r._trans) + self._trans + return Rigid(new_rot, new_trans) + + def apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + Applies the transformation to a coordinate tensor. + + Args: + pts: A [*, 3] coordinate tensor. + Returns: + The transformed points. + """ + rotated = self._rots.apply(pts) + return rotated + self._trans + + def invert_apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + Applies the inverse of the transformation to a coordinate tensor. + + Args: + pts: A [*, 3] coordinate tensor + Returns: + The transformed points. + """ + pts = pts - self._trans + return self._rots.invert_apply(pts) + + def invert(self) -> Rigid: + """ + Inverts the transformation. + + Returns: + The inverse transformation. + """ + rot_inv = self._rots.invert() + trn_inv = rot_inv.apply(self._trans) + + return Rigid(rot_inv, -1 * trn_inv) + + def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid: + """ + Apply a Tensor -> Tensor function to underlying translation and rotation tensors, mapping over the + translation/rotation dimensions respectively. + + Args: + fn: + A Tensor -> Tensor function to be mapped over the Rigid + Returns: + The transformed Rigid object + """ + new_rots = self._rots.map_tensor_fn(fn) + new_trans = torch.stack(list(map(fn, torch.unbind(self._trans, dim=-1))), dim=-1) + + return Rigid(new_rots, new_trans) + + def to_tensor_4x4(self) -> torch.Tensor: + """ + Converts a transformation to a homogeneous transformation tensor. + + Returns: + A [*, 4, 4] homogeneous transformation tensor + """ + tensor = self._trans.new_zeros((*self.shape, 4, 4)) + tensor[..., :3, :3] = self._rots.get_rot_mats() + tensor[..., :3, 3] = self._trans + tensor[..., 3, 3] = 1 + return tensor + + @staticmethod + def from_tensor_4x4(t: torch.Tensor) -> Rigid: + """ + Constructs a transformation from a homogeneous transformation tensor. + + Args: + t: [*, 4, 4] homogeneous transformation tensor + Returns: + T object with shape [*] + """ + if t.shape[-2:] != (4, 4): + raise ValueError("Incorrectly shaped input tensor") + + rots = Rotation(rot_mats=t[..., :3, :3], quats=None) + trans = t[..., :3, 3] + + return Rigid(rots, trans) + + def to_tensor_7(self) -> torch.Tensor: + """ + Converts a transformation to a tensor with 7 final columns, four for the quaternion followed by three for the + translation. + + Returns: + A [*, 7] tensor representation of the transformation + """ + tensor = self._trans.new_zeros((*self.shape, 7)) + tensor[..., :4] = self._rots.get_quats() + tensor[..., 4:] = self._trans + + return tensor + + @staticmethod + def from_tensor_7(t: torch.Tensor, normalize_quats: bool = False) -> Rigid: + if t.shape[-1] != 7: + raise ValueError("Incorrectly shaped input tensor") + + quats, trans = t[..., :4], t[..., 4:] + + rots = Rotation(rot_mats=None, quats=quats, normalize_quats=normalize_quats) + + return Rigid(rots, trans) + + @staticmethod + def from_3_points( + p_neg_x_axis: torch.Tensor, origin: torch.Tensor, p_xy_plane: torch.Tensor, eps: float = 1e-8 + ) -> Rigid: + """ + Implements algorithm 21. Constructs transformations from sets of 3 points using the Gram-Schmidt algorithm. + + Args: + p_neg_x_axis: [*, 3] coordinates + origin: [*, 3] coordinates used as frame origins + p_xy_plane: [*, 3] coordinates + eps: Small epsilon value + Returns: + A transformation object of shape [*] + """ + p_neg_x_axis_unbound = torch.unbind(p_neg_x_axis, dim=-1) + origin_unbound = torch.unbind(origin, dim=-1) + p_xy_plane_unbound = torch.unbind(p_xy_plane, dim=-1) + + e0 = [c1 - c2 for c1, c2 in zip(origin_unbound, p_neg_x_axis_unbound)] + e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane_unbound, origin_unbound)] + + denom = torch.sqrt(sum(c * c for c in e0) + eps * torch.ones_like(e0[0])) + e0 = [c / denom for c in e0] + dot = sum((c1 * c2 for c1, c2 in zip(e0, e1))) + e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)] + denom = torch.sqrt(sum((c * c for c in e1)) + eps * torch.ones_like(e1[0])) + e1 = [c / denom for c in e1] + e2 = [ + e0[1] * e1[2] - e0[2] * e1[1], + e0[2] * e1[0] - e0[0] * e1[2], + e0[0] * e1[1] - e0[1] * e1[0], + ] + + rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1) + rots = rots.reshape(rots.shape[:-1] + (3, 3)) + + rot_obj = Rotation(rot_mats=rots, quats=None) + + return Rigid(rot_obj, torch.stack(origin_unbound, dim=-1)) + + def unsqueeze(self, dim: int) -> Rigid: + """ + Analogous to torch.unsqueeze. The dimension is relative to the shared dimensions of the rotation/translation. + + Args: + dim: A positive or negative dimension index. + Returns: + The unsqueezed transformation. + """ + if dim >= len(self.shape): + raise ValueError("Invalid dimension") + rots = self._rots.unsqueeze(dim) + trans = self._trans.unsqueeze(dim if dim >= 0 else dim - 1) + + return Rigid(rots, trans) + + @staticmethod + def cat(ts: Sequence[Rigid], dim: int) -> Rigid: + """ + Concatenates transformations along a new dimension. + + Args: + ts: + A list of T objects + dim: + The dimension along which the transformations should be concatenated + Returns: + A concatenated transformation object + """ + rots = Rotation.cat([t._rots for t in ts], dim) + trans = torch.cat([t._trans for t in ts], dim=dim if dim >= 0 else dim - 1) + + return Rigid(rots, trans) + + def apply_rot_fn(self, fn: Callable[[Rotation], Rotation]) -> Rigid: + """ + Applies a Rotation -> Rotation function to the stored rotation object. + + Args: + fn: A function of type Rotation -> Rotation + Returns: + A transformation object with a transformed rotation. + """ + return Rigid(fn(self._rots), self._trans) + + def apply_trans_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid: + """ + Applies a Tensor -> Tensor function to the stored translation. + + Args: + fn: + A function of type Tensor -> Tensor to be applied to the translation + Returns: + A transformation object with a transformed translation. + """ + return Rigid(self._rots, fn(self._trans)) + + def scale_translation(self, trans_scale_factor: float) -> Rigid: + """ + Scales the translation by a constant factor. + + Args: + trans_scale_factor: + The constant factor + Returns: + A transformation object with a scaled translation. + """ + return self.apply_trans_fn(lambda t: t * trans_scale_factor) + + def stop_rot_gradient(self) -> Rigid: + """ + Detaches the underlying rotation object + + Returns: + A transformation object with detached rotations + """ + return self.apply_rot_fn(lambda r: r.detach()) + + @staticmethod + def make_transform_from_reference( + n_xyz: torch.Tensor, ca_xyz: torch.Tensor, c_xyz: torch.Tensor, eps: float = 1e-20 + ) -> Rigid: + """ + Returns a transformation object from reference coordinates. + + Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard + way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You + need to take care of such cases in your code. + + Args: + n_xyz: A [*, 3] tensor of nitrogen xyz coordinates. + ca_xyz: A [*, 3] tensor of carbon alpha xyz coordinates. + c_xyz: A [*, 3] tensor of carbon xyz coordinates. + Returns: + A transformation object. After applying the translation and rotation to the reference backbone, the + coordinates will approximately equal to the input coordinates. + """ + translation = -1 * ca_xyz + n_xyz = n_xyz + translation + c_xyz = c_xyz + translation + + c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)] + norm = torch.sqrt(eps + c_x**2 + c_y**2) + sin_c1 = -c_y / norm + cos_c1 = c_x / norm + + c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3)) + c1_rots[..., 0, 0] = cos_c1 + c1_rots[..., 0, 1] = -1 * sin_c1 + c1_rots[..., 1, 0] = sin_c1 + c1_rots[..., 1, 1] = cos_c1 + c1_rots[..., 2, 2] = 1 + + norm = torch.sqrt(eps + c_x**2 + c_y**2 + c_z**2) + sin_c2 = c_z / norm + cos_c2 = torch.sqrt(c_x**2 + c_y**2) / norm + + c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3)) + c2_rots[..., 0, 0] = cos_c2 + c2_rots[..., 0, 2] = sin_c2 + c2_rots[..., 1, 1] = 1 + c2_rots[..., 2, 0] = -1 * sin_c2 + c2_rots[..., 2, 2] = cos_c2 + + c_rots = rot_matmul(c2_rots, c1_rots) + n_xyz = rot_vec_mul(c_rots, n_xyz) + + _, n_y, n_z = [n_xyz[..., i] for i in range(3)] + norm = torch.sqrt(eps + n_y**2 + n_z**2) + sin_n = -n_z / norm + cos_n = n_y / norm + + n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3)) + n_rots[..., 0, 0] = 1 + n_rots[..., 1, 1] = cos_n + n_rots[..., 1, 2] = -1 * sin_n + n_rots[..., 2, 1] = sin_n + n_rots[..., 2, 2] = cos_n + + rots = rot_matmul(n_rots, c_rots) + + rots = rots.transpose(-1, -2) + translation = -1 * translation + + rot_obj = Rotation(rot_mats=rots, quats=None) + + return Rigid(rot_obj, translation) + + def cuda(self) -> Rigid: + """ + Moves the transformation object to GPU memory + + Returns: + A version of the transformation on GPU + """ + return Rigid(self._rots.cuda(), self._trans.cuda()) diff --git a/docs/transformers/build/lib/transformers/models/falcon/configuration_falcon.py b/docs/transformers/build/lib/transformers/models/falcon/configuration_falcon.py new file mode 100644 index 0000000000000000000000000000000000000000..2d072b14f79001ff94b7f3de969cd75e26346c4b --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/falcon/configuration_falcon.py @@ -0,0 +1,211 @@ +# coding=utf-8 +# Copyright 2023 the Falcon authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Falcon configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class FalconConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the + [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 65024): + Vocabulary size of the Falcon model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`FalconModel`] + hidden_size (`int`, *optional*, defaults to 4544): + Dimension of the hidden representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 71): + Number of attention heads for each attention layer in the Transformer encoder. + num_ln_in_parallel_attn (`int`, *optional*): + Set to 2 if separate layer norms are to be used for the MLP and the attention output when using parallel + attention, otherwise, 1. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether the model should return the last key/values attentions (not used by all models). Only relevant if + `config.is_decoder=True`. + hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for MLP layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for attention layers. + num_kv_heads (`int`, *optional*): + Number of key-value heads to use per attention layer. If unset, defaults to the same value as + `num_attention_heads`. + alibi (`bool`, *optional*, defaults to `False`): + Whether to use ALiBi positional biases during self-attention. + new_decoder_architecture (`bool`, *optional*, defaults to `False`): + Whether to use the new (Falcon-40B) decoder architecture. If `True`, the `multi_query` and `parallel_attn` + arguments are ignored, as the new decoder always uses parallel attention. + multi_query (`bool`, *optional*, defaults to `True`): + Whether to use multi-query attention in the decoder. Ignored when `new_decoder_architecture` is `True`. + parallel_attn (`bool`, *optional*, defaults to `True`): + Whether to compute attention in parallel with the feedforward layer. If False, they are consecutive + instead, as in the original Transformer architecture. Ignored when `new_decoder_architecture` is `True`. + bias (`bool`, *optional*, defaults to `False`): + Whether to use bias on Linear layers. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with, when `alibi` is `False`. Pretrained + Falcon models with RoPE support up to 2048 tokens. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + bos_token_id (`int`, *optional*, defaults to 11): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 11): + The id of the "end-of-sequence" token. + ffn_hidden_size (`int`, *optional*): + The hidden size of the feedforward layer in the Transformer decoder. + defaults to 4x hidden dim + activation (`str`, *optional*, defaults to `"gelu"`): + The activation function used in the feedforward layer. + + Example: + + ```python + >>> from transformers import FalconModel, FalconConfig + + >>> # Initializing a small (2-layer) Falcon configuration + >>> configuration = FalconConfig(num_hidden_layers=2) + + >>> # Initializing a model from the small configuration + >>> model = FalconModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "falcon" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=65024, + hidden_size=4544, + num_hidden_layers=32, + num_attention_heads=71, + num_ln_in_parallel_attn=None, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + use_cache=True, + hidden_dropout=0.0, + attention_dropout=0.0, + num_kv_heads=None, + alibi=False, + new_decoder_architecture=False, + multi_query=True, + parallel_attn=True, + bias=False, + max_position_embeddings=2048, + rope_theta=10000.0, + rope_scaling=None, + bos_token_id=11, + eos_token_id=11, + ffn_hidden_size=None, + activation="gelu", + **kwargs, + ): + self.vocab_size = vocab_size + # Backward compatibility with n_embed kwarg + n_embed = kwargs.pop("n_embed", None) + self.hidden_size = hidden_size if n_embed is None else n_embed + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads + self.alibi = alibi + self.new_decoder_architecture = new_decoder_architecture + self.multi_query = multi_query # Ignored when new_decoder_architecture is True + self.parallel_attn = parallel_attn + self.bias = bias + self.num_ln_in_parallel_attn = num_ln_in_parallel_attn + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.activation = activation + if ffn_hidden_size is None: + self.ffn_hidden_size = hidden_size * 4 + else: + self.ffn_hidden_size = ffn_hidden_size + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + @property + def head_dim(self): + return self.hidden_size // self.num_attention_heads + + @property + def rotary(self): + return not self.alibi + + +__all__ = ["FalconConfig"] diff --git a/docs/transformers/build/lib/transformers/models/falcon/convert_custom_code_checkpoint.py b/docs/transformers/build/lib/transformers/models/falcon/convert_custom_code_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..0da817c3ffa73907c0215be12377f08fb5729a85 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/falcon/convert_custom_code_checkpoint.py @@ -0,0 +1,74 @@ +import json +from argparse import ArgumentParser +from pathlib import Path + + +""" +This script converts Falcon custom code checkpoints to modern Falcon checkpoints that use code in the Transformers +library. After conversion, performance (especially for generation) should improve and the checkpoint can be loaded +without needing trust_remote_code=True. +""" + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "--checkpoint_dir", + type=Path, + required=True, + help="Directory containing a custom code checkpoint to convert to a modern Falcon checkpoint.", + ) + args = parser.parse_args() + + if not args.checkpoint_dir.is_dir(): + raise ValueError("--checkpoint_dir argument should be a directory!") + + if ( + not (args.checkpoint_dir / "configuration_RW.py").is_file() + or not (args.checkpoint_dir / "modelling_RW.py").is_file() + ): + raise ValueError( + "The model directory should contain configuration_RW.py and modelling_RW.py files! Are you sure this is a custom code checkpoint?" + ) + (args.checkpoint_dir / "configuration_RW.py").unlink() + (args.checkpoint_dir / "modelling_RW.py").unlink() + + config = args.checkpoint_dir / "config.json" + text = config.read_text() + text = text.replace("RWForCausalLM", "FalconForCausalLM") + text = text.replace("RefinedWebModel", "falcon") + text = text.replace("RefinedWeb", "falcon") + json_config = json.loads(text) + del json_config["auto_map"] + + if "n_head" in json_config: + json_config["num_attention_heads"] = json_config.pop("n_head") + if "n_layer" in json_config: + json_config["num_hidden_layers"] = json_config.pop("n_layer") + if "n_head_kv" in json_config: + json_config["num_kv_heads"] = json_config.pop("n_head_kv") + json_config["new_decoder_architecture"] = True + else: + json_config["new_decoder_architecture"] = False + bos_token_id = json_config.get("bos_token_id", 1) + eos_token_id = json_config.get("eos_token_id", 2) + config.unlink() + config.write_text(json.dumps(json_config, indent=2, sort_keys=True)) + + tokenizer_config = args.checkpoint_dir / "tokenizer_config.json" + if tokenizer_config.is_file(): + text = tokenizer_config.read_text() + json_config = json.loads(text) + if json_config["tokenizer_class"] == "PreTrainedTokenizerFast": + json_config["model_input_names"] = ["input_ids", "attention_mask"] + tokenizer_config.unlink() + tokenizer_config.write_text(json.dumps(json_config, indent=2, sort_keys=True)) + + generation_config_path = args.checkpoint_dir / "generation_config.json" + generation_dict = { + "_from_model_config": True, + "bos_token_id": bos_token_id, + "eos_token_id": eos_token_id, + "transformers_version": "4.33.0.dev0", + } + generation_config_path.write_text(json.dumps(generation_dict, indent=2, sort_keys=True)) + print("Done! Please double-check that the new checkpoint works as expected.") diff --git a/docs/transformers/build/lib/transformers/models/falcon/modeling_falcon.py b/docs/transformers/build/lib/transformers/models/falcon/modeling_falcon.py new file mode 100644 index 0000000000000000000000000000000000000000..86b974570d68cd9ac3735a27e3d834ae5fadb425 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/falcon/modeling_falcon.py @@ -0,0 +1,1566 @@ +# coding=utf-8 +# Copyright 2023 the Falcon authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Falcon model.""" + +import math +from typing import TYPE_CHECKING, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss +from torch.nn import functional as F + +from ...activations import get_activation +from ...cache_utils import Cache, DynamicCache, StaticCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import ( + AttentionMaskConverter, +) +from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + QuestionAnsweringModelOutput, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_falcon import FalconConfig + + +if TYPE_CHECKING: + from ...configuration_utils import PretrainedConfig + +if is_flash_attn_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b" +_CONFIG_FOR_DOC = "FalconConfig" + + +# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations. +# In order not to degrade the quality of our HF-port, we keep these characteristics in the final model. +class FalconLinear(nn.Linear): + def forward(self, input: torch.Tensor) -> torch.Tensor: + hidden_states = input @ self.weight.T + if self.bias is None: + return hidden_states + return hidden_states + self.bias + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Falcon +class FalconRotaryEmbedding(nn.Module): + def __init__(self, config: FalconConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: + batch_size, seq_length = attention_mask.shape + closest_power_of_2 = 2 ** math.floor(math.log2(num_heads)) + base = torch.tensor( + 2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32 + ) + powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32) + slopes = torch.pow(base, powers) + + if closest_power_of_2 != num_heads: + extra_base = torch.tensor( + 2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32 + ) + num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2) + extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32) + slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0) + + # Note: alibi will added to the attention bias that will be applied to the query, key product of attention + # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length) + # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length) + # => the query_length dimension will then be broadcasted correctly + # This is more or less identical to T5's relative position bias: + # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527 + arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :] + alibi = slopes[..., None].bfloat16() * arange_tensor + return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype) + + +# Copied from transformers.models.bloom.modeling_bloom.dropout_add +def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor: + """ + Dropout add function + + Args: + x (`torch.tensor`): + input tensor + residual (`torch.tensor`): + residual tensor + prob (`float`): + dropout probability + training (`bool`): + training mode + """ + out = F.dropout(x, p=prob, training=training) + out = residual + out + return out + + +class FalconAttention(nn.Module): + def __init__(self, config: FalconConfig, layer_idx=None): + super().__init__() + + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.split_size = self.hidden_size + self.hidden_dropout = config.hidden_dropout + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self._use_sdpa = config._attn_implementation == "sdpa" + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + if self.head_dim * self.num_heads != self.hidden_size: + raise ValueError( + f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:" + f" {self.num_heads})." + ) + + # Layer-wise attention scaling + self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) + self.beta = self.inv_norm_factor + if config.new_decoder_architecture: + qkv_out_dim = (config.num_kv_heads * 2 + config.num_attention_heads) * self.head_dim + elif config.multi_query: + qkv_out_dim = self.hidden_size + 2 * self.head_dim + else: + qkv_out_dim = 3 * self.hidden_size + self.query_key_value = FalconLinear(self.hidden_size, qkv_out_dim, bias=config.bias) + self.new_decoder_architecture = config.new_decoder_architecture + self.multi_query = config.multi_query + self.dense = FalconLinear(self.hidden_size, self.hidden_size, bias=config.bias) + self.attention_dropout = nn.Dropout(config.attention_dropout) + self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1 + + # TODO (raushan): remove in v4.46 (RoPE is computed in the model, not in the decoder layers) + if config.rotary: + self.rotary_emb = FalconRotaryEmbedding(config=self.config) + + def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv` + + Args: + fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim] + + Returns: + query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim] + value: [batch_size, seq_length, num_heads, head_dim] + """ + if self.new_decoder_architecture: + batch, seq_len, _ = fused_qkv.shape + qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv_heads + 2, self.head_dim) + query = qkv[:, :, :, :-2] + key = qkv[:, :, :, [-2]] + value = qkv[:, :, :, [-1]] + key = torch.broadcast_to(key, query.shape) + value = torch.broadcast_to(value, query.shape) + + query, key, value = [x.flatten(2, 3) for x in (query, key, value)] + return query, key, value + elif not self.multi_query: + batch_size, seq_length, three_times_hidden_size = fused_qkv.shape + fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim) + return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :] + else: + batch_size, seq_length, three_times_hidden_size = fused_qkv.shape + fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim) + return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :] + + # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads + def _merge_heads(self, x: torch.Tensor) -> torch.Tensor: + """ + Merge heads together over the last dimension + + Args: + x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim] + + Returns: + torch.tensor: [batch_size, seq_length, num_heads * head_dim] + """ + # What we want to achieve is: + # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim + batch_size_and_num_heads, seq_length, _ = x.shape + batch_size = batch_size_and_num_heads // self.num_heads + + # First view to decompose the batch size + # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim + x = x.view(batch_size, self.num_heads, seq_length, self.head_dim) + + # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim + x = x.permute(0, 2, 1, 3) + + # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim + return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim) + + def forward( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + layer_past: Optional[Cache] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + ): + fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads + # 3 x [batch_size, seq_length, num_heads, head_dim] + (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) + + batch_size, query_length, _, _ = query_layer.shape + + query_layer = query_layer.transpose(1, 2).reshape(batch_size, self.num_heads, query_length, self.head_dim) + key_layer = key_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim) + value_layer = value_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim) + + if alibi is None: + cos, sin = position_embeddings + query_layer, key_layer = apply_rotary_pos_emb(query_layer, key_layer, cos, sin) + + if layer_past is not None: + cache_kwargs = {"cache_position": cache_position} + if alibi is None: + cache_kwargs.update({"sin": sin, "cos": cos}) + key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs) + + kv_length = key_layer.shape[-2] + if self._use_sdpa and query_layer.device.type == "cuda" and attention_mask is not None: + # For torch<=2.1.2, SDPA with memory-efficient backend is bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + query_layer = query_layer.contiguous() + key_layer = key_layer.contiguous() + value_layer = value_layer.contiguous() + + if attention_mask is not None: + attention_mask = attention_mask[:, :, :, : key_layer.shape[-2]] + + if alibi is None: + if self._use_sdpa and not output_attentions: + # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an + # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True` + # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not + # create a causal mask in case query_length == 1. + is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=attention_mask, + dropout_p=0.0, + is_causal=is_causal, + ) + attention_scores = None + else: + attention_scores = query_layer @ key_layer.transpose(-1, -2) + attention_scores /= math.sqrt(self.head_dim) + + attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype) + # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi). + attn_output = attention_scores @ value_layer + + attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim) + attn_output = attn_output.permute(0, 2, 1, 3) + attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) + + attn_output = self.dense(attn_output) + + if output_attentions: + return attn_output, layer_past, attention_scores + else: + return attn_output, layer_past + + else: + if self._use_sdpa and not output_attentions and head_mask is None: + # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an + # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True` + is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=attention_mask, + dropout_p=self.attention_dropout.p if self.training else 0.0, + is_causal=is_causal, + ) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) + + attn_output = self.dense(attn_output) + else: + matmul_result = query_layer @ key_layer.transpose(-1, -2) + + # change view to [batch_size, num_heads, q_length, kv_length] + attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length) + + # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length] + input_dtype = attention_scores.dtype + # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38` + if input_dtype == torch.float16 or input_dtype == torch.bfloat16: + attention_scores = attention_scores.to(torch.float32) + + attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1) + attention_logits *= self.inv_norm_factor + attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype) + # [batch_size, num_heads, q_length, kv_length] + attention_probs = self.attention_dropout(attention_probs) + + if head_mask is not None: + attention_probs = attention_probs * head_mask + + # change view [batch_size, num_heads, q_length, kv_length] + attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length) + + # matmul: [batch_size * num_heads, q_length, head_dim] + attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1) + + # change view [batch_size, q_length, num_heads * head_dim] + attn_output = self._merge_heads(attn_output) + + attn_output = self.dense(attn_output) + + if output_attentions: + return attn_output, layer_past, attention_probs + else: + return attn_output, layer_past + + +class FalconFlashAttention2(FalconAttention): + """ + Falcon flash attention module. This module inherits from `FalconAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() + + def forward( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + layer_past: Optional[Cache] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + ): + fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads + # 3 x [batch_size, seq_length, num_heads, head_dim] + (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) + + batch_size, query_length, _, _ = query_layer.shape + + query_layer = query_layer.transpose(1, 2).reshape(batch_size, self.num_heads, query_length, self.head_dim) + key_layer = key_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim) + value_layer = value_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim) + + if alibi is None: + cos, sin = position_embeddings + query_layer, key_layer = apply_rotary_pos_emb(query_layer, key_layer, cos, sin) + + if layer_past is not None: + cache_kwargs = {"cache_position": cache_position} + if alibi is None: + cache_kwargs.update({"sin": sin, "cos": cos}) + key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_layer = query_layer.transpose(1, 2) + key_layer = key_layer.transpose(1, 2) + value_layer = value_layer.transpose(1, 2) + + if alibi is not None: + raise ValueError("`alibi` is not supported when `use_flash_attn` is True") + + attn_dropout = self.config.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_layer.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.query_key_value.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_layer = query_layer.to(target_dtype) + key_layer = key_layer.to(target_dtype) + value_layer = value_layer.to(target_dtype) + + attn_output = _flash_attention_forward( + query_layer, + key_layer, + value_layer, + attention_mask, + query_length, + position_ids=position_ids, + dropout=attn_dropout, + is_causal=self.is_causal, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + ) + + attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) + attn_output = self.dense(attn_weights) + + if not output_attentions: + attn_weights = None + + return attn_output, layer_past, attn_weights + + +class FalconMLP(nn.Module): + def __init__(self, config: FalconConfig): + super().__init__() + hidden_size = config.hidden_size + + self.dense_h_to_4h = FalconLinear(hidden_size, config.ffn_hidden_size, bias=config.bias) + self.act = get_activation(config.activation) + self.dense_4h_to_h = FalconLinear(config.ffn_hidden_size, hidden_size, bias=config.bias) + self.hidden_dropout = config.hidden_dropout + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.act(self.dense_h_to_4h(x)) + x = self.dense_4h_to_h(x) + return x + + +FALCON_ATTENTION_CLASSES = { + "eager": FalconAttention, + "sdpa": FalconAttention, # FalconAttention originally implemented both a forward with & without SDPA + "flash_attention_2": FalconFlashAttention2, +} + + +class FalconDecoderLayer(nn.Module): + def __init__(self, config: FalconConfig, layer_idx=None): + super().__init__() + hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + + self.self_attention = FALCON_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.mlp = FalconMLP(config) + self.hidden_dropout = config.hidden_dropout + self.config = config + + if config.num_ln_in_parallel_attn is None and config.new_decoder_architecture: + config.num_ln_in_parallel_attn = 2 + + if not config.parallel_attn: + self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + else: + if config.num_ln_in_parallel_attn == 2: + # The layer norm before self-attention + self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + # The layer norm before the MLP + self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + else: + self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + def forward( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + layer_past: Optional[Union[Cache, Tuple[torch.Tensor, torch.Tensor]]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs, + ): + residual = hidden_states + + if self.config.new_decoder_architecture and self.config.num_ln_in_parallel_attn == 2: + attention_layernorm_out = self.ln_attn(hidden_states) + mlp_layernorm_out = self.ln_mlp(hidden_states) + else: + attention_layernorm_out = self.input_layernorm(hidden_states) + + # Self attention. + attn_outputs = self.self_attention( + attention_layernorm_out, + layer_past=layer_past, + attention_mask=attention_mask, + position_ids=position_ids, + alibi=alibi, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + attention_output = attn_outputs[0] + + if not self.config.new_decoder_architecture: + if self.config.parallel_attn: + mlp_layernorm_out = attention_layernorm_out + else: + residual = dropout_add( + attention_output, residual, self.config.attention_dropout, training=self.training + ) + mlp_layernorm_out = self.post_attention_layernorm(residual) + + if ( + self.config.new_decoder_architecture + and self.config.parallel_attn + and self.config.num_ln_in_parallel_attn == 1 + ): + mlp_layernorm_out = attention_layernorm_out + + outputs = attn_outputs[1:] + + # MLP. + mlp_output = self.mlp(mlp_layernorm_out) + + if self.config.new_decoder_architecture or self.config.parallel_attn: + mlp_output += attention_output + + output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training) + + if use_cache: + outputs = (output,) + outputs + else: + outputs = (output,) + outputs[1:] + + return outputs # hidden_states, past_kv, attentions + + +FALCON_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`FalconConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +FALCON_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + + If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see + `past_key_values`). + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +class FalconPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = FalconConfig + base_model_prefix = "transformer" + supports_gradient_checkpointing = True + _no_split_modules = ["FalconDecoderLayer"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + if isinstance(module, nn.Linear) or isinstance(module, FalconLinear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + # Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa + @classmethod + def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> "PretrainedConfig": + _is_bettertransformer = getattr(cls, "use_bettertransformer", False) + if _is_bettertransformer: + return config + + if not hard_check_only: + config._attn_implementation = "sdpa" + return config + + +@add_start_docstrings( + "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.", + FALCON_START_DOCSTRING, +) +class FalconModel(FalconPreTrainedModel): + def __init__(self, config: FalconConfig): + super().__init__(config) + + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.use_alibi = config.alibi + + # Embedding + LN Embedding + self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim) + + # Transformer blocks + self.h = nn.ModuleList([FalconDecoderLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self._use_sdpa = config._attn_implementation == "sdpa" + + # Final Layer Norm + self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + self.rotary_emb = FalconRotaryEmbedding(config=config) + + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.word_embeddings + + def set_input_embeddings(self, new_embeddings: torch.Tensor): + self.word_embeddings = new_embeddings + + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) + + # Compute alibi tensor: check build_alibi_tensor documentation + alibi = None + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + batch_size, seq_length, _ = inputs_embeds.shape + if self.use_alibi: + mask = ( + torch.ones( + (batch_size, seq_length + past_key_values_length), device=inputs_embeds.device, dtype=torch.long + ) + if attention_mask is None + else attention_mask + ) + alibi = build_alibi_tensor(mask, self.num_heads, dtype=inputs_embeds.dtype) + + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions, head_mask, alibi + ) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape batch_size x num_heads x N x N + # head_mask has shape n_layer x batch x num_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + next_decoder_cache = None + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for i, block in enumerate(self.h): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + outputs = self._gradient_checkpointing_func( + block.__call__, + hidden_states, + alibi, + causal_mask, + position_ids, + head_mask[i], + past_key_values, + use_cache, + output_attentions, + cache_position, + position_embeddings, + ) + else: + outputs = block( + hidden_states, + layer_past=past_key_values, + attention_mask=causal_mask, + position_ids=position_ids, + head_mask=head_mask[i], + use_cache=use_cache, + output_attentions=output_attentions, + alibi=alibi, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + hidden_states = outputs[0] + if use_cache is True: + next_decoder_cache = outputs[1] + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) + + # Add last hidden state + hidden_states = self.ln_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + + if not return_dict: + return tuple( + v for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions] if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + head_mask: torch.Tensor, + alibi: torch.Tensor, + ): + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if ( + self.config._attn_implementation == "sdpa" + and not using_static_cache + and not output_attentions + and head_mask is None + and alibi is None + ): + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + batch_size, sequence_length, _ = input_tensor.shape + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + # We take care to integrate alibi bias in the causal_mask here + if head_mask is None and alibi is not None: + alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:]) + causal_mask = torch.masked_fill( + alibi / math.sqrt(self.config.hidden_size // self.num_heads), + causal_mask < -1, + min_dtype, + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to place the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +@add_start_docstrings( + "The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).", + FALCON_START_DOCSTRING, +) +class FalconForCausalLM(FalconPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: FalconConfig): + super().__init__(config) + self.transformer = FalconModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings: torch.Tensor): + self.lm_head = new_embeddings + + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + + logits_to_keep (`int` or `torch.Tensor`, *optional*): + If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. + This is useful when using packed tensor format (single dimension for batch and sequence length). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + hidden_states = transformer_outputs[0] + + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + lm_logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + lm_logits, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def _reorder_cache( + self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor + ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + + Output shares the same memory storage as `past`. + """ + + # Get a copy of `beam_idx` on all the devices where we need those indices. + device_to_beam_idx = { + past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past + } + reordered_past = tuple( + ( + layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]), + layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]), + ) + for layer_past in past + ) + return reordered_past + + +@add_start_docstrings( + """ + The Falcon Model transformer with a sequence classification head on top (linear layer). + + [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-1) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + FALCON_START_DOCSTRING, +) +class FalconForSequenceClassification(FalconPreTrainedModel): + def __init__(self, config: FalconConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = FalconModel(config) + self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + last_non_pad_token = -1 + elif input_ids is not None: + # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id + non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) + token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) + last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) + else: + last_non_pad_token = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + Falcon Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + FALCON_START_DOCSTRING, +) +class FalconForTokenClassification(FalconPreTrainedModel): + def __init__(self, config: FalconConfig): + super().__init__(config) + self.num_labels = config.num_labels + + self.transformer = FalconModel(config) + if getattr(config, "classifier_dropout", None) is not None: + classifier_dropout = config.classifier_dropout + elif getattr(config, "hidden_dropout", None) is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + batch_size, seq_length = labels.shape + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length) + ) + + if not return_dict: + output = (logits,) + transformer_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + The Falcon Model transformer with a span classification head on top for extractive question-answering tasks like + SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + FALCON_START_DOCSTRING, +) +class FalconForQuestionAnswering(FalconPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.transformer = FalconModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "FalconForCausalLM", + "FalconModel", + "FalconPreTrainedModel", + "FalconForSequenceClassification", + "FalconForTokenClassification", + "FalconForQuestionAnswering", +] diff --git a/docs/transformers/build/lib/transformers/models/falcon_mamba/__init__.py b/docs/transformers/build/lib/transformers/models/falcon_mamba/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..202147c938465dd7dfcb7e79ecbeeb93ce632dbf --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/falcon_mamba/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_falcon_mamba import * + from .modeling_falcon_mamba import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/docs/transformers/build/lib/transformers/models/falcon_mamba/configuration_falcon_mamba.py new file mode 100644 index 0000000000000000000000000000000000000000..4099920f4028909c1ae69bb5bc0b6e33a3881612 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/falcon_mamba/configuration_falcon_mamba.py @@ -0,0 +1,162 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""FALCONMAMBA configuration""" + +import math + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class FalconMambaConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`FalconMambaModel`]. It is used to instantiate a FALCON_MAMBA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the FALCON_MAMBA + [tiiuae/falcon-mamba-7b](https://huggingface.co/tiiuae/falcon-mamba-7b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50280): + Vocabulary size of the FALCON_MAMBA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`FalconMambaModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the embeddings and hidden states. + state_size (`int`, *optional*, defaults to 16): shape of the state space latents. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the model. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): + The epsilon to use in the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 0): + The id of the beginning of sentence token in the vocabulary. + eos_token_id (`int`, *optional*, defaults to 0): + The id of the end of sentence token in the vocabulary. + expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size. + conv_kernel (`int`, *optional*, defaults to 4): Size of the convolution kernel. + use_bias (`bool`, *optional*, defaults to `False`): + Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block + use_conv_bias (`bool`, *optional*, defaults to `True`): + Whether or not to use bias in the convolution layer of the mixer block. + hidden_act (`str`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.1): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + residual_in_fp32 (`bool`, *optional*, defaults to `True`): + Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model + time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`): + Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)` + time_step_scale (`float`, *optional*, defaults to 1.0): + Scale used used to scale `dt_proj.bias`. + time_step_min (`float`, *optional*, defaults to 0.001): + Minimum `time_step` used to bound `dt_proj.bias`. + time_step_max (`float`, *optional*, defaults to 0.1): + Maximum `time_step` used to bound `dt_proj.bias`. + time_step_init_scheme (`float`, *optional*, defaults to `"random"`): + Init scheme used for `dt_proj.weight`. Should be one of `["random","uniform"]` + time_step_floor (`float`, *optional*, defaults to 0.0001): + Minimum clamping value of the `dt_proj.bias` layer initialization. + rescale_prenorm_residual (`bool`, *optional*, defaults to `False`): + Whether or not to rescale `out_proj` weights when initializing. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the cache should be used. + use_mambapy (`bool`, *optional*, defaults to `False`): + Determines the fallback strategy during training if the CUDA-based official implementation of FalconMamba is not available. If `True`, the falcon_mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited. + mixer_rms_eps (`float`, *optional*, defaults to 1e-06): + The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states. + Example: + + ```python + >>> from transformers import FalconMambaConfig, FalconMambaModel + + >>> # Initializing a FalconMamba configuration + >>> configuration = FalconMambaConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = FalconMambaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "falcon_mamba" + + def __init__( + self, + vocab_size=50280, + hidden_size=768, + state_size=16, + num_hidden_layers=32, + layer_norm_epsilon=1e-5, + pad_token_id=0, + bos_token_id=0, + eos_token_id=0, + expand=2, + conv_kernel=4, + use_bias=False, + use_conv_bias=True, + hidden_act="silu", + initializer_range=0.1, + residual_in_fp32=True, + time_step_rank="auto", + time_step_scale=1.0, + time_step_min=0.001, + time_step_max=0.1, + time_step_init_scheme="random", + time_step_floor=1e-4, + rescale_prenorm_residual=False, + use_cache=True, + use_mambapy=False, + mixer_rms_eps=1e-6, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.state_size = state_size + self.num_hidden_layers = num_hidden_layers + self.layer_norm_epsilon = layer_norm_epsilon + self.conv_kernel = conv_kernel + self.expand = expand + self.intermediate_size = int(expand * self.hidden_size) + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.use_bias = use_bias + self.use_conv_bias = use_conv_bias + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank + self.time_step_scale = time_step_scale + self.time_step_min = time_step_min + self.time_step_max = time_step_max + self.time_step_init_scheme = time_step_init_scheme + self.time_step_floor = time_step_floor + self.rescale_prenorm_residual = rescale_prenorm_residual + self.residual_in_fp32 = residual_in_fp32 + self.use_cache = use_cache + self.use_mambapy = use_mambapy + self.mixer_rms_eps = mixer_rms_eps + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) + + +__all__ = ["FalconMambaConfig"] diff --git a/docs/transformers/build/lib/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/docs/transformers/build/lib/transformers/models/falcon_mamba/modeling_falcon_mamba.py new file mode 100644 index 0000000000000000000000000000000000000000..bf8695d2a9820b9e648689a2a1b6cd5f51998fad --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -0,0 +1,873 @@ +# coding=utf-8 +# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch FALCONMAMBA model.""" + +import math +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import MambaCache +from ...generation import GenerationMixin +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available, is_mambapy_available +from .configuration_falcon_mamba import FalconMambaConfig + + +logger = logging.get_logger(__name__) + +if is_mambapy_available(): + from mambapy.pscan import pscan +else: + pscan = None + +if is_mamba_ssm_available(): + from mamba_ssm.ops.selective_scan_interface import selective_scan_fn + from mamba_ssm.ops.triton.selective_state_update import selective_state_update + + from ...kernels.falcon_mamba import mamba_inner_fn +else: + selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None + +if is_causal_conv1d_available(): + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +else: + causal_conv1d_update, causal_conv1d_fn = None, None + +is_fast_path_available = all( + (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn) +) + +_CHECKPOINT_FOR_DOC = "tiiuae/falcon-mamba-7b" +_CONFIG_FOR_DOC = "FalconMambaConfig" + + +def rms_forward(hidden_states, variance_epsilon=1e-6): + """ + Calculates simple RMSNorm with no learnable weights. `MambaRMSNorm` will + leverage this in order to multiply the final result with the RMSNorm weight + + Args: + hidden_states (`torch.Tensor`): + Hidden states to normalize + variance_epsilon (`float`): + The eps value to add in the square root scaling factor + """ + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon) + return hidden_states.to(input_dtype) + + +class FalconMambaMixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. + A, D are input independent (see FalconMamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) + ∆, B, C are input-dependent (this is a key difference between FalconMamba and the linear time invariant S4, + and is why FalconMamba is called **selective** state spaces) + """ + + def __init__(self, config: FalconMambaConfig, layer_idx: int): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.ssm_state_size = config.state_size + self.conv_kernel_size = config.conv_kernel + self.intermediate_size = config.intermediate_size + self.time_step_rank = int(config.time_step_rank) + self.layer_idx = layer_idx + self.use_conv_bias = config.use_conv_bias + self.conv1d = nn.Conv1d( + in_channels=self.intermediate_size, + out_channels=self.intermediate_size, + bias=config.use_conv_bias, + kernel_size=config.conv_kernel, + groups=self.intermediate_size, + padding=config.conv_kernel - 1, + ) + + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + + self.use_mambapy = config.use_mambapy + + # projection of the input hidden states + self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias) + # selective projection used to make dt, B and C input dependent + self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False) + # time step projection (discretization) + self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :] + A = A.expand(self.intermediate_size, -1).contiguous() + + self.A_log = nn.Parameter(torch.log(A)) + self.D = nn.Parameter(torch.ones(self.intermediate_size)) + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias) + self.use_bias = config.use_bias + + # Triton expects to pass RMS weights even if they are non learnable, thus we need to create these weights here + self.register_buffer( + "b_c_rms", torch.nn.Parameter(torch.ones(self.ssm_state_size), requires_grad=False), persistent=False + ) + self.register_buffer( + "dt_rms", torch.nn.Parameter(torch.ones(self.intermediate_size), requires_grad=False), persistent=False + ) + self.rms_eps = config.mixer_rms_eps + + if not is_fast_path_available: + if self.use_mambapy: + if is_mambapy_available(): + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`" + " is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + else: + raise ImportError( + "use_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py." + ) + else: + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`" + " is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py." + ) + + def cuda_kernels_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[MambaCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + ): + # 1. Gated MLP's linear projection + projected_states = self.in_proj(hidden_states).transpose(1, 2) + + if self.training and cache_params is None: # Doesn't support outputting the states -> used for training + contextualized_states = mamba_inner_fn( + projected_states, + self.conv1d.weight, + self.conv1d.bias if self.use_conv_bias else None, + self.x_proj.weight, + self.dt_proj.weight, + self.out_proj.weight, + self.out_proj.bias.float() if self.use_bias else None, + -torch.exp(self.A_log.float()), + None, # input-dependent B + None, # input-dependent C + self.D.float(), + delta_bias=self.dt_proj.bias.float(), + delta_softplus=True, + b_rms_weight=self.b_c_rms, + c_rms_weight=self.b_c_rms, + dt_rms_weight=self.dt_rms, + b_c_dt_rms_eps=self.rms_eps, + ) + + else: + hidden_states, gate = projected_states.chunk(2, dim=1) + + if attention_mask is not None: + hidden_states = hidden_states * attention_mask.unsqueeze(1) + + # 2. Convolution sequence transformation + conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)) + if cache_params is not None and cache_position[0] > 0: + hidden_states = causal_conv1d_update( + hidden_states.squeeze(-1), + cache_params.conv_states[self.layer_idx], + conv_weights, + self.conv1d.bias, + self.activation, + ) + hidden_states = hidden_states.unsqueeze(-1) + else: + if cache_params is not None: + conv_states = nn.functional.pad( + hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0) + ) + cache_params.update_conv_state(self.layer_idx, conv_states, cache_position) + hidden_states = causal_conv1d_fn( + hidden_states, conv_weights, self.conv1d.bias, activation=self.activation + ) + + if attention_mask is not None: + hidden_states = hidden_states * attention_mask.unsqueeze(1) + + # 3. State Space Model sequence transformation + # 3.a. input varying initialization of time_step, B and C + ssm_parameters = self.x_proj(hidden_states.transpose(1, 2)) + time_step, B, C = torch.split( + ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1 + ) + + B = rms_forward(B, variance_epsilon=self.rms_eps) + C = rms_forward(C, variance_epsilon=self.rms_eps) + time_step = rms_forward(time_step, variance_epsilon=self.rms_eps) + + # In case the model has been quantized, we need a hack to properly call the `nn.Linear` module + # at the price of a small overhead. + if hasattr(self.config, "_pre_quantization_dtype"): + discrete_time_step = (self.dt_proj(time_step) - self.dt_proj.bias).transpose(1, 2) + else: + discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2) + + A = -torch.exp(self.A_log.float()) + # 3.c perform the recurrence y ← SSM(A, B, C)(x) + time_proj_bias = self.dt_proj.bias.float() if hasattr(self.dt_proj, "bias") else None + if cache_params is not None and cache_position[0] > 0: + scan_outputs = selective_state_update( + cache_params.ssm_states[self.layer_idx], + hidden_states[..., 0], + discrete_time_step[..., 0], + A, + B[:, 0], + C[:, 0], + self.D, + gate[..., 0], + time_proj_bias, + dt_softplus=True, + ).unsqueeze(-1) + else: + scan_outputs, ssm_state = selective_scan_fn( + hidden_states, + discrete_time_step, + A, + B.transpose(1, 2), + C.transpose(1, 2), + self.D.float(), + gate, + time_proj_bias, + delta_softplus=True, + return_last_state=True, + ) + if ssm_state is not None and cache_params is not None: + cache_params.update_ssm_state(self.layer_idx, ssm_state) + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_outputs.transpose(1, 2)) + return contextualized_states + + def slow_forward( + self, + input_states, + cache_params: Optional[MambaCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + ): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + # 1. Gated MLP's linear projection + projected_states = self.in_proj(input_states).transpose(1, 2) # [batch, 2 * intermediate_size, seq_len] + hidden_states, gate = projected_states.chunk(2, dim=1) + + if attention_mask is not None: + hidden_states = hidden_states * attention_mask.unsqueeze(1) + + # 2. Convolution sequence transformation + if cache_params is not None: + ssm_state = cache_params.ssm_states[self.layer_idx].clone() + ssm_state = ssm_state.to(hidden_states.device) + # use `cache_position.shape[0]` to check whether we are in prefill + # stage, it's equivalent to check `cache_position[0] == 0`, which + # breaks dynamo fullgraph constraints + if cache_position is not None and cache_position.shape[0] == self.conv_kernel_size: + conv_state = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)) + + cache_params.update_conv_state(self.layer_idx, conv_state, cache_position) + hidden_states = self.act( + self.conv1d(hidden_states)[..., :seq_len] + ) # [batch, intermediate_size, seq_len] + else: + conv_state = cache_params.update_conv_state(self.layer_idx, hidden_states, cache_position) + conv_state = conv_state.to(self.conv1d.weight.device) + hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1) + if self.use_conv_bias: + hidden_states += self.conv1d.bias + hidden_states = ( + self.act(hidden_states).to(dtype).unsqueeze(-1) + ) # [batch, intermediate_size, 1] : decoding + else: + ssm_state = torch.zeros( + (batch_size, self.intermediate_size, self.ssm_state_size), device=hidden_states.device, dtype=dtype + ) + hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len]) # [batch, intermediate_size, seq_len] + + if attention_mask is not None: + hidden_states = hidden_states * attention_mask.unsqueeze(1) + + # 3. State Space Model sequence transformation + # 3.a. Selection: [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2] + ssm_parameters = self.x_proj(hidden_states.transpose(1, 2)) + time_step, B, C = torch.split( + ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1 + ) + + B = rms_forward(B, variance_epsilon=self.rms_eps) + C = rms_forward(C, variance_epsilon=self.rms_eps) + time_step = rms_forward(time_step, variance_epsilon=self.rms_eps) + + discrete_time_step = self.dt_proj(time_step) # [batch, seq_len, intermediate_size] + discrete_time_step = nn.functional.softplus(discrete_time_step).transpose( + 1, 2 + ) # [batch, intermediate_size, seq_len] + + # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM) + A = -torch.exp(self.A_log.float()) # [intermediate_size, ssm_state_size] + discrete_A = torch.exp( + A[None, :, None, :] * discrete_time_step[:, :, :, None] + ) # [batch, intermediate_size, seq_len, ssm_state_size] + discrete_B = ( + discrete_time_step[:, :, :, None] * B[:, None, :, :].float() + ) # [batch, intermediate_size, seq_len, ssm_state_size] + deltaB_u = discrete_B * hidden_states[:, :, :, None].float() + + # 3.c perform the recurrence y ← SSM(A, B, C)(x) + if self.use_mambapy and self.training and cache_params is None: + hs = pscan( + discrete_A.transpose(1, 2), deltaB_u.transpose(1, 2) + ) # [batch, seq_len, intermediate_size, ssm_state_size] + scan_output = (hs @ C.unsqueeze(-1)).squeeze(3).transpose(1, 2) # [batch, intermediate_size, seq_len] + scan_output = scan_output + hidden_states * self.D[None, :, None] + scan_output = scan_output * self.act(gate) + else: + scan_outputs = [] + for i in range(seq_len): + ssm_state = ( + discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :] + ) # [batch, intermediate_size, ssm_state] + scan_output = torch.matmul( + ssm_state.to(dtype), C[:, i, :].unsqueeze(-1) + ) # [batch, intermediate_size, 1] + scan_outputs.append(scan_output[:, :, 0]) + scan_output = torch.stack(scan_outputs, dim=-1) # [batch, intermediate_size, seq_len] + scan_output = scan_output + (hidden_states * self.D[None, :, None]) + scan_output = scan_output * self.act(gate) + + if cache_params is not None: + cache_params.update_ssm_state(self.layer_idx, ssm_state) + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_output.transpose(1, 2)) # [batch, seq_len, hidden_size] + return contextualized_states + + # Copied from transformers.models.mamba.modeling_mamba.MambaMixer.forward + def forward( + self, + hidden_states, + cache_params: Optional[MambaCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + ): + if is_fast_path_available and "cuda" in self.x_proj.weight.device.type and not torch._dynamo.is_compiling(): + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) + return self.slow_forward(hidden_states, cache_params, cache_position, attention_mask) + + +# Copied from transformers.models.mamba.modeling_mamba.MambaRMSNorm with Mamba->FalconMamba +class FalconMambaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + FalconMambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def extra_repr(self): + return f"{self.weight.shape[0]}, eps={self.variance_epsilon}" + + # Ignore copy + def forward(self, hidden_states): + return self.weight.to(hidden_states.device) * rms_forward( + hidden_states, variance_epsilon=self.variance_epsilon + ) + + +# Copied from transformers.models.mamba.modeling_mamba.MambaBlock with Mamba->FalconMamba,FalconMambaCache->MambaCache +class FalconMambaBlock(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.residual_in_fp32 = config.residual_in_fp32 + self.norm = FalconMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + self.mixer = FalconMambaMixer(config, layer_idx=layer_idx) + + def forward( + self, + hidden_states, + cache_params: Optional[MambaCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + ): + residual = hidden_states + hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype)) + if self.residual_in_fp32: + residual = residual.to(torch.float32) + + hidden_states = self.mixer( + hidden_states, cache_params=cache_params, cache_position=cache_position, attention_mask=attention_mask + ) + hidden_states = residual + hidden_states + return hidden_states + + +# Copied from transformers.models.mamba.modeling_mamba.MambaPreTrainedModel with Mamba->FalconMamba +class FalconMambaPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = FalconMambaConfig + base_model_prefix = "backbone" + _no_split_modules = ["FalconMambaBlock", "FalconMambaMixer"] + supports_gradient_checkpointing = True + _is_stateful = True + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, FalconMambaMixer): + module.A_log._no_weight_decay = True + module.D._no_weight_decay = True + + dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale + if self.config.time_step_init_scheme == "constant": + nn.init.constant_(module.dt_proj.weight, dt_init_std) + elif self.config.time_step_init_scheme == "random": + nn.init.uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std) + + dt = torch.exp( + torch.rand(self.config.intermediate_size) + * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min)) + + math.log(self.config.time_step_min) + ).clamp(min=self.config.time_step_floor) + # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 + inv_dt = dt + torch.log(-torch.expm1(-dt)) + with torch.no_grad(): + module.dt_proj.bias.copy_(inv_dt) + module.dt_proj.bias._no_reinit = True + + if isinstance(module, nn.Linear): + if module.bias is not None: + if not getattr(module.bias, "_no_reinit", False): + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, std=self.config.initializer_range) + + if self.config.rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if name in ["out_proj.weight"]: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with torch.no_grad(): + p /= math.sqrt(self.config.num_hidden_layers) + + +@dataclass +# Copied from transformers.models.mamba.modeling_mamba.MambaOutput with MAMBA->FALCONMAMBA,Mamba->FalconMamba,FalconMambaCache->MambaCache +class FalconMambaOutput(ModelOutput): + """ + Class for the FALCONMAMBA model outputs. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + cache_params (`MambaCache`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + + Includes both the State space model state matrices after the selective scan, and the Convolutional states + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + cache_params: Optional[MambaCache] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +# Copied from transformers.models.mamba.modeling_mamba.MambaCausalLMOutput with Mamba->FalconMamba,FalconMambaCache->MambaCache +class FalconMambaCausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + cache_params (`MambaCache`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + + Includes both the State space model state matrices after the selective scan, and the Convolutional states + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + cache_params: Optional[MambaCache] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +FALCONMAMBA_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`FalconMambaConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +FALCONMAMBA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + Indices of input sequence tokens in the vocabulary. + + If `cache_params.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + cache_params (`MambaCache`, *optional*): + If passed along, the model uses the previous state in all the blocks (which will give the output for the + `input_ids` provided as if the model add `state_input_ids + input_ids` as context). + use_cache (`bool`, *optional*): + If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare FALCONMAMBA Model transformer outputting raw hidden-states without any specific head on top.", + FALCONMAMBA_START_DOCSTRING, +) +class FalconMambaModel(FalconMambaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList( + [FalconMambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)] + ) + + self.gradient_checkpointing = False + self.norm_f = FalconMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings = new_embeddings + + @add_start_docstrings_to_model_forward(FALCONMAMBA_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=FalconMambaOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + cache_params: Optional[MambaCache] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, FalconMambaOutput]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): # ^ is python for xor + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + if self.gradient_checkpointing and self.training and use_cache: + use_cache = False + + if use_cache: + if cache_params is None: + cache_params = MambaCache( + self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype + ) + cache_position = torch.arange(0, self.config.conv_kernel, device=inputs_embeds.device) + elif cache_position is None: + # cases when we do manual forward instead of using `model.generate` which will initiate + # `cache_position` and makes sure it is not None, throw error here instead of doing some + # hack to conjecture the current cache position + raise ValueError( + "You have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, " + "you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will " + "be initialized for you automatically" + ) + else: + cache_params = None + hidden_states = inputs_embeds + all_hidden_states = () if output_hidden_states else None + for mixer_block in self.layers: + if self.gradient_checkpointing and self.training: + hidden_states = self._gradient_checkpointing_func( + mixer_block.__call__, hidden_states, cache_params, cache_position, attention_mask + ) + else: + hidden_states = mixer_block( + hidden_states, + cache_params=cache_params, + cache_position=cache_position, + attention_mask=attention_mask, + ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + hidden_states = self.norm_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None) + + return FalconMambaOutput( + last_hidden_state=hidden_states, + cache_params=cache_params if use_cache else None, + hidden_states=all_hidden_states, + ) + + +@add_start_docstrings( + """ + The FALCONMAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + FALCONMAMBA_START_DOCSTRING, +) +# Copied from transformers.models.mamba.modeling_mamba.MambaForCausalLM with MAMBA->FALCONMAMBA,Mamba->FalconMamba,mamba->falcon_mamba,FalconMambaCache->MambaCache +class FalconMambaForCausalLM(FalconMambaPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.backbone = FalconMambaModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def get_input_embeddings(self): + return self.backbone.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + return self.backbone.set_input_embeddings(new_embeddings) + + def _update_model_kwargs_for_generation( + self, outputs: ModelOutput, model_kwargs: Dict[str, Any], num_new_tokens: int = 1, **kwargs + ) -> Dict[str, Any]: + model_kwargs["cache_params"] = outputs.get("cache_params", None) + if ( + model_kwargs.get("use_cache", True) + and "cache_position" in model_kwargs + and model_kwargs["cache_position"] is not None + ): + model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens + + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + return model_kwargs + + def prepare_inputs_for_generation( + self, + input_ids, + inputs_embeds=None, + use_cache=None, + cache_params: Optional[MambaCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + **kwargs, + ): + # Overwritten -- uses `cache_params` as opposed to `past_key_values` + + if use_cache: + # `cache_position` should have been initialized in `generate` + if cache_position is None: + raise ValueError( + "`cache_position` should not be None as it should have been initialized in " + "`model.generate`, you are responsible for passing in a valid `cache_position` if " + "you are calling `prepare_inputs_for_generation` directly with `use_cache=True`" + ) + if cache_position[0] > 0: + input_ids = input_ids[:, -1].unsqueeze(-1) + + if attention_mask is not None: + attention_mask = None + + else: + # we initialize the `cache_position` to full size of `conv_states` at prefill stage + # considering padding will be applied when input length is shorter, and truncation + # will be applied when it is longer, so it will be equivalent to always have it match + # the length of `cache_params.conv_states`, which is `config.conv_kernel` + cache_position = torch.arange(0, self.config.conv_kernel, device=input_ids.device) + + if inputs_embeds is not None and cache_params is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} + + model_inputs.update( + { + "cache_params": cache_params, + "use_cache": use_cache, + "cache_position": cache_position, + "attention_mask": attention_mask, + } + ) + return model_inputs + + @add_start_docstrings_to_model_forward(FALCONMAMBA_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=FalconMambaCausalLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_params: Optional[MambaCache] = None, + labels: Optional[torch.LongTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, # for now we need this for generation + ) -> Union[Tuple, FalconMambaCausalLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + falcon_mamba_outputs = self.backbone( + input_ids, + cache_params=cache_params, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + use_cache=use_cache, + cache_position=cache_position, + attention_mask=attention_mask, + ) + hidden_states = falcon_mamba_outputs[0] + + logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float() + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (logits,) + falcon_mamba_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return FalconMambaCausalLMOutput( + loss=loss, + logits=logits, + cache_params=falcon_mamba_outputs.cache_params, + hidden_states=falcon_mamba_outputs.hidden_states, + ) + + +__all__ = ["FalconMambaForCausalLM", "FalconMambaModel", "FalconMambaPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/__init__.py b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..44d1ec7236310774ed6b1379683c144d7f93ecce --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_fastspeech2_conformer import * + from .modeling_fastspeech2_conformer import * + from .tokenization_fastspeech2_conformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e9a061ffd3733c831190b9efb14316987674b853 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py @@ -0,0 +1,480 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""FastSpeech2Conformer model configuration""" + +from typing import Dict + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class FastSpeech2ConformerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FastSpeech2ConformerModel`]. It is used to + instantiate a FastSpeech2Conformer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastSpeech2Conformer [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 384): + The dimensionality of the hidden layers. + vocab_size (`int`, *optional*, defaults to 78): + The size of the vocabulary. + num_mel_bins (`int`, *optional*, defaults to 80): + The number of mel filters used in the filter bank. + encoder_num_attention_heads (`int`, *optional*, defaults to 2): + The number of attention heads in the encoder. + encoder_layers (`int`, *optional*, defaults to 4): + The number of layers in the encoder. + encoder_linear_units (`int`, *optional*, defaults to 1536): + The number of units in the linear layer of the encoder. + decoder_layers (`int`, *optional*, defaults to 4): + The number of layers in the decoder. + decoder_num_attention_heads (`int`, *optional*, defaults to 2): + The number of attention heads in the decoder. + decoder_linear_units (`int`, *optional*, defaults to 1536): + The number of units in the linear layer of the decoder. + speech_decoder_postnet_layers (`int`, *optional*, defaults to 5): + The number of layers in the post-net of the speech decoder. + speech_decoder_postnet_units (`int`, *optional*, defaults to 256): + The number of units in the post-net layers of the speech decoder. + speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5): + The kernel size in the post-net of the speech decoder. + positionwise_conv_kernel_size (`int`, *optional*, defaults to 3): + The size of the convolution kernel used in the position-wise layer. + encoder_normalize_before (`bool`, *optional*, defaults to `False`): + Specifies whether to normalize before encoder layers. + decoder_normalize_before (`bool`, *optional*, defaults to `False`): + Specifies whether to normalize before decoder layers. + encoder_concat_after (`bool`, *optional*, defaults to `False`): + Specifies whether to concatenate after encoder layers. + decoder_concat_after (`bool`, *optional*, defaults to `False`): + Specifies whether to concatenate after decoder layers. + reduction_factor (`int`, *optional*, defaults to 1): + The factor by which the speech frame rate is reduced. + speaking_speed (`float`, *optional*, defaults to 1.0): + The speed of the speech produced. + use_macaron_style_in_conformer (`bool`, *optional*, defaults to `True`): + Specifies whether to use macaron style in the conformer. + use_cnn_in_conformer (`bool`, *optional*, defaults to `True`): + Specifies whether to use convolutional neural networks in the conformer. + encoder_kernel_size (`int`, *optional*, defaults to 7): + The kernel size used in the encoder. + decoder_kernel_size (`int`, *optional*, defaults to 31): + The kernel size used in the decoder. + duration_predictor_layers (`int`, *optional*, defaults to 2): + The number of layers in the duration predictor. + duration_predictor_channels (`int`, *optional*, defaults to 256): + The number of channels in the duration predictor. + duration_predictor_kernel_size (`int`, *optional*, defaults to 3): + The kernel size used in the duration predictor. + energy_predictor_layers (`int`, *optional*, defaults to 2): + The number of layers in the energy predictor. + energy_predictor_channels (`int`, *optional*, defaults to 256): + The number of channels in the energy predictor. + energy_predictor_kernel_size (`int`, *optional*, defaults to 3): + The kernel size used in the energy predictor. + energy_predictor_dropout (`float`, *optional*, defaults to 0.5): + The dropout rate in the energy predictor. + energy_embed_kernel_size (`int`, *optional*, defaults to 1): + The kernel size used in the energy embed layer. + energy_embed_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate in the energy embed layer. + stop_gradient_from_energy_predictor (`bool`, *optional*, defaults to `False`): + Specifies whether to stop gradients from the energy predictor. + pitch_predictor_layers (`int`, *optional*, defaults to 5): + The number of layers in the pitch predictor. + pitch_predictor_channels (`int`, *optional*, defaults to 256): + The number of channels in the pitch predictor. + pitch_predictor_kernel_size (`int`, *optional*, defaults to 5): + The kernel size used in the pitch predictor. + pitch_predictor_dropout (`float`, *optional*, defaults to 0.5): + The dropout rate in the pitch predictor. + pitch_embed_kernel_size (`int`, *optional*, defaults to 1): + The kernel size used in the pitch embed layer. + pitch_embed_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate in the pitch embed layer. + stop_gradient_from_pitch_predictor (`bool`, *optional*, defaults to `True`): + Specifies whether to stop gradients from the pitch predictor. + encoder_dropout_rate (`float`, *optional*, defaults to 0.2): + The dropout rate in the encoder. + encoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2): + The positional dropout rate in the encoder. + encoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2): + The attention dropout rate in the encoder. + decoder_dropout_rate (`float`, *optional*, defaults to 0.2): + The dropout rate in the decoder. + decoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2): + The positional dropout rate in the decoder. + decoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2): + The attention dropout rate in the decoder. + duration_predictor_dropout_rate (`float`, *optional*, defaults to 0.2): + The dropout rate in the duration predictor. + speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5): + The dropout rate in the speech decoder postnet. + max_source_positions (`int`, *optional*, defaults to 5000): + if `"relative"` position embeddings are used, defines the maximum source input positions. + use_masking (`bool`, *optional*, defaults to `True`): + Specifies whether to use masking in the model. + use_weighted_masking (`bool`, *optional*, defaults to `False`): + Specifies whether to use weighted masking in the model. + num_speakers (`int`, *optional*): + Number of speakers. If set to > 1, assume that the speaker ids will be provided as the input and use + speaker id embedding layer. + num_languages (`int`, *optional*): + Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the + languge id embedding layer. + speaker_embed_dim (`int`, *optional*): + Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Specifies whether the model is an encoder-decoder. + + Example: + + ```python + >>> from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerConfig + + >>> # Initializing a FastSpeech2Conformer style configuration + >>> configuration = FastSpeech2ConformerConfig() + + >>> # Initializing a model from the FastSpeech2Conformer style configuration + >>> model = FastSpeech2ConformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "fastspeech2_conformer" + base_config_key = "model_config" + attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"} + + def __init__( + self, + hidden_size=384, + vocab_size=78, + num_mel_bins=80, + encoder_num_attention_heads=2, + encoder_layers=4, + encoder_linear_units=1536, + decoder_layers=4, + decoder_num_attention_heads=2, + decoder_linear_units=1536, + speech_decoder_postnet_layers=5, + speech_decoder_postnet_units=256, + speech_decoder_postnet_kernel=5, + positionwise_conv_kernel_size=3, + encoder_normalize_before=False, + decoder_normalize_before=False, + encoder_concat_after=False, + decoder_concat_after=False, + reduction_factor=1, + speaking_speed=1.0, + use_macaron_style_in_conformer=True, + use_cnn_in_conformer=True, + encoder_kernel_size=7, + decoder_kernel_size=31, + duration_predictor_layers=2, + duration_predictor_channels=256, + duration_predictor_kernel_size=3, + energy_predictor_layers=2, + energy_predictor_channels=256, + energy_predictor_kernel_size=3, + energy_predictor_dropout=0.5, + energy_embed_kernel_size=1, + energy_embed_dropout=0.0, + stop_gradient_from_energy_predictor=False, + pitch_predictor_layers=5, + pitch_predictor_channels=256, + pitch_predictor_kernel_size=5, + pitch_predictor_dropout=0.5, + pitch_embed_kernel_size=1, + pitch_embed_dropout=0.0, + stop_gradient_from_pitch_predictor=True, + encoder_dropout_rate=0.2, + encoder_positional_dropout_rate=0.2, + encoder_attention_dropout_rate=0.2, + decoder_dropout_rate=0.2, + decoder_positional_dropout_rate=0.2, + decoder_attention_dropout_rate=0.2, + duration_predictor_dropout_rate=0.2, + speech_decoder_postnet_dropout=0.5, + max_source_positions=5000, + use_masking=True, + use_weighted_masking=False, + num_speakers=None, + num_languages=None, + speaker_embed_dim=None, + is_encoder_decoder=True, + **kwargs, + ): + if positionwise_conv_kernel_size % 2 == 0: + raise ValueError( + f"positionwise_conv_kernel_size must be odd, but got {positionwise_conv_kernel_size} instead." + ) + if encoder_kernel_size % 2 == 0: + raise ValueError(f"encoder_kernel_size must be odd, but got {encoder_kernel_size} instead.") + if decoder_kernel_size % 2 == 0: + raise ValueError(f"decoder_kernel_size must be odd, but got {decoder_kernel_size} instead.") + if duration_predictor_kernel_size % 2 == 0: + raise ValueError( + f"duration_predictor_kernel_size must be odd, but got {duration_predictor_kernel_size} instead." + ) + if energy_predictor_kernel_size % 2 == 0: + raise ValueError( + f"energy_predictor_kernel_size must be odd, but got {energy_predictor_kernel_size} instead." + ) + if energy_embed_kernel_size % 2 == 0: + raise ValueError(f"energy_embed_kernel_size must be odd, but got {energy_embed_kernel_size} instead.") + if pitch_predictor_kernel_size % 2 == 0: + raise ValueError( + f"pitch_predictor_kernel_size must be odd, but got {pitch_predictor_kernel_size} instead." + ) + if pitch_embed_kernel_size % 2 == 0: + raise ValueError(f"pitch_embed_kernel_size must be odd, but got {pitch_embed_kernel_size} instead.") + if hidden_size % encoder_num_attention_heads != 0: + raise ValueError("The hidden_size must be evenly divisible by encoder_num_attention_heads.") + if hidden_size % decoder_num_attention_heads != 0: + raise ValueError("The hidden_size must be evenly divisible by decoder_num_attention_heads.") + if use_masking and use_weighted_masking: + raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.") + + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.num_mel_bins = num_mel_bins + self.encoder_config = { + "num_attention_heads": encoder_num_attention_heads, + "layers": encoder_layers, + "kernel_size": encoder_kernel_size, + "attention_dropout_rate": encoder_attention_dropout_rate, + "dropout_rate": encoder_dropout_rate, + "positional_dropout_rate": encoder_positional_dropout_rate, + "linear_units": encoder_linear_units, + "normalize_before": encoder_normalize_before, + "concat_after": encoder_concat_after, + } + self.decoder_config = { + "num_attention_heads": decoder_num_attention_heads, + "layers": decoder_layers, + "kernel_size": decoder_kernel_size, + "attention_dropout_rate": decoder_attention_dropout_rate, + "dropout_rate": decoder_dropout_rate, + "positional_dropout_rate": decoder_positional_dropout_rate, + "linear_units": decoder_linear_units, + "normalize_before": decoder_normalize_before, + "concat_after": decoder_concat_after, + } + self.encoder_num_attention_heads = encoder_num_attention_heads + self.encoder_layers = encoder_layers + self.duration_predictor_channels = duration_predictor_channels + self.duration_predictor_kernel_size = duration_predictor_kernel_size + self.duration_predictor_layers = duration_predictor_layers + self.energy_embed_dropout = energy_embed_dropout + self.energy_embed_kernel_size = energy_embed_kernel_size + self.energy_predictor_channels = energy_predictor_channels + self.energy_predictor_dropout = energy_predictor_dropout + self.energy_predictor_kernel_size = energy_predictor_kernel_size + self.energy_predictor_layers = energy_predictor_layers + self.pitch_embed_dropout = pitch_embed_dropout + self.pitch_embed_kernel_size = pitch_embed_kernel_size + self.pitch_predictor_channels = pitch_predictor_channels + self.pitch_predictor_dropout = pitch_predictor_dropout + self.pitch_predictor_kernel_size = pitch_predictor_kernel_size + self.pitch_predictor_layers = pitch_predictor_layers + self.positionwise_conv_kernel_size = positionwise_conv_kernel_size + self.speech_decoder_postnet_units = speech_decoder_postnet_units + self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout + self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel + self.speech_decoder_postnet_layers = speech_decoder_postnet_layers + self.reduction_factor = reduction_factor + self.speaking_speed = speaking_speed + self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor + self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor + self.max_source_positions = max_source_positions + self.use_cnn_in_conformer = use_cnn_in_conformer + self.use_macaron_style_in_conformer = use_macaron_style_in_conformer + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + self.num_speakers = num_speakers + self.num_languages = num_languages + self.speaker_embed_dim = speaker_embed_dim + self.duration_predictor_dropout_rate = duration_predictor_dropout_rate + self.is_encoder_decoder = is_encoder_decoder + + super().__init__( + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + + +class FastSpeech2ConformerHifiGanConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FastSpeech2ConformerHifiGanModel`]. It is used to + instantiate a FastSpeech2Conformer HiFi-GAN vocoder model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastSpeech2Conformer + [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + model_in_dim (`int`, *optional*, defaults to 80): + The number of frequency bins in the input log-mel spectrogram. + upsample_initial_channel (`int`, *optional*, defaults to 512): + The number of input channels into the upsampling network. + upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`): + A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The + length of *upsample_rates* defines the number of convolutional layers and has to match the length of + *upsample_kernel_sizes*. + upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The + length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of + *upsample_rates*. + resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`): + A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field + fusion (MRF) module. + resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`): + A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the + multi-receptive field fusion (MRF) module. + initializer_range (`float`, *optional*, defaults to 0.01): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + leaky_relu_slope (`float`, *optional*, defaults to 0.1): + The angle of the negative slope used by the leaky ReLU activation. + normalize_before (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance. + + Example: + + ```python + >>> from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig + + >>> # Initializing a FastSpeech2ConformerHifiGan configuration + >>> configuration = FastSpeech2ConformerHifiGanConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = FastSpeech2ConformerHifiGan(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "hifigan" + base_config_key = "vocoder_config" + + def __init__( + self, + model_in_dim=80, + upsample_initial_channel=512, + upsample_rates=[8, 8, 2, 2], + upsample_kernel_sizes=[16, 16, 4, 4], + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + initializer_range=0.01, + leaky_relu_slope=0.1, + normalize_before=True, + **kwargs, + ): + self.model_in_dim = model_in_dim + self.upsample_initial_channel = upsample_initial_channel + self.upsample_rates = upsample_rates + self.upsample_kernel_sizes = upsample_kernel_sizes + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.initializer_range = initializer_range + self.leaky_relu_slope = leaky_relu_slope + self.normalize_before = normalize_before + super().__init__(**kwargs) + + +class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`FastSpeech2ConformerWithHifiGan`]. It is used to + instantiate a `FastSpeech2ConformerWithHifiGanModel` model according to the specified sub-models configurations, + defining the model architecture. + + Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastSpeech2ConformerModel [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) and + FastSpeech2ConformerHifiGan + [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architectures. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + model_config (`typing.Dict`, *optional*): + Configuration of the text-to-speech model. + vocoder_config (`typing.Dict`, *optional*): + Configuration of the vocoder model. + model_config ([`FastSpeech2ConformerConfig`], *optional*): + Configuration of the text-to-speech model. + vocoder_config ([`FastSpeech2ConformerHiFiGanConfig`], *optional*): + Configuration of the vocoder model. + + Example: + + ```python + >>> from transformers import ( + ... FastSpeech2ConformerConfig, + ... FastSpeech2ConformerHifiGanConfig, + ... FastSpeech2ConformerWithHifiGanConfig, + ... FastSpeech2ConformerWithHifiGan, + ... ) + + >>> # Initializing FastSpeech2ConformerWithHifiGan sub-modules configurations. + >>> model_config = FastSpeech2ConformerConfig() + >>> vocoder_config = FastSpeech2ConformerHifiGanConfig() + + >>> # Initializing a FastSpeech2ConformerWithHifiGan module style configuration + >>> configuration = FastSpeech2ConformerWithHifiGanConfig(model_config.to_dict(), vocoder_config.to_dict()) + + >>> # Initializing a model (with random weights) + >>> model = FastSpeech2ConformerWithHifiGan(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "fastspeech2_conformer_with_hifigan" + sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig} + + def __init__( + self, + model_config: Dict = None, + vocoder_config: Dict = None, + **kwargs, + ): + if model_config is None: + model_config = {} + logger.info("model_config is None. initializing the model with default values.") + + if vocoder_config is None: + vocoder_config = {} + logger.info("vocoder_config is None. initializing the coarse model with default values.") + + self.model_config = FastSpeech2ConformerConfig(**model_config) + self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config) + + super().__init__(**kwargs) + + +__all__ = ["FastSpeech2ConformerConfig", "FastSpeech2ConformerHifiGanConfig", "FastSpeech2ConformerWithHifiGanConfig"] diff --git a/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..3a5bb2d2e2e92481739184d30846c5ee1c987f40 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,210 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert FastSpeech2Conformer checkpoint.""" + +import argparse +import json +import re +from pathlib import Path +from tempfile import TemporaryDirectory + +import torch +import yaml + +from transformers import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerModel, + FastSpeech2ConformerTokenizer, + logging, +) + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.FastSpeech2Conformer") + +CONFIG_MAPPING = { + "adim": "hidden_size", + "aheads": "num_attention_heads", + "conformer_dec_kernel_size": "decoder_kernel_size", + "conformer_enc_kernel_size": "encoder_kernel_size", + "decoder_normalize_before": "decoder_normalize_before", + "dlayers": "decoder_layers", + "dunits": "decoder_linear_units", + "duration_predictor_chans": "duration_predictor_channels", + "duration_predictor_kernel_size": "duration_predictor_kernel_size", + "duration_predictor_layers": "duration_predictor_layers", + "elayers": "encoder_layers", + "encoder_normalize_before": "encoder_normalize_before", + "energy_embed_dropout": "energy_embed_dropout", + "energy_embed_kernel_size": "energy_embed_kernel_size", + "energy_predictor_chans": "energy_predictor_channels", + "energy_predictor_dropout": "energy_predictor_dropout", + "energy_predictor_kernel_size": "energy_predictor_kernel_size", + "energy_predictor_layers": "energy_predictor_layers", + "eunits": "encoder_linear_units", + "pitch_embed_dropout": "pitch_embed_dropout", + "pitch_embed_kernel_size": "pitch_embed_kernel_size", + "pitch_predictor_chans": "pitch_predictor_channels", + "pitch_predictor_dropout": "pitch_predictor_dropout", + "pitch_predictor_kernel_size": "pitch_predictor_kernel_size", + "pitch_predictor_layers": "pitch_predictor_layers", + "positionwise_conv_kernel_size": "positionwise_conv_kernel_size", + "postnet_chans": "speech_decoder_postnet_units", + "postnet_filts": "speech_decoder_postnet_kernel", + "postnet_layers": "speech_decoder_postnet_layers", + "reduction_factor": "reduction_factor", + "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor", + "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor", + "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate", + "transformer_dec_dropout_rate": "decoder_dropout_rate", + "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate", + "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate", + "transformer_enc_dropout_rate": "encoder_dropout_rate", + "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate", + "use_cnn_in_conformer": "use_cnn_in_conformer", + "use_macaron_style_in_conformer": "use_macaron_style_in_conformer", + "use_masking": "use_masking", + "use_weighted_masking": "use_weighted_masking", + "idim": "input_dim", + "odim": "num_mel_bins", + "spk_embed_dim": "speaker_embed_dim", + "langs": "num_languages", + "spks": "num_speakers", +} + + +def remap_model_yaml_config(yaml_config_path): + with Path(yaml_config_path).open("r", encoding="utf-8") as f: + args = yaml.safe_load(f) + args = argparse.Namespace(**args) + + remapped_config = {} + + model_params = args.tts_conf["text2mel_params"] + # espnet_config_key -> hf_config_key, any keys not included are ignored + for espnet_config_key, hf_config_key in CONFIG_MAPPING.items(): + if espnet_config_key in model_params: + remapped_config[hf_config_key] = model_params[espnet_config_key] + + return remapped_config, args.g2p, args.token_list + + +def convert_espnet_state_dict_to_hf(state_dict): + new_state_dict = {} + for key in state_dict: + if "tts.generator.text2mel." in key: + new_key = key.replace("tts.generator.text2mel.", "") + if "postnet" in key: + new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers") + new_key = new_key.replace(".0.weight", ".conv.weight") + new_key = new_key.replace(".1.weight", ".batch_norm.weight") + new_key = new_key.replace(".1.bias", ".batch_norm.bias") + new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean") + new_key = new_key.replace(".1.running_var", ".batch_norm.running_var") + new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked") + if "feat_out" in key: + if "weight" in key: + new_key = "speech_decoder_postnet.feat_out.weight" + if "bias" in key: + new_key = "speech_decoder_postnet.feat_out.bias" + if "encoder.embed.0.weight" in key: + new_key = new_key.replace("0.", "") + if "w_1" in key: + new_key = new_key.replace("w_1", "conv1") + if "w_2" in key: + new_key = new_key.replace("w_2", "conv2") + if "predictor.conv" in key: + new_key = new_key.replace(".conv", ".conv_layers") + pattern = r"(\d)\.(\d)" + replacement = ( + r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm" + ) + new_key = re.sub(pattern, replacement, new_key) + if "pitch_embed" in key or "energy_embed" in key: + new_key = new_key.replace("0", "conv") + if "encoders" in key: + new_key = new_key.replace("encoders", "conformer_layers") + new_key = new_key.replace("norm_final", "final_layer_norm") + new_key = new_key.replace("norm_mha", "self_attn_layer_norm") + new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm") + new_key = new_key.replace("norm_ff", "ff_layer_norm") + new_key = new_key.replace("norm_conv", "conv_layer_norm") + if "lid_emb" in key: + new_key = new_key.replace("lid_emb", "language_id_embedding") + if "sid_emb" in key: + new_key = new_key.replace("sid_emb", "speaker_id_embedding") + + new_state_dict[new_key] = state_dict[key] + + return new_state_dict + + +@torch.no_grad() +def convert_FastSpeech2ConformerModel_checkpoint( + checkpoint_path, + yaml_config_path, + pytorch_dump_folder_path, + repo_id=None, +): + model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path) + config = FastSpeech2ConformerConfig(**model_params) + + # Prepare the model + model = FastSpeech2ConformerModel(config) + + espnet_checkpoint = torch.load(checkpoint_path, weights_only=True) + hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) + + model.load_state_dict(hf_compatible_state_dict) + + model.save_pretrained(pytorch_dump_folder_path) + + # Prepare the tokenizer + with TemporaryDirectory() as tempdir: + vocab = {token: id for id, token in enumerate(vocab)} + vocab_file = Path(tempdir) / "vocab.json" + with open(vocab_file, "w") as f: + json.dump(vocab, f) + should_strip_spaces = "no_space" in tokenizer_name + tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces) + + tokenizer.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + model.push_to_hub(repo_id) + tokenizer.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument( + "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" + ) + parser.add_argument( + "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + convert_FastSpeech2ConformerModel_checkpoint( + args.checkpoint_path, + args.yaml_config_path, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) diff --git a/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_hifigan.py b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..70aada84bd5b49a28bbdcee605dfa035a20a20b9 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_hifigan.py @@ -0,0 +1,134 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert FastSpeech2Conformer HiFi-GAN checkpoint.""" + +import argparse +from pathlib import Path + +import torch +import yaml + +from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.FastSpeech2Conformer") + + +def load_weights(checkpoint, hf_model, config): + vocoder_key_prefix = "tts.generator.vocoder." + checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k} + + hf_model.apply_weight_norm() + + hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"] + hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"] + hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"] + + for i in range(len(config.upsample_rates)): + hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"] + hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"] + hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"] + + for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)): + for j in range(len(config.resblock_dilation_sizes)): + hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"] + hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"] + hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"] + + hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"] + hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"] + hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"] + + hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"] + hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"] + hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"] + + hf_model.remove_weight_norm() + + +def remap_hifigan_yaml_config(yaml_config_path): + with Path(yaml_config_path).open("r", encoding="utf-8") as f: + args = yaml.safe_load(f) + args = argparse.Namespace(**args) + + vocoder_type = args.tts_conf["vocoder_type"] + if vocoder_type != "hifigan_generator": + raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}") + + remapped_dict = {} + vocoder_params = args.tts_conf["vocoder_params"] + + # espnet_config_key -> hf_config_key + key_mappings = { + "channels": "upsample_initial_channel", + "in_channels": "model_in_dim", + "resblock_dilations": "resblock_dilation_sizes", + "resblock_kernel_sizes": "resblock_kernel_sizes", + "upsample_kernel_sizes": "upsample_kernel_sizes", + "upsample_scales": "upsample_rates", + } + for espnet_config_key, hf_config_key in key_mappings.items(): + remapped_dict[hf_config_key] = vocoder_params[espnet_config_key] + remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"] + remapped_dict["normalize_before"] = False + remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"] + + return remapped_dict + + +@torch.no_grad() +def convert_hifigan_checkpoint( + checkpoint_path, + pytorch_dump_folder_path, + yaml_config_path=None, + repo_id=None, +): + if yaml_config_path is not None: + config_kwargs = remap_hifigan_yaml_config(yaml_config_path) + config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) + else: + config = FastSpeech2ConformerHifiGanConfig() + + model = FastSpeech2ConformerHifiGan(config) + + orig_checkpoint = torch.load(checkpoint_path, weights_only=True) + load_weights(orig_checkpoint, model, config) + + model.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + model.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert") + parser.add_argument( + "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + convert_hifigan_checkpoint( + args.checkpoint_path, + args.pytorch_dump_folder_path, + args.yaml_config_path, + args.push_to_hub, + ) diff --git a/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..6f840438dcaea99ab9ac51e8373bb1b5f753169c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py @@ -0,0 +1,102 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert FastSpeech2Conformer checkpoint.""" + +import argparse + +import torch + +from transformers import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerHifiGan, + FastSpeech2ConformerHifiGanConfig, + FastSpeech2ConformerModel, + FastSpeech2ConformerWithHifiGan, + FastSpeech2ConformerWithHifiGanConfig, + logging, +) + +from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import ( + convert_espnet_state_dict_to_hf, + remap_model_yaml_config, +) +from .convert_hifigan import load_weights, remap_hifigan_yaml_config + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.FastSpeech2Conformer") + + +def convert_FastSpeech2ConformerWithHifiGan_checkpoint( + checkpoint_path, + yaml_config_path, + pytorch_dump_folder_path, + repo_id=None, +): + # Prepare the model + model_params, *_ = remap_model_yaml_config(yaml_config_path) + model_config = FastSpeech2ConformerConfig(**model_params) + + model = FastSpeech2ConformerModel(model_config) + + espnet_checkpoint = torch.load(checkpoint_path, weights_only=True) + hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) + model.load_state_dict(hf_compatible_state_dict) + + # Prepare the vocoder + config_kwargs = remap_hifigan_yaml_config(yaml_config_path) + vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) + + vocoder = FastSpeech2ConformerHifiGan(vocoder_config) + load_weights(espnet_checkpoint, vocoder, vocoder_config) + + # Prepare the model + vocoder + config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config) + with_hifigan_model = FastSpeech2ConformerWithHifiGan(config) + with_hifigan_model.model = model + with_hifigan_model.vocoder = vocoder + + with_hifigan_model.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + with_hifigan_model.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument( + "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" + ) + parser.add_argument( + "--pytorch_dump_folder_path", + required=True, + default=None, + type=str, + help="Path to the output `FastSpeech2ConformerModel` PyTorch model.", + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + + convert_FastSpeech2ConformerWithHifiGan_checkpoint( + args.checkpoint_path, + args.yaml_config_path, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) diff --git a/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..590786b195d725a0063c272ce0b143227cc17873 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py @@ -0,0 +1,1697 @@ +# coding=utf-8 +# Copyright 2023 The Espnet authors, IMS Toucan authors, and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch FastSpeech2Conformer model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +from torch import nn + +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, add_start_docstrings, logging, replace_return_docstrings +from .configuration_fastspeech2_conformer import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerHifiGanConfig, + FastSpeech2ConformerWithHifiGanConfig, +) + + +logger = logging.get_logger(__name__) + + +@dataclass +class FastSpeech2ConformerModelOutput(ModelOutput): + """ + Output type of [`FastSpeech2ConformerModel`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Spectrogram generation loss. + spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`): + The predicted spectrogram. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*): + Outputs of the duration predictor. + pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the pitch predictor. + energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the energy predictor. + + """ + + loss: Optional[torch.FloatTensor] = None + spectrogram: Optional[torch.FloatTensor] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + duration_outputs: Optional[torch.LongTensor] = None + pitch_outputs: Optional[torch.FloatTensor] = None + energy_outputs: Optional[torch.FloatTensor] = None + + +@dataclass +class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput): + """ + Output type of [`FastSpeech2ConformerWithHifiGan`]. + + Args: + waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`): + Speech output as a result of passing the predicted mel spectrogram through the vocoder. + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Spectrogram generation loss. + spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`): + The predicted spectrogram. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*): + Outputs of the duration predictor. + pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the pitch predictor. + energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the energy predictor. + """ + + waveform: Optional[torch.FloatTensor] = None + + +_CONFIG_FOR_DOC = "FastSpeech2ConformerConfig" + +FASTSPEECH2_CONFORMER_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`FastSpeech2ConformerConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +HIFIGAN_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`FastSpeech2ConformerConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`FastSpeech2ConformerWithHifiGanConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +def length_regulator(encoded_embeddings, duration_labels, speaking_speed=1.0): + """ + Length regulator for feed-forward Transformer. + + This is the length regulator module described in `FastSpeech: Fast, Robust and Controllable Text to Speech` + https://arxiv.org/pdf/1905.09263.pdf. The length regulator expands char or phoneme-level embedding features to + frame-level by repeating each feature based on the corresponding predicted durations. + + Args: + encoded_embeddings (`torch.Tensor` of shape `(batch_size, max_text_length, embedding_dim)`): + Batch of sequences of char or phoneme embeddings. + duration_labels (`torch.LongTensor` of shape `(batch_size, time)`): + Batch of durations of each frame. + speaking_speed (`float`, *optional*, defaults to 1.0): + Value to control speed of speech. + + Returns: + `torch.Tensor`: + Replicated input tensor based on durations (batch_size, time*, embedding_dim). + """ + + if speaking_speed <= 0: + raise ValueError("`speaking_speed` must be greater than 0.") + elif speaking_speed != 1.0: + duration_labels = torch.round(duration_labels.float() * speaking_speed).long() + + if duration_labels.sum() == 0: + duration_labels[duration_labels.sum(dim=1).eq(0)] = 1 + + # Calculate the maximum length needed + max_len = torch.sum(duration_labels, dim=1).max() + + # Create a padded tensor to hold the results + hidden_states = torch.zeros( + (encoded_embeddings.size(0), max_len, encoded_embeddings.size(2)), + dtype=torch.float, + device=encoded_embeddings.device, + ) + + # Loop through the batch and fill in the data + for i, (encoded_embedding, target_duration) in enumerate(zip(encoded_embeddings, duration_labels)): + repeated = torch.repeat_interleave(encoded_embedding, target_duration, dim=0) + hidden_states[i, : repeated.size(0)] = repeated + + return hidden_states + + +class FastSpeech2ConformerDurationPredictor(nn.Module): + """ + Duration predictor module. + + This is a module of duration predictor described in the paper 'FastSpeech: Fast, Robust and Controllable Text to + Speech' https://arxiv.org/pdf/1905.09263.pdf The duration predictor predicts a duration of each frame in log domain + from the hidden embeddings of encoder. + + Note: + The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the + outputs are calculated in log domain but in `inference`, those are calculated in linear domain. + + """ + + def __init__(self, config: FastSpeech2ConformerConfig): + super().__init__() + + self.conv_layers = nn.ModuleList() + self.log_domain_offset = 1.0 + + for layer_idx in range(config.duration_predictor_layers): + num_chans = config.duration_predictor_channels + input_channels = config.hidden_size if layer_idx == 0 else num_chans + layer = FastSpeech2ConformerPredictorLayer( + input_channels, + num_chans, + config.duration_predictor_kernel_size, + config.duration_predictor_dropout_rate, + ) + self.conv_layers.append(layer) + self.linear = nn.Linear(config.duration_predictor_channels, 1) + + def forward(self, encoder_hidden_states): + """ + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`): + Batch of input sequences. + padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*): + Batch of masks indicating padded part. + + Returns: + `torch.Tensor`: Batch of predicted durations in log domain `(batch_size, max_text_length)`. + + """ + # (batch_size, input_dim, max_text_length) + hidden_states = encoder_hidden_states.transpose(1, -1) + for layer in self.conv_layers: + hidden_states = layer(hidden_states) + + # NOTE: calculate in log domain, (batch_size, max_text_length) + hidden_states = self.linear(hidden_states.transpose(1, -1)).squeeze(-1) + + if not self.training: + # NOTE: calculate in linear domain + hidden_states = torch.clamp(torch.round(hidden_states.exp() - self.log_domain_offset), min=0).long() + + return hidden_states + + +# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5BatchNormConvLayer +class FastSpeech2ConformerBatchNormConvLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + + if layer_id == 0: + in_conv_dim = config.num_mel_bins + else: + in_conv_dim = config.speech_decoder_postnet_units + + if layer_id == config.speech_decoder_postnet_layers - 1: + out_conv_dim = config.num_mel_bins + else: + out_conv_dim = config.speech_decoder_postnet_units + + self.conv = nn.Conv1d( + in_conv_dim, + out_conv_dim, + kernel_size=config.speech_decoder_postnet_kernel, + stride=1, + padding=(config.speech_decoder_postnet_kernel - 1) // 2, + bias=False, + ) + self.batch_norm = nn.BatchNorm1d(out_conv_dim) + + if layer_id < config.speech_decoder_postnet_layers - 1: + self.activation = nn.Tanh() + else: + self.activation = None + + self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.batch_norm(hidden_states) + if self.activation is not None: + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class FastSpeech2ConformerSpeechDecoderPostnet(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor) + self.layers = nn.ModuleList( + [FastSpeech2ConformerBatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)] + ) + + def forward(self, hidden_states: torch.Tensor): + outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins) + layer_output = outputs_before_postnet.transpose(1, 2) + for layer in self.layers: + layer_output = layer(layer_output) + outputs_after_postnet = outputs_before_postnet + layer_output.transpose(1, 2) + return outputs_before_postnet, outputs_after_postnet + + +class FastSpeech2ConformerPredictorLayer(nn.Module): + def __init__(self, input_channels, num_chans, kernel_size, dropout_rate): + super().__init__() + self.conv = nn.Conv1d( + input_channels, + num_chans, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + ) + self.activation = nn.ReLU() + self.layer_norm = nn.LayerNorm(num_chans) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.activation(hidden_states) + + # Perform layer norm on dimension 1 + hidden_states = hidden_states.transpose(1, -1) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.transpose(1, -1) + + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class FastSpeech2ConformerVariancePredictor(nn.Module): + def __init__( + self, + config: FastSpeech2ConformerConfig, + num_layers=2, + num_chans=384, + kernel_size=3, + dropout_rate=0.5, + ): + """ + Initilize variance predictor module. + + Args: + input_dim (`int`): Input dimension. + num_layers (`int`, *optional*, defaults to 2): Number of convolutional layers. + num_chans (`int`, *optional*, defaults to 384): Number of channels of convolutional layers. + kernel_size (`int`, *optional*, defaults to 3): Kernel size of convolutional layers. + dropout_rate (`float`, *optional*, defaults to 0.5): Dropout rate. + """ + super().__init__() + self.conv_layers = nn.ModuleList() + for idx in range(num_layers): + input_channels = config.hidden_size if idx == 0 else num_chans + layer = FastSpeech2ConformerPredictorLayer(input_channels, num_chans, kernel_size, dropout_rate) + self.conv_layers.append(layer) + self.linear = nn.Linear(num_chans, 1) + + def forward(self, encoder_hidden_states, padding_masks=None): + """ + Calculate forward propagation. + + Args: + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`): + Batch of input sequences. + padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*): + Batch of masks indicating padded part. + + Returns: + Tensor: Batch of predicted sequences `(batch_size, max_text_length, 1)`. + """ + # (batch_size, input_dim, max_text_length) + hidden_states = encoder_hidden_states.transpose(1, -1) + for layer in self.conv_layers: + hidden_states = layer(hidden_states) + + hidden_states = self.linear(hidden_states.transpose(1, 2)) + + if padding_masks is not None: + hidden_states = hidden_states.masked_fill(padding_masks, 0.0) + + return hidden_states + + +class FastSpeech2ConformerVarianceEmbedding(nn.Module): + def __init__( + self, + in_channels=1, + out_channels=384, + kernel_size=1, + padding=0, + dropout_rate=0.0, + ): + super().__init__() + self.conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + padding=padding, + ) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, hidden_states): + hidden_states = hidden_states.transpose(1, 2) + hidden_states = self.conv(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class FastSpeech2ConformerAttention(nn.Module): + """ + Multi-Head attention layer with relative position encoding. Details can be found in + https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860. + """ + + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + """Construct an FastSpeech2ConformerAttention object.""" + super().__init__() + # We assume d_v always equals dim_key + self.num_heads = module_config["num_attention_heads"] + self.hidden_size = config.hidden_size + self.dim_key = self.hidden_size // self.num_heads + self.head_dim = self.hidden_size // self.num_heads + self.linear_q = nn.Linear(self.hidden_size, self.hidden_size) + self.linear_k = nn.Linear(self.hidden_size, self.hidden_size) + self.linear_v = nn.Linear(self.hidden_size, self.hidden_size) + self.linear_out = nn.Linear(self.hidden_size, self.hidden_size) + self.dropout = nn.Dropout(p=module_config["attention_dropout_rate"]) + + # linear transformation for positional encoding + self.linear_pos = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim)) + + def shift_relative_position_tensor(self, pos_tensor): + """ + Args: + pos_tensor (torch.Tensor of shape (batch_size, head, time1, 2*time1-1)): Input tensor. + """ + zero_pad = torch.zeros((*pos_tensor.size()[:3], 1), device=pos_tensor.device, dtype=pos_tensor.dtype) + pos_tensor_padded = torch.cat([zero_pad, pos_tensor], dim=-1) + + pos_tensor_padded = pos_tensor_padded.view(*pos_tensor.size()[:2], pos_tensor.size(3) + 1, pos_tensor.size(2)) + # only keep the positions from 0 to time2 + pos_tensor = pos_tensor_padded[:, :, 1:].view_as(pos_tensor)[:, :, :, : pos_tensor.size(-1) // 2 + 1] + + return pos_tensor + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + pos_emb: Optional[torch.Tensor] = None, + output_attentions: Optional[torch.Tensor] = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Compute 'Scaled Dot Product Attention' with rel. positional encoding. + + Args: + hidden_states (`torch.Tensor` of shape `(batch, time2, size)`): Values of the hidden states + attention_mask (`torch.Tensor` of shape `(batch, time1, time2)`): Mask tensor. + pos_emb (`torch.Tensor` of shape `(batch, 2*time1-1, size)`): Positional embedding tensor. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + Returns: + `torch.Tensor`: Output tensor of shape `(batch, time1, d_model)`. + """ + bsz, q_len, _ = hidden_states.size() + query_states = self.linear_q(hidden_states).view(bsz, -1, self.num_heads, self.head_dim) + key_states = self.linear_k(hidden_states).view(bsz, -1, self.num_heads, self.head_dim) + value_states = self.linear_v(hidden_states).view(bsz, -1, self.num_heads, self.head_dim) + + bsz_pos = pos_emb.size(0) + pos_encoding = self.linear_pos(pos_emb).view(bsz_pos, -1, self.num_heads, self.head_dim) + + # (batch_size, head, time1, dim_key) + query_with_bias_u = (query_states + self.pos_bias_u).transpose(1, 2) + # (batch_size, head, time1, dim_key) + query_with_bias_v = (query_states + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch_size, head, time1, time2) + matrix_ac = torch.matmul(query_with_bias_u, key_states.permute(0, 2, 3, 1)) + + # compute matrix b and matrix d + # (batch_size, head, time1, 2*time1-1) + matrix_bd = torch.matmul(query_with_bias_v, pos_encoding.permute(0, 2, 3, 1)) + matrix_bd = self.shift_relative_position_tensor(matrix_bd) + + # (batch_size, head, time1, time2) + scores = (matrix_ac + matrix_bd) / math.sqrt(self.dim_key) + + # Forward attention + if attention_mask is not None: + expected_size = (bsz, 1, q_len) + if attention_mask.size() != expected_size: + raise ValueError(f"Attention mask should be of size {expected_size}, but is {attention_mask.size()}") + attention_mask = attention_mask.unsqueeze(1).eq(0) + min_value = float(torch.finfo(scores.dtype).min) + scores = scores.masked_fill(attention_mask, min_value) + attn_weights = torch.softmax(scores, dim=-1).masked_fill(attention_mask, 0.0) + else: + attn_weights = torch.softmax(scores, dim=-1) + + attn_weights = self.dropout(attn_weights) + attn_output = torch.matmul(attn_weights, value_states.transpose(1, 2)) + attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1) + + attn_output = self.linear_out(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + +class FastSpeech2ConformerConvolutionModule(nn.Module): + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + super().__init__() + # kernel_size should be an odd number for 'SAME' padding + channels = config.hidden_size + kernel_size = module_config["kernel_size"] + self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True) + self.depthwise_conv = nn.Conv1d( + channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True + ) + self.norm = nn.BatchNorm1d(channels) + self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True) + + def forward(self, hidden_states): + """ + Compute convolution module. + + Args: + hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor. + + Returns: + `torch.Tensor`: Output tensor of shape `(batch, time, channels)`. + + """ + # exchange the temporal dimension and the feature dimension + hidden_states = hidden_states.transpose(1, 2) + + # GLU mechanism, (batch_size, 2*channel, dim) + hidden_states = self.pointwise_conv1(hidden_states) + # (batch_size, channel, dim) + hidden_states = nn.functional.glu(hidden_states, dim=1) + + # 1D Depthwise Conv + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.norm(hidden_states) + + hidden_states = hidden_states * torch.sigmoid(hidden_states) + + hidden_states = self.pointwise_conv2(hidden_states) + + return hidden_states.transpose(1, 2) + + +class FastSpeech2ConformerEncoderLayer(nn.Module): + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + super().__init__() + + # self-attention module definition + self.self_attn = FastSpeech2ConformerAttention(config, module_config) + + # feed-forward module definition + self.feed_forward = FastSpeech2ConformerMultiLayeredConv1d(config, module_config) + + self.macaron_style = config.use_macaron_style_in_conformer + if self.macaron_style: + self.feed_forward_macaron = FastSpeech2ConformerMultiLayeredConv1d(config, module_config) + self.ff_macaron_layer_norm = nn.LayerNorm(config.hidden_size) + self.ff_scale = 0.5 + else: + self.ff_scale = 1.0 + + # convolution module definition + self.use_cnn_module = config.use_cnn_in_conformer + if self.use_cnn_module: + self.conv_module = FastSpeech2ConformerConvolutionModule(config, module_config) + self.conv_layer_norm = nn.LayerNorm(config.hidden_size) + self.final_layer_norm = nn.LayerNorm(config.hidden_size) + + self.ff_layer_norm = nn.LayerNorm(config.hidden_size) + + self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size) + + self.dropout = nn.Dropout(module_config["dropout_rate"]) + self.size = config.hidden_size + self.normalize_before = module_config["normalize_before"] + self.concat_after = module_config["concat_after"] + if self.concat_after: + self.concat_linear = nn.Linear(config.hidden_size + config.hidden_size, config.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + pos_emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[torch.Tensor] = False, + ): + """ + Compute encoded features. + + Args: + hidden_states (`torch.Tensor` of shape `(batch, time, size)`): Input tensor. + pos_emb (`torch.Tensor` of shape `(1, time, size)`): Positional embeddings tensor. + attention_mask (`torch.Tensor` of shape `(batch, time)`): Attention mask tensor for the input. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + Returns: + `torch.Tensor`: Output tensor of shape `(batch, time, size)`. + + """ + # whether to use macaron style + if self.macaron_style: + residual = hidden_states + if self.normalize_before: + hidden_states = self.ff_macaron_layer_norm(hidden_states) + hidden_states = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(hidden_states)) + if not self.normalize_before: + hidden_states = self.ff_macaron_layer_norm(hidden_states) + + # multi-headed self-attention module + residual = hidden_states + if self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + attention_output, attention_scores = self.self_attn( + hidden_states, attention_mask=attention_mask, pos_emb=pos_emb, output_attentions=output_attentions + ) + + if self.concat_after: + x_concat = torch.cat((hidden_states, attention_output), dim=-1) + hidden_states = self.concat_linear(x_concat) + hidden_states = residual + hidden_states + else: + hidden_states = self.dropout(attention_output) + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + # convolution module + if self.use_cnn_module: + residual = hidden_states + if self.normalize_before: + hidden_states = self.conv_layer_norm(hidden_states) + hidden_states = self.conv_module(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.conv_layer_norm(hidden_states) + + # feed forward module + residual = hidden_states + if self.normalize_before: + hidden_states = self.ff_layer_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = residual + self.ff_scale * hidden_states + if not self.normalize_before: + hidden_states = self.ff_layer_norm(hidden_states) + + if self.conv_module is not None: + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_scores,) + + return outputs + + +class FastSpeech2ConformerMultiLayeredConv1d(nn.Module): + """ + Multi-layered conv1d for Transformer block. + + This is a module of multi-layered conv1d designed to replace positionwise feed-forward network in Transformer + block, which is introduced in 'FastSpeech: Fast, Robust and Controllable Text to Speech' + https://arxiv.org/pdf/1905.09263.pdf + """ + + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + """ + Initialize FastSpeech2ConformerMultiLayeredConv1d module. + + Args: + input_channels (`int`): Number of input channels. + hidden_channels (`int`): Number of hidden channels. + kernel_size (`int`): Kernel size of conv1d. + dropout_rate (`float`): Dropout rate. + """ + super().__init__() + input_channels = config.hidden_size + hidden_channels = module_config["linear_units"] + kernel_size = config.positionwise_conv_kernel_size + self.conv1 = nn.Conv1d(input_channels, hidden_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.conv2 = nn.Conv1d(hidden_channels, input_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.dropout = nn.Dropout(module_config["dropout_rate"]) + + def forward(self, hidden_states): + """ + Calculate forward propagation. + + Args: + hidden_states (torch.Tensor): Batch of input tensors (batch_size, time, input_channels). + + Returns: + torch.Tensor: Batch of output tensors (batch_size, time, hidden_channels). + """ + hidden_states = hidden_states.transpose(-1, 1) + hidden_states = self.conv1(hidden_states) + hidden_states = torch.relu(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.conv2(hidden_states) + hidden_states = hidden_states.transpose(-1, 1) + return hidden_states + + +class FastSpeech2ConformerRelPositionalEncoding(nn.Module): + """ + Args: + Relative positional encoding module (new implementation). Details can be found in + https://github.com/espnet/espnet/pull/2816. See : Appendix Batch in https://arxiv.org/abs/1901.02860 + config (`FastSpeech2ConformerConfig`): + FastSpeech2ConformerConfig instance. + module_config (`dict`): + Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`. + """ + + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + """ + Construct an PositionalEncoding object. + """ + super().__init__() + self.embed_dim = config.hidden_size + self.input_scale = math.sqrt(self.embed_dim) + self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"]) + self.pos_enc = None + self.max_len = 5000 + self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len)) + + def extend_pos_enc(self, x): + """Reset the positional encodings.""" + if self.pos_enc is not None: + # self.pos_enc contains both positive and negative parts + # the length of self.pos_enc is 2 * input_len - 1 + if self.pos_enc.size(1) >= x.size(1) * 2 - 1: + if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device: + self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vector and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i