koichi12 commited on Feb 12, 2025

Commit

82ed4ab

verified ·

1 Parent(s): 0c36bb3

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_refs/__pycache__/fft.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_matcher.py +460 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_passes.py +950 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_safeguard.py +42 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_tree_utils.py +64 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/common_types.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/cpp.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/init.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/parameter.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/thnn.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/thnn.py +4 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__init__.py +35 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/modules/fused.py +30 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/__init__.py +13 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py +5 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__init__.py +12 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__init__.py +68 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/_functions.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/container.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/conv.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/flatten.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/fold.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/lazy.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/activation.py +1624 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/batchnorm.py +849 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/channelshuffle.py +57 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/container.py +911 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/dropout.py +294 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/flatten.py +144 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/normalization.py +297 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/padding.py +801 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pixelshuffle.py +113 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pooling.py +1306 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/transformer.py +975 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/__init__.cpython-311.pyc +0 -0

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_refs/__pycache__/fft.cpython-311.pyc ADDED Viewed

Binary file (29.8 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-311.pyc ADDED Viewed

Binary file (33 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-311.pyc ADDED Viewed

Binary file (1.41 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-311.pyc ADDED Viewed

Binary file (8.03 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-311.pyc ADDED Viewed

Binary file (10.4 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (23.2 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_matcher.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import collections
+import enum
+import torch
+toq = torch.ops.quantized
+from torch.fx import GraphModule
+from torch.fx.graph import Graph, Node
+from torch.ao.quantization.utils import getattr_from_fqn
+from .ns_types import NSSubgraph, NSNodeTargetType
+from .mappings import (
+    get_base_name_to_sets_of_related_ops,
+    get_unmatchable_types_map,
+)
+from .pattern_utils import (
+    get_type_a_related_to_b,
+    get_reversed_fusions,
+    end_node_matches_reversed_fusion,
+)
+from torch.ao.quantization import (
+    ObserverBase,
+    FakeQuantizeBase,
+)
+from typing import Dict, Tuple, List, Optional, Set, Any
+def _get_output_nodes(g: Graph) -> List[Node]:
+    return [n for n in g.nodes if n.op == 'output']
+class _NSGraphMatchableSubgraphsIterator:
+    """
+    Iterates through the graph of gm, starting with the output nodes
+    and continuing backwards.
+    1. Returns matchable subgraphs, in order. A subgraph is defined by
+       (start_node, end_node).
+    2. Skips over non-matchable subgraphs
+    """
+    def __init__(
+        self,
+        gm: GraphModule,
+        non_matchable_functions: Set[NSNodeTargetType],
+        non_matchable_modules: Set[NSNodeTargetType],
+        non_matchable_methods: Set[NSNodeTargetType],
+    ):
+        self.gm: GraphModule = gm
+        self.non_matchable_functions: Set[NSNodeTargetType] = non_matchable_functions
+        self.non_matchable_modules: Set[NSNodeTargetType] = non_matchable_modules
+        self.non_matchable_methods: Set[NSNodeTargetType] = non_matchable_methods
+        self.seen_nodes: Set[Node] = set()
+        self.stack: List[Node] = []
+        for start_node in _get_output_nodes(self.gm.graph):
+            self.stack.append(start_node)
+    def __iter__(self):
+        return self
+    def __next__(self) -> NSSubgraph:
+        """
+        Returns the next matchable subgraph.
+        """
+        while len(self.stack) > 0:
+            cur_end_node = self.stack.pop()
+            if cur_end_node in self.seen_nodes:
+                continue
+            # for subgraphs which are single nodes, start_node == end_node
+            # for subgraphs with more than one node, start node != end_node
+            cur_start_node = cur_end_node
+            # Subgraphs like linear-relu have the base node as the start node.
+            # Subgraphs like dequantize-linear-relu-to(torch.float16) have the
+            #   base node as the second node.
+            # The cur_base_op_node var will move to the actual node during
+            #   the fusion matching later in this code block.
+            cur_base_op_node = cur_end_node
+            # Check for potential fusions. For now, we are greedy
+            # and always skip all non-base nodes of a fusion.  For example,
+            # if we match linear-relu backwards, we will always skip the
+            # relu node and attempt to match the linear node.  This can
+            # be made configurable later if needed.
+            for _reverse_fusion_ops, base_op_idx in get_reversed_fusions():
+                is_match = end_node_matches_reversed_fusion(
+                    cur_end_node, _reverse_fusion_ops, self.gm, self.seen_nodes)
+                if is_match:
+                    # navigate to the base node
+                    for rev_fusion_idx in range(len(_reverse_fusion_ops) - 1):
+                        self.seen_nodes.add(cur_start_node)
+                        # for now, assume that there are no other nodes
+                        # which need to be added to the stack
+                        cur_start_node = cur_start_node.args[0]  # type: ignore[assignment]
+                        # if the base op index matches the current node, set it
+                        rev_base_op_idx = \
+                            len(_reverse_fusion_ops) - 2 - base_op_idx
+                        if rev_fusion_idx == rev_base_op_idx:
+                            cur_base_op_node = cur_start_node
+                    break
+            self.seen_nodes.add(cur_start_node)
+            # add args of previous nodes to stack
+            for arg in cur_start_node.all_input_nodes:
+                self._recursively_add_node_arg_to_stack(arg)
+            # skip unmatchable nodes
+            # note: this check is done on the start_node, i.e.
+            # if we are matching linear-relu in reverse, this would do the matchable
+            # check on the linear
+            if not self._is_matchable(cur_base_op_node):
+                continue
+            # If an observer or a fake_quant was not matched as a part of
+            # a pattern of multiple nodes, ignore it. One case where this is
+            # relevant is an observer on a graph input, which was added because
+            # it is necessary for the next node.
+            if cur_end_node.op == 'call_module' and cur_start_node is cur_end_node:
+                maybe_obs = getattr_from_fqn(self.gm, cur_end_node.target)  # type: ignore[arg-type]
+                if isinstance(maybe_obs, (ObserverBase, FakeQuantizeBase)):
+                    continue
+            return NSSubgraph(
+                start_node=cur_start_node, end_node=cur_end_node,
+                base_op_node=cur_base_op_node)
+        raise StopIteration
+    def _recursively_add_node_arg_to_stack(self, arg: Any) -> None:
+        """
+        Adds all of the nodes in this arg to the stack, properly navigating
+        through list, dicts and tuples.
+        """
+        if isinstance(arg, Node):
+            self.stack.append(arg)
+        elif isinstance(arg, torch.fx.immutable_collections.immutable_list) or type(arg) is tuple:
+            for inner_arg in arg:
+                self._recursively_add_node_arg_to_stack(inner_arg)
+        elif isinstance(arg, torch.fx.immutable_collections.immutable_dict):
+            for value in arg.values():
+                self._recursively_add_node_arg_to_stack(value)
+    def _is_matchable(self, node: Node) -> bool:
+        if node.op == 'call_function':
+            return node.target not in self.non_matchable_functions
+        elif node.op == 'call_module':
+            assert isinstance(node.target, str)
+            target_mod = getattr_from_fqn(self.gm, node.target)
+            return not \
+                any(isinstance(target_mod, t)  # type: ignore[arg-type]
+                    for t in self.non_matchable_modules)
+        elif node.op == 'call_method':
+            return node.target not in self.non_matchable_methods
+        else:
+            return False
+class GraphMatchingException(Exception):
+    """
+    Exception raised when two graphs cannot be matched.
+    """
+    pass
+class SubgraphTypeRelationship(enum.Enum):
+    # same type, known
+    # example: F.linear and F.linear, or nn.Conv2d and nn.Conv2d
+    EQUAL = enum.auto()
+    # same type, but the type is not known to Numerical Suite
+    # (user defined type, etc).
+    EQUAL_BUT_UKNOWN = enum.auto()
+    # known, same subgraph_relationship set, but not the same type
+    # example: F.linear and toq.linear
+    RELATED_BUT_NOT_EQUAL = enum.auto()
+    # not related
+    NOT_RELATED = enum.auto()
+def _get_subgraph_relationship_type(
+    subgraph_a: NSSubgraph,
+    subgraph_b: NSSubgraph,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    type_a_related_to_b: Set[Tuple[NSNodeTargetType, NSNodeTargetType]],
+) -> SubgraphTypeRelationship:
+    node_a = subgraph_a.base_op_node
+    node_b = subgraph_b.base_op_node
+    # TODO(next): make this code handle matching by what is before the base op
+    if node_a.op != node_b.op:
+        if not (
+            node_a.op in ('call_function', 'call_method') and
+            node_b.op in ('call_function', 'call_method')
+        ):
+            return SubgraphTypeRelationship.NOT_RELATED
+    if node_a.op in ('call_function', 'call_method'):
+        key = (node_a.target, node_b.target)
+        if key not in type_a_related_to_b:
+            if node_a.target == node_b.target:
+                return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN
+            else:
+                return SubgraphTypeRelationship.NOT_RELATED
+        # after this point, we are dealing with known types
+        if node_a.target == node_b.target:
+            node_a_has_prev = subgraph_a.base_op_node == subgraph_a.start_node
+            node_b_has_prev = subgraph_b.base_op_node == subgraph_b.start_node
+            if node_a_has_prev and (not node_b_has_prev):
+                return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+            elif (not node_a_has_prev) and node_b_has_prev:
+                return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+            elif (not node_a_has_prev) and (not node_b_has_prev):
+                return SubgraphTypeRelationship.EQUAL
+            else:
+                # TODO(future PR): check for matches start_op_node and base_op_node
+                return SubgraphTypeRelationship.EQUAL
+        if key in type_a_related_to_b:
+            return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+        else:
+            return SubgraphTypeRelationship.NOT_RELATED
+    elif node_a.op == 'call_module':
+        assert (subgraph_a.base_op_node == subgraph_a.start_node and
+                subgraph_b.base_op_node == subgraph_b.start_node), \
+            "Matching call_module patterns where base_op_node != start_node is not supported yet"
+        # for call_module, we need to look up the modules to do the type check
+        assert isinstance(node_a.target, str)
+        mod_a = getattr_from_fqn(gm_a, node_a.target)
+        assert isinstance(node_b.target, str)
+        mod_b = getattr_from_fqn(gm_b, node_b.target)
+        key = (type(mod_a), type(mod_b))
+        if key not in type_a_related_to_b:
+            if type(mod_a) == type(mod_b):
+                return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN
+            else:
+                return SubgraphTypeRelationship.NOT_RELATED
+        elif type(mod_a) == type(mod_b):
+            return SubgraphTypeRelationship.EQUAL
+        else:
+            return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+    return SubgraphTypeRelationship.NOT_RELATED
+def _get_name_for_subgraph(
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    base_name_to_sets_of_related_ops: Dict[str, Set[NSNodeTargetType]],
+    existing_names: Set[str],
+) -> str:
+    """
+    Returns a unique name for a subgraph. This name is based on two things:
+    1. the name of the set containing the underlying type of the base op in the
+       subgraph (i.e. 'torch.nn.functional.linear' if this is related to a linear op)
+    2. the number of previous subgraphs with related underlying type of the base op
+    For example, in the graph
+    linear0 -> relu0 -> linear1 -> relu1
+    The subgraphs are (linear0, relu0) and (linear1, relu1).  If we iterate
+    from the output node backwards, the name given to (linear1, relu1) will be
+    `base_op_torch.nn.functional.linear_0`, and the name given to (linear0, relu0)
+    will be `base_op_torch.nn.functional.linear_1`.
+    Why are we not just using the node name? Answer: because of two requirements:
+    A. fusions must be supported
+    B. some Numeric Suite APIs can be called without having all of the models in memory
+    For example, let's say we need to match nodes of
+    (1) ... -> linear0 -> relu0 -> ...
+    And
+    (2) ... -> linear_relu0 -> ...
+    Without being able to inspect them together. With the current naming scheme, if
+    we iterate through both of these graphs in the same order, and assuming the rest
+    of the graphs match, both of these subgraphs will get the same name without
+    (1) and (2) knowing anything about each other.
+    """
+    target_type = _get_node_target_type(subgraph_a.base_op_node, gm_a)
+    target_base_type = None
+    for base_name, sets_of_related_ops in base_name_to_sets_of_related_ops.items():
+        if target_type in sets_of_related_ops:
+            target_base_type = base_name
+    target_base_name = 'base_op_' + str(target_base_type)
+    counter = 0
+    proposed_name = target_base_name + '_' + str(counter)
+    while proposed_name in existing_names:
+        counter += 1
+        proposed_name = target_base_name + '_' + str(counter)
+    existing_names.add(proposed_name)
+    return proposed_name
+def _get_node_target_type(node: Node, gm: GraphModule) -> Optional[NSNodeTargetType]:
+    if node.op in ('call_function', 'call_method'):
+        return node.target
+    elif node.op == 'call_module':
+        assert isinstance(node.target, str)
+        mod = getattr_from_fqn(gm, node.target)
+        return type(mod)
+    return None
+def get_matching_subgraph_pairs(
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    base_name_to_sets_of_related_ops: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    unmatchable_types_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+) -> Dict[str, Tuple[NSSubgraph, NSSubgraph]]:
+    """
+    Matches matchable subgraphs of graph_a to graph_b.
+    For a node, "matchable" is defined as a node which is not an observer,
+    fake_quants, quant or dequant.
+    A subgraph can contain one or more nodes.  A subgraph is matchable if
+    at least one node inside of it is matchable.  Currently, all nodes in
+    a subgraph must be matchable (because we assume no observers will be
+    inserted in the middle of a fusion).
+    A subgraph is defined by (start_node, end_node).  We assume that only
+    start_node and end_node are linked with the surrounding graph, all other
+    nodes in a subgraph are self-contained.
+    A pair of nodes is "related" if both nodes represent the same mathematical
+    operation across different quantization flavors. For example,
+    `F.linear` and `torch.ops.quantized.linear` are related, and
+    `F.linear` and `torch.nn.Conv` are not related.
+    For each matchable pair of nodes node_a and node_b, they will match
+    if node_a and node_b are related.
+    For graphs A and B, they will match iff:
+    1. the number of matchable subgraphs in A and B is equivalent
+    2. when iterating through the matchable subgraphs of A and B in the same order, each
+       corresponding pair of base nodes is related.
+    This enables us to find the corresponding subgraphs between
+    graphs of related models.  For example, if we had two graphs such as:
+    graph_a: x0 -> conv_0 (type: nn.Conv2d) -> obs_0 -> x1
+             w  -/
+             b  -/
+    graph_b: x0 -> quant_0 -> qconv_0 (type: nnq.Conv2d) -> dequant_0 -> x1
+           packed_params_0 -/
+    This function will return the following result:
+    {
+        'conv_0': (  # the name of the node in graph_b
+          (conv_0, conv_0),  # (start_node_a, end_node_a)
+          (qconv_0, qconv_0),  # (start_node_b, end_node_b)
+        ),
+    }
+    Or, if we have a fusion pattern,
+    graph_a: x0 -> linear_0 -> relu_0 -> obs_0 -> x1
+             w  -/
+             b  -/
+    graph_b: x0 -> quant_0 -> linear_relu_0 -> dequant_0 -> x1
+           packed_params_0 -/
+    This function will return the following result:
+    {
+        'linear_relu_0': (  # the name of the node in graph_b
+          (linear_0, relu_0),  # (start_node_a, end_node_a)
+          (linear_relu_0, linear_relu_0),  # (start_node_b, end_node_b)
+        ),
+    }
+    """
+    if unmatchable_types_map is None:
+        unmatchable_types_map = get_unmatchable_types_map()
+    non_matchable_functions = unmatchable_types_map['funs_unmatchable']
+    non_matchable_modules = unmatchable_types_map['mods_unmatchable']
+    non_matchable_methods = unmatchable_types_map['meths_unmatchable']
+    graph_a_iterator = _NSGraphMatchableSubgraphsIterator(
+        gm_a, non_matchable_functions, non_matchable_modules,
+        non_matchable_methods)
+    graph_b_iterator = _NSGraphMatchableSubgraphsIterator(
+        gm_b, non_matchable_functions, non_matchable_modules,
+        non_matchable_methods)
+    results = collections.OrderedDict()
+    if base_name_to_sets_of_related_ops is None:
+        base_name_to_sets_of_related_ops = get_base_name_to_sets_of_related_ops()
+    type_a_related_to_b = \
+        get_type_a_related_to_b(base_name_to_sets_of_related_ops)
+    existing_names_a: Set[str] = set()
+    existing_names_b: Set[str] = set()
+    while True:
+        # fetch the next subgraphs from a and b
+        cur_subgraph_a, cur_subgraph_b = None, None
+        try:
+            cur_subgraph_a = next(graph_a_iterator)
+        except StopIteration:
+            pass
+        try:
+            cur_subgraph_b = next(graph_b_iterator)
+        except StopIteration:
+            pass
+        # look up types of a and b for useful error messages
+        type_start_a, type_start_b = None, None
+        if cur_subgraph_a is not None:
+            type_start_a = _get_node_target_type(cur_subgraph_a.start_node, gm_a)
+        if cur_subgraph_b is not None:
+            type_start_b = _get_node_target_type(cur_subgraph_b.start_node, gm_b)
+        # check for results and determine what to do next
+        if cur_subgraph_a is not None and cur_subgraph_b is not None:
+            # both nodes were fetched, check for subgraph_relationship
+            # note: subgraph_relationship is checked on the start node, i.e.
+            # if a linear-relu pattern is checked, we would check for subgraph_relationship
+            # of the linear
+            subgraph_relationship = _get_subgraph_relationship_type(
+                cur_subgraph_a, cur_subgraph_b,
+                gm_a, gm_b, type_a_related_to_b)
+            if subgraph_relationship == SubgraphTypeRelationship.NOT_RELATED:
+                msg = f"""
+The subgraphs
+({cur_subgraph_a}, {type_start_a}) and
+({cur_subgraph_b}, {type_start_b})
+are not related. Please ensure that the two models you pass in have the same number
+of subgraphs, and each pair of subgraphs is related to each other."""
+                raise GraphMatchingException(msg)
+            elif subgraph_relationship == SubgraphTypeRelationship.EQUAL_BUT_UKNOWN:
+                # skip matching but unknown types
+                continue
+            key_name_a = _get_name_for_subgraph(
+                cur_subgraph_a, gm_a, base_name_to_sets_of_related_ops,
+                existing_names_a)
+            key_name_b = _get_name_for_subgraph(
+                cur_subgraph_b, gm_b, base_name_to_sets_of_related_ops,
+                existing_names_b)
+            assert key_name_a == key_name_b, \
+                f"Subgraph names {key_name_a} and {key_name_b} do not match"
+            results[key_name_a] = (cur_subgraph_a, cur_subgraph_b)
+            continue
+        elif cur_subgraph_a is None and cur_subgraph_b is None:
+            # we reached the end of both graphs
+            break
+        else:
+            # only one node was fetched, no match possible, throw error
+            msg = f"""
+Attempting to match
+({cur_subgraph_a}, {type_start_a}) and
+({cur_subgraph_b}, {type_start_b}),
+one of which is empty. Please ensure that the two models you pass in have the same number
+of subgraphs."""
+            raise GraphMatchingException(msg)
+    # The subgraph pairs are originally created by traversing the two graphs
+    # from the outputs to the inputs. Reverse the results to return the
+    # subgraphs in their order of execution.
+    results = collections.OrderedDict(reversed(list(results.items())))
+    return results

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_passes.py ADDED Viewed

	@@ -0,0 +1,950 @@

+import torch
+from torch.fx import GraphModule, map_arg
+from torch.fx.graph import Graph, Node
+from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
+from .utils import (
+    get_node_first_input_and_output_type,
+    getattr_from_fqn,
+    NodeInputOrOutputType,
+    return_first_non_observer_node,
+    get_number_of_non_param_args,
+    get_target_type_str,
+    get_arg_indices_of_inputs_to_log,
+    get_node_input_qparams,
+    op_type_supports_shadowing,
+    get_normalized_nth_input,
+)
+from .ns_types import (
+    NSSingleResultValuesType,
+    NSSubgraph,
+    NSNodeTargetType,
+)
+from torch.ao.ns.fx.mappings import (
+    get_node_type_to_io_type_map,
+)
+from torch.ao.quantization.observer import _is_activation_post_process
+from typing import Dict, Tuple, Callable, List, Any, Union, Optional, Set
+def _maybe_get_fqn(node: Node, gm: GraphModule) -> Optional[str]:
+    fqn = None
+    if hasattr(gm, '_node_name_to_scope'):
+        # fqn on observers is not present, because they do not
+        # exist when the fqns are created during tracing. If this is
+        # an observer, get the fqn of the node being observed.
+        node_to_use_for_fqn = node
+        if node.op == 'call_module':
+            assert isinstance(node.target, str)
+            module = getattr_from_fqn(gm, node.target)
+            if _is_activation_post_process(module):
+                node_to_use_for_fqn = get_normalized_nth_input(node, gm, 0)
+        fqn = gm._node_name_to_scope[node_to_use_for_fqn.name][0]  # type: ignore[index]
+    return fqn  # type: ignore[return-value]
+def _insert_logger_after_node(
+    node: Node,
+    gm: GraphModule,
+    logger_cls: Callable,
+    logger_node_name_suffix: str,
+    ref_node_name: str,
+    model_name: str,
+    ref_name: str,
+    ref_node_target_type: str,
+    results_type: str,
+    index_within_arg: int,
+    index_of_arg: int,
+    fqn: Optional[str],
+) -> Node:
+    """
+    Given a starting graph of
+    prev_node -> node -> next_node
+    This function creates a new logger_cls obj and adds it
+    after node, resulting in
+    prev_node -> node -> logger_obj -> next_node
+    """
+    # create new name
+    logger_node_name = \
+        get_new_attr_name_with_prefix(node.name + logger_node_name_suffix)(gm)
+    target_type = get_target_type_str(node, gm)
+    # create the logger object
+    logger_obj = logger_cls(
+        ref_node_name, node.name, model_name, ref_name, target_type,
+        ref_node_target_type,
+        results_type, index_within_arg, index_of_arg, fqn)
+    # attach the logger object to the parent module
+    setattr(gm, logger_node_name, logger_obj)
+    logger_node = node.graph.create_node(
+        'call_module', logger_node_name, (node,), {})
+    return logger_node
+def add_loggers_to_model(
+    gm: GraphModule,
+    node_to_instrument_inputs_to_ref_node_name: Dict[Node, Tuple[str, str]],
+    node_to_instrument_outputs_to_ref_node_name: Dict[Node, Tuple[str, str]],
+    logger_cls: Callable,
+    model_name: str,
+) -> GraphModule:
+    """
+    Takes the graph of gm, adds loggers to the output
+    of each node in nodes_to_instrument. Returns a GraphModule with the new
+    graph.
+    """
+    new_graph = Graph()
+    env: Dict[str, Any] = {}
+    modules = dict(gm.named_modules())
+    def load_arg(a):
+        return map_arg(a, lambda node: env[node.name])
+    for node in gm.graph.nodes:
+        if node.op == 'output':
+            new_graph.output(map_arg(get_normalized_nth_input(node, gm, 0), load_arg))
+            continue
+        if (
+            (node in node_to_instrument_inputs_to_ref_node_name) or
+            (node in node_to_instrument_outputs_to_ref_node_name)
+        ):
+            fqn = _maybe_get_fqn(node, gm)
+            if node in node_to_instrument_inputs_to_ref_node_name:
+                ref_name, ref_node_type = node_to_instrument_inputs_to_ref_node_name[node]
+                # Ops such add and mul are special because either
+                # one or two of the first two arguments can be tensors,
+                # and if one argument is a tensor it can be first or
+                # second (x + 1 versus 1 + x).
+                arg_indices_to_log = get_arg_indices_of_inputs_to_log(node)
+                for node_arg_idx in arg_indices_to_log:
+                    node_arg = get_normalized_nth_input(node, gm, node_arg_idx)
+                    if type(node_arg) == Node:
+                        # create a single input logger
+                        prev_node = env[node_arg.name]
+                        env[node_arg.name] = _insert_logger_after_node(
+                            prev_node, gm, logger_cls, '_ns_logger_', node.name,
+                            model_name, ref_name, ref_node_type,
+                            NSSingleResultValuesType.NODE_INPUT.value,
+                            index_within_arg=0, index_of_arg=node_arg_idx,
+                            fqn=fqn)
+                    elif type(node_arg) == torch.fx.immutable_collections.immutable_list:
+                        # create N input loggers, one for each node
+                        for arg_idx, arg in enumerate(node_arg):  # type: ignore[var-annotated, arg-type]
+                            prev_node = env[arg.name]
+                            env[prev_node.name] = _insert_logger_after_node(
+                                prev_node, gm, logger_cls, '_ns_logger_', node.name,
+                                model_name, ref_name, ref_node_type,
+                                NSSingleResultValuesType.NODE_INPUT.value,
+                                index_within_arg=arg_idx, index_of_arg=node_arg_idx,
+                                fqn=fqn)
+                    else:
+                        pass
+            # ensure env is populated with base node
+            # Note: runs for both inputs and outputs
+            env[node.name] = new_graph.node_copy(node, load_arg)
+            if node in node_to_instrument_outputs_to_ref_node_name:
+                ref_name, ref_node_type = node_to_instrument_outputs_to_ref_node_name[node]
+                # add the logger after the base node
+                env[node.name] = _insert_logger_after_node(
+                    env[node.name], gm, logger_cls, '_ns_logger_', node.name,
+                    model_name, ref_name, ref_node_type,
+                    NSSingleResultValuesType.NODE_OUTPUT.value,
+                    index_within_arg=0, index_of_arg=0, fqn=fqn)
+        else:
+            env[node.name] = new_graph.node_copy(node, load_arg)
+    new_gm = GraphModule(gm, new_graph)
+    return new_gm
+def _insert_quantize_per_tensor_node(
+    prev_node_c: Node,
+    node_a: Node,
+    gm_b: GraphModule,
+    graph_c: Graph,
+    scale: Union[torch.Tensor, float],
+    zero_point: Union[torch.Tensor, int],
+    dtype_cast_name: str,
+) -> Node:
+    # copy scale
+    scale_node_name = \
+        get_new_attr_name_with_prefix(
+            node_a.name + '_input_scale_')(gm_b)
+    setattr(gm_b, scale_node_name, scale)
+    scale_node = graph_c.create_node(
+        'get_attr', scale_node_name, (), {}, scale_node_name)
+    # copy zero_point
+    zero_point_node_name = \
+        get_new_attr_name_with_prefix(
+            node_a.name + '_input_zero_point_')(gm_b)
+    setattr(gm_b, zero_point_node_name, zero_point)
+    zero_point_node = graph_c.create_node(
+        'get_attr', zero_point_node_name, (), {}, zero_point_node_name)
+    # create the quantize_per_tensor call
+    return graph_c.create_node(
+        'call_function', torch.quantize_per_tensor,
+        (prev_node_c, scale_node, zero_point_node, torch.quint8), {},
+        dtype_cast_name)
+def _insert_dtype_cast_after_node(
+    node_a: Node,
+    node_c: Node,
+    prev_node_c: Union[Node, List[Node]],
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    graph_c: Graph,
+    node_name_prefix: str,
+    logger_cls: Callable,
+    node_type_to_io_type_map: Dict[str, Set[NSNodeTargetType]],
+) -> Union[Node, List[Node]]:
+    """
+    Given a starting graph C (derived from graph B) of
+    ... -> prev_node_c -> node_c -> ...
+    And a corresponding related node_a, inserts the correct dtype
+    cast node after prev_node_c to cast into the dtype expected
+    by node_a, resulting in:
+                          dtype_cast
+                        /
+    ... -> prev_node_c -> node_c -> ...
+    For example, if node_c is an int8 op and node_a is an fp32 op, this function
+    will insert a dequant.
+    """
+    dtype_cast_op = None
+    dtype_cast_mod_cls = None
+    dtype_cast_method = None
+    dtype_cast_method_dtype = None
+    dtype_cast_scale = None
+    dtype_cast_zero_point = None
+    node_input_type_a, _node_output_type_a = \
+        get_node_first_input_and_output_type(
+            node_a, gm_a, logger_cls, node_type_to_io_type_map)
+    node_input_type_c, _node_output_type_c = \
+        get_node_first_input_and_output_type(
+            node_c, gm_b, logger_cls, node_type_to_io_type_map)
+    if (
+        (node_input_type_a == NodeInputOrOutputType.FP32 and
+         node_input_type_c == NodeInputOrOutputType.INT8) or
+        (node_input_type_a == NodeInputOrOutputType.FP32 and
+         node_input_type_c == NodeInputOrOutputType.FP16) or
+        # TODO(future PR): determine the actual dtype of node_c,
+        # the current code only works because dequantize works with
+        # multiple input dtypes.
+        (node_input_type_a == NodeInputOrOutputType.FP32 and
+         node_input_type_c == NodeInputOrOutputType.FP32_OR_INT8)
+    ):
+        dtype_cast_op = torch.dequantize
+    elif (
+        node_input_type_a == node_input_type_c and
+        node_input_type_a != NodeInputOrOutputType.UNKNOWN
+    ):
+        dtype_cast_mod_cls = torch.nn.Identity
+    elif (
+        node_input_type_a == NodeInputOrOutputType.INT8 and
+        node_input_type_c == NodeInputOrOutputType.FP32
+    ):
+        # int8 shadows fp32, the dtype cast needs to quantize to int8
+        # with the right qparams.
+        node_a_input_qparams = get_node_input_qparams(
+            node_a, gm_a, node_type_to_io_type_map)
+        if node_a_input_qparams is not None:
+            dtype_cast_op = torch.quantize_per_tensor  # type: ignore[assignment]
+            dtype_cast_scale, dtype_cast_zero_point = node_a_input_qparams
+    elif (
+        node_input_type_a == NodeInputOrOutputType.FP16 and
+        node_input_type_c == NodeInputOrOutputType.FP32
+    ):
+        dtype_cast_method = 'to'
+        dtype_cast_method_dtype = torch.float16
+    else:
+        raise AssertionError(
+            f"dtype cast from {node_input_type_c} {node_c.format_node()} to " +
+            f"{node_input_type_a} {node_a.format_node()} needs to be implemented")
+    if isinstance(prev_node_c, Node):
+        new_dtype_cast_name = \
+            get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+        if dtype_cast_op:
+            if dtype_cast_scale is not None and dtype_cast_zero_point is not None:
+                return _insert_quantize_per_tensor_node(
+                    prev_node_c, node_a, gm_b, graph_c, dtype_cast_scale,
+                    dtype_cast_zero_point, new_dtype_cast_name)
+            else:
+                return graph_c.create_node(
+                    'call_function', dtype_cast_op, (prev_node_c,), {},
+                    new_dtype_cast_name)
+        elif dtype_cast_method:
+            return graph_c.create_node(
+                'call_method', dtype_cast_method,
+                (prev_node_c, dtype_cast_method_dtype), {}, new_dtype_cast_name)
+        else:
+            assert dtype_cast_mod_cls
+            dtype_cast_mod = dtype_cast_mod_cls()
+            setattr(gm_b, new_dtype_cast_name, dtype_cast_mod)
+            return graph_c.create_node(
+                'call_module', new_dtype_cast_name, (prev_node_c,), {},
+                new_dtype_cast_name)
+    elif isinstance(prev_node_c, list):
+        results = []
+        for prev_node_c_inner in prev_node_c:
+            new_dtype_cast_name = \
+                get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+            if dtype_cast_op:
+                # TODO(future PR): add handling for quantize_per_tensor
+                new_dtype_cast_node = graph_c.create_node(
+                    'call_function', dtype_cast_op, (prev_node_c_inner,), {},
+                    new_dtype_cast_name)
+                results.append(new_dtype_cast_node)
+            else:
+                assert dtype_cast_mod_cls
+                dtype_cast_mod = dtype_cast_mod_cls()
+                setattr(gm_b, new_dtype_cast_name, dtype_cast_mod)
+                new_dtype_cast_node = graph_c.create_node(
+                    'call_module', new_dtype_cast_name, (prev_node_c_inner,), {},
+                    new_dtype_cast_name)
+                results.append(new_dtype_cast_node)
+        return results
+    else:
+        raise AssertionError(f"type f{type(prev_node_c)} is not handled")
+# TODO(future PR): look into using copy_node API instead
+def _copy_node_from_a_to_c(
+    node_a: Node,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    graph_c: Graph,
+) -> Node:
+    """
+    Simple copy of node_a to graph_c.
+    """
+    if node_a.op == 'get_attr':
+        node_a_copy_name = \
+            get_new_attr_name_with_prefix(node_a.name + '_shadow_copy_')(gm_b)
+        node_a_obj = getattr_from_fqn(gm_a, node_a.target)  # type: ignore[arg-type]
+        if torch.is_tensor(node_a_obj):
+            node_a_obj = node_a_obj.detach()
+        setattr(gm_b, node_a_copy_name, node_a_obj)
+        node_a_copy = graph_c.create_node(
+            node_a.op, node_a_copy_name, (), {}, node_a_copy_name)
+        return node_a_copy
+    elif node_a.op == 'call_method':
+        assert node_a.target in ('dequantize', 'to'), \
+            f"target {node_a.target} is not implemented"
+        if node_a.target == 'dequantize':
+            arg_copy = _copy_node_from_a_to_c(
+                get_normalized_nth_input(node_a, gm_a, 0),
+                gm_a, gm_b, graph_c)  # type: ignore[arg-type]
+            node_a_copy_name = \
+                get_new_attr_name_with_prefix(node_a.name + '_shadow_copy_')(gm_b)
+            node_a_copy = graph_c.create_node(
+                node_a.op, node_a.target, (arg_copy,), {}, node_a_copy_name)
+            return node_a_copy
+        else:  # to
+            arg_copy = _copy_node_from_a_to_c(
+                get_normalized_nth_input(node_a, gm_a, 0), gm_a, gm_b, graph_c)  # type: ignore[arg-type]
+            node_a_copy_name = \
+                get_new_attr_name_with_prefix(node_a.name + '_shadow_copy_')(gm_b)
+            node_a_copy = graph_c.create_node(
+                node_a.op, node_a.target,
+                (arg_copy, get_normalized_nth_input(node_a, gm_a, 1)),
+                {}, node_a_copy_name)
+            return node_a_copy
+    else:
+        raise AssertionError(
+            f"handling of node {node_a.format_node()} with op {node_a.op} is not implemented")
+def _can_insert_copy_of_subgraph_a(
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    num_non_param_args_node_a: int,
+) -> bool:
+    """
+    This function returns `False` if the input subgraph cannot be copied by
+    `_insert_copy_of_subgraph_a_after_input_node_c`. This usually means
+    that there is a corner case logic for which copy is not yet implemented.
+    """
+    # populate the list of nodes we need to check
+    nodes = []
+    cur_node = subgraph_a.end_node
+    while cur_node != subgraph_a.start_node:
+        nodes.append(cur_node)
+        cur_node = get_normalized_nth_input(cur_node, gm_a, 0)  # type: ignore[assignment]
+    nodes.append(cur_node)
+    nodes.reverse()
+    def _can_insert(node_a_arg, gm_a):
+        if isinstance(node_a_arg, Node):
+            arg_a = return_first_non_observer_node(node_a_arg, gm_a)
+            if arg_a.op == 'call_method':
+                return arg_a.target in ('dequantize', 'to')
+            elif arg_a.op == 'get_attr':
+                return True
+            else:
+                return False
+        elif isinstance(node_a_arg, (list, tuple)):
+            for el in node_a_arg:
+                if not isinstance(el, Node):
+                    return False
+        return True
+    # For each node, check if we handle the copy behavior. This follows the
+    # logic in `_insert_copy_of_subgraph_a_after_input_node_c`.
+    for node_a in nodes:
+        local_num_non_param_args_node_a = num_non_param_args_node_a \
+            if node_a is nodes[0] else 1
+        norm_args_kwargs = node_a.normalized_arguments(
+            gm_a, normalize_to_only_use_kwargs=True)
+        if norm_args_kwargs is not None:
+            norm_args, norm_kwargs = norm_args_kwargs
+        else:
+            norm_args, norm_kwargs = node_a.args, node_a.kwargs
+        cur_idx = 0
+        while cur_idx < len(norm_args):
+            if cur_idx == 0:
+                pass
+            elif cur_idx == 1 and local_num_non_param_args_node_a == 2:
+                pass
+            else:
+                if not _can_insert(norm_args[cur_idx], gm_a):
+                    return False
+            cur_idx += 1
+        for kwarg_val in norm_kwargs.values():
+            # stitch the inputs from base graph
+            if cur_idx == 0:
+                pass
+            elif cur_idx == 1 and local_num_non_param_args_node_a == 2:
+                pass
+            else:
+                if not _can_insert(kwarg_val, gm_a):
+                    return False
+            cur_idx += 1
+    return True
+def _insert_copy_of_subgraph_a_after_input_node_c(
+    input_node_c: Union[Node, List[Node]],
+    input_node_c_2: Optional[Union[Node, List[Node]]],
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    node_name_prefix: str,
+) -> Node:
+    """
+    TODO(before land): real docblock
+    """
+    if isinstance(input_node_c, Node):
+        graph_c = input_node_c.graph
+    else:
+        assert isinstance(input_node_c, list)
+        graph_c = input_node_c[0].graph
+    # create a sequential list of the subgraphs' nodes from start to end,
+    # because we need to add the nodes to graph C in non-reverse order
+    nodes_of_a = [subgraph_a.end_node]
+    cur_node = subgraph_a.end_node
+    while cur_node != subgraph_a.start_node:
+        cur_node = get_normalized_nth_input(cur_node, gm_a, 0)  # type: ignore[assignment]
+        nodes_of_a.insert(0, cur_node)
+    # go through nodes of a in order, and insert them into the graph of c
+    # sequentially
+    cur_node_a = nodes_of_a[0]
+    cur_node_c = _insert_copy_of_node_a_after_input_node_c(
+        input_node_c,
+        input_node_c_2,
+        cur_node_a,
+        gm_a,
+        gm_b,
+        node_name_prefix)
+    for cur_idx_a in range(1, len(nodes_of_a)):
+        cur_node_a = nodes_of_a[cur_idx_a]
+        prev_node_c = cur_node_c  # previous added node is the input to next node
+        cur_node_c = _insert_copy_of_node_a_after_input_node_c(
+            prev_node_c,
+            # TODO(future PR): enable multiple inputs for nodes which are not at start of subgraph
+            None,
+            cur_node_a,
+            gm_a,
+            gm_b,
+            node_name_prefix)
+    # return the last inserted node
+    return cur_node_c
+def _insert_copy_of_node_a_after_input_node_c(
+    input_node_c: Union[Node, List[Node]],
+    input_node_c_2: Optional[Union[Node, List[Node]]],
+    node_a: Node,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    node_name_prefix: str,
+) -> Node:
+    """
+    Assume that node_a from graph_a has
+      args (input, (input2)?, arg1, ...), and
+      kwargs {kw0: kwarg0, ...}
+    Note: input2 is optional. If it equals to None, we assume that the op
+    has a single non-param input.  If it is specified, we assume that the op
+    has two non-param inputs.
+    Copies the underlying values of arg1..argn and kwarg0..kwargn into gm_b,
+    and creates the corresponding nodes in graph_c. Note: observers are ignored,
+    so if an arg is an observer we navigate up until we find a non-observer parent.
+    If node_a is a call_module, points the module pointed to by node_a to gm_b.
+    Creates the copy of node_a in graph_c, with input as the first arg,
+    and all other args and kwargs pointing to the copies of the objects
+    in gm_b created above.
+    An example in pictures:
+    graph A:
+    ========
+    input -------------> node_a
+                         / / /
+    (input_2)?----------/ / /
+                         / /
+    weight -> weight_obs  /
+                         /
+    bias ----------------
+    graph C (derived from B):
+    =========================
+    input_node_c --> node_a_copy
+                     / / /
+    (input_node_c_2)? / /
+                     / /
+    weight_copy ----/ /
+                     /
+    bias_copy ------/
+    """
+    if isinstance(input_node_c, Node):
+        graph_c = input_node_c.graph
+    else:
+        assert isinstance(input_node_c, list)
+        graph_c = input_node_c[0].graph
+    norm_args_kwargs = node_a.normalized_arguments(
+        gm_a, normalize_to_only_use_kwargs=True)
+    if norm_args_kwargs is not None:
+        norm_args, norm_kwargs = norm_args_kwargs
+    else:
+        norm_args, norm_kwargs = node_a.args, node_a.kwargs
+    new_args = []
+    new_kwargs = {}
+    def _copy_arg(arg):
+        # copy the other inputs from the other graph
+        if isinstance(arg, Node):
+            arg = return_first_non_observer_node(arg, gm_a)
+            arg = _copy_node_from_a_to_c(arg, gm_a, gm_b, graph_c)
+            return arg
+        elif isinstance(arg, (int, float, torch.dtype)):
+            return arg
+        elif isinstance(kwarg_val, (list, tuple)):
+            for el in kwarg_val:
+                assert not isinstance(el, Node), \
+                    "handling of Node inside list is not implemented"
+            return arg
+        else:
+            raise AssertionError(
+                f"handling for kwarg of type {type(kwarg_val)} is not implemented")
+    cur_idx = 0
+    while cur_idx < len(norm_args):
+        if cur_idx == 0:
+            new_arg = input_node_c
+        elif cur_idx == 1 and input_node_c_2 is not None:
+            new_arg = input_node_c_2
+        else:
+            new_arg = _copy_arg(norm_args[cur_idx])
+        new_args.append(new_arg)
+        cur_idx += 1
+    for kwarg_name, kwarg_val in norm_kwargs.items():
+        # stitch the inputs from base graph
+        if cur_idx == 0:
+            new_kwargs[kwarg_name] = input_node_c
+        elif cur_idx == 1 and input_node_c_2 is not None:
+            new_kwargs[kwarg_name] = input_node_c_2
+        else:
+            new_kwargs[kwarg_name] = _copy_arg(kwarg_val)
+        cur_idx += 1
+    new_args = tuple(new_args)  # type: ignore[assignment]
+    node_a_shadows_c_name = \
+        get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+    if node_a.op == 'call_module':
+        # if target is a module, we point to the module from gm_b
+        new_mod_copy_name = \
+            get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+        # fetch the corresponding module from gm_a
+        assert isinstance(node_a.target, str)
+        mod_a = getattr_from_fqn(gm_a, node_a.target)
+        setattr(gm_b, new_mod_copy_name, mod_a)
+        node_a_shadows_c = graph_c.create_node(
+            node_a.op, new_mod_copy_name, new_args,
+            new_kwargs, node_a_shadows_c_name)
+        return node_a_shadows_c
+    else:
+        assert node_a.op in ('call_function', 'call_method')
+        node_a_shadows_c = graph_c.create_node(
+            node_a.op, node_a.target, new_args,
+            new_kwargs, node_a_shadows_c_name)
+        return node_a_shadows_c
+def create_a_shadows_b(
+    name_a: str,
+    gm_a: GraphModule,
+    name_b: str,
+    gm_b: GraphModule,
+    matched_subgraph_pairs: Dict[str, Tuple[NSSubgraph, NSSubgraph]],
+    logger_cls: Callable,
+    should_log_inputs: bool,
+    node_type_to_io_type_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+) -> GraphModule:
+    """
+    Creates a new GraphModule consisting of the graph of C, with the meaningful
+    nodes of A shadowing the corresponding nodes of B.  For example,
+    Graph A:
+    a0 -> op0_fp32 -> a1 -> op1_fp32 -> a2
+    Graph B:
+    b0 -> op0_int8 -> b1 -> op1_int8 -> b2
+    matched_node_pairs: {'op0': (op0_fp32, op0_int8), 'op1': (op1_fp32, op1_int8)}
+    Graph C (A shadows B):
+        / dequant0 -> op0_fp32 -> logger_a_0  / dequant_1 -> op1_fp32 -> logger_a_1
+       /                                     /
+    b0 -------------> op0_int8 -> logger_b_0 --------------> op1_int8 -> logger_b_1
+    In a nutshell, this function does the following for each node pair:
+    * copies the necessary attributes and modules from gm_a to gm_b,
+      keeping names unique
+    * adds a dtype cast op (dequant, quant, etc)
+    * adds a copy of node_a in gm_b's graph
+    * adds loggers to the outputs of node_a and node_b
+    """
+    if node_type_to_io_type_map is None:
+        node_type_to_io_type_map = get_node_type_to_io_type_map()
+    # graph_c is the graph created from copying the nodes of graph_b and inserting
+    # the shadows with the nodes copied from graph_a
+    graph_c = Graph()
+    env_c: Dict[str, Any] = {}
+    modules = dict(gm_b.named_modules())
+    def load_arg(a):
+        return map_arg(a, lambda node: env_c[node.name])
+    start_node_b_to_matched_subgraph_a_and_name = {}
+    end_node_b_to_matched_subgraph_a_and_name = {}
+    for match_name, match in matched_subgraph_pairs.items():
+        subgraph_a, subgraph_b = match
+        ref_node_type_a = get_target_type_str(subgraph_a.base_op_node, gm_a)
+        ref_node_type_b = get_target_type_str(subgraph_b.base_op_node, gm_b)
+        start_node_b_to_matched_subgraph_a_and_name[subgraph_b.start_node] = \
+            (subgraph_a, match_name, ref_node_type_a, ref_node_type_b)
+        end_node_b_to_matched_subgraph_a_and_name[subgraph_b.end_node] = \
+            (subgraph_a, match_name, ref_node_type_a, ref_node_type_b)
+    for node_b in gm_b.graph.nodes:
+        if node_b.op == 'output':
+            graph_c.output(map_arg(node_b.args[0], load_arg))
+            continue
+        # calculate the flags to determine what to do with this node
+        node_b_is_start_node = node_b in start_node_b_to_matched_subgraph_a_and_name
+        node_b_is_end_node = node_b in end_node_b_to_matched_subgraph_a_and_name
+        if (node_b_is_start_node or node_b_is_end_node):
+            if node_b_is_start_node:
+                subgraph_a, ref_name, ref_node_type_a, ref_node_type_b = \
+                    start_node_b_to_matched_subgraph_a_and_name[node_b]
+            else:
+                assert node_b_is_end_node
+                subgraph_a, ref_name, ref_node_type_a, ref_node_type_b = \
+                    end_node_b_to_matched_subgraph_a_and_name[node_b]
+            all_op_types_support_shadowing = (
+                op_type_supports_shadowing(subgraph_a.start_node) and
+                op_type_supports_shadowing(node_b)
+            )
+            if not all_op_types_support_shadowing:
+                print(
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                    ', unsupported')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+            # For both start_node and end_node verify that we know how to do
+            # the dtype cast. If we do not, skip.
+            node_input_type_a, node_output_type_a = \
+                get_node_first_input_and_output_type(
+                    subgraph_a.start_node, gm_a, logger_cls,
+                    node_type_to_io_type_map)
+            node_input_type_b, node_output_type_b = \
+                get_node_first_input_and_output_type(
+                    node_b, gm_b, logger_cls,
+                    node_type_to_io_type_map)
+            node_io_types_known_a_and_b = (
+                node_input_type_a != NodeInputOrOutputType.UNKNOWN and
+                node_output_type_a != NodeInputOrOutputType.UNKNOWN and
+                node_input_type_b != NodeInputOrOutputType.UNKNOWN and
+                node_output_type_b != NodeInputOrOutputType.UNKNOWN
+            )
+            if not node_io_types_known_a_and_b:
+                print(
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                    ', unknown dtype cast')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+            # If we are shadowing from fp32 to int8, we need to insert
+            # quantize_per_tensor call with qparams from the previous node.
+            # Only do this if we are able to infer these qparams from the graph.
+            if (
+                node_input_type_a == NodeInputOrOutputType.INT8 and
+                node_input_type_b == NodeInputOrOutputType.FP32
+            ):
+                node_a_input_qparams = get_node_input_qparams(
+                    subgraph_a.start_node, gm_a, node_type_to_io_type_map)
+                if not node_a_input_qparams:
+                    print(
+                        f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                        f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                        ', unknown input qparams')
+                    env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                    continue
+            num_non_param_args_node_a = \
+                get_number_of_non_param_args(subgraph_a.start_node, gm_a)
+            if not _can_insert_copy_of_subgraph_a(subgraph_a, gm_a, num_non_param_args_node_a):
+                print(
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                    ', unhandled logic in subgraph copy')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+            fqn_base_a = _maybe_get_fqn(subgraph_a.base_op_node, gm_a)
+            fqn_base_b = _maybe_get_fqn(subgraph_b.base_op_node, gm_b)  # type: ignore[possibly-undefined]
+            if node_b_is_start_node:
+                # if necessary, log the input of node_c
+                if should_log_inputs:
+                    prev_node_b = get_normalized_nth_input(node_b, gm_b, 0)
+                    if isinstance(prev_node_b, Node):
+                        prev_node_c = env_c[prev_node_b.name]
+                        env_c[prev_node_c.name] = _insert_logger_after_node(
+                            prev_node_c, gm_b, logger_cls, '_ns_logger_b_inp_',
+                            node_b.name, name_b, ref_name, ref_node_type_b,
+                            NSSingleResultValuesType.NODE_INPUT.value,
+                            index_within_arg=0, index_of_arg=0,
+                            fqn=fqn_base_b)
+                    elif isinstance(prev_node_b, list):
+                        # first, save the prev_node instances, because they
+                        # will be overwritten in the env after the first logger
+                        # is added
+                        prev_node_c_list = [env_c[arg.name] for arg in prev_node_b]
+                        for arg_idx, arg in enumerate(prev_node_b):
+                            prev_node_c = prev_node_c_list[arg_idx]
+                            env_c[prev_node_c.name] = _insert_logger_after_node(
+                                prev_node_c, gm_b, logger_cls, '_ns_logger_b_inp_',
+                                node_b.name, name_b, ref_name, ref_node_type_b,
+                                NSSingleResultValuesType.NODE_INPUT.value,
+                                index_within_arg=arg_idx, index_of_arg=0,
+                                fqn=fqn_base_b)
+                    else:
+                        # logging of inputs which are not lists is not supported yet
+                        raise AssertionError(f"type {type(prev_node_b)} is not handled yet")
+                # subgraph so far:
+                #
+                # (prev_node_c)+ -> (logger_c_input)?
+            # Note: this if statement is always True, spelling it out to clarify code
+            # intent.
+            if node_b_is_start_node or node_b_is_end_node:
+                # ensure env_c is populated with base node
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                node_c = env_c[node_b.name]
+                # after this point,
+                #
+                # node_a is the original node from graph_a, with parent module gm_a
+                # node_b is the original node from graph_b, with parent module gm_b
+                # node_c is the copy of node_b in graph_c
+                #
+                # subgraph so far:
+                #
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+            if node_b_is_start_node:
+                # cast dtype from the dtype of node_c's input to the dtype of
+                # node_a's input (dequant, etc)
+                # prev_node_c = node_c.args[0]
+                prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)  # type: ignore[possibly-undefined]
+                if should_log_inputs:
+                    # skip the input logger when inserting a dtype cast
+                    if isinstance(prev_node_c, Node):
+                        prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)
+                    elif isinstance(prev_node_c, list):
+                        prev_node_c = [get_normalized_nth_input(arg, gm_b, 0) for arg in prev_node_c]
+                dtype_cast_node = _insert_dtype_cast_after_node(
+                    subgraph_a.start_node, node_c, prev_node_c, gm_a, gm_b, graph_c,
+                    node_b.name + '_dtype_cast_', logger_cls,
+                    node_type_to_io_type_map)
+                # note: not inserting to env_c because all nodes which use the dtype
+                #   casts are copied from graph_a
+                #
+                # subgraph so far:
+                #
+                #           (dtype_cast_node)+
+                #                  /
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+                # if input logging is enabled, log the input to the subgraph
+                if should_log_inputs:
+                    # TODO: explain this
+                    ref_node_name = ''
+                    if isinstance(dtype_cast_node, Node):
+                        dtype_cast_node = _insert_logger_after_node(
+                            dtype_cast_node, gm_b, logger_cls, '_ns_logger_a_inp_',
+                            ref_node_name, name_a, ref_name, ref_node_type_a,
+                            NSSingleResultValuesType.NODE_INPUT.value,
+                            index_within_arg=0, index_of_arg=0,
+                            fqn=fqn_base_a)
+                        input_logger: Union[Node, List[Node]] = dtype_cast_node
+                    else:
+                        assert isinstance(dtype_cast_node, list)
+                        new_loggers = []
+                        for dtype_cast_idx, dtype_cast_node_inner in enumerate(dtype_cast_node):
+                            dtype_cast_logger = _insert_logger_after_node(
+                                dtype_cast_node_inner, gm_b, logger_cls, '_ns_logger_a_inp_',
+                                ref_node_name, name_a, ref_name, ref_node_type_a,
+                                NSSingleResultValuesType.NODE_INPUT.value,
+                                index_within_arg=dtype_cast_idx,
+                                index_of_arg=0,
+                                fqn=fqn_base_a)
+                            new_loggers.append(dtype_cast_logger)
+                        dtype_cast_node = new_loggers
+                        input_logger = dtype_cast_node
+                    # subgraph so far:
+                    #
+                    #       (dtype_cast_node)+ -> (logger_a_input)?
+                    #                  /
+                    # prev_node_c -> (logger_c_input)? -> node_start_c
+                # hook up the new mod_a copy to be in the graph, receiving the
+                # same inputs as mod_b does, with dtype cast to match a
+                # Some ops, such as LSTMs, have two non-param inputs. If we have
+                # such an op, pass the second param as well. Note: dtype casting
+                # for the second param is not implemented yet, it can be added
+                # later if there is a use case.
+                node_c_second_non_param_arg = None
+                num_non_param_args_node_a = get_number_of_non_param_args(subgraph_a.start_node, gm_a)
+                if num_non_param_args_node_a == 2:
+                    # node_c_second_non_param_arg = node_c.args[1]
+                    node_c_second_non_param_arg = get_normalized_nth_input(node_c, gm_b, 1)
+                node_a_shadows_c = _insert_copy_of_subgraph_a_after_input_node_c(
+                    dtype_cast_node, node_c_second_non_param_arg,
+                    subgraph_a, gm_a, gm_b, node_c.name + '_shadow_copy_')
+                env_c[node_a_shadows_c.name] = node_a_shadows_c
+                # subgraph so far:
+                #
+                #       dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy(args/kwargs not shown)
+                #                  /
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+                if should_log_inputs:
+                    # When we created the input logger, we left the ref_node_name
+                    # as an empty string, because the subgraph copy did not exist
+                    # yet. Now that the subgraph copy exists, we modify this name
+                    # to its true value.
+                    # Note: the alternative to this is to create the input logger
+                    # after creating the subgraph, which is slightly more
+                    # complicated. This is the lesser of two evils.
+                    # input_logger = env_c[dtype_cast_node.name]
+                    # Find the first node in the subgraph
+                    cur_node = node_a_shadows_c
+                    while get_normalized_nth_input(cur_node, gm_b, 0) != input_logger:  # type: ignore[possibly-undefined]
+                        cur_node = get_normalized_nth_input(cur_node, gm_b, 0)  # type: ignore[assignment]
+                    if isinstance(input_logger, Node):
+                        input_logger_mod = getattr(gm_b, input_logger.name)
+                        input_logger_mod.ref_node_name = cur_node.name
+                    else:
+                        assert isinstance(input_logger, list)
+                        for input_logger_inner in input_logger:
+                            input_logger_mod = getattr(gm_b, input_logger_inner.name)
+                            input_logger_mod.ref_node_name = cur_node.name
+                # hook up a logger to the mod_a copy
+                env_c[node_a_shadows_c.name] = _insert_logger_after_node(
+                    env_c[node_a_shadows_c.name], gm_b, logger_cls, '_ns_logger_a_',
+                    node_a_shadows_c.name, name_a, ref_name, ref_node_type_a,
+                    NSSingleResultValuesType.NODE_OUTPUT.value,
+                    index_within_arg=0, index_of_arg=0,
+                    fqn=fqn_base_a)
+                # subgraph so far:
+                #
+                #       dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a
+                #                  /
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+            if node_b_is_end_node:
+                # hook up a logger to the mod_b copy
+                env_c[node_b.name] = _insert_logger_after_node(
+                    env_c[node_b.name], gm_b, logger_cls, '_ns_logger_b_',
+                    node_b.name, name_b, ref_name, ref_node_type_b,
+                    NSSingleResultValuesType.NODE_OUTPUT.value,
+                    index_within_arg=0, index_of_arg=0,
+                    fqn=fqn_base_b)
+                # subgraph so far:
+                #
+                #       dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a
+                #                  /
+                # (prev_node_c+) -> (logger_c_input)? -> node_start_c -> ... -> node_end_c -> logger_c
+                #
+                # Note: node_start_c may be the same node as node_end_c, or they
+                # may have nodes inbetween.
+        else:
+            env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+    gm_c = GraphModule(gm_b, graph_c)
+    return gm_c

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_safeguard.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+from torch.overrides import TorchFunctionMode
+class AutogradStateOpsFailSafeguard(TorchFunctionMode):
+    """
+    Detect grad state ops during exporting the graph and fail the process by
+    raising an error, to avoid unexpected behavior. Those grad mode ops could be:
+    `torch.no_grad`
+    `torch.enable_grad`
+    `torch.set_grad_enabled`
+    Export with predispatch mode is exempted.
+    """
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        unsupported_grad_mode_ops = [
+            torch._C._set_grad_enabled,
+        ]
+        # It's only enabled while tracing, by confirming the torch dispatch mode is
+        # any active PROXY. This is to allow the autograd ops out of tracing.
+        current_state = torch._C.is_grad_enabled()
+        if func in unsupported_grad_mode_ops:
+            assert len(args) == 1
+            changed_state = args[0]
+            mode = torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY)
+            # Intend to check if it's not the pre_dispatch mode. It's allowed to use
+            # autograd ops in pre_dispatch mode, e.g. `torch.no_grad`
+            if (
+                mode
+                and isinstance(mode, ProxyTorchDispatchMode)
+                and not mode.pre_dispatch
+                and changed_state != current_state
+            ):
+                raise RuntimeError(
+                    f"Encountered autograd state manager op {func} trying to change global autograd state "
+                    "while exporting. This is unsafe because we don't capture this op in torch.export "
+                    "today, hence we can't reflect the user intention soundly."
+                )
+        return func(*args, **kwargs)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_tree_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Any, Callable, Dict, Optional
+from torch.utils._pytree import Context, TreeSpec
+def reorder_kwargs(user_kwargs: Dict[str, Any], spec: TreeSpec) -> Dict[str, Any]:
+    """Reorder user-provided kwargs to match the order in `spec`. `spec` is
+    expected to be the in_spec of an exported program, i.e. the spec that
+    results from flattening `(args, kwargs)`.
+    We need this to provide consistent input ordering, such so that users can
+    pass in foo(a=a, b=b) OR foo(b=b, a=a) and receive the same result.
+    """
+    # Make sure that the spec is actually shaped like (args, kwargs)
+    assert spec.type is tuple
+    assert spec.num_children == 2
+    kwargs_spec = spec.children_specs[1]
+    assert kwargs_spec.type is dict
+    if set(user_kwargs) != set(kwargs_spec.context):
+        raise ValueError(
+            f"kwarg key mismatch: "
+            f"Got {list(user_kwargs)} but expected {kwargs_spec.context}"
+        )
+    reordered_kwargs = {}
+    for kw in kwargs_spec.context:
+        reordered_kwargs[kw] = user_kwargs[kw]
+    return reordered_kwargs
+def is_equivalent(
+    spec1: TreeSpec,
+    spec2: TreeSpec,
+    equivalence_fn: Callable[[Optional[type], Context, Optional[type], Context], bool],
+) -> bool:
+    """Customizable equivalence check for two TreeSpecs.
+    Arguments:
+        spec1: The first TreeSpec to compare
+        spec2: The second TreeSpec to compare
+        equivalence_fn: A function to determine the equivalence of two
+            TreeSpecs by examining their types and contexts. It will be called like:
+                equivalence_fn(spec1.type, spec1.context, spec2.type, spec2.context)
+            This function will be applied recursively to all children.
+    Returns:
+        True if the two TreeSpecs are equivalent, False otherwise.
+    """
+    if not equivalence_fn(spec1.type, spec1.context, spec2.type, spec2.context):
+        return False
+    # Recurse on children
+    if len(spec1.children_specs) != len(spec2.children_specs):
+        return False
+    for child_spec1, child_spec2 in zip(spec1.children_specs, spec2.children_specs):
+        if not is_equivalent(child_spec1, child_spec2, equivalence_fn):
+            return False
+    return True

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.69 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/common_types.cpython-311.pyc ADDED Viewed

Binary file (1.7 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/cpp.cpython-311.pyc ADDED Viewed

Binary file (5.42 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/init.cpython-311.pyc ADDED Viewed

Binary file (28.7 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/parameter.cpython-311.pyc ADDED Viewed

Binary file (12.7 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (218 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/thnn.cpython-311.pyc ADDED Viewed

Binary file (346 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/thnn.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# this is for historical pickle deserialization, it is not used otherwise
+def _get_thnn_function_backend():
+    pass

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from torch.ao.nn.intrinsic import ConvBn1d
+from torch.ao.nn.intrinsic import ConvBn2d
+from torch.ao.nn.intrinsic import ConvBn3d
+from torch.ao.nn.intrinsic import ConvBnReLU1d
+from torch.ao.nn.intrinsic import ConvBnReLU2d
+from torch.ao.nn.intrinsic import ConvBnReLU3d
+from torch.ao.nn.intrinsic import ConvReLU1d
+from torch.ao.nn.intrinsic import ConvReLU2d
+from torch.ao.nn.intrinsic import ConvReLU3d
+from torch.ao.nn.intrinsic import LinearReLU
+from torch.ao.nn.intrinsic import BNReLU2d
+from torch.ao.nn.intrinsic import BNReLU3d
+from torch.ao.nn.intrinsic import LinearBn1d
+from torch.ao.nn.intrinsic.modules.fused import _FusedModule  # noqa: F401
+# Include the subpackages in case user imports from it directly
+from . import modules  # noqa: F401
+from . import qat  # noqa: F401
+from . import quantized  # noqa: F401
+__all__ = [
+    'ConvBn1d',
+    'ConvBn2d',
+    'ConvBn3d',
+    'ConvBnReLU1d',
+    'ConvBnReLU2d',
+    'ConvBnReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearReLU',
+    'BNReLU2d',
+    'BNReLU3d',
+    'LinearBn1d',
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.21 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/modules/fused.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from torch.ao.nn.intrinsic import BNReLU2d
+from torch.ao.nn.intrinsic import BNReLU3d
+from torch.ao.nn.intrinsic import ConvBn1d
+from torch.ao.nn.intrinsic import ConvBn2d
+from torch.ao.nn.intrinsic import ConvBn3d
+from torch.ao.nn.intrinsic import ConvBnReLU1d
+from torch.ao.nn.intrinsic import ConvBnReLU2d
+from torch.ao.nn.intrinsic import ConvBnReLU3d
+from torch.ao.nn.intrinsic import ConvReLU1d
+from torch.ao.nn.intrinsic import ConvReLU2d
+from torch.ao.nn.intrinsic import ConvReLU3d
+from torch.ao.nn.intrinsic import LinearBn1d
+from torch.ao.nn.intrinsic import LinearReLU
+from torch.ao.nn.intrinsic.modules.fused import _FusedModule  # noqa: F401
+__all__ = [
+    'BNReLU2d',
+    'BNReLU3d',
+    'ConvBn1d',
+    'ConvBn2d',
+    'ConvBn3d',
+    'ConvBnReLU1d',
+    'ConvBnReLU2d',
+    'ConvBnReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearBn1d',
+    'LinearReLU',
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-311.pyc ADDED Viewed

Binary file (1.27 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-311.pyc ADDED Viewed

Binary file (712 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .modules import *  # noqa: F403
+# to ensure customers can use the module below
+# without importing it directly
+import torch.nn.intrinsic.quantized.dynamic
+__all__ = [
+    'BNReLU2d',
+    'BNReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearReLU',
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (335 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from torch.ao.nn.intrinsic.quantized.dynamic import LinearReLU
+__all__ = [
+    'LinearReLU',
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .linear_relu import LinearReLU
+from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d
+from .bn_relu import BNReLU2d, BNReLU3d
+__all__ = [
+    'LinearReLU',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'BNReLU2d',
+    'BNReLU3d',
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (556 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-311.pyc ADDED Viewed

Binary file (350 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from .module import Module
+from .linear import Identity, Linear, Bilinear, LazyLinear
+from .conv import Conv1d, Conv2d, Conv3d, \
+    ConvTranspose1d, ConvTranspose2d, ConvTranspose3d, \
+    LazyConv1d, LazyConv2d, LazyConv3d, LazyConvTranspose1d, LazyConvTranspose2d, LazyConvTranspose3d
+from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \
+    Softmax, Softmax2d, LogSoftmax, ELU, SELU, CELU, GELU, Hardshrink, LeakyReLU, LogSigmoid, \
+    Softplus, Softshrink, MultiheadAttention, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU, \
+    Hardsigmoid, Hardswish, SiLU, Mish
+from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \
+    CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \
+    MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, SmoothL1Loss, HuberLoss, \
+    SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, TripletMarginWithDistanceLoss, PoissonNLLLoss, GaussianNLLLoss
+from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict
+from .pooling import AvgPool1d, AvgPool2d, AvgPool3d, MaxPool1d, MaxPool2d, MaxPool3d, \
+    MaxUnpool1d, MaxUnpool2d, MaxUnpool3d, FractionalMaxPool2d, FractionalMaxPool3d, LPPool1d, LPPool2d, LPPool3d, \
+    AdaptiveMaxPool1d, AdaptiveMaxPool2d, AdaptiveMaxPool3d, AdaptiveAvgPool1d, AdaptiveAvgPool2d, AdaptiveAvgPool3d
+from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d, SyncBatchNorm, \
+    LazyBatchNorm1d, LazyBatchNorm2d, LazyBatchNorm3d
+from .instancenorm import InstanceNorm1d, InstanceNorm2d, InstanceNorm3d, \
+    LazyInstanceNorm1d, LazyInstanceNorm2d, LazyInstanceNorm3d
+from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm, GroupNorm
+from .dropout import Dropout, Dropout1d, Dropout2d, Dropout3d, AlphaDropout, FeatureAlphaDropout
+from .padding import ReflectionPad1d, ReflectionPad2d, ReflectionPad3d, ReplicationPad1d, ReplicationPad2d, \
+    ReplicationPad3d, ZeroPad1d, ZeroPad2d, ZeroPad3d, ConstantPad1d, ConstantPad2d, ConstantPad3d, \
+    CircularPad1d, CircularPad2d, CircularPad3d
+from .sparse import Embedding, EmbeddingBag
+from .rnn import RNNBase, RNN, LSTM, GRU, \
+    RNNCellBase, RNNCell, LSTMCell, GRUCell
+from .pixelshuffle import PixelShuffle, PixelUnshuffle
+from .upsampling import UpsamplingNearest2d, UpsamplingBilinear2d, Upsample
+from .distance import PairwiseDistance, CosineSimilarity
+from .fold import Fold, Unfold
+from .adaptive import AdaptiveLogSoftmaxWithLoss
+from .transformer import TransformerEncoder, TransformerDecoder, \
+    TransformerEncoderLayer, TransformerDecoderLayer, Transformer
+from .flatten import Flatten, Unflatten
+from .channelshuffle import ChannelShuffle
+__all__ = [
+    'Module', 'Identity', 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d',
+    'ConvTranspose2d', 'ConvTranspose3d', 'Threshold', 'ReLU', 'Hardtanh', 'ReLU6',
+    'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'CELU', 'GLU', 'GELU', 'Hardshrink',
+    'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'MultiheadAttention', 'PReLU', 'Softsign', 'Softmin',
+    'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss',
+    'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'CTCLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss',
+    'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss', 'GaussianNLLLoss',
+    'HuberLoss', 'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict',
+    'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d',
+    'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'FractionalMaxPool2d', "FractionalMaxPool3d",
+    'LPPool1d', 'LPPool2d', 'LPPool3d', 'LocalResponseNorm', 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d',
+    'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'SyncBatchNorm',
+    'Dropout', 'Dropout1d', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout',
+    'ReflectionPad1d', 'ReflectionPad2d', 'ReflectionPad3d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d',
+    'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell',
+    'LSTMCell', 'GRUCell', 'PixelShuffle', 'PixelUnshuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d',
+    'PairwiseDistance', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d',
+    'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad1d', 'ZeroPad2d', 'ZeroPad3d',
+    'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold',
+    'AdaptiveLogSoftmaxWithLoss', 'TransformerEncoder', 'TransformerDecoder',
+    'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Transformer',
+    'LazyLinear', 'LazyConv1d', 'LazyConv2d', 'LazyConv3d',
+    'LazyConvTranspose1d', 'LazyConvTranspose2d', 'LazyConvTranspose3d',
+    'LazyBatchNorm1d', 'LazyBatchNorm2d', 'LazyBatchNorm3d',
+    'LazyInstanceNorm1d', 'LazyInstanceNorm2d', 'LazyInstanceNorm3d',
+    'Flatten', 'Unflatten', 'Hardsigmoid', 'Hardswish', 'SiLU', 'Mish', 'TripletMarginWithDistanceLoss', 'ChannelShuffle',
+    'CircularPad1d', 'CircularPad2d', 'CircularPad3d'
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (7.04 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/_functions.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/container.cpython-311.pyc ADDED Viewed

Binary file (55.7 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/conv.cpython-311.pyc ADDED Viewed

Binary file (76.6 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/flatten.cpython-311.pyc ADDED Viewed

Binary file (8.13 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/fold.cpython-311.pyc ADDED Viewed

Binary file (14.4 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/lazy.cpython-311.pyc ADDED Viewed

Binary file (14.8 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/activation.py ADDED Viewed

	@@ -0,0 +1,1624 @@

+import warnings
+from typing import Optional, Tuple
+import torch
+from torch import Tensor
+from .linear import NonDynamicallyQuantizableLinear
+from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
+from torch.nn.parameter import Parameter
+from .module import Module
+from .. import functional as F
+__all__ = ['Threshold', 'ReLU', 'RReLU', 'Hardtanh', 'ReLU6', 'Sigmoid', 'Hardsigmoid', 'Tanh',
+           'SiLU', 'Mish', 'Hardswish', 'ELU', 'CELU', 'SELU', 'GLU', 'GELU', 'Hardshrink', 'LeakyReLU',
+           'LogSigmoid', 'Softplus', 'Softshrink', 'MultiheadAttention', 'PReLU', 'Softsign', 'Tanhshrink',
+           'Softmin', 'Softmax', 'Softmax2d', 'LogSoftmax']
+class Threshold(Module):
+    r"""Thresholds each element of the input Tensor.
+    Threshold is defined as:
+    .. math::
+        y =
+        \begin{cases}
+        x, &\text{ if } x > \text{threshold} \\
+        \text{value}, &\text{ otherwise }
+        \end{cases}
+    Args:
+        threshold: The value to threshold at
+        value: The value to replace with
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    Examples::
+        >>> m = nn.Threshold(0.1, 20)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['threshold', 'value', 'inplace']
+    threshold: float
+    value: float
+    inplace: bool
+    def __init__(self, threshold: float, value: float, inplace: bool = False) -> None:
+        super().__init__()
+        self.threshold = threshold
+        self.value = value
+        self.inplace = inplace
+        # TODO: check in THNN (if inplace == True, then assert value <= threshold)
+    def forward(self, input: Tensor) -> Tensor:
+        return F.threshold(input, self.threshold, self.value, self.inplace)
+    def extra_repr(self):
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'threshold={self.threshold}, value={self.value}{inplace_str}'
+class ReLU(Module):
+    r"""Applies the rectified linear unit function element-wise.
+    :math:`\text{ReLU}(x) = (x)^+ = \max(0, x)`
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/ReLU.png
+    Examples::
+        >>> m = nn.ReLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+      An implementation of CReLU - https://arxiv.org/abs/1603.05201
+        >>> m = nn.ReLU()
+        >>> input = torch.randn(2).unsqueeze(0)
+        >>> output = torch.cat((m(input), m(-input)))
+    """
+    __constants__ = ['inplace']
+    inplace: bool
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.relu(input, inplace=self.inplace)
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+class RReLU(Module):
+    r"""Applies the randomized leaky rectified linear unit function, element-wise.
+    Method described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_.
+    The function is defined as:
+    .. math::
+        \text{RReLU}(x) =
+        \begin{cases}
+            x & \text{if } x \geq 0 \\
+            ax & \text{ otherwise }
+        \end{cases}
+    where :math:`a` is randomly sampled from uniform distribution
+    :math:`\mathcal{U}(\text{lower}, \text{upper})` during training while during
+    evaluation :math:`a` is fixed with :math:`a = \frac{\text{lower} + \text{upper}}{2}`.
+    Args:
+        lower: lower bound of the uniform distribution. Default: :math:`\frac{1}{8}`
+        upper: upper bound of the uniform distribution. Default: :math:`\frac{1}{3}`
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/RReLU.png
+    Examples::
+        >>> m = nn.RReLU(0.1, 0.3)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['lower', 'upper', 'inplace']
+    lower: float
+    upper: float
+    inplace: bool
+    def __init__(
+        self,
+        lower: float = 1. / 8,
+        upper: float = 1. / 3,
+        inplace: bool = False
+    ):
+        super().__init__()
+        self.lower = lower
+        self.upper = upper
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.rrelu(input, self.lower, self.upper, self.training, self.inplace)
+    def extra_repr(self):
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'lower={self.lower}, upper={self.upper}{inplace_str}'
+class Hardtanh(Module):
+    r"""Applies the HardTanh function element-wise.
+    HardTanh is defined as:
+    .. math::
+        \text{HardTanh}(x) = \begin{cases}
+            \text{max\_val} & \text{ if } x > \text{ max\_val } \\
+            \text{min\_val} & \text{ if } x < \text{ min\_val } \\
+            x & \text{ otherwise } \\
+        \end{cases}
+    Args:
+        min_val: minimum value of the linear region range. Default: -1
+        max_val: maximum value of the linear region range. Default: 1
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Keyword arguments :attr:`min_value` and :attr:`max_value`
+    have been deprecated in favor of :attr:`min_val` and :attr:`max_val`.
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Hardtanh.png
+    Examples::
+        >>> m = nn.Hardtanh(-2, 2)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['min_val', 'max_val', 'inplace']
+    min_val: float
+    max_val: float
+    inplace: bool
+    def __init__(
+        self,
+        min_val: float = -1.,
+        max_val: float = 1.,
+        inplace: bool = False,
+        min_value: Optional[float] = None,
+        max_value: Optional[float] = None
+    ) -> None:
+        super().__init__()
+        if min_value is not None:
+            warnings.warn("keyword argument min_value is deprecated and rename to min_val")
+            min_val = min_value
+        if max_value is not None:
+            warnings.warn("keyword argument max_value is deprecated and rename to max_val")
+            max_val = max_value
+        self.min_val = min_val
+        self.max_val = max_val
+        self.inplace = inplace
+        assert self.max_val > self.min_val
+    def forward(self, input: Tensor) -> Tensor:
+        return F.hardtanh(input, self.min_val, self.max_val, self.inplace)
+    def extra_repr(self) -> str:
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'min_val={self.min_val}, max_val={self.max_val}{inplace_str}'
+class ReLU6(Hardtanh):
+    r"""Applies the ReLU6 function element-wise.
+    .. math::
+        \text{ReLU6}(x) = \min(\max(0,x), 6)
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/ReLU6.png
+    Examples::
+        >>> m = nn.ReLU6()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    def __init__(self, inplace: bool = False):
+        super().__init__(0., 6., inplace)
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+class Sigmoid(Module):
+    r"""Applies the Sigmoid function element-wise.
+    .. math::
+        \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)}
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Sigmoid.png
+    Examples::
+        >>> m = nn.Sigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return torch.sigmoid(input)
+class Hardsigmoid(Module):
+    r"""Applies the Hardsigmoid function element-wise.
+    Hardsigmoid is defined as:
+    .. math::
+        \text{Hardsigmoid}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            1 & \text{if~} x \ge +3, \\
+            x / 6 + 1 / 2 & \text{otherwise}
+        \end{cases}
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Hardsigmoid.png
+    Examples::
+        >>> m = nn.Hardsigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['inplace']
+    inplace: bool
+    def __init__(self, inplace : bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.hardsigmoid(input, self.inplace)
+class Tanh(Module):
+    r"""Applies the Hyperbolic Tangent (Tanh) function element-wise.
+    Tanh is defined as:
+    .. math::
+        \text{Tanh}(x) = \tanh(x) = \frac{\exp(x) - \exp(-x)} {\exp(x) + \exp(-x)}
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Tanh.png
+    Examples::
+        >>> m = nn.Tanh()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return torch.tanh(input)
+class SiLU(Module):
+    r"""Applies the Sigmoid Linear Unit (SiLU) function, element-wise.
+    The SiLU function is also known as the swish function.
+    .. math::
+        \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.}
+    .. note::
+        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_
+        where the SiLU (Sigmoid Linear Unit) was originally coined, and see
+        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation
+        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish:
+        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_
+        where the SiLU was experimented with later.
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/SiLU.png
+    Examples::
+        >>> m = nn.SiLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['inplace']
+    inplace: bool
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.silu(input, inplace=self.inplace)
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+class Mish(Module):
+    r"""Applies the Mish function, element-wise.
+    Mish: A Self Regularized Non-Monotonic Neural Activation Function.
+    .. math::
+        \text{Mish}(x) = x * \text{Tanh}(\text{Softplus}(x))
+    .. note::
+        See `Mish: A Self Regularized Non-Monotonic Neural Activation Function <https://arxiv.org/abs/1908.08681>`_
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Mish.png
+    Examples::
+        >>> m = nn.Mish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['inplace']
+    inplace: bool
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.mish(input, inplace=self.inplace)
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+class Hardswish(Module):
+    r"""Applies the Hardswish function, element-wise.
+    Method described in the paper: `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`_.
+    Hardswish is defined as:
+    .. math::
+        \text{Hardswish}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            x & \text{if~} x \ge +3, \\
+            x \cdot (x + 3) /6 & \text{otherwise}
+        \end{cases}
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Hardswish.png
+    Examples::
+        >>> m = nn.Hardswish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['inplace']
+    inplace: bool
+    def __init__(self, inplace : bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.hardswish(input, self.inplace)
+class ELU(Module):
+    r"""Applies the Exponential Linear Unit (ELU) function, element-wise.
+    Method described in the paper: `Fast and Accurate Deep Network Learning by Exponential Linear
+    Units (ELUs) <https://arxiv.org/abs/1511.07289>`__.
+    ELU is defined as:
+    .. math::
+        \text{ELU}(x) = \begin{cases}
+        x, & \text{ if } x > 0\\
+        \alpha * (\exp(x) - 1), & \text{ if } x \leq 0
+        \end{cases}
+    Args:
+        alpha: the :math:`\alpha` value for the ELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/ELU.png
+    Examples::
+        >>> m = nn.ELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['alpha', 'inplace']
+    alpha: float
+    inplace: bool
+    def __init__(self, alpha: float = 1., inplace: bool = False) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.elu(input, self.alpha, self.inplace)
+    def extra_repr(self) -> str:
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'alpha={self.alpha}{inplace_str}'
+class CELU(Module):
+    r"""Applies the CELU function element-wise.
+    .. math::
+        \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))
+    More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ .
+    Args:
+        alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/CELU.png
+    Examples::
+        >>> m = nn.CELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    .. _`Continuously Differentiable Exponential Linear Units`:
+        https://arxiv.org/abs/1704.07483
+    """
+    __constants__ = ['alpha', 'inplace']
+    alpha: float
+    inplace: bool
+    def __init__(self, alpha: float = 1., inplace: bool = False) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.celu(input, self.alpha, self.inplace)
+    def extra_repr(self) -> str:
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'alpha={self.alpha}{inplace_str}'
+class SELU(Module):
+    r"""Applies the SELU function element-wise.
+    .. math::
+        \text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))
+    with :math:`\alpha = 1.6732632423543772848170429916717` and
+    :math:`\text{scale} = 1.0507009873554804934193349852946`.
+    .. warning::
+        When using ``kaiming_normal`` or ``kaiming_normal_`` for initialisation,
+        ``nonlinearity='linear'`` should be used instead of ``nonlinearity='selu'``
+        in order to get `Self-Normalizing Neural Networks`_.
+        See :func:`torch.nn.init.calculate_gain` for more information.
+    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
+    Args:
+        inplace (bool, optional): can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/SELU.png
+    Examples::
+        >>> m = nn.SELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    """
+    __constants__ = ['inplace']
+    inplace: bool
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.selu(input, self.inplace)
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+class GLU(Module):
+    r"""Applies the gated linear unit function.
+    :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half
+    of the input matrices and :math:`b` is the second half.
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    Shape:
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+    Examples::
+        >>> m = nn.GLU()
+        >>> input = torch.randn(4, 2)
+        >>> output = m(input)
+    """
+    __constants__ = ['dim']
+    dim: int
+    def __init__(self, dim: int = -1) -> None:
+        super().__init__()
+        self.dim = dim
+    def forward(self, input: Tensor) -> Tensor:
+        return F.glu(input, self.dim)
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}'
+class GELU(Module):
+    r"""Applies the Gaussian Error Linear Units function.
+    .. math:: \text{GELU}(x) = x * \Phi(x)
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
+    When the approximate argument is 'tanh', Gelu is estimated with:
+    .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt{2 / \pi} * (x + 0.044715 * x^3)))
+    Args:
+        approximate (str, optional): the gelu approximation algorithm to use:
+            ``'none'`` | ``'tanh'``. Default: ``'none'``
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/GELU.png
+    Examples::
+        >>> m = nn.GELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['approximate']
+    approximate: str
+    def __init__(self, approximate: str = 'none') -> None:
+        super().__init__()
+        self.approximate = approximate
+    def forward(self, input: Tensor) -> Tensor:
+        return F.gelu(input, approximate=self.approximate)
+    def extra_repr(self) -> str:
+        return f'approximate={repr(self.approximate)}'
+class Hardshrink(Module):
+    r"""Applies the Hard Shrinkage (Hardshrink) function element-wise.
+    Hardshrink is defined as:
+    .. math::
+        \text{HardShrink}(x) =
+        \begin{cases}
+        x, & \text{ if } x > \lambda \\
+        x, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+    Args:
+        lambd: the :math:`\lambda` value for the Hardshrink formulation. Default: 0.5
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Hardshrink.png
+    Examples::
+        >>> m = nn.Hardshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['lambd']
+    lambd: float
+    def __init__(self, lambd: float = 0.5) -> None:
+        super().__init__()
+        self.lambd = lambd
+    def forward(self, input: Tensor) -> Tensor:
+        return F.hardshrink(input, self.lambd)
+    def extra_repr(self) -> str:
+        return f'{self.lambd}'
+class LeakyReLU(Module):
+    r"""Applies the LeakyReLU function element-wise.
+    .. math::
+        \text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)
+    or
+    .. math::
+        \text{LeakyReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        \text{negative\_slope} \times x, & \text{ otherwise }
+        \end{cases}
+    Args:
+        negative_slope: Controls the angle of the negative slope (which is used for
+          negative input values). Default: 1e-2
+        inplace: can optionally do the operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+    .. image:: ../scripts/activation_images/LeakyReLU.png
+    Examples::
+        >>> m = nn.LeakyReLU(0.1)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['inplace', 'negative_slope']
+    inplace: bool
+    negative_slope: float
+    def __init__(self, negative_slope: float = 1e-2, inplace: bool = False) -> None:
+        super().__init__()
+        self.negative_slope = negative_slope
+        self.inplace = inplace
+    def forward(self, input: Tensor) -> Tensor:
+        return F.leaky_relu(input, self.negative_slope, self.inplace)
+    def extra_repr(self) -> str:
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'negative_slope={self.negative_slope}{inplace_str}'
+class LogSigmoid(Module):
+    r"""Applies the Logsigmoid function element-wise.
+    .. math::
+        \text{LogSigmoid}(x) = \log\left(\frac{ 1 }{ 1 + \exp(-x)}\right)
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/LogSigmoid.png
+    Examples::
+        >>> m = nn.LogSigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.logsigmoid(input)
+class Softplus(Module):
+    r"""Applies the Softplus function element-wise.
+    .. math::
+        \text{Softplus}(x) = \frac{1}{\beta} * \log(1 + \exp(\beta * x))
+    SoftPlus is a smooth approximation to the ReLU function and can be used
+    to constrain the output of a machine to always be positive.
+    For numerical stability the implementation reverts to the linear function
+    when :math:`input \times \beta > threshold`.
+    Args:
+        beta: the :math:`\beta` value for the Softplus formulation. Default: 1
+        threshold: values above this revert to a linear function. Default: 20
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Softplus.png
+    Examples::
+        >>> m = nn.Softplus()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['beta', 'threshold']
+    beta: float
+    threshold: float
+    def __init__(self, beta: float = 1.0, threshold: float = 20.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.threshold = threshold
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softplus(input, self.beta, self.threshold)
+    def extra_repr(self) -> str:
+        return f'beta={self.beta}, threshold={self.threshold}'
+class Softshrink(Module):
+    r"""Applies the soft shrinkage function element-wise.
+    .. math::
+        \text{SoftShrinkage}(x) =
+        \begin{cases}
+        x - \lambda, & \text{ if } x > \lambda \\
+        x + \lambda, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+    Args:
+        lambd: the :math:`\lambda` (must be no less than zero) value for the Softshrink formulation. Default: 0.5
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Softshrink.png
+    Examples::
+        >>> m = nn.Softshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['lambd']
+    lambd: float
+    def __init__(self, lambd: float = 0.5) -> None:
+        super().__init__()
+        self.lambd = lambd
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softshrink(input, self.lambd)
+    def extra_repr(self) -> str:
+        return str(self.lambd)
+def _check_arg_device(x: Optional[torch.Tensor]) -> bool:
+    if x is not None:
+        return x.device.type in ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name]
+    return True
+def _arg_requires_grad(x: Optional[torch.Tensor]) -> bool:
+    if x is not None:
+        return x.requires_grad
+    return False
+def _is_make_fx_tracing():
+    if not torch.jit.is_scripting():
+        torch_dispatch_mode_stack = torch.utils._python_dispatch._get_current_dispatch_mode_stack()
+        return any(type(x) == torch.fx.experimental.proxy_tensor.ProxyTorchDispatchMode for x in torch_dispatch_mode_stack)
+    else:
+        return False
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information from different representation subspaces.
+    Method described in the paper:
+    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+    Multi-Head Attention is defined as:
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+    ``nn.MultiHeadAttention`` will use the optimized implementations of
+    ``scaled_dot_product_attention()`` when possible.
+    In addition to support for the new ``scaled_dot_product_attention()``
+    function, for speeding up Inference, MHA will use
+    fastpath inference with support for Nested Tensors, iff:
+    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor).
+    - inputs are batched (3D) with ``batch_first==True``
+    - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
+    - training is disabled (using ``.eval()``)
+    - ``add_bias_kv`` is ``False``
+    - ``add_zero_attn`` is ``False``
+    - ``kdim`` and ``vdim`` are equal to ``embed_dim``
+    - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
+      nor ``attn_mask`` is passed
+    - autocast is disabled
+    If the optimized inference fastpath implementation is in use, a
+    `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
+    ``query``/``key``/``value`` to represent padding more efficiently than using a
+    padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
+    will be returned, and an additional speedup proportional to the fraction of the input
+    that is padding can be expected.
+    Args:
+        embed_dim: Total dimension of the model.
+        num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
+            across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
+        dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
+        bias: If specified, adds bias to input / output projection layers. Default: ``True``.
+        add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
+        add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
+            Default: ``False``.
+        kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
+        vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`:
+         https://arxiv.org/abs/2205.14135
+    """
+    __constants__ = ['batch_first']
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False,
+                 kdim=None, vdim=None, batch_first=False, device=None, dtype=None) -> None:
+        if embed_dim <= 0 or num_heads <= 0:
+            raise ValueError(
+                f"embed_dim and num_heads must be greater than 0,"
+                f" got embed_dim={embed_dim} and num_heads={num_heads} instead"
+            )
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.batch_first = batch_first
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        if not self._qkv_same_embed_dim:
+            self.q_proj_weight = Parameter(torch.empty((embed_dim, embed_dim), **factory_kwargs))
+            self.k_proj_weight = Parameter(torch.empty((embed_dim, self.kdim), **factory_kwargs))
+            self.v_proj_weight = Parameter(torch.empty((embed_dim, self.vdim), **factory_kwargs))
+            self.register_parameter('in_proj_weight', None)
+        else:
+            self.in_proj_weight = Parameter(torch.empty((3 * embed_dim, embed_dim), **factory_kwargs))
+            self.register_parameter('q_proj_weight', None)
+            self.register_parameter('k_proj_weight', None)
+            self.register_parameter('v_proj_weight', None)
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+            self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self._reset_parameters()
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+        super().__setstate__(state)
+    def forward(
+            self,
+            query: Tensor,
+            key: Tensor,
+            value: Tensor,
+            key_padding_mask: Optional[Tensor] = None,
+            need_weights: bool = True,
+            attn_mask: Optional[Tensor] = None,
+            average_attn_weights: bool = True,
+            is_causal : bool = False) -> Tuple[Tensor, Optional[Tensor]]:
+        r"""Compute attention outputs using query, key, and value embeddings.
+        Supports optional parameters for padding, masks and attention weights.
+    Args:
+        query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
+            or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
+            :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
+            Queries are compared against key-value pairs to produce the output.
+            See "Attention Is All You Need" for more details.
+        key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
+            or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
+            :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
+            See "Attention Is All You Need" for more details.
+        value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
+            ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
+            sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
+            to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
+            Binary and float masks are supported.
+            For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
+            the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
+        need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
+            Set ``need_weights=False`` to use the optimized ``scaled_dot_product_attention``
+            and achieve the best performance for MHA.
+            Default: ``True``.
+        attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
+            :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
+            :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
+            broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
+            Binary and float masks are supported. For a binary mask, a ``True`` value indicates that the
+            corresponding position is not allowed to attend. For a float mask, the mask values will be added to
+            the attention weight.
+            If both attn_mask and key_padding_mask are supplied, their types should match.
+        average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
+            heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
+            effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
+        is_causal: If specified, applies a causal mask as attention mask.
+            Default: ``False``.
+            Warning:
+            ``is_causal`` provides a hint that ``attn_mask`` is the
+            causal mask. Providing incorrect hints can result in
+            incorrect execution, including forward and backward
+            compatibility.
+    Outputs:
+        - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
+          :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
+          where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
+          embedding dimension ``embed_dim``.
+        - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
+          returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
+          :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
+          :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
+          head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
+        .. note::
+            `batch_first` argument is ignored for unbatched inputs.
+        """
+        why_not_fast_path = ''
+        if ((attn_mask is not None and torch.is_floating_point(attn_mask))
+           or (key_padding_mask is not None) and torch.is_floating_point(key_padding_mask)):
+            why_not_fast_path = "floating-point masks are not supported for fast path."
+        is_batched = query.dim() == 3
+        key_padding_mask = F._canonical_mask(
+            mask=key_padding_mask,
+            mask_name="key_padding_mask",
+            other_type=F._none_or_dtype(attn_mask),
+            other_name="attn_mask",
+            target_type=query.dtype
+        )
+        attn_mask = F._canonical_mask(
+            mask=attn_mask,
+            mask_name="attn_mask",
+            other_type=None,
+            other_name="",
+            target_type=query.dtype,
+            check_other=False,
+        )
+        is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+        if not is_fastpath_enabled:
+            why_not_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
+        elif not is_batched:
+            why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
+        elif query is not key or key is not value:
+            # When lifting this restriction, don't forget to either
+            # enforce that the dtypes all match or test cases where
+            # they don't!
+            why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
+        elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
+        elif self.in_proj_weight is None:
+            why_not_fast_path = "in_proj_weight was None"
+        elif query.dtype != self.in_proj_weight.dtype:
+            # this case will fail anyway, but at least they'll get a useful error message.
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
+        elif self.training:
+            why_not_fast_path = "training is enabled"
+        elif (self.num_heads % 2) != 0:
+            why_not_fast_path = "self.num_heads is not even"
+        elif not self.batch_first:
+            why_not_fast_path = "batch_first was not True"
+        elif self.bias_k is not None:
+            why_not_fast_path = "self.bias_k was not None"
+        elif self.bias_v is not None:
+            why_not_fast_path = "self.bias_v was not None"
+        elif self.add_zero_attn:
+            why_not_fast_path = "add_zero_attn was enabled"
+        elif not self._qkv_same_embed_dim:
+            why_not_fast_path = "_qkv_same_embed_dim was not True"
+        elif query.is_nested and (key_padding_mask is not None or attn_mask is not None):
+            why_not_fast_path = "supplying both src_key_padding_mask and src_mask at the same time \
+                                 is not supported with NestedTensor input"
+        elif torch.is_autocast_enabled():
+            why_not_fast_path = "autocast is enabled"
+        if not why_not_fast_path:
+            tensor_args = (
+                query,
+                key,
+                value,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj.weight,
+                self.out_proj.bias,
+            )
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_fast_path = "some Tensor argument has_torch_function"
+            elif _is_make_fx_tracing():
+                why_not_fast_path = "we are running make_fx tracing"
+            elif not all(_check_arg_device(x) for x in tensor_args):
+                why_not_fast_path = ("some Tensor argument's device is neither one of "
+                                     f"cpu, cuda or {torch.utils.backend_registration._privateuse1_backend_name}")
+            elif torch.is_grad_enabled() and any(_arg_requires_grad(x) for x in tensor_args):
+                why_not_fast_path = ("grad is enabled and at least one of query or the "
+                                     "input/output projection weights or biases requires_grad")
+            if not why_not_fast_path:
+                merged_mask, mask_type = self.merge_masks(attn_mask, key_padding_mask, query)
+                if self.in_proj_bias is not None and self.in_proj_weight is not None:
+                    return torch._native_multi_head_attention(
+                        query,
+                        key,
+                        value,
+                        self.embed_dim,
+                        self.num_heads,
+                        self.in_proj_weight,
+                        self.in_proj_bias,
+                        self.out_proj.weight,
+                        self.out_proj.bias,
+                        merged_mask,
+                        need_weights,
+                        average_attn_weights,
+                        mask_type)
+        any_nested = query.is_nested or key.is_nested or value.is_nested
+        assert not any_nested, ("MultiheadAttention does not support NestedTensor outside of its fast path. " +
+                                f"The fast path was not hit because {why_not_fast_path}")
+        if self.batch_first and is_batched:
+            # make sure that the transpose op does not affect the "is" property
+            if key is value:
+                if query is key:
+                    query = key = value = query.transpose(1, 0)
+                else:
+                    query, key = (x.transpose(1, 0) for x in (query, key))
+                    value = key
+            else:
+                query, key, value = (x.transpose(1, 0) for x in (query, key, value))
+        if not self._qkv_same_embed_dim:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                average_attn_weights=average_attn_weights,
+                is_causal=is_causal)
+        else:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                average_attn_weights=average_attn_weights,
+                is_causal=is_causal)
+        if self.batch_first and is_batched:
+            return attn_output.transpose(1, 0), attn_output_weights
+        else:
+            return attn_output, attn_output_weights
+    def merge_masks(self, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor],
+                    query: Tensor) -> Tuple[Optional[Tensor], Optional[int]]:
+        r"""Determine mask type and combine masks if necessary.
+        If only one mask is provided, that mask
+        and the corresponding mask type will be returned. If both masks are provided, they will be both
+        expanded to shape ``(batch_size, num_heads, seq_len, seq_len)``, combined with logical ``or``
+        and mask type 2 will be returned
+        Args:
+            attn_mask: attention mask of shape ``(seq_len, seq_len)``, mask type 0
+            key_padding_mask: padding mask of shape ``(batch_size, seq_len)``, mask type 1
+            query: query embeddings of shape ``(batch_size, seq_len, embed_dim)``
+        Returns:
+            merged_mask: merged mask
+            mask_type: merged mask type (0, 1, or 2)
+        """
+        mask_type: Optional[int] = None
+        merged_mask: Optional[Tensor] = None
+        if key_padding_mask is not None:
+            mask_type = 1
+            merged_mask = key_padding_mask
+        if attn_mask is not None:
+            # In this branch query can't be a nested tensor, so it has a shape
+            batch_size, seq_len, _ = query.shape
+            mask_type = 2
+            # Always expands attn_mask to 4D
+            if attn_mask.dim() == 3:
+                attn_mask_expanded = attn_mask.view(batch_size, -1, seq_len, seq_len)
+            else:  # attn_mask.dim() == 2:
+                attn_mask_expanded = attn_mask.view(1, 1, seq_len, seq_len).expand(batch_size, self.num_heads, -1, -1)
+            merged_mask = attn_mask_expanded
+            if key_padding_mask is not None:
+                key_padding_mask_expanded = key_padding_mask.view(batch_size, 1, 1, seq_len).expand(-1, self.num_heads, -1, -1)
+                merged_mask = attn_mask_expanded + key_padding_mask_expanded
+        # no attn_mask and no key_padding_mask, returns None, None
+        return merged_mask, mask_type
+class PReLU(Module):
+    r"""Applies the element-wise PReLU function.
+    .. math::
+        \text{PReLU}(x) = \max(0,x) + a * \min(0,x)
+    or
+    .. math::
+        \text{PReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        ax, & \text{ otherwise }
+        \end{cases}
+    Here :math:`a` is a learnable parameter. When called without arguments, `nn.PReLU()` uses a single
+    parameter :math:`a` across all input channels. If called with `nn.PReLU(nChannels)`,
+    a separate :math:`a` is used for each input channel.
+    .. note::
+        weight decay should not be used when learning :math:`a` for good performance.
+    .. note::
+        Channel dim is the 2nd dim of input. When input has dims < 2, then there is
+        no channel dim and the number of channels = 1.
+    Args:
+        num_parameters (int): number of :math:`a` to learn.
+            Although it takes an int as input, there is only two values are legitimate:
+            1, or the number of channels at input. Default: 1
+        init (float): the initial value of :math:`a`. Default: 0.25
+    Shape:
+        - Input: :math:`( *)` where `*` means, any number of additional
+          dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    Attributes:
+        weight (Tensor): the learnable weights of shape (:attr:`num_parameters`).
+    .. image:: ../scripts/activation_images/PReLU.png
+    Examples::
+        >>> m = nn.PReLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['num_parameters']
+    num_parameters: int
+    def __init__(self, num_parameters: int = 1, init: float = 0.25,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        self.num_parameters = num_parameters
+        super().__init__()
+        self.init = init
+        self.weight = Parameter(torch.empty(num_parameters, **factory_kwargs))
+        self.reset_parameters()
+    def reset_parameters(self):
+        torch.nn.init.constant_(self.weight, self.init)
+    def forward(self, input: Tensor) -> Tensor:
+        return F.prelu(input, self.weight)
+    def extra_repr(self) -> str:
+        return f'num_parameters={self.num_parameters}'
+class Softsign(Module):
+    r"""Applies the element-wise Softsign function.
+    .. math::
+        \text{SoftSign}(x) = \frac{x}{ 1 + |x|}
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Softsign.png
+    Examples::
+        >>> m = nn.Softsign()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softsign(input)
+class Tanhshrink(Module):
+    r"""Applies the element-wise Tanhshrink function.
+    .. math::
+        \text{Tanhshrink}(x) = x - \tanh(x)
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    .. image:: ../scripts/activation_images/Tanhshrink.png
+    Examples::
+        >>> m = nn.Tanhshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.tanhshrink(input)
+class Softmin(Module):
+    r"""Applies the Softmin function to an n-dimensional input Tensor.
+    Rescales them so that the elements of the n-dimensional output Tensor
+    lie in the range `[0, 1]` and sum to 1.
+    Softmin is defined as:
+    .. math::
+        \text{Softmin}(x_{i}) = \frac{\exp(-x_i)}{\sum_j \exp(-x_j)}
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+    Args:
+        dim (int): A dimension along which Softmin will be computed (so every slice
+            along dim will sum to 1).
+    Returns:
+        a Tensor of the same dimension and shape as the input, with
+        values in the range [0, 1]
+    Examples::
+        >>> m = nn.Softmin(dim=1)
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+    __constants__ = ['dim']
+    dim: Optional[int]
+    def __init__(self, dim: Optional[int] = None) -> None:
+        super().__init__()
+        self.dim = dim
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'dim'):
+            self.dim = None
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softmin(input, self.dim, _stacklevel=5)
+    def extra_repr(self):
+        return f'dim={self.dim}'
+class Softmax(Module):
+    r"""Applies the Softmax function to an n-dimensional input Tensor.
+    Rescales them so that the elements of the n-dimensional output Tensor
+    lie in the range [0,1] and sum to 1.
+    Softmax is defined as:
+    .. math::
+        \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
+    When the input Tensor is a sparse tensor then the unspecified
+    values are treated as ``-inf``.
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+    Args:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+    .. note::
+        This module doesn't work directly with NLLLoss,
+        which expects the Log to be computed between the Softmax and itself.
+        Use `LogSoftmax` instead (it's faster and has better numerical properties).
+    Examples::
+        >>> m = nn.Softmax(dim=1)
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+    __constants__ = ['dim']
+    dim: Optional[int]
+    def __init__(self, dim: Optional[int] = None) -> None:
+        super().__init__()
+        self.dim = dim
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'dim'):
+            self.dim = None
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softmax(input, self.dim, _stacklevel=5)
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}'
+class Softmax2d(Module):
+    r"""Applies SoftMax over features to each spatial location.
+    When given an image of ``Channels x Height x Width``, it will
+    apply `Softmax` to each location :math:`(Channels, h_i, w_j)`
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`.
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+    Examples::
+        >>> m = nn.Softmax2d()
+        >>> # you softmax over the 2nd dimension
+        >>> input = torch.randn(2, 3, 12, 13)
+        >>> output = m(input)
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        if input.dim() not in (3, 4):
+            raise ValueError(
+                f"Softmax2d: expected input to be 3D or 4D, got {input.dim()}D instead"
+            )
+        return F.softmax(input, -3, _stacklevel=5)
+class LogSoftmax(Module):
+    r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional input Tensor.
+    The LogSoftmax formulation can be simplified as:
+    .. math::
+        \text{LogSoftmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right)
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+    Args:
+        dim (int): A dimension along which LogSoftmax will be computed.
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [-inf, 0)
+    Examples::
+        >>> m = nn.LogSoftmax(dim=1)
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+    __constants__ = ['dim']
+    dim: Optional[int]
+    def __init__(self, dim: Optional[int] = None) -> None:
+        super().__init__()
+        self.dim = dim
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'dim'):
+            self.dim = None
+    def forward(self, input: Tensor) -> Tensor:
+        return F.log_softmax(input, self.dim, _stacklevel=5)
+    def extra_repr(self):
+        return f'dim={self.dim}'

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/batchnorm.py ADDED Viewed

	@@ -0,0 +1,849 @@

+from typing import Optional, Any
+import torch
+from torch import Tensor
+from torch.nn.parameter import Parameter, UninitializedParameter, UninitializedBuffer
+from .. import functional as F
+from .. import init
+from ._functions import SyncBatchNorm as sync_batch_norm
+from .lazy import LazyModuleMixin
+from .module import Module
+__all__ = ['BatchNorm1d', 'LazyBatchNorm1d', 'BatchNorm2d', 'LazyBatchNorm2d', 'BatchNorm3d',
+           'LazyBatchNorm3d', 'SyncBatchNorm']
+class _NormBase(Module):
+    """Common base of _InstanceNorm and _BatchNorm."""
+    _version = 2
+    __constants__ = ["track_running_stats", "momentum", "eps", "num_features", "affine"]
+    num_features: int
+    eps: float
+    momentum: float
+    affine: bool
+    track_running_stats: bool
+    # WARNING: weight and bias purposely not defined here.
+    # See https://github.com/pytorch/pytorch/issues/39670
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = Parameter(torch.empty(num_features, **factory_kwargs))
+            self.bias = Parameter(torch.empty(num_features, **factory_kwargs))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        if self.track_running_stats:
+            self.register_buffer('running_mean', torch.zeros(num_features, **factory_kwargs))
+            self.register_buffer('running_var', torch.ones(num_features, **factory_kwargs))
+            self.running_mean: Optional[Tensor]
+            self.running_var: Optional[Tensor]
+            self.register_buffer('num_batches_tracked',
+                                 torch.tensor(0, dtype=torch.long,
+                                              **{k: v for k, v in factory_kwargs.items() if k != 'dtype'}))
+            self.num_batches_tracked: Optional[Tensor]
+        else:
+            self.register_buffer("running_mean", None)
+            self.register_buffer("running_var", None)
+            self.register_buffer("num_batches_tracked", None)
+        self.reset_parameters()
+    def reset_running_stats(self) -> None:
+        if self.track_running_stats:
+            # running_mean/running_var/num_batches... are registered at runtime depending
+            # if self.track_running_stats is on
+            self.running_mean.zero_()  # type: ignore[union-attr]
+            self.running_var.fill_(1)  # type: ignore[union-attr]
+            self.num_batches_tracked.zero_()  # type: ignore[union-attr,operator]
+    def reset_parameters(self) -> None:
+        self.reset_running_stats()
+        if self.affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+    def extra_repr(self):
+        return (
+            "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, "
+            "track_running_stats={track_running_stats}".format(**self.__dict__)
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+        if (version is None or version < 2) and self.track_running_stats:
+            # at version 2: added num_batches_tracked buffer
+            #               this should have a default value of 0
+            num_batches_tracked_key = prefix + "num_batches_tracked"
+            if num_batches_tracked_key not in state_dict:
+                state_dict[num_batches_tracked_key] = (
+                    self.num_batches_tracked
+                    if self.num_batches_tracked is not None and self.num_batches_tracked.device != torch.device('meta')
+                    else torch.tensor(0, dtype=torch.long)
+                )
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+class _BatchNorm(_NormBase):
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+        # exponential_average_factor is set to self.momentum
+        # (when it is available) only so that it gets updated
+        # in ONNX graph when this node is exported to ONNX.
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+        if self.training and self.track_running_stats:
+            # TODO: if statement only here to tell the jit to skip emitting this when it is None
+            if self.num_batches_tracked is not None:  # type: ignore[has-type]
+                self.num_batches_tracked.add_(1)  # type: ignore[has-type]
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+        r"""
+        Decide whether the mini-batch stats should be used for normalization rather than the buffers.
+        Mini-batch stats are used in training mode, and in eval mode when buffers are None.
+        """
+        if self.training:
+            bn_training = True
+        else:
+            bn_training = (self.running_mean is None) and (self.running_var is None)
+        r"""
+        Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
+        passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
+        used for normalization (i.e. in eval mode when buffers are not None).
+        """
+        return F.batch_norm(
+            input,
+            # If buffers are not to be tracked, ensure that they won't be updated
+            self.running_mean
+            if not self.training or self.track_running_stats
+            else None,
+            self.running_var if not self.training or self.track_running_stats else None,
+            self.weight,
+            self.bias,
+            bn_training,
+            exponential_average_factor,
+            self.eps,
+        )
+class _LazyNormBase(LazyModuleMixin, _NormBase):
+    weight: UninitializedParameter  # type: ignore[assignment]
+    bias: UninitializedParameter  # type: ignore[assignment]
+    def __init__(self, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            # affine and track_running_stats are hardcoded to False to
+            # avoid creating tensors that will soon be overwritten.
+            0,
+            eps,
+            momentum,
+            False,
+            False,
+            **factory_kwargs,
+        )
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = UninitializedParameter(**factory_kwargs)
+            self.bias = UninitializedParameter(**factory_kwargs)
+        if self.track_running_stats:
+            self.running_mean = UninitializedBuffer(**factory_kwargs)
+            self.running_var = UninitializedBuffer(**factory_kwargs)
+            self.num_batches_tracked = torch.tensor(
+                0, dtype=torch.long, **{k: v for k, v in factory_kwargs.items() if k != 'dtype'})
+    def reset_parameters(self) -> None:
+        if not self.has_uninitialized_params() and self.num_features != 0:
+            super().reset_parameters()
+    def initialize_parameters(self, input) -> None:  # type: ignore[override]
+        if self.has_uninitialized_params():
+            self.num_features = input.shape[1]
+            if self.affine:
+                assert isinstance(self.weight, UninitializedParameter)
+                assert isinstance(self.bias, UninitializedParameter)
+                self.weight.materialize((self.num_features,))
+                self.bias.materialize((self.num_features,))
+            if self.track_running_stats:
+                self.running_mean.materialize((self.num_features,))  # type:ignore[union-attr]
+                self.running_var.materialize((self.num_features,))  # type:ignore[union-attr]
+            self.reset_parameters()
+class BatchNorm1d(_BatchNorm):
+    r"""Applies Batch Normalization over a 2D or 3D input.
+    Method described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the number of features or channels of the input). By default, the
+    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
+    At train time in the forward pass, the standard-deviation is calculated via the biased estimator,
+    equivalent to ``torch.var(input, unbiased=False)``. However, the value stored in the
+    moving average of the standard-deviation is calculated via the unbiased  estimator, equivalent to
+    ``torch.var(input, unbiased=True)``.
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.
+    Args:
+        num_features: number of features or channels :math:`C` of the input
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
+          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+    Examples::
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm1d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm1d(100, affine=False)
+        >>> input = torch.randn(20, 100)
+        >>> output = m(input)
+    """
+    def _check_input_dim(self, input):
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(
+                f"expected 2D or 3D input (got {input.dim()}D input)"
+            )
+class LazyBatchNorm1d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`torch.nn.BatchNorm1d` module with lazy initialization.
+    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+    cls_to_become = BatchNorm1d  # type: ignore[assignment]
+    def _check_input_dim(self, input):
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(
+                f"expected 2D or 3D input (got {input.dim()}D input)"
+            )
+class BatchNorm2d(_BatchNorm):
+    r"""Applies Batch Normalization over a 4D input.
+    4D is a mini-batch of 2D inputs
+    with additional channel dimension. Method described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
+    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
+    standard-deviation is calculated via the biased estimator, equivalent to
+    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
+    standard-deviation is calculated via the unbiased  estimator, equivalent to
+    ``torch.var(input, unbiased=True)``.
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+    Examples::
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm2d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm2d(100, affine=False)
+        >>> input = torch.randn(20, 100, 35, 45)
+        >>> output = m(input)
+    """
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError(f"expected 4D input (got {input.dim()}D input)")
+class LazyBatchNorm2d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`torch.nn.BatchNorm2d` module with lazy initialization.
+    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+    cls_to_become = BatchNorm2d  # type: ignore[assignment]
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError(f"expected 4D input (got {input.dim()}D input)")
+class BatchNorm3d(_BatchNorm):
+    r"""Applies Batch Normalization over a 5D input.
+    5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
+    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
+    standard-deviation is calculated via the biased estimator, equivalent to
+    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
+    standard-deviation is calculated via the unbiased  estimator, equivalent to
+    ``torch.var(input, unbiased=True)``.
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
+    or Spatio-temporal Batch Normalization.
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    Shape:
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+    Examples::
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm3d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm3d(100, affine=False)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+    """
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError(f"expected 5D input (got {input.dim()}D input)")
+class LazyBatchNorm3d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`torch.nn.BatchNorm3d` module with lazy initialization.
+    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+    cls_to_become = BatchNorm3d  # type: ignore[assignment]
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError(f"expected 5D input (got {input.dim()}D input)")
+class SyncBatchNorm(_BatchNorm):
+    r"""Applies Batch Normalization over a N-Dimensional input.
+    The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    The mean and standard-deviation are calculated per-dimension over all
+    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
+    are learnable parameter vectors of size `C` (where `C` is the input size).
+    By default, the elements of :math:`\gamma` are sampled from
+    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
+    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
+    Normalization or Spatio-temporal Batch Normalization.
+    Currently :class:`SyncBatchNorm` only supports
+    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
+    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
+    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
+    Network with DDP.
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, +)`
+        eps: a value added to the denominator for numerical stability.
+            Default: ``1e-5``
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+        process_group: synchronization of stats happen within each process group
+            individually. Default behavior is synchronization across the whole
+            world
+    Shape:
+        - Input: :math:`(N, C, +)`
+        - Output: :math:`(N, C, +)` (same shape as input)
+    .. note::
+        Synchronization of batchnorm statistics occurs only while training, i.e.
+        synchronization is disabled when ``model.eval()`` is set or if
+        ``self.training`` is otherwise ``False``.
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> # With Learnable Parameters
+        >>> m = nn.SyncBatchNorm(100)
+        >>> # creating process group (optional)
+        >>> # ranks is a list of int identifying rank ids.
+        >>> ranks = list(range(8))
+        >>> r1, r2 = ranks[:4], ranks[4:]
+        >>> # Note: every rank calls into new_group for every
+        >>> # process group created, even if that rank is not
+        >>> # part of the group.
+        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
+        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+        >>> # network is nn.BatchNorm layer
+        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
+        >>> # only single gpu per process is currently supported
+        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
+        >>>                         sync_bn_network,
+        >>>                         device_ids=[args.local_rank],
+        >>>                         output_device=args.local_rank)
+    """
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        process_group: Optional[Any] = None,
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+        self.process_group = process_group
+    def _check_input_dim(self, input):
+        if input.dim() < 2:
+            raise ValueError(
+                f"expected at least 2D input (got {input.dim()}D input)"
+            )
+    def _check_non_zero_input_channels(self, input):
+        if input.size(1) == 0:
+            raise ValueError(
+                "SyncBatchNorm number of input channels should be non-zero"
+            )
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+        self._check_non_zero_input_channels(input)
+        # exponential_average_factor is set to self.momentum
+        # (when it is available) only so that it gets updated
+        # in ONNX graph when this node is exported to ONNX.
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+        if self.training and self.track_running_stats:
+            assert self.num_batches_tracked is not None
+            self.num_batches_tracked.add_(1)
+            if self.momentum is None:  # use cumulative moving average
+                exponential_average_factor = 1.0 / self.num_batches_tracked.item()
+            else:  # use exponential moving average
+                exponential_average_factor = self.momentum
+        r"""
+        Decide whether the mini-batch stats should be used for normalization rather than the buffers.
+        Mini-batch stats are used in training mode, and in eval mode when buffers are None.
+        """
+        if self.training:
+            bn_training = True
+        else:
+            bn_training = (self.running_mean is None) and (self.running_var is None)
+        r"""
+        Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
+        passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
+        used for normalization (i.e. in eval mode when buffers are not None).
+        """
+        # If buffers are not to be tracked, ensure that they won't be updated
+        running_mean = (
+            self.running_mean if not self.training or self.track_running_stats else None
+        )
+        running_var = (
+            self.running_var if not self.training or self.track_running_stats else None
+        )
+        # Don't sync batchnorm stats in inference mode (model.eval()).
+        need_sync = (bn_training and self.training and
+                     torch.distributed.is_available() and torch.distributed.is_initialized())
+        if need_sync:
+            # currently only GPU/PrivateUse1 input is supported
+            if input.device.type not in ["cuda", torch._C._get_privateuse1_backend_name()]:
+                raise ValueError("SyncBatchNorm expected input tensor to be on GPU or "
+                                 f"{torch._C._get_privateuse1_backend_name()}")
+            process_group = torch.distributed.group.WORLD
+            if self.process_group:
+                process_group = self.process_group
+            world_size = torch.distributed.get_world_size(process_group)
+            need_sync = world_size > 1
+        # fallback to framework BN when synchronization is not necessary
+        if not need_sync:
+            return F.batch_norm(
+                input,
+                running_mean,
+                running_var,
+                self.weight,
+                self.bias,
+                bn_training,
+                exponential_average_factor,
+                self.eps,
+            )
+        else:
+            assert bn_training
+            return sync_batch_norm.apply(
+                input,
+                self.weight,
+                self.bias,
+                running_mean,
+                running_var,
+                self.eps,
+                exponential_average_factor,
+                process_group,  # type: ignore[possibly-undefined]
+                world_size,  # type: ignore[possibly-undefined]
+            )
+    @classmethod
+    def convert_sync_batchnorm(cls, module, process_group=None):
+        r"""Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.
+        Args:
+            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
+            process_group (optional): process group to scope synchronization,
+                default is the whole world
+        Returns:
+            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
+            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
+            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
+            instead.
+        Example::
+            >>> # Network with nn.BatchNorm layer
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+            >>> module = torch.nn.Sequential(
+            >>>            torch.nn.Linear(20, 100),
+            >>>            torch.nn.BatchNorm1d(100),
+            >>>          ).cuda()
+            >>> # creating process group (optional)
+            >>> # ranks is a list of int identifying rank ids.
+            >>> ranks = list(range(8))
+            >>> r1, r2 = ranks[:4], ranks[4:]
+            >>> # Note: every rank calls into new_group for every
+            >>> # process group created, even if that rank is not
+            >>> # part of the group.
+            >>> # xdoctest: +SKIP("distributed")
+            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
+            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
+            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
+        """
+        module_output = module
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+            module_output = torch.nn.SyncBatchNorm(
+                module.num_features,
+                module.eps,
+                module.momentum,
+                module.affine,
+                module.track_running_stats,
+                process_group,
+            )
+            if module.affine:
+                with torch.no_grad():
+                    module_output.weight = module.weight
+                    module_output.bias = module.bias
+            module_output.running_mean = module.running_mean
+            module_output.running_var = module.running_var
+            module_output.num_batches_tracked = module.num_batches_tracked
+            module_output.training = module.training
+            if hasattr(module, "qconfig"):
+                module_output.qconfig = module.qconfig
+        for name, child in module.named_children():
+            module_output.add_module(
+                name, cls.convert_sync_batchnorm(child, process_group)
+            )
+        del module
+        return module_output

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/channelshuffle.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from .module import Module
+from .. import functional as F
+from torch import Tensor
+__all__ = ['ChannelShuffle']
+class ChannelShuffle(Module):
+    r"""Divides and rearranges the channels in a tensor.
+    This operation divides the channels in a tensor of shape :math:`(*, C , H, W)`
+    into g groups and rearranges them as :math:`(*, \frac{C}{g}, g, H, W)`,
+    while keeping the original tensor shape.
+    Args:
+        groups (int): number of groups to divide channels in.
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("FIXME: incorrect want")
+        >>> channel_shuffle = nn.ChannelShuffle(2)
+        >>> input = torch.randn(1, 4, 2, 2)
+        >>> print(input)
+        [[[[1, 2],
+           [3, 4]],
+          [[5, 6],
+           [7, 8]],
+          [[9, 10],
+           [11, 12]],
+          [[13, 14],
+           [15, 16]],
+         ]]
+        >>> output = channel_shuffle(input)
+        >>> print(output)
+        [[[[1, 2],
+           [3, 4]],
+          [[9, 10],
+           [11, 12]],
+          [[5, 6],
+           [7, 8]],
+          [[13, 14],
+           [15, 16]],
+         ]]
+    """
+    __constants__ = ['groups']
+    groups: int
+    def __init__(self, groups: int) -> None:
+        super().__init__()
+        self.groups = groups
+    def forward(self, input: Tensor) -> Tensor:
+        return F.channel_shuffle(input, self.groups)
+    def extra_repr(self) -> str:
+        return f'groups={self.groups}'

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/container.py ADDED Viewed

	@@ -0,0 +1,911 @@

+import warnings
+from collections import OrderedDict, abc as container_abcs
+from itertools import chain, islice
+import operator
+import torch
+from .module import Module
+from ..parameter import Parameter
+from torch._jit_internal import _copy_to_script_wrapper
+from typing import Any, Dict, Iterable, Iterator, Mapping, Optional, overload, Tuple, TypeVar, Union
+from typing_extensions import Self
+__all__ = ['Container', 'Sequential', 'ModuleList', 'ModuleDict', 'ParameterList', 'ParameterDict']
+T = TypeVar('T', bound=Module)
+# Copied from torch.nn.modules.module, required for a custom __repr__ for ModuleList
+def _addindent(s_, numSpaces):
+    s = s_.split('\n')
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(numSpaces * ' ') + line for line in s]
+    s = '\n'.join(s)
+    s = first + '\n' + s
+    return s
+class Container(Module):
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__()
+        # DeprecationWarning is ignored by default <sigh>
+        warnings.warn("nn.Container is deprecated. All of it's functionality "
+                      "is now implemented in nn.Module. Subclass that instead.")
+        for key, value in kwargs.items():
+            self.add_module(key, value)
+class Sequential(Module):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the
+    constructor. Alternatively, an ``OrderedDict`` of modules can be
+    passed in. The ``forward()`` method of ``Sequential`` accepts any
+    input and forwards it to the first module it contains. It then
+    "chains" outputs to inputs sequentially for each subsequent module,
+    finally returning the output of the last module.
+    The value a ``Sequential`` provides over manually calling a sequence
+    of modules is that it allows treating the whole container as a
+    single module, such that performing a transformation on the
+    ``Sequential`` applies to each of the modules it stores (which are
+    each a registered submodule of the ``Sequential``).
+    What's the difference between a ``Sequential`` and a
+    :class:`torch.nn.ModuleList`? A ``ModuleList`` is exactly what it
+    sounds like--a list for storing ``Module`` s! On the other hand,
+    the layers in a ``Sequential`` are connected in a cascading way.
+    Example::
+        # Using Sequential to create a small model. When `model` is run,
+        # input will first be passed to `Conv2d(1,20,5)`. The output of
+        # `Conv2d(1,20,5)` will be used as the input to the first
+        # `ReLU`; the output of the first `ReLU` will become the input
+        # for `Conv2d(20,64,5)`. Finally, the output of
+        # `Conv2d(20,64,5)` will be used as input to the second `ReLU`
+        model = nn.Sequential(
+                  nn.Conv2d(1,20,5),
+                  nn.ReLU(),
+                  nn.Conv2d(20,64,5),
+                  nn.ReLU()
+                )
+        # Using Sequential with OrderedDict. This is functionally the
+        # same as the above code
+        model = nn.Sequential(OrderedDict([
+                  ('conv1', nn.Conv2d(1,20,5)),
+                  ('relu1', nn.ReLU()),
+                  ('conv2', nn.Conv2d(20,64,5)),
+                  ('relu2', nn.ReLU())
+                ]))
+    """
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+    @overload
+    def __init__(self, *args: Module) -> None:
+        ...
+    @overload
+    def __init__(self, arg: 'OrderedDict[str, Module]') -> None:
+        ...
+    def __init__(self, *args):
+        super().__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+    def _get_item_by_idx(self, iterator, idx) -> T:  # type: ignore[misc, type-var]
+        """Get the idx-th item of the iterator."""
+        size = len(self)
+        idx = operator.index(idx)
+        if not -size <= idx < size:
+            raise IndexError(f'index {idx} is out of range')
+        idx %= size
+        return next(islice(iterator, idx, None))
+    @_copy_to_script_wrapper
+    def __getitem__(self, idx: Union[slice, int]) -> Union['Sequential', T]:
+        if isinstance(idx, slice):
+            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
+        else:
+            return self._get_item_by_idx(self._modules.values(), idx)
+    def __setitem__(self, idx: int, module: Module) -> None:
+        key: str = self._get_item_by_idx(self._modules.keys(), idx)
+        return setattr(self, key, module)
+    def __delitem__(self, idx: Union[slice, int]) -> None:
+        if isinstance(idx, slice):
+            for key in list(self._modules.keys())[idx]:
+                delattr(self, key)
+        else:
+            key = self._get_item_by_idx(self._modules.keys(), idx)
+            delattr(self, key)
+        # To preserve numbering
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+    @_copy_to_script_wrapper
+    def __len__(self) -> int:
+        return len(self._modules)
+    def __add__(self, other) -> 'Sequential':
+        if isinstance(other, Sequential):
+            ret = Sequential()
+            for layer in self:
+                ret.append(layer)
+            for layer in other:
+                ret.append(layer)
+            return ret
+        else:
+            raise ValueError('add operator supports only objects '
+                             f'of Sequential class, but {str(type(other))} is given.')
+    def pop(self, key: Union[int, slice]) -> Module:
+        v = self[key]
+        del self[key]
+        return v
+    def __iadd__(self, other) -> Self:
+        if isinstance(other, Sequential):
+            offset = len(self)
+            for i, module in enumerate(other):
+                self.add_module(str(i + offset), module)
+            return self
+        else:
+            raise ValueError('add operator supports only objects '
+                             f'of Sequential class, but {str(type(other))} is given.')
+    def __mul__(self, other: int) -> 'Sequential':
+        if not isinstance(other, int):
+            raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}")
+        elif (other <= 0):
+            raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}")
+        else:
+            combined = Sequential()
+            offset = 0
+            for _ in range(other):
+                for module in self:
+                    combined.add_module(str(offset), module)
+                    offset += 1
+            return combined
+    def __rmul__(self, other: int) -> 'Sequential':
+        return self.__mul__(other)
+    def __imul__(self, other: int) -> Self:
+        if not isinstance(other, int):
+            raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}")
+        elif (other <= 0):
+            raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}")
+        else:
+            len_original = len(self)
+            offset = len(self)
+            for _ in range(other - 1):
+                for i in range(len_original):
+                    self.add_module(str(i + offset), self._modules[str(i)])
+                offset += len_original
+            return self
+    @_copy_to_script_wrapper
+    def __dir__(self):
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+    @_copy_to_script_wrapper
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+    # NB: We can't really type check this function as the type of input
+    # may change dynamically (as is tested in
+    # TestScript.test_sequential_intermediary_types).  Cannot annotate
+    # with Any as TorchScript expects a more precise type
+    def forward(self, input):
+        for module in self:
+            input = module(input)
+        return input
+    def append(self, module: Module) -> 'Sequential':
+        r"""Append a given module to the end.
+        Args:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+    def insert(self, index: int, module: Module) -> 'Sequential':
+        if not isinstance(module, Module):
+            raise AssertionError(
+                f'module should be of type: {Module}')
+        n = len(self._modules)
+        if not (-n <= index <= n):
+            raise IndexError(
+                f'Index out of range: {index}')
+        if index < 0:
+            index += n
+        for i in range(n, index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+        return self
+    def extend(self, sequential) -> 'Sequential':
+        for layer in sequential:
+            self.append(layer)
+        return self
+class ModuleList(Module):
+    r"""Holds submodules in a list.
+    :class:`~torch.nn.ModuleList` can be indexed like a regular Python list, but
+    modules it contains are properly registered, and will be visible by all
+    :class:`~torch.nn.Module` methods.
+    Args:
+        modules (iterable, optional): an iterable of modules to add
+    Example::
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
+            def forward(self, x):
+                # ModuleList can act as an iterable, or be indexed using ints
+                for i, l in enumerate(self.linears):
+                    x = self.linears[i // 2](x) + l(x)
+                return x
+    """
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+    def __init__(self, modules: Optional[Iterable[Module]] = None) -> None:
+        super().__init__()
+        if modules is not None:
+            self += modules
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules."""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f'index {idx} is out of range')
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+    @_copy_to_script_wrapper
+    def __getitem__(self, idx: Union[int, slice]) -> Union[Module, 'ModuleList']:
+        if isinstance(idx, slice):
+            return self.__class__(list(self._modules.values())[idx])
+        else:
+            return self._modules[self._get_abs_string_index(idx)]
+    def __setitem__(self, idx: int, module: Module) -> None:
+        idx = self._get_abs_string_index(idx)
+        return setattr(self, str(idx), module)
+    def __delitem__(self, idx: Union[int, slice]) -> None:
+        if isinstance(idx, slice):
+            for k in range(len(self._modules))[idx]:
+                delattr(self, str(k))
+        else:
+            delattr(self, self._get_abs_string_index(idx))
+        # To preserve numbering, self._modules is being reconstructed with modules after deletion
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+    @_copy_to_script_wrapper
+    def __len__(self) -> int:
+        return len(self._modules)
+    @_copy_to_script_wrapper
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+    def __iadd__(self, modules: Iterable[Module]) -> Self:
+        return self.extend(modules)
+    def __add__(self, other: Iterable[Module]) -> 'ModuleList':
+        combined = ModuleList()
+        for i, module in enumerate(chain(self, other)):
+            combined.add_module(str(i), module)
+        return combined
+    def __repr__(self):
+        """Return a custom repr for ModuleList that compresses repeated module representations."""
+        list_of_reprs = [repr(item) for item in self]
+        if len(list_of_reprs) == 0:
+            return self._get_name() + '()'
+        start_end_indices = [[0, 0]]
+        repeated_blocks = [list_of_reprs[0]]
+        for i, r in enumerate(list_of_reprs[1:], 1):
+            if r == repeated_blocks[-1]:
+                start_end_indices[-1][1] += 1
+                continue
+            start_end_indices.append([i, i])
+            repeated_blocks.append(r)
+        lines = []
+        main_str = self._get_name() + '('
+        for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
+            local_repr = f"({start_id}): {b}"  # default repr
+            if start_id != end_id:
+                n = end_id - start_id + 1
+                local_repr = f"({start_id}-{end_id}): {n} x {b}"
+            local_repr = _addindent(local_repr, 2)
+            lines.append(local_repr)
+        main_str += '\n  ' + '\n  '.join(lines) + '\n'
+        main_str += ')'
+        return main_str
+    @_copy_to_script_wrapper
+    def __dir__(self):
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+    def insert(self, index: int, module: Module) -> None:
+        r"""Insert a given module before a given index in the list.
+        Args:
+            index (int): index to insert.
+            module (nn.Module): module to insert
+        """
+        for i in range(len(self._modules), index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+    def append(self, module: Module) -> 'ModuleList':
+        r"""Append a given module to the end of the list.
+        Args:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+    def pop(self, key: Union[int, slice]) -> Module:
+        v = self[key]
+        del self[key]
+        return v
+    def extend(self, modules: Iterable[Module]) -> Self:
+        r"""Append modules from a Python iterable to the end of the list.
+        Args:
+            modules (iterable): iterable of modules to append
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError("ModuleList.extend should be called with an "
+                            "iterable, but got " + type(modules).__name__)
+        offset = len(self)
+        for i, module in enumerate(modules):
+            self.add_module(str(offset + i), module)
+        return self
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+class ModuleDict(Module):
+    r"""Holds submodules in a dictionary.
+    :class:`~torch.nn.ModuleDict` can be indexed like a regular Python dictionary,
+    but modules it contains are properly registered, and will be visible by all
+    :class:`~torch.nn.Module` methods.
+    :class:`~torch.nn.ModuleDict` is an **ordered** dictionary that respects
+    * the order of insertion, and
+    * in :meth:`~torch.nn.ModuleDict.update`, the order of the merged
+      ``OrderedDict``, ``dict`` (started from Python 3.6) or another
+      :class:`~torch.nn.ModuleDict` (the argument to
+      :meth:`~torch.nn.ModuleDict.update`).
+    Note that :meth:`~torch.nn.ModuleDict.update` with other unordered mapping
+    types (e.g., Python's plain ``dict`` before Python version 3.6) does not
+    preserve the order of the merged mapping.
+    Args:
+        modules (iterable, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module)
+    Example::
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.choices = nn.ModuleDict({
+                        'conv': nn.Conv2d(10, 10, 3),
+                        'pool': nn.MaxPool2d(3)
+                })
+                self.activations = nn.ModuleDict([
+                        ['lrelu', nn.LeakyReLU()],
+                        ['prelu', nn.PReLU()]
+                ])
+            def forward(self, x, choice, act):
+                x = self.choices[choice](x)
+                x = self.activations[act](x)
+                return x
+    """
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+    def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
+        super().__init__()
+        if modules is not None:
+            self.update(modules)
+    @_copy_to_script_wrapper
+    def __getitem__(self, key: str) -> Module:
+        return self._modules[key]
+    def __setitem__(self, key: str, module: Module) -> None:
+        self.add_module(key, module)
+    def __delitem__(self, key: str) -> None:
+        del self._modules[key]
+    @_copy_to_script_wrapper
+    def __len__(self) -> int:
+        return len(self._modules)
+    @_copy_to_script_wrapper
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._modules)
+    @_copy_to_script_wrapper
+    def __contains__(self, key: str) -> bool:
+        return key in self._modules
+    def clear(self) -> None:
+        """Remove all items from the ModuleDict."""
+        self._modules.clear()
+    def pop(self, key: str) -> Module:
+        r"""Remove key from the ModuleDict and return its module.
+        Args:
+            key (str): key to pop from the ModuleDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+    @_copy_to_script_wrapper
+    def keys(self) -> Iterable[str]:
+        r"""Return an iterable of the ModuleDict keys."""
+        return self._modules.keys()
+    @_copy_to_script_wrapper
+    def items(self) -> Iterable[Tuple[str, Module]]:
+        r"""Return an iterable of the ModuleDict key/value pairs."""
+        return self._modules.items()
+    @_copy_to_script_wrapper
+    def values(self) -> Iterable[Module]:
+        r"""Return an iterable of the ModuleDict values."""
+        return self._modules.values()
+    def update(self, modules: Mapping[str, Module]) -> None:
+        r"""Update the :class:`~torch.nn.ModuleDict` with key-value pairs from a mapping, overwriting existing keys.
+        .. note::
+            If :attr:`modules` is an ``OrderedDict``, a :class:`~torch.nn.ModuleDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+        Args:
+            modules (iterable): a mapping (dictionary) from string to :class:`~torch.nn.Module`,
+                or an iterable of key-value pairs of type (string, :class:`~torch.nn.Module`)
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError("ModuleDict.update should be called with an "
+                            "iterable of key/value pairs, but got " +
+                            type(modules).__name__)
+        if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)):
+            for key, module in modules.items():
+                self[key] = module
+        else:
+            # modules here can be a list with two items
+            for j, m in enumerate(modules):
+                if not isinstance(m, container_abcs.Iterable):
+                    raise TypeError("ModuleDict update sequence element "
+                                    "#" + str(j) + " should be Iterable; is" +
+                                    type(m).__name__)
+                if not len(m) == 2:
+                    raise ValueError("ModuleDict update sequence element "
+                                     "#" + str(j) + " has length " + str(len(m)) +
+                                     "; 2 is required")
+                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
+                # that's too cumbersome to type correctly with overloads, so we add an ignore here
+                self[m[0]] = m[1]  # type: ignore[assignment]
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+class ParameterList(Module):
+    r"""Holds parameters in a list.
+    :class:`~torch.nn.ParameterList` can be used like a regular Python
+    list, but Tensors that are :class:`~torch.nn.Parameter` are properly registered,
+    and will be visible by all :class:`~torch.nn.Module` methods.
+    Note that the constructor, assigning an element of the list, the
+    :meth:`~torch.nn.ParameterDict.append` method and the :meth:`~torch.nn.ParameterDict.extend`
+    method will convert any :class:`~torch.Tensor` into :class:`~torch.nn.Parameter`.
+    Args:
+        parameters (iterable, optional): an iterable of elements to add to the list.
+    Example::
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])
+            def forward(self, x):
+                # ParameterList can act as an iterable, or be indexed using ints
+                for i, p in enumerate(self.params):
+                    x = self.params[i // 2].mm(x) + p.mm(x)
+                return x
+    """
+    def __init__(self, values: Optional[Iterable[Any]] = None) -> None:
+        super().__init__()
+        self._size = 0
+        if values is not None:
+            self += values
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules."""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f'index {idx} is out of range')
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+    @overload
+    def __getitem__(self, idx: int) -> Any:
+        ...
+    @overload
+    def __getitem__(self: T, idx: slice) -> T:
+        ...
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            out = self.__class__()
+            for i in range(start, stop, step):
+                out.append(self[i])
+            return out
+        else:
+            idx = self._get_abs_string_index(idx)
+            return getattr(self, str(idx))
+    def __setitem__(self, idx: int, param: Any) -> None:
+        # Note that all other function that add an entry to the list part of
+        # the ParameterList end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the list part and thus won't
+        # call into this function.
+        idx = self._get_abs_string_index(idx)
+        if isinstance(param, torch.Tensor) and not isinstance(param, Parameter):
+            param = Parameter(param)
+        return setattr(self, str(idx), param)
+    def __len__(self) -> int:
+        return self._size
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self[i] for i in range(len(self)))
+    def __iadd__(self, parameters: Iterable[Any]) -> Self:
+        return self.extend(parameters)
+    def __dir__(self):
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+    def append(self, value: Any) -> 'ParameterList':
+        """Append a given value at the end of the list.
+        Args:
+            value (Any): value to append
+        """
+        new_idx = len(self)
+        self._size += 1
+        self[new_idx] = value
+        return self
+    def extend(self, values: Iterable[Any]) -> Self:
+        """Append values from a Python iterable to the end of the list.
+        Args:
+            values (iterable): iterable of values to append
+        """
+        # Tensor is an iterable but we never want to unpack it here
+        if not isinstance(values, container_abcs.Iterable) or isinstance(values, torch.Tensor):
+            raise TypeError("ParameterList.extend should be called with an "
+                            "iterable, but got " + type(values).__name__)
+        for value in values:
+            self.append(value)
+        return self
+    def extra_repr(self) -> str:
+        child_lines = []
+        for k, p in enumerate(self):
+            if isinstance(p, torch.Tensor):
+                size_str = 'x'.join(str(size) for size in p.size())
+                if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
+                    device_str = f' ({p.device})'
+                else:
+                    device_str = ''
+                parastr = '{} containing: [{} of size {}{}]'.format(
+                    "Parameter" if isinstance(p, Parameter) else "Tensor",
+                    p.dtype, size_str, device_str)
+                child_lines.append('  (' + str(k) + '): ' + parastr)
+            else:
+                child_lines.append('  (' + str(k) + '): Object of type: ' + type(p).__name__)
+        tmpstr = '\n'.join(child_lines)
+        return tmpstr
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError('ParameterList should not be called.')
+class ParameterDict(Module):
+    r"""Holds parameters in a dictionary.
+    ParameterDict can be indexed like a regular Python dictionary, but Parameters it
+    contains are properly registered, and will be visible by all Module methods.
+    Other objects are treated as would be done by a regular Python dictionary
+    :class:`~torch.nn.ParameterDict` is an **ordered** dictionary.
+    :meth:`~torch.nn.ParameterDict.update` with other unordered mapping
+    types (e.g., Python's plain ``dict``) does not preserve the order of the
+    merged mapping. On the other hand, ``OrderedDict`` or another :class:`~torch.nn.ParameterDict`
+    will preserve their ordering.
+    Note that the constructor, assigning an element of the dictionary and the
+    :meth:`~torch.nn.ParameterDict.update` method will convert any :class:`~torch.Tensor` into
+    :class:`~torch.nn.Parameter`.
+    Args:
+        values (iterable, optional): a mapping (dictionary) of
+            (string : Any) or an iterable of key-value pairs
+            of type (string, Any)
+    Example::
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.params = nn.ParameterDict({
+                        'left': nn.Parameter(torch.randn(5, 10)),
+                        'right': nn.Parameter(torch.randn(5, 10))
+                })
+            def forward(self, x, choice):
+                x = self.params[choice].mm(x)
+                return x
+    """
+    def __init__(self, parameters: Any = None) -> None:
+        super().__init__()
+        self._keys: Dict[str, None] = {}
+        if parameters is not None:
+            self.update(parameters)
+    def _key_to_attr(self, key: str) -> str:
+        if not isinstance(key, str):
+            raise TypeError("Index given to ParameterDict cannot be used as a key as it is "
+                            f"not a string (type is '{type(key).__name__}'). Open an issue on "
+                            "github if you need non-string keys.")
+        else:
+            # Use the key as-is so that `.named_parameters()` returns the right thing
+            return key
+    def __getitem__(self, key: str) -> Any:
+        attr = self._key_to_attr(key)
+        return getattr(self, attr)
+    def __setitem__(self, key: str, value: Any) -> None:
+        # Note that all other function that add an entry to the dictionary part of
+        # the ParameterDict end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the dictionary part and thus won't
+        # call into this function.
+        self._keys[key] = None
+        attr = self._key_to_attr(key)
+        if isinstance(value, torch.Tensor) and not isinstance(value, Parameter):
+            value = Parameter(value)
+        setattr(self, attr, value)
+    def __delitem__(self, key: str) -> None:
+        del self._keys[key]
+        attr = self._key_to_attr(key)
+        delattr(self, attr)
+    def __len__(self) -> int:
+        return len(self._keys)
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._keys)
+    def __reversed__(self) -> Iterator[str]:
+        return reversed(list(self._keys))
+    def copy(self) -> 'ParameterDict':
+        """Return a copy of this :class:`~torch.nn.ParameterDict` instance."""
+        # We have to use an OrderedDict because the ParameterDict constructor
+        # behaves differently on plain dict vs OrderedDict
+        return ParameterDict(OrderedDict((k, self[k]) for k in self._keys))
+    def __contains__(self, key: str) -> bool:
+        return key in self._keys
+    def setdefault(self, key: str, default: Optional[Any] = None) -> Any:
+        """Set the default for a key in the Parameterdict.
+        If key is in the ParameterDict, return its value.
+        If not, insert `key` with a parameter `default` and return `default`.
+        `default` defaults to `None`.
+        Args:
+            key (str): key to set default for
+            default (Any): the parameter set to the key
+        """
+        if key not in self:
+            self[key] = default
+        return self[key]
+    def clear(self) -> None:
+        """Remove all items from the ParameterDict."""
+        for k in self._keys.copy():
+            del self[k]
+    def pop(self, key: str) -> Any:
+        r"""Remove key from the ParameterDict and return its parameter.
+        Args:
+            key (str): key to pop from the ParameterDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+    def popitem(self) -> Tuple[str, Any]:
+        """Remove and return the last inserted `(key, parameter)` pair from the ParameterDict."""
+        k, _ = self._keys.popitem()
+        # We need the key in the _keys to be able to access/del
+        self._keys[k] = None
+        val = self[k]
+        del self[k]
+        return k, val
+    def get(self, key: str, default: Optional[Any] = None) -> Any:
+        r"""Return the parameter associated with key if present. Otherwise return default if provided, None if not.
+        Args:
+            key (str): key to get from the ParameterDict
+            default (Parameter, optional): value to return if key not present
+        """
+        return self[key] if key in self else default
+    def fromkeys(self, keys: Iterable[str], default: Optional[Any] = None) -> 'ParameterDict':
+        r"""Return a new ParameterDict with the keys provided.
+        Args:
+            keys (iterable, string): keys to make the new ParameterDict from
+            default (Parameter, optional): value to set for all keys
+        """
+        return ParameterDict((k, default) for k in keys)
+    def keys(self) -> Iterable[str]:
+        r"""Return an iterable of the ParameterDict keys."""
+        return self._keys.keys()
+    def items(self) -> Iterable[Tuple[str, Any]]:
+        r"""Return an iterable of the ParameterDict key/value pairs."""
+        return ((k, self[k]) for k in self._keys)
+    def values(self) -> Iterable[Any]:
+        r"""Return an iterable of the ParameterDict values."""
+        return (self[k] for k in self._keys)
+    def update(self, parameters: Union[Mapping[str, Any], 'ParameterDict']) -> None:
+        r"""Update the :class:`~torch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys.
+        .. note::
+            If :attr:`parameters` is an ``OrderedDict``, a :class:`~torch.nn.ParameterDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+        Args:
+            parameters (iterable): a mapping (dictionary) from string to
+                :class:`~torch.nn.Parameter`, or an iterable of
+                key-value pairs of type (string, :class:`~torch.nn.Parameter`)
+        """
+        if not isinstance(parameters, container_abcs.Iterable):
+            raise TypeError("ParametersDict.update should be called with an "
+                            "iterable of key/value pairs, but got " +
+                            type(parameters).__name__)
+        if isinstance(parameters, (OrderedDict, ParameterDict)):
+            for key, parameter in parameters.items():
+                self[key] = parameter
+        elif isinstance(parameters, container_abcs.Mapping):
+            for key, parameter in sorted(parameters.items()):
+                self[key] = parameter
+        else:
+            for j, p in enumerate(parameters):
+                if not isinstance(p, container_abcs.Iterable):
+                    raise TypeError("ParameterDict update sequence element "
+                                    "#" + str(j) + " should be Iterable; is" +
+                                    type(p).__name__)
+                if not len(p) == 2:
+                    raise ValueError("ParameterDict update sequence element "
+                                     "#" + str(j) + " has length " + str(len(p)) +
+                                     "; 2 is required")
+                # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
+                self[p[0]] = p[1]  # type: ignore[assignment]
+    def extra_repr(self) -> str:
+        child_lines = []
+        for k, p in self.items():
+            if isinstance(p, torch.Tensor):
+                size_str = 'x'.join(str(size) for size in p.size())
+                if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
+                    device_str = f' ({p.device})'
+                else:
+                    device_str = ''
+                parastr = '{} containing: [{} of size {}{}]'.format(
+                    "Parameter" if isinstance(p, Parameter) else "Tensor",
+                    torch.typename(p), size_str, device_str)
+                child_lines.append('  (' + str(k) + '): ' + parastr)
+            else:
+                child_lines.append('  (' + str(k) + '): Object of type: ' + type(p).__name__)
+        tmpstr = '\n'.join(child_lines)
+        return tmpstr
+    def __call__(self, input):
+        raise RuntimeError('ParameterDict should not be called.')
+    def __or__(self, other: 'ParameterDict') -> 'ParameterDict':
+        copy = self.copy()
+        copy.update(other)
+        return copy
+    def __ror__(self, other: 'ParameterDict') -> 'ParameterDict':
+        copy = other.copy()
+        copy.update(self)
+        return copy
+    def __ior__(self, other : 'ParameterDict') -> Self:
+        self.update(other)
+        return self

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/dropout.py ADDED Viewed

	@@ -0,0 +1,294 @@

+from .module import Module
+from .. import functional as F
+from torch import Tensor
+__all__ = ['Dropout', 'Dropout1d', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout']
+class _DropoutNd(Module):
+    __constants__ = ['p', 'inplace']
+    p: float
+    inplace: bool
+    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
+        self.p = p
+        self.inplace = inplace
+    def extra_repr(self) -> str:
+        return f'p={self.p}, inplace={self.inplace}'
+class Dropout(_DropoutNd):
+    r"""During training, randomly zeroes some of the elements of the input tensor with probability :attr:`p`.
+    The zeroed elements are chosen independently for each forward call and are sampled from a Bernoulli distribution.
+    Each channel will be zeroed out independently on every forward call.
+    This has proven to be an effective technique for regularization and
+    preventing the co-adaptation of neurons as described in the paper
+    `Improving neural networks by preventing co-adaptation of feature
+    detectors`_ .
+    Furthermore, the outputs are scaled by a factor of :math:`\frac{1}{1-p}` during
+    training. This means that during evaluation the module simply computes an
+    identity function.
+    Args:
+        p: probability of an element to be zeroed. Default: 0.5
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+    Shape:
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
+    Examples::
+        >>> m = nn.Dropout(p=0.2)
+        >>> input = torch.randn(20, 16)
+        >>> output = m(input)
+    .. _Improving neural networks by preventing co-adaptation of feature
+        detectors: https://arxiv.org/abs/1207.0580
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.dropout(input, self.p, self.training, self.inplace)
+class Dropout1d(_DropoutNd):
+    r"""Randomly zero out entire channels.
+    A channel is a 1D feature map,
+    e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 1D tensor :math:`\text{input}[i, j]`.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+    Usually the input comes from :class:`nn.Conv1d` modules.
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+    In this case, :func:`nn.Dropout1d` will help promote independence between
+    feature maps and should be used instead.
+    Args:
+        p (float, optional): probability of an element to be zero-ed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+    Shape:
+        - Input: :math:`(N, C, L)` or :math:`(C, L)`.
+        - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input).
+    Examples::
+        >>> m = nn.Dropout1d(p=0.2)
+        >>> input = torch.randn(20, 16, 32)
+        >>> output = m(input)
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.dropout1d(input, self.p, self.training, self.inplace)
+class Dropout2d(_DropoutNd):
+    r"""Randomly zero out entire channels.
+    A channel is a 2D feature map,
+    e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 2D tensor :math:`\text{input}[i, j]`.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+    Usually the input comes from :class:`nn.Conv2d` modules.
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+    In this case, :func:`nn.Dropout2d` will help promote independence between
+    feature maps and should be used instead.
+    Args:
+        p (float, optional): probability of an element to be zero-ed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+    .. warning ::
+        Due to historical reasons, this class will perform 1D channel-wise dropout
+        for 3D inputs (as done by :class:`nn.Dropout1d`). Thus, it currently does NOT
+        support inputs without a batch dimension of shape :math:`(C, H, W)`. This
+        behavior will change in a future release to interpret 3D inputs as no-batch-dim
+        inputs. To maintain the old behavior, switch to :class:`nn.Dropout1d`.
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(N, C, L)`.
+        - Output: :math:`(N, C, H, W)` or :math:`(N, C, L)` (same shape as input).
+    Examples::
+        >>> m = nn.Dropout2d(p=0.2)
+        >>> input = torch.randn(20, 16, 32, 32)
+        >>> output = m(input)
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.dropout2d(input, self.p, self.training, self.inplace)
+class Dropout3d(_DropoutNd):
+    r"""Randomly zero out entire channels.
+    A channel is a 3D feature map,
+    e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 3D tensor :math:`\text{input}[i, j]`.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+    Usually the input comes from :class:`nn.Conv3d` modules.
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+    In this case, :func:`nn.Dropout3d` will help promote independence between
+    feature maps and should be used instead.
+    Args:
+        p (float, optional): probability of an element to be zeroed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`.
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input).
+    Examples::
+        >>> m = nn.Dropout3d(p=0.2)
+        >>> input = torch.randn(20, 16, 4, 32, 32)
+        >>> output = m(input)
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.dropout3d(input, self.p, self.training, self.inplace)
+class AlphaDropout(_DropoutNd):
+    r"""Applies Alpha Dropout over the input.
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing
+    property.
+    For an input with zero mean and unit standard deviation, the output of
+    Alpha Dropout maintains the original mean and standard deviation of the
+    input.
+    Alpha Dropout goes hand-in-hand with SELU activation function, which ensures
+    that the outputs have zero mean and unit standard deviation.
+    During training, it randomly masks some of the elements of the input
+    tensor with probability *p* using samples from a bernoulli distribution.
+    The elements to masked are randomized on every forward call, and scaled
+    and shifted to maintain zero mean and unit standard deviation.
+    During evaluation the module simply computes an identity function.
+    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
+    Args:
+        p (float): probability of an element to be dropped. Default: 0.5
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+    Shape:
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
+    Examples::
+        >>> m = nn.AlphaDropout(p=0.2)
+        >>> input = torch.randn(20, 16)
+        >>> output = m(input)
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.alpha_dropout(input, self.p, self.training)
+class FeatureAlphaDropout(_DropoutNd):
+    r"""Randomly masks out entire channels.
+    A channel is a feature map,
+    e.g. the :math:`j`-th channel of the :math:`i`-th sample in the batch input
+    is a tensor :math:`\text{input}[i, j]` of the input tensor). Instead of
+    setting activations to zero, as in regular Dropout, the activations are set
+    to the negative saturation value of the SELU activation function. More details
+    can be found in the paper `Self-Normalizing Neural Networks`_ .
+    Each element will be masked independently for each sample on every forward
+    call with probability :attr:`p` using samples from a Bernoulli distribution.
+    The elements to be masked are randomized on every forward call, and scaled
+    and shifted to maintain zero mean and unit variance.
+    Usually the input comes from :class:`nn.AlphaDropout` modules.
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+    In this case, :func:`nn.AlphaDropout` will help promote independence between
+    feature maps and should be used instead.
+    Args:
+        p (float, optional): probability of an element to be zeroed. Default: 0.5
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`.
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input).
+    Examples::
+        >>> m = nn.FeatureAlphaDropout(p=0.2)
+        >>> input = torch.randn(20, 16, 4, 32, 32)
+        >>> output = m(input)
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.feature_alpha_dropout(input, self.p, self.training)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/flatten.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from .module import Module
+from typing import Tuple, Union
+from torch import Tensor
+from torch.types import _size
+__all__ = ['Flatten', 'Unflatten']
+class Flatten(Module):
+    r"""
+    Flattens a contiguous range of dims into a tensor.
+    For use with :class:`~nn.Sequential`, see :meth:`torch.flatten` for details.
+    Shape:
+        - Input: :math:`(*, S_{\text{start}},..., S_{i}, ..., S_{\text{end}}, *)`,'
+          where :math:`S_{i}` is the size at dimension :math:`i` and :math:`*` means any
+          number of dimensions including none.
+        - Output: :math:`(*, \prod_{i=\text{start}}^{\text{end}} S_{i}, *)`.
+    Args:
+        start_dim: first dim to flatten (default = 1).
+        end_dim: last dim to flatten (default = -1).
+    Examples::
+        >>> input = torch.randn(32, 1, 5, 5)
+        >>> # With default parameters
+        >>> m = nn.Flatten()
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([32, 25])
+        >>> # With non-default parameters
+        >>> m = nn.Flatten(0, 2)
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([160, 5])
+    """
+    __constants__ = ['start_dim', 'end_dim']
+    start_dim: int
+    end_dim: int
+    def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
+        super().__init__()
+        self.start_dim = start_dim
+        self.end_dim = end_dim
+    def forward(self, input: Tensor) -> Tensor:
+        return input.flatten(self.start_dim, self.end_dim)
+    def extra_repr(self) -> str:
+        return f'start_dim={self.start_dim}, end_dim={self.end_dim}'
+class Unflatten(Module):
+    r"""
+    Unflattens a tensor dim expanding it to a desired shape. For use with :class:`~nn.Sequential`.
+    * :attr:`dim` specifies the dimension of the input tensor to be unflattened, and it can
+      be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively.
+    * :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be
+      a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input;  a `NamedShape`
+      (tuple of `(name, size)` tuples) for `NamedTensor` input.
+    Shape:
+        - Input: :math:`(*, S_{\text{dim}}, *)`, where :math:`S_{\text{dim}}` is the size at
+          dimension :attr:`dim` and :math:`*` means any number of dimensions including none.
+        - Output: :math:`(*, U_1, ..., U_n, *)`, where :math:`U` = :attr:`unflattened_size` and
+          :math:`\prod_{i=1}^n U_i = S_{\text{dim}}`.
+    Args:
+        dim (Union[int, str]): Dimension to be unflattened
+        unflattened_size (Union[torch.Size, Tuple, List, NamedShape]): New shape of the unflattened dimension
+    Examples:
+        >>> input = torch.randn(2, 50)
+        >>> # With tuple of ints
+        >>> m = nn.Sequential(
+        >>>     nn.Linear(50, 50),
+        >>>     nn.Unflatten(1, (2, 5, 5))
+        >>> )
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([2, 2, 5, 5])
+        >>> # With torch.Size
+        >>> m = nn.Sequential(
+        >>>     nn.Linear(50, 50),
+        >>>     nn.Unflatten(1, torch.Size([2, 5, 5]))
+        >>> )
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([2, 2, 5, 5])
+        >>> # With namedshape (tuple of tuples)
+        >>> input = torch.randn(2, 50, names=('N', 'features'))
+        >>> unflatten = nn.Unflatten('features', (('C', 2), ('H', 5), ('W', 5)))
+        >>> output = unflatten(input)
+        >>> output.size()
+        torch.Size([2, 2, 5, 5])
+    """
+    NamedShape = Tuple[Tuple[str, int]]
+    __constants__ = ['dim', 'unflattened_size']
+    dim: Union[int, str]
+    unflattened_size: Union[_size, NamedShape]
+    def __init__(self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]) -> None:
+        super().__init__()
+        if isinstance(dim, int):
+            self._require_tuple_int(unflattened_size)
+        elif isinstance(dim, str):
+            self._require_tuple_tuple(unflattened_size)
+        else:
+            raise TypeError("invalid argument type for dim parameter")
+        self.dim = dim
+        self.unflattened_size = unflattened_size
+    def _require_tuple_tuple(self, input):
+        if (isinstance(input, tuple)):
+            for idx, elem in enumerate(input):
+                if not isinstance(elem, tuple):
+                    raise TypeError("unflattened_size must be tuple of tuples, " +
+                                    f"but found element of type {type(elem).__name__} at pos {idx}")
+            return
+        raise TypeError("unflattened_size must be a tuple of tuples, " +
+                        f"but found type {type(input).__name__}")
+    def _require_tuple_int(self, input):
+        if (isinstance(input, (tuple, list))):
+            for idx, elem in enumerate(input):
+                if not isinstance(elem, int):
+                    raise TypeError("unflattened_size must be tuple of ints, " +
+                                    f"but found element of type {type(elem).__name__} at pos {idx}")
+            return
+        raise TypeError(f"unflattened_size must be a tuple of ints, but found type {type(input).__name__}")
+    def forward(self, input: Tensor) -> Tensor:
+        return input.unflatten(self.dim, self.unflattened_size)
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, unflattened_size={self.unflattened_size}'

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/normalization.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import torch
+import numbers
+from torch.nn.parameter import Parameter
+from .module import Module
+from ._functions import CrossMapLRN2d as _cross_map_lrn2d
+from .. import functional as F
+from .. import init
+from torch import Tensor, Size
+from typing import Union, List, Tuple
+__all__ = ['LocalResponseNorm', 'CrossMapLRN2d', 'LayerNorm', 'GroupNorm']
+class LocalResponseNorm(Module):
+    r"""Applies local response normalization over an input signal.
+    The input signal is composed of several input planes, where channels occupy the second dimension.
+    Applies normalization across channels.
+    .. math::
+        b_{c} = a_{c}\left(k + \frac{\alpha}{n}
+        \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta}
+    Args:
+        size: amount of neighbouring channels used for normalization
+        alpha: multiplicative factor. Default: 0.0001
+        beta: exponent. Default: 0.75
+        k: additive factor. Default: 1
+    Shape:
+        - Input: :math:`(N, C, *)`
+        - Output: :math:`(N, C, *)` (same shape as input)
+    Examples::
+        >>> lrn = nn.LocalResponseNorm(2)
+        >>> signal_2d = torch.randn(32, 5, 24, 24)
+        >>> signal_4d = torch.randn(16, 5, 7, 7, 7, 7)
+        >>> output_2d = lrn(signal_2d)
+        >>> output_4d = lrn(signal_4d)
+    """
+    __constants__ = ['size', 'alpha', 'beta', 'k']
+    size: int
+    alpha: float
+    beta: float
+    k: float
+    def __init__(self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.) -> None:
+        super().__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+    def forward(self, input: Tensor) -> Tensor:
+        return F.local_response_norm(input, self.size, self.alpha, self.beta,
+                                     self.k)
+    def extra_repr(self):
+        return '{size}, alpha={alpha}, beta={beta}, k={k}'.format(**self.__dict__)
+class CrossMapLRN2d(Module):
+    size: int
+    alpha: float
+    beta: float
+    k: float
+    def __init__(self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1) -> None:
+        super().__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+    def forward(self, input: Tensor) -> Tensor:
+        return _cross_map_lrn2d.apply(input, self.size, self.alpha, self.beta,
+                                      self.k)
+    def extra_repr(self) -> str:
+        return '{size}, alpha={alpha}, beta={beta}, k={k}'.format(**self.__dict__)
+_shape_t = Union[int, List[int], Size]
+class LayerNorm(Module):
+    r"""Applies Layer Normalization over a mini-batch of inputs.
+    This layer implements the operation as described in
+    the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    The mean and standard-deviation are calculated over the last `D` dimensions, where `D`
+    is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape`
+    is ``(3, 5)`` (a 2-dimensional shape), the mean and standard-deviation are computed over
+    the last 2 dimensions of the input (i.e. ``input.mean((-2, -1))``).
+    :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
+    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+    .. note::
+        Unlike Batch Normalization and Instance Normalization, which applies
+        scalar scale and bias for each entire channel/plane with the
+        :attr:`affine` option, Layer Normalization applies per-element scale and
+        bias with :attr:`elementwise_affine`.
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+    Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+            .. math::
+                [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
+                    \times \ldots \times \text{normalized\_shape}[-1]]
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
+            :attr:`elementwise_affine` is ``True``). Default: ``True``.
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+            The values are initialized to 1.
+        bias:   the learnable bias of the module of shape
+                :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+                The values are initialized to 0.
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+    Examples::
+        >>> # NLP Example
+        >>> batch, sentence_length, embedding_dim = 20, 5, 10
+        >>> embedding = torch.randn(batch, sentence_length, embedding_dim)
+        >>> layer_norm = nn.LayerNorm(embedding_dim)
+        >>> # Activate module
+        >>> layer_norm(embedding)
+        >>>
+        >>> # Image Example
+        >>> N, C, H, W = 20, 5, 10, 10
+        >>> input = torch.randn(N, C, H, W)
+        >>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
+        >>> # as shown in the image below
+        >>> layer_norm = nn.LayerNorm([C, H, W])
+        >>> output = layer_norm(input)
+    .. image:: ../_static/img/nn/layer_norm.jpg
+        :scale: 50 %
+    """
+    __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
+    normalized_shape: Tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+    def __init__(self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+            if bias:
+                self.bias = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+            else:
+                self.register_parameter('bias', None)
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+            if self.bias is not None:
+                init.zeros_(self.bias)
+    def forward(self, input: Tensor) -> Tensor:
+        return F.layer_norm(
+            input, self.normalized_shape, self.weight, self.bias, self.eps)
+    def extra_repr(self) -> str:
+        return '{normalized_shape}, eps={eps}, ' \
+            'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
+class GroupNorm(Module):
+    r"""Applies Group Normalization over a mini-batch of inputs.
+    This layer implements the operation as described in
+    the paper `Group Normalization <https://arxiv.org/abs/1803.08494>`__
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    The input channels are separated into :attr:`num_groups` groups, each containing
+    ``num_channels / num_groups`` channels. :attr:`num_channels` must be divisible by
+    :attr:`num_groups`. The mean and standard-deviation are calculated
+    separately over the each group. :math:`\gamma` and :math:`\beta` are learnable
+    per-channel affine transform parameter vectors of size :attr:`num_channels` if
+    :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+    Args:
+        num_groups (int): number of groups to separate the channels into
+        num_channels (int): number of channels expected in input
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        affine: a boolean value that when set to ``True``, this module
+            has learnable per-channel affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+    Shape:
+        - Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}`
+        - Output: :math:`(N, C, *)` (same shape as input)
+    Examples::
+        >>> input = torch.randn(20, 6, 10, 10)
+        >>> # Separate 6 channels into 3 groups
+        >>> m = nn.GroupNorm(3, 6)
+        >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
+        >>> m = nn.GroupNorm(6, 6)
+        >>> # Put all 6 channels into a single group (equivalent with LayerNorm)
+        >>> m = nn.GroupNorm(1, 6)
+        >>> # Activating the module
+        >>> output = m(input)
+    """
+    __constants__ = ['num_groups', 'num_channels', 'eps', 'affine']
+    num_groups: int
+    num_channels: int
+    eps: float
+    affine: bool
+    def __init__(self, num_groups: int, num_channels: int, eps: float = 1e-5, affine: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        if num_channels % num_groups != 0:
+            raise ValueError('num_channels must be divisible by num_groups')
+        self.num_groups = num_groups
+        self.num_channels = num_channels
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self.weight = Parameter(torch.empty(num_channels, **factory_kwargs))
+            self.bias = Parameter(torch.empty(num_channels, **factory_kwargs))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        if self.affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+    def forward(self, input: Tensor) -> Tensor:
+        return F.group_norm(
+            input, self.num_groups, self.weight, self.bias, self.eps)
+    def extra_repr(self) -> str:
+        return '{num_groups}, {num_channels}, eps={eps}, ' \
+            'affine={affine}'.format(**self.__dict__)
+# TODO: ContrastiveNorm2d
+# TODO: DivisiveNorm2d
+# TODO: SubtractiveNorm2d

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/padding.py ADDED Viewed

	@@ -0,0 +1,801 @@

+from .module import Module
+from .utils import _pair, _quadruple, _ntuple
+from .. import functional as F
+from torch import Tensor
+from ..common_types import _size_2_t, _size_4_t, _size_6_t
+from typing import Sequence, Tuple
+# TODO: grad_output size asserts in THNN
+__all__ = ['CircularPad1d', 'CircularPad2d', 'CircularPad3d', 'ConstantPad1d', 'ConstantPad2d',
+           'ConstantPad3d', 'ReflectionPad1d', 'ReflectionPad2d', 'ReflectionPad3d',
+           'ReplicationPad1d', 'ReplicationPad2d', 'ReplicationPad3d', 'ZeroPad1d', 'ZeroPad2d', 'ZeroPad3d']
+class _CircularPadNd(Module):
+    __constants__ = ['padding']
+    padding: Sequence[int]
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+        return F.pad(input, self.padding, 'circular')
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+class CircularPad1d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.CircularPad1d(2)
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[2., 3., 0., 1., 2., 3., 0., 1.],
+                 [6., 7., 4., 5., 6., 7., 4., 5.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad1d((3, 1))
+        >>> m(input)
+        tensor([[[1., 2., 3., 0., 1., 2., 3., 0.],
+                 [5., 6., 7., 4., 5., 6., 7., 4.]]])
+    """
+    padding: Tuple[int, int]
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
+        self.padding = _pair(padding)
+    def _check_input_dim(self, input):
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(
+                f"expected 2D or 3D input (got {input.dim()}D input)"
+            )
+class CircularPad2d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> m = nn.CircularPad2d(2)
+        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[4., 5., 3., 4., 5., 3., 4.],
+                  [7., 8., 6., 7., 8., 6., 7.],
+                  [1., 2., 0., 1., 2., 0., 1.],
+                  [4., 5., 3., 4., 5., 3., 4.],
+                  [7., 8., 6., 7., 8., 6., 7.],
+                  [1., 2., 0., 1., 2., 0., 1.],
+                  [4., 5., 3., 4., 5., 3., 4.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[5., 3., 4., 5., 3.],
+                  [8., 6., 7., 8., 6.],
+                  [2., 0., 1., 2., 0.],
+                  [5., 3., 4., 5., 3.],
+                  [8., 6., 7., 8., 6.]]]])
+    """
+    padding: Tuple[int, int, int, int]
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
+        self.padding = _quadruple(padding)
+    def _check_input_dim(self, input):
+        if input.dim() != 3 and input.dim() != 4:
+            raise ValueError(
+                f"expected 3D or 4D input (got {input.dim()}D input)"
+            )
+class CircularPad3d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.CircularPad3d(3)
+        >>> input = torch.randn(16, 3, 8, 320, 480)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad3d((3, 3, 6, 6, 1, 1))
+        >>> output = m(input)
+    """
+    padding: Tuple[int, int, int, int, int, int]
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
+        self.padding = _ntuple(6)(padding)
+    def _check_input_dim(self, input):
+        if input.dim() != 4 and input.dim() != 5:
+            raise ValueError(
+                f"expected 4D or 5D input (got {input.dim()}D input)"
+            )
+class _ConstantPadNd(Module):
+    __constants__ = ['padding', 'value']
+    value: float
+    padding: Sequence[int]
+    def __init__(self, value: float) -> None:
+        super().__init__()
+        self.value = value
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, 'constant', self.value)
+    def extra_repr(self) -> str:
+        return f'padding={self.padding}, value={self.value}'
+class ConstantPad1d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in both boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ConstantPad1d(2, 3.5)
+        >>> input = torch.randn(1, 2, 4)
+        >>> input
+        tensor([[[-1.0491, -0.7152, -0.0749,  0.8530],
+                 [-1.3287,  1.8966,  0.1466, -0.2771]]])
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000, -1.0491, -0.7152, -0.0749,  0.8530,  3.5000,
+                   3.5000],
+                 [ 3.5000,  3.5000, -1.3287,  1.8966,  0.1466, -0.2771,  3.5000,
+                   3.5000]]])
+        >>> m = nn.ConstantPad1d(2, 3.5)
+        >>> input = torch.randn(1, 2, 3)
+        >>> input
+        tensor([[[ 1.6616,  1.4523, -1.1255],
+                 [-3.6372,  0.1182, -1.8652]]])
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  1.6616,  1.4523, -1.1255,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000,  3.5000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ConstantPad1d((3, 1), 3.5)
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  3.5000,  1.6616,  1.4523, -1.1255,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000]]])
+    """
+    padding: Tuple[int, int]
+    def __init__(self, padding: _size_2_t, value: float):
+        super().__init__(value)
+        self.padding = _pair(padding)
+class ConstantPad2d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ConstantPad2d(2, 3.5)
+        >>> input = torch.randn(1, 2, 2)
+        >>> input
+        tensor([[[ 1.6585,  0.4320],
+                 [-0.8701, -0.4649]]])
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  1.6585,  0.4320,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000, -0.8701, -0.4649,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ConstantPad2d((3, 0, 2, 1), 3.5)
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  1.6585,  0.4320],
+                 [ 3.5000,  3.5000,  3.5000, -0.8701, -0.4649],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000]]])
+    """
+    __constants__ = ['padding', 'value']
+    padding: Tuple[int, int, int, int]
+    def __init__(self, padding: _size_4_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _quadruple(padding)
+class ConstantPad3d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> m = nn.ConstantPad3d(3, 3.5)
+        >>> input = torch.randn(16, 3, 10, 20, 30)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ConstantPad3d((3, 3, 6, 6, 0, 1), 3.5)
+        >>> output = m(input)
+    """
+    padding: Tuple[int, int, int, int, int, int]
+    def __init__(self, padding: _size_6_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _ntuple(6)(padding)
+class _ReflectionPadNd(Module):
+    __constants__ = ['padding']
+    padding: Sequence[int]
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, 'reflect')
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+class ReflectionPad1d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> m = nn.ReflectionPad1d(2)
+        >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles")
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[2., 1., 0., 1., 2., 3., 2., 1.],
+                 [6., 5., 4., 5., 6., 7., 6., 5.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReflectionPad1d((3, 1))
+        >>> m(input)
+        tensor([[[3., 2., 1., 0., 1., 2., 3., 2.],
+                 [7., 6., 5., 4., 5., 6., 7., 6.]]])
+    """
+    padding: Tuple[int, int]
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
+        self.padding = _pair(padding)
+class ReflectionPad2d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+            Note that padding size should be less than the corresponding input dimension.
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})` where
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.ReflectionPad2d(2)
+        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[8., 7., 6., 7., 8., 7., 6.],
+                  [5., 4., 3., 4., 5., 4., 3.],
+                  [2., 1., 0., 1., 2., 1., 0.],
+                  [5., 4., 3., 4., 5., 4., 3.],
+                  [8., 7., 6., 7., 8., 7., 6.],
+                  [5., 4., 3., 4., 5., 4., 3.],
+                  [2., 1., 0., 1., 2., 1., 0.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReflectionPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[7., 6., 7., 8., 7.],
+                  [4., 3., 4., 5., 4.],
+                  [1., 0., 1., 2., 1.],
+                  [4., 3., 4., 5., 4.],
+                  [7., 6., 7., 8., 7.]]]])
+    """
+    padding: Tuple[int, int, int, int]
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
+        self.padding = _quadruple(padding)
+class ReflectionPad3d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.ReflectionPad3d(1)
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 1, 2, 2, 2)
+        >>> m(input)
+        tensor([[[[[7., 6., 7., 6.],
+                   [5., 4., 5., 4.],
+                   [7., 6., 7., 6.],
+                   [5., 4., 5., 4.]],
+                  [[3., 2., 3., 2.],
+                   [1., 0., 1., 0.],
+                   [3., 2., 3., 2.],
+                   [1., 0., 1., 0.]],
+                  [[7., 6., 7., 6.],
+                   [5., 4., 5., 4.],
+                   [7., 6., 7., 6.],
+                   [5., 4., 5., 4.]],
+                  [[3., 2., 3., 2.],
+                   [1., 0., 1., 0.],
+                   [3., 2., 3., 2.],
+                   [1., 0., 1., 0.]]]]])
+    """
+    padding: Tuple[int, int, int, int, int, int]
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
+        self.padding = _ntuple(6)(padding)
+class _ReplicationPadNd(Module):
+    __constants__ = ['padding']
+    padding: Sequence[int]
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, 'replicate')
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+class ReplicationPad1d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.ReplicationPad1d(2)
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[0., 0., 0., 1., 2., 3., 3., 3.],
+                 [4., 4., 4., 5., 6., 7., 7., 7.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReplicationPad1d((3, 1))
+        >>> m(input)
+        tensor([[[0., 0., 0., 0., 1., 2., 3., 3.],
+                 [4., 4., 4., 4., 5., 6., 7., 7.]]])
+    """
+    padding: Tuple[int, int]
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
+        self.padding = _pair(padding)
+class ReplicationPad2d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> m = nn.ReplicationPad2d(2)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[0., 0., 0., 1., 2., 2., 2.],
+                  [0., 0., 0., 1., 2., 2., 2.],
+                  [0., 0., 0., 1., 2., 2., 2.],
+                  [3., 3., 3., 4., 5., 5., 5.],
+                  [6., 6., 6., 7., 8., 8., 8.],
+                  [6., 6., 6., 7., 8., 8., 8.],
+                  [6., 6., 6., 7., 8., 8., 8.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReplicationPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[0., 0., 1., 2., 2.],
+                  [0., 0., 1., 2., 2.],
+                  [0., 0., 1., 2., 2.],
+                  [3., 3., 4., 5., 5.],
+                  [6., 6., 7., 8., 8.]]]])
+    """
+    padding: Tuple[int, int, int, int]
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
+        self.padding = _quadruple(padding)
+class ReplicationPad3d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ReplicationPad3d(3)
+        >>> input = torch.randn(16, 3, 8, 320, 480)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ReplicationPad3d((3, 3, 6, 6, 1, 1))
+        >>> output = m(input)
+    """
+    padding: Tuple[int, int, int, int, int, int]
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
+        self.padding = _ntuple(6)(padding)
+class ZeroPad1d(ConstantPad1d):
+    r"""Pads the input tensor boundaries with zero.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in both boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ZeroPad1d(2)
+        >>> input = torch.randn(1, 2, 4)
+        >>> input
+        tensor([[[-1.0491, -0.7152, -0.0749,  0.8530],
+                 [-1.3287,  1.8966,  0.1466, -0.2771]]])
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000, -1.0491, -0.7152, -0.0749,  0.8530,  0.0000,
+                   0.0000],
+                 [ 0.0000,  0.0000, -1.3287,  1.8966,  0.1466, -0.2771,  0.0000,
+                   0.0000]]])
+        >>> m = nn.ZeroPad1d(2)
+        >>> input = torch.randn(1, 2, 3)
+        >>> input
+        tensor([[[ 1.6616,  1.4523, -1.1255],
+                 [-3.6372,  0.1182, -1.8652]]])
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000,  0.0000],
+                 [ 0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000,  0.0000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad1d((3, 1))
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000],
+                 [ 0.0000,  0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000]]])
+    """
+    padding: Tuple[int, int]
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__(padding, 0.)
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+class ZeroPad2d(ConstantPad2d):
+    r"""Pads the input tensor boundaries with zero.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ZeroPad2d(2)
+        >>> input = torch.randn(1, 1, 3, 3)
+        >>> input
+        tensor([[[[-0.1678, -0.4418,  1.9466],
+                  [ 0.9604, -0.4219, -0.5241],
+                  [-0.9162, -0.5436, -0.6446]]]])
+        >>> m(input)
+        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000, -0.1678, -0.4418,  1.9466,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.9604, -0.4219, -0.5241,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000, -0.9162, -0.5436, -0.6446,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000, -0.1678, -0.4418,  1.9466,  0.0000],
+                  [ 0.0000,  0.9604, -0.4219, -0.5241,  0.0000],
+                  [ 0.0000, -0.9162, -0.5436, -0.6446,  0.0000]]]])
+    """
+    padding: Tuple[int, int, int, int]
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__(padding, 0.)
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+class ZeroPad3d(ConstantPad3d):
+    r"""Pads the input tensor boundaries with zero.
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+    Examples::
+        >>> m = nn.ZeroPad3d(3)
+        >>> input = torch.randn(16, 3, 10, 20, 30)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad3d((3, 3, 6, 6, 0, 1))
+        >>> output = m(input)
+    """
+    padding: Tuple[int, int, int, int, int, int]
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__(padding, 0.)
+    def extra_repr(self) -> str:
+        return f'{self.padding}'

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pixelshuffle.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from .module import Module
+from .. import functional as F
+from torch import Tensor
+__all__ = ['PixelShuffle', 'PixelUnshuffle']
+class PixelShuffle(Module):
+    r"""Rearrange elements in a tensor according to an upscaling factor.
+    Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
+    to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an upscale factor.
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of :math:`1/r`.
+    See the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et. al (2016) for more details.
+    Args:
+        upscale_factor (int): factor to increase spatial resolution by
+    Shape:
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+    .. math::
+        C_{out} = C_{in} \div \text{upscale\_factor}^2
+    .. math::
+        H_{out} = H_{in} \times \text{upscale\_factor}
+    .. math::
+        W_{out} = W_{in} \times \text{upscale\_factor}
+    Examples::
+        >>> pixel_shuffle = nn.PixelShuffle(3)
+        >>> input = torch.randn(1, 9, 4, 4)
+        >>> output = pixel_shuffle(input)
+        >>> print(output.size())
+        torch.Size([1, 1, 12, 12])
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+    __constants__ = ['upscale_factor']
+    upscale_factor: int
+    def __init__(self, upscale_factor: int) -> None:
+        super().__init__()
+        self.upscale_factor = upscale_factor
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pixel_shuffle(input, self.upscale_factor)
+    def extra_repr(self) -> str:
+        return f'upscale_factor={self.upscale_factor}'
+class PixelUnshuffle(Module):
+    r"""Reverse the PixelShuffle operation.
+    Reverses the :class:`~torch.nn.PixelShuffle` operation by rearranging elements
+    in a tensor of shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape
+    :math:`(*, C \times r^2, H, W)`, where r is a downscale factor.
+    See the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et. al (2016) for more details.
+    Args:
+        downscale_factor (int): factor to decrease spatial resolution by
+    Shape:
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+    .. math::
+        C_{out} = C_{in} \times \text{downscale\_factor}^2
+    .. math::
+        H_{out} = H_{in} \div \text{downscale\_factor}
+    .. math::
+        W_{out} = W_{in} \div \text{downscale\_factor}
+    Examples::
+        >>> pixel_unshuffle = nn.PixelUnshuffle(3)
+        >>> input = torch.randn(1, 1, 12, 12)
+        >>> output = pixel_unshuffle(input)
+        >>> print(output.size())
+        torch.Size([1, 9, 4, 4])
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+    __constants__ = ['downscale_factor']
+    downscale_factor: int
+    def __init__(self, downscale_factor: int) -> None:
+        super().__init__()
+        self.downscale_factor = downscale_factor
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pixel_unshuffle(input, self.downscale_factor)
+    def extra_repr(self) -> str:
+        return f'downscale_factor={self.downscale_factor}'

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pooling.py ADDED Viewed

	@@ -0,0 +1,1306 @@

+from typing import List, Optional
+from torch import Tensor
+from .module import Module
+from .utils import _single, _pair, _triple
+from .. import functional as F
+from ..common_types import (_size_any_t, _size_1_t, _size_2_t, _size_3_t,
+                            _ratio_3_t, _ratio_2_t, _size_any_opt_t, _size_2_opt_t, _size_3_opt_t)
+__all__ = ['MaxPool1d', 'MaxPool2d', 'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d',
+           'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'FractionalMaxPool2d', 'FractionalMaxPool3d', 'LPPool1d',
+           'LPPool2d', 'LPPool3d', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d',
+           'AdaptiveAvgPool1d', 'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d']
+class _MaxPoolNd(Module):
+    __constants__ = ['kernel_size', 'stride', 'padding', 'dilation',
+                     'return_indices', 'ceil_mode']
+    return_indices: bool
+    ceil_mode: bool
+    def __init__(self, kernel_size: _size_any_t, stride: Optional[_size_any_t] = None,
+                 padding: _size_any_t = 0, dilation: _size_any_t = 1,
+                 return_indices: bool = False, ceil_mode: bool = False) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if (stride is not None) else kernel_size
+        self.padding = padding
+        self.dilation = dilation
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+    def extra_repr(self) -> str:
+        return 'kernel_size={kernel_size}, stride={stride}, padding={padding}' \
+            ', dilation={dilation}, ceil_mode={ceil_mode}'.format(**self.__dict__)
+class MaxPool1d(_MaxPoolNd):
+    r"""Applies a 1D max pooling over an input signal composed of several input planes.
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`
+    and output :math:`(N, C, L_{out})` can be precisely described as:
+    .. math::
+        out(N_i, C_j, k) = \max_{m=0, \ldots, \text{kernel\_size} - 1}
+                input(N_i, C_j, stride \times k + m)
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points. :attr:`dilation` is the stride between the elements within the
+    sliding window. This `link`_ has a nice visualization of the pooling parameters.
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+    Args:
+        kernel_size: The size of the sliding window, must be > 0.
+        stride: The stride of the sliding window, must be > 0. Default value is :attr:`kernel_size`.
+        padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2.
+        dilation: The stride between elements within a sliding window, must be > 0.
+        return_indices: If ``True``, will return the argmax along with the max values.
+                        Useful for :class:`torch.nn.MaxUnpool1d` later
+        ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This
+                   ensures that every element in the input tensor is covered by a sliding window.
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          .. math::
+              L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                    \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+    Examples::
+        >>> # pool of size=3, stride=2
+        >>> m = nn.MaxPool1d(3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+    dilation: _size_1_t
+    def forward(self, input: Tensor):
+        return F.max_pool1d(input, self.kernel_size, self.stride,
+                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
+                            return_indices=self.return_indices)
+class MaxPool2d(_MaxPoolNd):
+    r"""Applies a 2D max pooling over an input signal composed of several input planes.
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+    .. math::
+        \begin{aligned}
+            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+        \end{aligned}
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: Implicit negative infinity padding to be added on both sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool2d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding[0]} - \text{dilation[0]}
+                    \times (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding[1]} - \text{dilation[1]}
+                    \times (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor
+    Examples::
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.MaxPool2d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.MaxPool2d((3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
+    dilation: _size_2_t
+    def forward(self, input: Tensor):
+        return F.max_pool2d(input, self.kernel_size, self.stride,
+                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
+                            return_indices=self.return_indices)
+class MaxPool3d(_MaxPoolNd):
+    r"""Applies a 3D max pooling over an input signal composed of several input planes.
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+    .. math::
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+        \end{aligned}
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: Implicit negative infinity padding to be added on all three sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool3d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times
+                (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times
+                (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times
+                (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
+    Examples::
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.MaxPool3d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.MaxPool3d((3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """  # noqa: E501
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
+    dilation: _size_3_t
+    def forward(self, input: Tensor):
+        return F.max_pool3d(input, self.kernel_size, self.stride,
+                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
+                            return_indices=self.return_indices)
+class _MaxUnpoolNd(Module):
+    def extra_repr(self) -> str:
+        return f'kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}'
+class MaxUnpool1d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool1d`.
+    :class:`MaxPool1d` is not fully invertible, since the non-maximal values are lost.
+    :class:`MaxUnpool1d` takes in as input the output of :class:`MaxPool1d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+    .. note:: :class:`MaxPool1d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs and Example below.
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool1d`
+        - `output_size` (optional): the targeted output size
+    Shape:
+        - Input: :math:`(N, C, H_{in})` or :math:`(C, H_{in})`.
+        - Output: :math:`(N, C, H_{out})` or :math:`(C, H_{out})`, where
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0]
+          or as given by :attr:`output_size` in the call operator
+    Example::
+        >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?")
+        >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool1d(2, stride=2)
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+        >>> # Example showcasing the use of output_size
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8, 9]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices, output_size=input.size())
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.,  0.]]])
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+    """
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+    def __init__(self, kernel_size: _size_1_t, stride: Optional[_size_1_t] = None, padding: _size_1_t = 0) -> None:
+        super().__init__()
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride if (stride is not None) else kernel_size)
+        self.padding = _single(padding)
+    def forward(self, input: Tensor, indices: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
+        return F.max_unpool1d(input, indices, self.kernel_size, self.stride,
+                              self.padding, output_size)
+class MaxUnpool2d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool2d`.
+    :class:`MaxPool2d` is not fully invertible, since the non-maximal values are lost.
+    :class:`MaxUnpool2d` takes in as input the output of :class:`MaxPool2d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+    .. note:: :class:`MaxPool2d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs and Example below.
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool2d`
+        - `output_size` (optional): the targeted output size
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          .. math::
+            H_{out} = (H_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
+          .. math::
+            W_{out} = (W_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
+          or as given by :attr:`output_size` in the call operator
+    Example::
+        >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool2d(2, stride=2)
+        >>> input = torch.tensor([[[[ 1.,  2.,  3.,  4.],
+                                    [ 5.,  6.,  7.,  8.],
+                                    [ 9., 10., 11., 12.],
+                                    [13., 14., 15., 16.]]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[[  0.,   0.,   0.,   0.],
+                  [  0.,   6.,   0.,   8.],
+                  [  0.,   0.,   0.,   0.],
+                  [  0.,  14.,   0.,  16.]]]])
+        >>> # Now using output_size to resolve an ambiguous size for the inverse
+        >>> input = torch.torch.tensor([[[[ 1.,  2.,  3., 4., 5.],
+                                          [ 6.,  7.,  8., 9., 10.],
+                                          [11., 12., 13., 14., 15.],
+                                          [16., 17., 18., 19., 20.]]]])
+        >>> output, indices = pool(input)
+        >>> # This call will not work without specifying output_size
+        >>> unpool(output, indices, output_size=input.size())
+        tensor([[[[ 0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  7.,  0.,  9.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.],
+                  [ 0., 17.,  0., 19.,  0.]]]])
+    """
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
+    def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, padding: _size_2_t = 0) -> None:
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride if (stride is not None) else kernel_size)
+        self.padding = _pair(padding)
+    def forward(self, input: Tensor, indices: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
+        return F.max_unpool2d(input, indices, self.kernel_size, self.stride,
+                              self.padding, output_size)
+class MaxUnpool3d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool3d`.
+    :class:`MaxPool3d` is not fully invertible, since the non-maximal values are lost.
+    :class:`MaxUnpool3d` takes in as input the output of :class:`MaxPool3d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+    .. note:: :class:`MaxPool3d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs section below.
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool3d`
+        - `output_size` (optional): the targeted output size
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where
+          .. math::
+              D_{out} = (D_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
+          .. math::
+              W_{out} = (W_{in} - 1) \times \text{stride[2]} - 2 \times \text{padding[2]} + \text{kernel\_size[2]}
+          or as given by :attr:`output_size` in the call operator
+    Example::
+        >>> # pool of square window of size=3, stride=2
+        >>> pool = nn.MaxPool3d(3, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool3d(3, stride=2)
+        >>> output, indices = pool(torch.randn(20, 16, 51, 33, 15))
+        >>> unpooled_output = unpool(output, indices)
+        >>> unpooled_output.size()
+        torch.Size([20, 16, 51, 33, 15])
+    """
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
+    def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, padding: _size_3_t = 0) -> None:
+        super().__init__()
+        self.kernel_size = _triple(kernel_size)
+        self.stride = _triple(stride if (stride is not None) else kernel_size)
+        self.padding = _triple(padding)
+    def forward(self, input: Tensor, indices: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
+        return F.max_unpool3d(input, indices, self.kernel_size, self.stride,
+                              self.padding, output_size)
+class _AvgPoolNd(Module):
+    __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad']
+    def extra_repr(self) -> str:
+        return f'kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}'
+class AvgPool1d(_AvgPoolNd):
+    r"""Applies a 1D average pooling over an input signal composed of several input planes.
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`,
+    output :math:`(N, C, L_{out})` and :attr:`kernel_size` :math:`k`
+    can be precisely described as:
+    .. math::
+        \text{out}(N_i, C_j, l) = \frac{1}{k} \sum_{m=0}^{k-1}
+                               \text{input}(N_i, C_j, \text{stride} \times l + m)
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points.
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be
+    an ``int`` or a one-element tuple.
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          .. math::
+              L_{out} = \left\lfloor \frac{L_{in} +
+              2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+          Per the note above, if ``ceil_mode`` is True and :math:`(L_{out} - 1) \times \text{stride} \geq L_{in}
+          + \text{padding}`, we skip the last window as it would start in the right padded region, resulting in
+          :math:`L_{out}` being reduced by one.
+    Examples::
+        >>> # pool with window of size=3, stride=2
+        >>> m = nn.AvgPool1d(3, stride=2)
+        >>> m(torch.tensor([[[1., 2, 3, 4, 5, 6, 7]]]))
+        tensor([[[2., 4., 6.]]])
+    """
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+    ceil_mode: bool
+    count_include_pad: bool
+    def __init__(self, kernel_size: _size_1_t, stride: _size_1_t = None, padding: _size_1_t = 0, ceil_mode: bool = False,
+                 count_include_pad: bool = True) -> None:
+        super().__init__()
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride if stride is not None else kernel_size)
+        self.padding = _single(padding)
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+    def forward(self, input: Tensor) -> Tensor:
+        return F.avg_pool1d(
+            input, self.kernel_size, self.stride, self.padding, self.ceil_mode,
+            self.count_include_pad)
+class AvgPool2d(_AvgPoolNd):
+    r"""Applies a 2D average pooling over an input signal composed of several input planes.
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+    .. math::
+        out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points.
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+        divisor_override: if specified, it will be used as divisor, otherwise size of the pooling region will be used.
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] -
+                \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] -
+                \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+          Per the note above, if ``ceil_mode`` is True and :math:`(H_{out} - 1)\times \text{stride}[0]\geq H_{in}
+          + \text{padding}[0]`, we skip the last window as it would start in the bottom padded region,
+          resulting in :math:`H_{out}` being reduced by one.
+          The same applies for :math:`W_{out}`.
+    Examples::
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.AvgPool2d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.AvgPool2d((3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+    """
+    __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad', 'divisor_override']
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
+    ceil_mode: bool
+    count_include_pad: bool
+    def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, padding: _size_2_t = 0,
+                 ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if (stride is not None) else kernel_size
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor_override = divisor_override
+    def forward(self, input: Tensor) -> Tensor:
+        return F.avg_pool2d(input, self.kernel_size, self.stride,
+                            self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override)
+class AvgPool3d(_AvgPoolNd):
+    r"""Applies a 3D average pooling over an input signal composed of several input planes.
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+    .. math::
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} \\
+                                              & \frac{\text{input}(N_i, C_j, \text{stride}[0] \times d + k,
+                                                      \text{stride}[1] \times h + m, \text{stride}[2] \times w + n)}
+                                                     {kD \times kH \times kW}
+        \end{aligned}
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides
+    for :attr:`padding` number of points.
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on all three sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+        divisor_override: if specified, it will be used as divisor, otherwise :attr:`kernel_size` will be used
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] -
+                    \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] -
+                    \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] -
+                    \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
+          Per the note above, if ``ceil_mode`` is True and :math:`(D_{out} - 1)\times \text{stride}[0]\geq D_{in}
+          + \text{padding}[0]`, we skip the last window as it would start in the padded region,
+          resulting in :math:`D_{out}` being reduced by one.
+          The same applies for :math:`W_{out}` and :math:`H_{out}`.
+    Examples::
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.AvgPool3d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.AvgPool3d((3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+    """
+    __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad', 'divisor_override']
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
+    ceil_mode: bool
+    count_include_pad: bool
+    def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, padding: _size_3_t = 0,
+                 ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if (stride is not None) else kernel_size
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor_override = divisor_override
+    def forward(self, input: Tensor) -> Tensor:
+        return F.avg_pool3d(input, self.kernel_size, self.stride,
+                            self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override)
+    def __setstate__(self, d):
+        super().__setstate__(d)
+        self.__dict__.setdefault('padding', 0)
+        self.__dict__.setdefault('ceil_mode', False)
+        self.__dict__.setdefault('count_include_pad', True)
+class FractionalMaxPool2d(Module):
+    r"""Applies a 2D fractional max pooling over an input signal composed of several input planes.
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+    The max-pooling operation is applied in :math:`kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+    .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined.
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number k (for a square kernel of k x k) or a tuple `(kh, kw)`
+        output_size: the target output size of the image of the form `oH x oW`.
+                     Can be a tuple `(oH, oW)` or a single number oH for a square image `oH x oH`.
+                     Note that we must have :math:`kH + oH - 1 <= H_{in}` and :math:`kW + oW - 1 <= W_{in}`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1).
+                      Note that we must have :math:`kH + (output\_ratio\_H * H_{in}) - 1 <= H_{in}`
+                      and :math:`kW + (output\_ratio\_W * W_{in}) - 1 <= W_{in}`
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :meth:`nn.MaxUnpool2d`. Default: ``False``
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`(H_{out}, W_{out})=\text{output\_size}` or
+          :math:`(H_{out}, W_{out})=\text{output\_ratio} \times (H_{in}, W_{in})`.
+    Examples:
+        >>> # pool of square window of size=3, and target output size 13x12
+        >>> m = nn.FractionalMaxPool2d(3, output_size=(13, 12))
+        >>> # pool of square window and target output size being half of input image size
+        >>> m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+    .. _Fractional MaxPooling:
+        https://arxiv.org/abs/1412.6071
+    """
+    __constants__ = ['kernel_size', 'return_indices', 'output_size',
+                     'output_ratio']
+    kernel_size: _size_2_t
+    return_indices: bool
+    output_size: _size_2_t
+    output_ratio: _ratio_2_t
+    def __init__(self, kernel_size: _size_2_t, output_size: Optional[_size_2_t] = None,
+                 output_ratio: Optional[_ratio_2_t] = None,
+                 return_indices: bool = False, _random_samples=None) -> None:
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.return_indices = return_indices
+        self.register_buffer('_random_samples', _random_samples)
+        self.output_size = _pair(output_size) if output_size is not None else None
+        self.output_ratio = _pair(output_ratio) if output_ratio is not None else None
+        if output_size is None and output_ratio is None:
+            raise ValueError("FractionalMaxPool2d requires specifying either "
+                             "an output size, or a pooling ratio")
+        if output_size is not None and output_ratio is not None:
+            raise ValueError("only one of output_size and output_ratio may be specified")
+        if self.output_ratio is not None:
+            if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1):
+                raise ValueError(f"output_ratio must be between 0 and 1 (got {output_ratio})")
+    def forward(self, input: Tensor):
+        return F.fractional_max_pool2d(
+            input, self.kernel_size, self.output_size, self.output_ratio,
+            self.return_indices,
+            _random_samples=self._random_samples)
+class FractionalMaxPool3d(Module):
+    r"""Applies a 3D fractional max pooling over an input signal composed of several input planes.
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+    The max-pooling operation is applied in :math:`kT \times kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+    .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined.
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number k (for a square kernel of k x k x k) or a tuple `(kt x kh x kw)`
+        output_size: the target output size of the image of the form `oT x oH x oW`.
+                     Can be a tuple `(oT, oH, oW)` or a single number oH for a square image `oH x oH x oH`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1)
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :meth:`nn.MaxUnpool3d`. Default: ``False``
+    Shape:
+        - Input: :math:`(N, C, T_{in}, H_{in}, W_{in})` or :math:`(C, T_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, T_{out}, H_{out}, W_{out})` or :math:`(C, T_{out}, H_{out}, W_{out})`, where
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_size}` or
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_ratio} \times (T_{in}, H_{in}, W_{in})`
+    Examples:
+        >>> # pool of cubic window of size=3, and target output size 13x12x11
+        >>> m = nn.FractionalMaxPool3d(3, output_size=(13, 12, 11))
+        >>> # pool of cubic window and target output size being half of input size
+        >>> m = nn.FractionalMaxPool3d(3, output_ratio=(0.5, 0.5, 0.5))
+        >>> input = torch.randn(20, 16, 50, 32, 16)
+        >>> output = m(input)
+    .. _Fractional MaxPooling:
+        https://arxiv.org/abs/1412.6071
+    """
+    __constants__ = ['kernel_size', 'return_indices', 'output_size',
+                     'output_ratio']
+    kernel_size: _size_3_t
+    return_indices: bool
+    output_size: _size_3_t
+    output_ratio: _ratio_3_t
+    def __init__(self, kernel_size: _size_3_t, output_size: Optional[_size_3_t] = None,
+                 output_ratio: Optional[_ratio_3_t] = None,
+                 return_indices: bool = False, _random_samples=None) -> None:
+        super().__init__()
+        self.kernel_size = _triple(kernel_size)
+        self.return_indices = return_indices
+        self.register_buffer('_random_samples', _random_samples)
+        self.output_size = _triple(output_size) if output_size is not None else None
+        self.output_ratio = _triple(output_ratio) if output_ratio is not None else None
+        if output_size is None and output_ratio is None:
+            raise ValueError("FractionalMaxPool3d requires specifying either "
+                             "an output size, or a pooling ratio")
+        if output_size is not None and output_ratio is not None:
+            raise ValueError("only one of output_size and output_ratio may be specified")
+        if self.output_ratio is not None:
+            if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1 and 0 < self.output_ratio[2] < 1):
+                raise ValueError(f"output_ratio must be between 0 and 1 (got {output_ratio})")
+    def forward(self, input: Tensor):
+        return F.fractional_max_pool3d(
+            input, self.kernel_size, self.output_size, self.output_ratio,
+            self.return_indices,
+            _random_samples=self._random_samples)
+class _LPPoolNd(Module):
+    __constants__ = ['norm_type', 'kernel_size', 'stride', 'ceil_mode']
+    norm_type: float
+    ceil_mode: bool
+    def __init__(self, norm_type: float, kernel_size: _size_any_t, stride: Optional[_size_any_t] = None,
+                 ceil_mode: bool = False) -> None:
+        super().__init__()
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.ceil_mode = ceil_mode
+    def extra_repr(self) -> str:
+        return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \
+            'ceil_mode={ceil_mode}'.format(**self.__dict__)
+class LPPool1d(_LPPoolNd):
+    r"""Applies a 1D power-average pooling over an input signal composed of several input planes.
+    On each window, the function computed is:
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling)
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+    Args:
+        kernel_size: a single int, the size of the window
+        stride: a single int, the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          .. math::
+              L_{out} = \left\lfloor\frac{L_{in} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+    Examples::
+        >>> # power-2 pool of window of length 3, with stride 2.
+        >>> m = nn.LPPool1d(2, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+    """
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    def forward(self, input: Tensor) -> Tensor:
+        return F.lp_pool1d(input, float(self.norm_type), self.kernel_size,
+                           self.stride, self.ceil_mode)
+class LPPool2d(_LPPoolNd):
+    r"""Applies a 2D power-average pooling over an input signal composed of several input planes.
+    On each window, the function computed is:
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+    Examples::
+        >>> # power-2 pool of square window of size=3, stride=2
+        >>> m = nn.LPPool2d(2, 3, stride=2)
+        >>> # pool of non-square window of power 1.2
+        >>> m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+    """
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    def forward(self, input: Tensor) -> Tensor:
+        return F.lp_pool2d(input, float(self.norm_type), self.kernel_size,
+                           self.stride, self.ceil_mode)
+class LPPool3d(_LPPoolNd):
+    r"""Applies a 3D power-average pooling over an input signal composed of several input planes.
+    On each window, the function computed is:
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+        - a single ``int`` -- in which case the same value is used for the height, width and depth dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
+    Examples::
+        >>> # power-2 pool of square window of size=3, stride=2
+        >>> m = nn.LPPool3d(2, 3, stride=2)
+        >>> # pool of non-square window of power 1.2
+        >>> m = nn.LPPool3d(1.2, (3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+    """
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    def forward(self, input: Tensor) -> Tensor:
+        return F.lp_pool3d(input, float(self.norm_type), self.kernel_size,
+                           self.stride, self.ceil_mode)
+class _AdaptiveMaxPoolNd(Module):
+    __constants__ = ['output_size', 'return_indices']
+    return_indices: bool
+    def __init__(self, output_size: _size_any_opt_t, return_indices: bool = False) -> None:
+        super().__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+    def extra_repr(self) -> str:
+        return f'output_size={self.output_size}'
+# FIXME (by @ssnl): Improve adaptive pooling docs: specify what the input and
+#   output shapes are, and how the operation computes output.
+class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd):
+    r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes.
+    The output size is :math:`L_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+    Args:
+        output_size: the target output size :math:`L_{out}`.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool1d. Default: ``False``
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          :math:`L_{out}=\text{output\_size}`.
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveMaxPool1d(5)
+        >>> input = torch.randn(1, 64, 8)
+        >>> output = m(input)
+    """
+    output_size: _size_1_t
+    def forward(self, input: Tensor):
+        return F.adaptive_max_pool1d(input, self.output_size, self.return_indices)
+class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd):
+    r"""Applies a 2D adaptive max pooling over an input signal composed of several input planes.
+    The output is of size :math:`H_{out} \times W_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+    Args:
+        output_size: the target output size of the image of the form :math:`H_{out} \times W_{out}`.
+                     Can be a tuple :math:`(H_{out}, W_{out})` or a single :math:`H_{out}` for a
+                     square image :math:`H_{out} \times H_{out}`. :math:`H_{out}` and :math:`W_{out}`
+                     can be either a ``int``, or ``None`` which means the size will be the same as that
+                     of the input.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool2d. Default: ``False``
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`(H_{out}, W_{out})=\text{output\_size}`.
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveMaxPool2d((5, 7))
+        >>> input = torch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveMaxPool2d(7)
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveMaxPool2d((None, 7))
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+    """
+    output_size: _size_2_opt_t
+    def forward(self, input: Tensor):
+        return F.adaptive_max_pool2d(input, self.output_size, self.return_indices)
+class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd):
+    r"""Applies a 3D adaptive max pooling over an input signal composed of several input planes.
+    The output is of size :math:`D_{out} \times H_{out} \times W_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+    Args:
+        output_size: the target output size of the image of the form :math:`D_{out} \times H_{out} \times W_{out}`.
+                     Can be a tuple :math:`(D_{out}, H_{out}, W_{out})` or a single
+                     :math:`D_{out}` for a cube :math:`D_{out} \times D_{out} \times D_{out}`.
+                     :math:`D_{out}`, :math:`H_{out}` and :math:`W_{out}` can be either a
+                     ``int``, or ``None`` which means the size will be the same as that of the input.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool3d. Default: ``False``
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where :math:`(D_{out}, H_{out}, W_{out})=\text{output\_size}`.
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveMaxPool3d((5, 7, 9))
+        >>> input = torch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveMaxPool3d(7)
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveMaxPool3d((7, None, None))
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+    """
+    output_size: _size_3_opt_t
+    def forward(self, input: Tensor):
+        return F.adaptive_max_pool3d(input, self.output_size, self.return_indices)
+class _AdaptiveAvgPoolNd(Module):
+    __constants__ = ['output_size']
+    def __init__(self, output_size: _size_any_opt_t) -> None:
+        super().__init__()
+        self.output_size = output_size
+    def extra_repr(self) -> str:
+        return f'output_size={self.output_size}'
+class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd):
+    r"""Applies a 1D adaptive average pooling over an input signal composed of several input planes.
+    The output size is :math:`L_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+    Args:
+        output_size: the target output size :math:`L_{out}`.
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          :math:`L_{out}=\text{output\_size}`.
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveAvgPool1d(5)
+        >>> input = torch.randn(1, 64, 8)
+        >>> output = m(input)
+    """
+    output_size: _size_1_t
+    def forward(self, input: Tensor) -> Tensor:
+        return F.adaptive_avg_pool1d(input, self.output_size)
+class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd):
+    r"""Applies a 2D adaptive average pooling over an input signal composed of several input planes.
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H.
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, S_{0}, S_{1})` or :math:`(C, S_{0}, S_{1})`, where
+          :math:`S=\text{output\_size}`.
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveAvgPool2d((5, 7))
+        >>> input = torch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveAvgPool2d(7)
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveAvgPool2d((None, 7))
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+    """
+    output_size: _size_2_opt_t
+    def forward(self, input: Tensor) -> Tensor:
+        return F.adaptive_avg_pool2d(input, self.output_size)
+class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd):
+    r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes.
+    The output is of size D x H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+    Args:
+        output_size: the target output size of the form D x H x W.
+                     Can be a tuple (D, H, W) or a single number D for a cube D x D x D.
+                     D, H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, S_{0}, S_{1}, S_{2})` or :math:`(C, S_{0}, S_{1}, S_{2})`,
+          where :math:`S=\text{output\_size}`.
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveAvgPool3d((5, 7, 9))
+        >>> input = torch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveAvgPool3d(7)
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveAvgPool3d((7, None, None))
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+    """
+    output_size: _size_3_opt_t
+    def forward(self, input: Tensor) -> Tensor:
+        return F.adaptive_avg_pool3d(input, self.output_size)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/transformer.py ADDED Viewed

	@@ -0,0 +1,975 @@

+import copy
+from typing import Optional, Any, Union, Callable
+import torch
+import warnings
+from torch import Tensor
+from .. import functional as F
+from .module import Module
+from .activation import MultiheadAttention
+from .container import ModuleList
+from ..init import xavier_uniform_
+from .dropout import Dropout
+from .linear import Linear
+from .normalization import LayerNorm
+__all__ = ['Transformer', 'TransformerEncoder', 'TransformerDecoder', 'TransformerEncoderLayer', 'TransformerDecoderLayer']
+def _generate_square_subsequent_mask(
+        sz: int,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    r"""Generate a square causal mask for the sequence.
+    The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0).
+    """
+    if device is None:
+        device = torch.device('cpu')
+    if dtype is None:
+        dtype = torch.float32
+    return torch.triu(
+        torch.full((sz, sz), float('-inf'), dtype=dtype, device=device),
+        diagonal=1,
+    )
+def _get_seq_len(
+        src: Tensor,
+        batch_first: bool
+) -> Optional[int]:
+    if src.is_nested:
+        return None
+    else:
+        src_size = src.size()
+        if len(src_size) == 2:
+            # unbatched: S, E
+            return src_size[0]
+        else:
+            # batched: B, S, E if batch_first else S, B, E
+            seq_len_pos = 1 if batch_first else 0
+            return src_size[seq_len_pos]
+class Transformer(Module):
+    r"""A transformer model.
+    User is able to modify the attributes as needed. The architecture
+    is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
+    Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
+    Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
+    Processing Systems, pages 6000-6010.
+    Args:
+        d_model: the number of expected features in the encoder/decoder inputs (default=512).
+        nhead: the number of heads in the multiheadattention models (default=8).
+        num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
+        num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of encoder/decoder intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        custom_encoder: custom encoder (default=None).
+        custom_decoder: custom decoder (default=None).
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, encoder and decoder layers will perform LayerNorms before
+            other attention and feedforward operations, otherwise after. Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+    Examples::
+        >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
+        >>> src = torch.rand((10, 32, 512))
+        >>> tgt = torch.rand((20, 32, 512))
+        >>> out = transformer_model(src, tgt)
+    Note: A full example to apply nn.Transformer module for the word language model is available in
+    https://github.com/pytorch/examples/tree/master/word_language_model
+    """
+    def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6,
+                 num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 custom_encoder: Optional[Any] = None, custom_decoder: Optional[Any] = None,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        if custom_encoder is not None:
+            self.encoder = custom_encoder
+        else:
+            encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                                    activation, layer_norm_eps, batch_first, norm_first,
+                                                    bias, **factory_kwargs)
+            encoder_norm = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+        if custom_decoder is not None:
+            self.decoder = custom_decoder
+        else:
+            decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                                    activation, layer_norm_eps, batch_first, norm_first,
+                                                    bias, **factory_kwargs)
+            decoder_norm = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
+        self._reset_parameters()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.batch_first = batch_first
+    def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None,
+                src_is_causal: Optional[bool] = None, tgt_is_causal: Optional[bool] = None,
+                memory_is_causal: bool = False) -> Tensor:
+        r"""Take in and process masked source/target sequences.
+        .. note::
+            If a boolean tensor is provided for any of the [src/tgt/memory]_mask arguments, positions with a ``True`` value are
+            not allowed to participate in the attention,
+            which is the opposite of the definition for :attr:`attn_mask`
+            in :func:`torch.nn.functional.scaled_dot_product_attention`.
+        Args:
+            src: the sequence to the encoder (required).
+            tgt: the sequence to the decoder (required).
+            src_mask: the additive mask for the src sequence (optional).
+            tgt_mask: the additive mask for the tgt sequence (optional).
+            memory_mask: the additive mask for the encoder output (optional).
+            src_key_padding_mask: the Tensor mask for src keys per batch (optional).
+            tgt_key_padding_mask: the Tensor mask for tgt keys per batch (optional).
+            memory_key_padding_mask: the Tensor mask for memory keys per batch (optional).
+            src_is_causal: If specified, applies a causal mask as ``src_mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``src_is_causal`` provides a hint that ``src_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            tgt_is_causal: If specified, applies a causal mask as ``tgt_mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory_mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+        Shape:
+            - src: :math:`(S, E)` for unbatched input, :math:`(S, N, E)` if `batch_first=False` or
+              `(N, S, E)` if `batch_first=True`.
+            - tgt: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
+              `(N, T, E)` if `batch_first=True`.
+            - src_mask: :math:`(S, S)` or :math:`(N\cdot\text{num\_heads}, S, S)`.
+            - tgt_mask: :math:`(T, T)` or :math:`(N\cdot\text{num\_heads}, T, T)`.
+            - memory_mask: :math:`(T, S)`.
+            - src_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
+            - tgt_key_padding_mask: :math:`(T)` for unbatched input otherwise :math:`(N, T)`.
+            - memory_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
+            Note: [src/tgt/memory]_mask ensures that position :math:`i` is allowed to attend the unmasked
+            positions. If a BoolTensor is provided, positions with ``True``
+            are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+            is provided, it will be added to the attention weight.
+            [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by
+            the attention. If a BoolTensor is provided, the positions with the
+            value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+            - output: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
+              `(N, T, E)` if `batch_first=True`.
+            Note: Due to the multi-head attention architecture in the transformer model,
+            the output sequence length of a transformer is same as the input sequence
+            (i.e. target) length of the decoder.
+            where :math:`S` is the source sequence length, :math:`T` is the target sequence length, :math:`N` is the
+            batch size, :math:`E` is the feature number
+        Examples:
+            >>> # xdoctest: +SKIP
+            >>> output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
+        """
+        is_batched = src.dim() == 3
+        if not self.batch_first and src.size(1) != tgt.size(1) and is_batched:
+            raise RuntimeError("the batch number of src and tgt must be equal")
+        elif self.batch_first and src.size(0) != tgt.size(0) and is_batched:
+            raise RuntimeError("the batch number of src and tgt must be equal")
+        if src.size(-1) != self.d_model or tgt.size(-1) != self.d_model:
+            raise RuntimeError("the feature number of src and tgt must be equal to d_model")
+        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask,
+                              is_causal=src_is_causal)
+        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
+                              tgt_key_padding_mask=tgt_key_padding_mask,
+                              memory_key_padding_mask=memory_key_padding_mask,
+                              tgt_is_causal=tgt_is_causal, memory_is_causal=memory_is_causal)
+        return output
+    @staticmethod
+    def generate_square_subsequent_mask(
+            sz: int,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+    ) -> Tensor:
+        r"""Generate a square causal mask for the sequence.
+        The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0).
+        """
+        return _generate_square_subsequent_mask(sz, dtype=dtype, device=device)
+    def _reset_parameters(self):
+        r"""Initiate parameters in the transformer model."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+class TransformerEncoder(Module):
+    r"""TransformerEncoder is a stack of N encoder layers.
+    Users can build the BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
+    Args:
+        encoder_layer: an instance of the TransformerEncoderLayer() class (required).
+        num_layers: the number of sub-encoder-layers in the encoder (required).
+        norm: the layer normalization component (optional).
+        enable_nested_tensor: if True, input will automatically convert to nested tensor
+            (and convert back on output). This will improve the overall performance of
+            TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = transformer_encoder(src)
+    """
+    __constants__ = ['norm']
+    def __init__(
+        self,
+        encoder_layer: "TransformerEncoderLayer",
+        num_layers: int,
+        norm: Optional[Module] = None,
+        enable_nested_tensor: bool = True,
+        mask_check: bool = True
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        # this attribute saves the value providedat object construction
+        self.enable_nested_tensor = enable_nested_tensor
+        # this attribute controls whether nested tensors are used
+        self.use_nested_tensor = enable_nested_tensor
+        self.mask_check = mask_check
+        enc_layer = "encoder_layer"
+        why_not_sparsity_fast_path = ''
+        if not isinstance(encoder_layer, torch.nn.TransformerEncoderLayer):
+            why_not_sparsity_fast_path = f"{enc_layer} was not TransformerEncoderLayer"
+        elif encoder_layer.norm_first :
+            why_not_sparsity_fast_path = f"{enc_layer}.norm_first was True"
+        elif not encoder_layer.self_attn.batch_first:
+            why_not_sparsity_fast_path = (f"{enc_layer}.self_attn.batch_first was not True" +
+                                          "(use batch_first for better inference performance)")
+        elif not encoder_layer.self_attn._qkv_same_embed_dim:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn._qkv_same_embed_dim was not True"
+        elif encoder_layer.self_attn.in_proj_bias is None:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn was passed bias=False"
+        elif not encoder_layer.activation_relu_or_gelu:
+            why_not_sparsity_fast_path = f"{enc_layer}.activation_relu_or_gelu was not True"
+        elif not (encoder_layer.norm1.eps == encoder_layer.norm2.eps) :
+            why_not_sparsity_fast_path = f"{enc_layer}.norm1.eps was not equal to {enc_layer}.norm2.eps"
+        elif encoder_layer.self_attn.num_heads % 2 == 1:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn.num_heads is odd"
+        if enable_nested_tensor and why_not_sparsity_fast_path:
+            warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}")
+            self.use_nested_tensor = False
+    def forward(
+            self,
+            src: Tensor,
+            mask: Optional[Tensor] = None,
+            src_key_padding_mask: Optional[Tensor] = None,
+            is_causal: Optional[bool] = None) -> Tensor:
+        r"""Pass the input through the encoder layers in turn.
+        Args:
+            src: the sequence to the encoder (required).
+            mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+            is_causal: If specified, applies a causal mask as ``mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``is_causal`` provides a hint that ``mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(mask),
+            other_name="mask",
+            target_type=src.dtype
+        )
+        mask = F._canonical_mask(
+            mask=mask,
+            mask_name="mask",
+            other_type=None,
+            other_name="",
+            target_type=src.dtype,
+            check_other=False,
+        )
+        output = src
+        convert_to_nested = False
+        first_layer = self.layers[0]
+        src_key_padding_mask_for_layers = src_key_padding_mask
+        why_not_sparsity_fast_path = ''
+        str_first_layer = "self.layers[0]"
+        batch_first = first_layer.self_attn.batch_first
+        is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+        if not is_fastpath_enabled:
+            why_not_sparsity_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
+        elif not hasattr(self, "use_nested_tensor"):
+            why_not_sparsity_fast_path = "use_nested_tensor attribute not present"
+        elif not self.use_nested_tensor:
+            why_not_sparsity_fast_path = "self.use_nested_tensor (set in init) was not True"
+        elif first_layer.training:
+            why_not_sparsity_fast_path = f"{str_first_layer} was in training mode"
+        elif not src.dim() == 3:
+            why_not_sparsity_fast_path = f"input not batched; expected src.dim() of 3 but got {src.dim()}"
+        elif src_key_padding_mask is None:
+            why_not_sparsity_fast_path = "src_key_padding_mask was None"
+        elif (((not hasattr(self, "mask_check")) or self.mask_check)
+                and not torch._nested_tensor_from_mask_left_aligned(src, src_key_padding_mask.logical_not())):
+            why_not_sparsity_fast_path = "mask_check enabled, and src and src_key_padding_mask was not left aligned"
+        elif output.is_nested:
+            why_not_sparsity_fast_path = "NestedTensor input is not supported"
+        elif mask is not None:
+            why_not_sparsity_fast_path = "src_key_padding_mask and mask were both supplied"
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
+        if not why_not_sparsity_fast_path:
+            tensor_args = (
+                src,
+                first_layer.self_attn.in_proj_weight,
+                first_layer.self_attn.in_proj_bias,
+                first_layer.self_attn.out_proj.weight,
+                first_layer.self_attn.out_proj.bias,
+                first_layer.norm1.weight,
+                first_layer.norm1.bias,
+                first_layer.norm2.weight,
+                first_layer.norm2.bias,
+                first_layer.linear1.weight,
+                first_layer.linear1.bias,
+                first_layer.linear2.weight,
+                first_layer.linear2.bias,
+            )
+            _supported_device_type = ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name]
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_sparsity_fast_path = "some Tensor argument has_torch_function"
+            elif src.device.type not in _supported_device_type:
+                why_not_sparsity_fast_path = f"src device is neither one of {_supported_device_type}"
+            elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args):
+                why_not_sparsity_fast_path = ("grad is enabled and at least one of query or the "
+                                              "input/output projection weights or biases requires_grad")
+            if (not why_not_sparsity_fast_path) and (src_key_padding_mask is not None):
+                convert_to_nested = True
+                output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)
+                src_key_padding_mask_for_layers = None
+        seq_len = _get_seq_len(src, batch_first)
+        is_causal = _detect_is_causal_mask(mask, is_causal, seq_len)
+        for mod in self.layers:
+            output = mod(output, src_mask=mask, is_causal=is_causal, src_key_padding_mask=src_key_padding_mask_for_layers)
+        if convert_to_nested:
+            output = output.to_padded_tensor(0., src.size())
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class TransformerDecoder(Module):
+    r"""TransformerDecoder is a stack of N decoder layers.
+    Args:
+        decoder_layer: an instance of the TransformerDecoderLayer() class (required).
+        num_layers: the number of sub-decoder-layers in the decoder (required).
+        norm: the layer normalization component (optional).
+    Examples::
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = transformer_decoder(tgt, memory)
+    """
+    __constants__ = ['norm']
+    def __init__(
+        self,
+        decoder_layer: "TransformerDecoderLayer",
+        num_layers: int,
+        norm: Optional[Module] = None
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None, tgt_is_causal: Optional[bool] = None,
+                memory_is_causal: bool = False) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer in turn.
+        Args:
+            tgt: the sequence to the decoder (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+            tgt_is_causal: If specified, applies a causal mask as ``tgt mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        output = tgt
+        seq_len = _get_seq_len(tgt, self.layers[0].self_attn.batch_first)
+        tgt_is_causal = _detect_is_causal_mask(tgt_mask, tgt_is_causal, seq_len)
+        for mod in self.layers:
+            output = mod(output, memory, tgt_mask=tgt_mask,
+                         memory_mask=memory_mask,
+                         tgt_key_padding_mask=tgt_key_padding_mask,
+                         memory_key_padding_mask=memory_key_padding_mask,
+                         tgt_is_causal=tgt_is_causal,
+                         memory_is_causal=memory_is_causal)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class TransformerEncoderLayer(Module):
+    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
+    This standard encoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+    TransformerEncoderLayer can handle either traditional torch.tensor inputs,
+    or Nested Tensor inputs.  Derived classes are expected to similarly accept
+    both input formats.  (Not all combinations of inputs are currently
+    supported by TransformerEncoderLayer while Nested Tensor is in prototype
+    state.)
+    If you are implementing a custom layer, you may derive it either from
+    the Module or TransformerEncoderLayer class.  If your custom layer
+    supports both torch.Tensors and Nested Tensors inputs, make its
+    implementation a derived class of TransformerEncoderLayer. If your custom
+    Layer supports only torch.Tensor inputs, derive its implementation from
+    Module.
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, layer norm is done prior to attention and feedforward
+            operations, respectively. Otherwise it's done after. Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = encoder_layer(src)
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> src = torch.rand(32, 10, 512)
+        >>> out = encoder_layer(src)
+    Fast path:
+        forward() will use a special optimized implementation described in
+        `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`_ if all of the following
+        conditions are met:
+        - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor
+          argument ``requires_grad``
+        - training is disabled (using ``.eval()``)
+        - batch_first is ``True`` and the input is batched (i.e., ``src.dim() == 3``)
+        - activation is one of: ``"relu"``, ``"gelu"``, ``torch.functional.relu``, or ``torch.functional.gelu``
+        - at most one of ``src_mask`` and ``src_key_padding_mask`` is passed
+        - if src is a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_, neither ``src_mask``
+          nor ``src_key_padding_mask`` is passed
+        - the two ``LayerNorm`` instances have a consistent ``eps`` value (this will naturally be the case
+          unless the caller has manually modified one without modifying the other)
+        If the optimized implementation is in use, a
+        `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be
+        passed for ``src`` to represent padding more efficiently than using a padding
+        mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ will be
+        returned, and an additional speedup proportional to the fraction of the input that
+        is padding can be expected.
+        .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`:
+         https://arxiv.org/abs/2205.14135
+    """
+    __constants__ = ['norm_first']
+    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout,
+                                            bias=bias, batch_first=batch_first,
+                                            **factory_kwargs)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
+        self.norm_first = norm_first
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            activation = _get_activation_fn(activation)
+        # We can't test self.activation in forward() in TorchScript,
+        # so stash some information about it instead.
+        if activation is F.relu or isinstance(activation, torch.nn.ReLU):
+            self.activation_relu_or_gelu = 1
+        elif activation is F.gelu or isinstance(activation, torch.nn.GELU):
+            self.activation_relu_or_gelu = 2
+        else:
+            self.activation_relu_or_gelu = 0
+        self.activation = activation
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'activation'):
+            self.activation = F.relu
+    def forward(
+            self,
+            src: Tensor,
+            src_mask: Optional[Tensor] = None,
+            src_key_padding_mask: Optional[Tensor] = None,
+            is_causal: bool = False) -> Tensor:
+        r"""Pass the input through the encoder layer.
+        Args:
+            src: the sequence to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+            is_causal: If specified, applies a causal mask as ``src mask``.
+                Default: ``False``.
+                Warning:
+                ``is_causal`` provides a hint that ``src_mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(src_mask),
+            other_name="src_mask",
+            target_type=src.dtype
+        )
+        src_mask = F._canonical_mask(
+            mask=src_mask,
+            mask_name="src_mask",
+            other_type=None,
+            other_name="",
+            target_type=src.dtype,
+            check_other=False,
+        )
+        is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+        why_not_sparsity_fast_path = ''
+        if not is_fastpath_enabled:
+            why_not_sparsity_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
+        elif not src.dim() == 3:
+            why_not_sparsity_fast_path = f"input not batched; expected src.dim() of 3 but got {src.dim()}"
+        elif self.training:
+            why_not_sparsity_fast_path = "training is enabled"
+        elif not self.self_attn.batch_first:
+            why_not_sparsity_fast_path = "self_attn.batch_first was not True"
+        elif self.self_attn.in_proj_bias is None:
+            why_not_sparsity_fast_path = "self_attn was passed bias=False"
+        elif not self.self_attn._qkv_same_embed_dim:
+            why_not_sparsity_fast_path = "self_attn._qkv_same_embed_dim was not True"
+        elif not self.activation_relu_or_gelu:
+            why_not_sparsity_fast_path = "activation_relu_or_gelu was not True"
+        elif not (self.norm1.eps == self.norm2.eps):
+            why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps"
+        elif src.is_nested and (src_key_padding_mask is not None or src_mask is not None):
+            why_not_sparsity_fast_path = "neither src_key_padding_mask nor src_mask are not supported with NestedTensor input"
+        elif self.self_attn.num_heads % 2 == 1:
+            why_not_sparsity_fast_path = "num_head is odd"
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
+        if not why_not_sparsity_fast_path:
+            tensor_args = (
+                src,
+                self.self_attn.in_proj_weight,
+                self.self_attn.in_proj_bias,
+                self.self_attn.out_proj.weight,
+                self.self_attn.out_proj.bias,
+                self.norm1.weight,
+                self.norm1.bias,
+                self.norm2.weight,
+                self.norm2.bias,
+                self.linear1.weight,
+                self.linear1.bias,
+                self.linear2.weight,
+                self.linear2.bias,
+            )
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            _supported_device_type = ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name]
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_sparsity_fast_path = "some Tensor argument has_torch_function"
+            elif not all((x.device.type in _supported_device_type) for x in tensor_args):
+                why_not_sparsity_fast_path = ("some Tensor argument's device is neither one of "
+                                              f"{_supported_device_type}")
+            elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args):
+                why_not_sparsity_fast_path = ("grad is enabled and at least one of query or the "
+                                              "input/output projection weights or biases requires_grad")
+            if not why_not_sparsity_fast_path:
+                merged_mask, mask_type = self.self_attn.merge_masks(src_mask, src_key_padding_mask, src)
+                return torch._transformer_encoder_layer_fwd(
+                    src,
+                    self.self_attn.embed_dim,
+                    self.self_attn.num_heads,
+                    self.self_attn.in_proj_weight,
+                    self.self_attn.in_proj_bias,
+                    self.self_attn.out_proj.weight,
+                    self.self_attn.out_proj.bias,
+                    self.activation_relu_or_gelu == 2,
+                    self.norm_first,
+                    self.norm1.eps,
+                    self.norm1.weight,
+                    self.norm1.bias,
+                    self.norm2.weight,
+                    self.norm2.bias,
+                    self.linear1.weight,
+                    self.linear1.bias,
+                    self.linear2.weight,
+                    self.linear2.bias,
+                    merged_mask,
+                    mask_type,
+                )
+        x = src
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask, is_causal=is_causal)
+            x = x + self._ff_block(self.norm2(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal))
+            x = self.norm2(x + self._ff_block(x))
+        return x
+    # self-attention block
+    def _sa_block(self, x: Tensor,
+                  attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor:
+        x = self.self_attn(x, x, x,
+                           attn_mask=attn_mask,
+                           key_padding_mask=key_padding_mask,
+                           need_weights=False, is_causal=is_causal)[0]
+        return self.dropout1(x)
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+class TransformerDecoderLayer(Module):
+    r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
+    This standard decoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, layer norm is done prior to self attention, multihead
+            attention and feedforward operations, respectively. Otherwise it's done after.
+            Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+    Examples::
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = decoder_layer(tgt, memory)
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> memory = torch.rand(32, 10, 512)
+        >>> tgt = torch.rand(32, 20, 512)
+        >>> out = decoder_layer(tgt, memory)
+    """
+    __constants__ = ['norm_first']
+    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                            bias=bias, **factory_kwargs)
+        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                                 bias=bias, **factory_kwargs)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
+        self.norm_first = norm_first
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.dropout3 = Dropout(dropout)
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            self.activation = _get_activation_fn(activation)
+        else:
+            self.activation = activation
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super().__setstate__(state)
+    def forward(
+        self,
+        tgt: Tensor,
+        memory: Tensor,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        tgt_is_causal: bool = False,
+        memory_is_causal: bool = False,
+    ) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer.
+        Args:
+            tgt: the sequence to the decoder layer (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+            tgt_is_causal: If specified, applies a causal mask as ``tgt mask``.
+                Default: ``False``.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+        x = tgt
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask, tgt_is_causal)
+            x = x + self._mha_block(self.norm2(x), memory, memory_mask, memory_key_padding_mask, memory_is_causal)
+            x = x + self._ff_block(self.norm3(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, tgt_mask, tgt_key_padding_mask, tgt_is_causal))
+            x = self.norm2(x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask, memory_is_causal))
+            x = self.norm3(x + self._ff_block(x))
+        return x
+    # self-attention block
+    def _sa_block(self, x: Tensor,
+                  attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor:
+        x = self.self_attn(x, x, x,
+                           attn_mask=attn_mask,
+                           key_padding_mask=key_padding_mask,
+                           is_causal=is_causal,
+                           need_weights=False)[0]
+        return self.dropout1(x)
+    # multihead attention block
+    def _mha_block(self, x: Tensor, mem: Tensor,
+                   attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor:
+        x = self.multihead_attn(x, mem, mem,
+                                attn_mask=attn_mask,
+                                key_padding_mask=key_padding_mask,
+                                is_causal=is_causal,
+                                need_weights=False)[0]
+        return self.dropout2(x)
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout3(x)
+def _get_clones(module, N):
+    # FIXME: copy.deepcopy() is not defined on nn.module
+    return ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]:
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return F.gelu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}")
+def _detect_is_causal_mask(
+        mask: Optional[Tensor],
+        is_causal: Optional[bool] = None,
+        size: Optional[int] = None,
+) -> bool:
+    """Return whether the given attention mask is causal.
+    Warning:
+    If ``is_causal`` is not ``None``, its value will be returned as is.  If a
+    user supplies an incorrect ``is_causal`` hint,
+    ``is_causal=False`` when the mask is in fact a causal attention.mask
+       may lead to reduced performance relative to what would be achievable
+       with ``is_causal=True``;
+    ``is_causal=True`` when the mask is in fact not a causal attention.mask
+       may lead to incorrect and unpredictable execution - in some scenarios,
+       a causal mask may be applied based on the hint, in other execution
+       scenarios the specified mask may be used.  The choice may not appear
+       to be deterministic, in that a number of factors like alignment,
+       hardware SKU, etc influence the decision whether to use a mask or
+       rely on the hint.
+    ``size`` if not None, check whether the mask is a causal mask of the provided size
+       Otherwise, checks for any causal mask.
+    """
+    # Prevent type refinement
+    make_causal = (is_causal is True)
+    if is_causal is None and mask is not None:
+        sz = size if size is not None else mask.size(-2)
+        causal_comparison = _generate_square_subsequent_mask(
+            sz, device=mask.device, dtype=mask.dtype)
+        # Do not use `torch.equal` so we handle batched masks by
+        # broadcasting the comparison.
+        if mask.size() == causal_comparison.size():
+            make_causal = bool((mask == causal_comparison).all())
+        else:
+            make_causal = False
+    return make_causal

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.06 kB). View file