diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_refs/__pycache__/fft.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_refs/__pycache__/fft.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..148daf4fa7c38270cca6df6f24d9daafac519aa3 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_refs/__pycache__/fft.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35cebf072561dc48742caae78e05dc620297c1b6 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e01c52d5961647406c0452681c2e0de510dfee0a Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8db49d03aeb3c5edb777e73ffab99df4788f06d3 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0cb3f76c7b5d150696bfd9b92286f8987bebac54 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f5b4315832d68cbf99cf1594d58a346a3af1c55 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_matcher.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..8db946ec707a71c752bb13a73fbc48fb0a003cf1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_matcher.py @@ -0,0 +1,460 @@ +import collections +import enum + +import torch +toq = torch.ops.quantized + +from torch.fx import GraphModule +from torch.fx.graph import Graph, Node + +from torch.ao.quantization.utils import getattr_from_fqn +from .ns_types import NSSubgraph, NSNodeTargetType +from .mappings import ( + get_base_name_to_sets_of_related_ops, + get_unmatchable_types_map, +) +from .pattern_utils import ( + get_type_a_related_to_b, + get_reversed_fusions, + end_node_matches_reversed_fusion, +) +from torch.ao.quantization import ( + ObserverBase, + FakeQuantizeBase, +) + +from typing import Dict, Tuple, List, Optional, Set, Any + +def _get_output_nodes(g: Graph) -> List[Node]: + return [n for n in g.nodes if n.op == 'output'] + +class _NSGraphMatchableSubgraphsIterator: + """ + Iterates through the graph of gm, starting with the output nodes + and continuing backwards. + 1. Returns matchable subgraphs, in order. A subgraph is defined by + (start_node, end_node). + 2. Skips over non-matchable subgraphs + """ + def __init__( + self, + gm: GraphModule, + non_matchable_functions: Set[NSNodeTargetType], + non_matchable_modules: Set[NSNodeTargetType], + non_matchable_methods: Set[NSNodeTargetType], + ): + self.gm: GraphModule = gm + self.non_matchable_functions: Set[NSNodeTargetType] = non_matchable_functions + self.non_matchable_modules: Set[NSNodeTargetType] = non_matchable_modules + self.non_matchable_methods: Set[NSNodeTargetType] = non_matchable_methods + self.seen_nodes: Set[Node] = set() + self.stack: List[Node] = [] + for start_node in _get_output_nodes(self.gm.graph): + self.stack.append(start_node) + + def __iter__(self): + return self + + def __next__(self) -> NSSubgraph: + """ + Returns the next matchable subgraph. + """ + while len(self.stack) > 0: + cur_end_node = self.stack.pop() + if cur_end_node in self.seen_nodes: + continue + + # for subgraphs which are single nodes, start_node == end_node + # for subgraphs with more than one node, start node != end_node + cur_start_node = cur_end_node + # Subgraphs like linear-relu have the base node as the start node. + # Subgraphs like dequantize-linear-relu-to(torch.float16) have the + # base node as the second node. + # The cur_base_op_node var will move to the actual node during + # the fusion matching later in this code block. + cur_base_op_node = cur_end_node + + # Check for potential fusions. For now, we are greedy + # and always skip all non-base nodes of a fusion. For example, + # if we match linear-relu backwards, we will always skip the + # relu node and attempt to match the linear node. This can + # be made configurable later if needed. + for _reverse_fusion_ops, base_op_idx in get_reversed_fusions(): + is_match = end_node_matches_reversed_fusion( + cur_end_node, _reverse_fusion_ops, self.gm, self.seen_nodes) + if is_match: + # navigate to the base node + for rev_fusion_idx in range(len(_reverse_fusion_ops) - 1): + self.seen_nodes.add(cur_start_node) + # for now, assume that there are no other nodes + # which need to be added to the stack + cur_start_node = cur_start_node.args[0] # type: ignore[assignment] + # if the base op index matches the current node, set it + rev_base_op_idx = \ + len(_reverse_fusion_ops) - 2 - base_op_idx + if rev_fusion_idx == rev_base_op_idx: + cur_base_op_node = cur_start_node + break + + self.seen_nodes.add(cur_start_node) + # add args of previous nodes to stack + for arg in cur_start_node.all_input_nodes: + self._recursively_add_node_arg_to_stack(arg) + + # skip unmatchable nodes + # note: this check is done on the start_node, i.e. + # if we are matching linear-relu in reverse, this would do the matchable + # check on the linear + if not self._is_matchable(cur_base_op_node): + continue + + # If an observer or a fake_quant was not matched as a part of + # a pattern of multiple nodes, ignore it. One case where this is + # relevant is an observer on a graph input, which was added because + # it is necessary for the next node. + if cur_end_node.op == 'call_module' and cur_start_node is cur_end_node: + maybe_obs = getattr_from_fqn(self.gm, cur_end_node.target) # type: ignore[arg-type] + if isinstance(maybe_obs, (ObserverBase, FakeQuantizeBase)): + continue + + return NSSubgraph( + start_node=cur_start_node, end_node=cur_end_node, + base_op_node=cur_base_op_node) + + raise StopIteration + + def _recursively_add_node_arg_to_stack(self, arg: Any) -> None: + """ + Adds all of the nodes in this arg to the stack, properly navigating + through list, dicts and tuples. + """ + if isinstance(arg, Node): + self.stack.append(arg) + elif isinstance(arg, torch.fx.immutable_collections.immutable_list) or type(arg) is tuple: + for inner_arg in arg: + self._recursively_add_node_arg_to_stack(inner_arg) + elif isinstance(arg, torch.fx.immutable_collections.immutable_dict): + for value in arg.values(): + self._recursively_add_node_arg_to_stack(value) + + def _is_matchable(self, node: Node) -> bool: + if node.op == 'call_function': + return node.target not in self.non_matchable_functions + elif node.op == 'call_module': + assert isinstance(node.target, str) + target_mod = getattr_from_fqn(self.gm, node.target) + return not \ + any(isinstance(target_mod, t) # type: ignore[arg-type] + for t in self.non_matchable_modules) + elif node.op == 'call_method': + return node.target not in self.non_matchable_methods + else: + return False + +class GraphMatchingException(Exception): + """ + Exception raised when two graphs cannot be matched. + """ + pass + +class SubgraphTypeRelationship(enum.Enum): + # same type, known + # example: F.linear and F.linear, or nn.Conv2d and nn.Conv2d + EQUAL = enum.auto() + # same type, but the type is not known to Numerical Suite + # (user defined type, etc). + EQUAL_BUT_UKNOWN = enum.auto() + # known, same subgraph_relationship set, but not the same type + # example: F.linear and toq.linear + RELATED_BUT_NOT_EQUAL = enum.auto() + # not related + NOT_RELATED = enum.auto() + +def _get_subgraph_relationship_type( + subgraph_a: NSSubgraph, + subgraph_b: NSSubgraph, + gm_a: GraphModule, + gm_b: GraphModule, + type_a_related_to_b: Set[Tuple[NSNodeTargetType, NSNodeTargetType]], +) -> SubgraphTypeRelationship: + node_a = subgraph_a.base_op_node + node_b = subgraph_b.base_op_node + + # TODO(next): make this code handle matching by what is before the base op + if node_a.op != node_b.op: + if not ( + node_a.op in ('call_function', 'call_method') and + node_b.op in ('call_function', 'call_method') + ): + return SubgraphTypeRelationship.NOT_RELATED + + if node_a.op in ('call_function', 'call_method'): + key = (node_a.target, node_b.target) + + if key not in type_a_related_to_b: + if node_a.target == node_b.target: + return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN + else: + return SubgraphTypeRelationship.NOT_RELATED + # after this point, we are dealing with known types + + if node_a.target == node_b.target: + node_a_has_prev = subgraph_a.base_op_node == subgraph_a.start_node + node_b_has_prev = subgraph_b.base_op_node == subgraph_b.start_node + if node_a_has_prev and (not node_b_has_prev): + return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL + elif (not node_a_has_prev) and node_b_has_prev: + return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL + elif (not node_a_has_prev) and (not node_b_has_prev): + return SubgraphTypeRelationship.EQUAL + else: + # TODO(future PR): check for matches start_op_node and base_op_node + return SubgraphTypeRelationship.EQUAL + + if key in type_a_related_to_b: + return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL + else: + return SubgraphTypeRelationship.NOT_RELATED + elif node_a.op == 'call_module': + assert (subgraph_a.base_op_node == subgraph_a.start_node and + subgraph_b.base_op_node == subgraph_b.start_node), \ + "Matching call_module patterns where base_op_node != start_node is not supported yet" + # for call_module, we need to look up the modules to do the type check + assert isinstance(node_a.target, str) + mod_a = getattr_from_fqn(gm_a, node_a.target) + assert isinstance(node_b.target, str) + mod_b = getattr_from_fqn(gm_b, node_b.target) + + key = (type(mod_a), type(mod_b)) + + if key not in type_a_related_to_b: + if type(mod_a) == type(mod_b): + return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN + else: + return SubgraphTypeRelationship.NOT_RELATED + elif type(mod_a) == type(mod_b): + return SubgraphTypeRelationship.EQUAL + else: + return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL + + return SubgraphTypeRelationship.NOT_RELATED + +def _get_name_for_subgraph( + subgraph_a: NSSubgraph, + gm_a: GraphModule, + base_name_to_sets_of_related_ops: Dict[str, Set[NSNodeTargetType]], + existing_names: Set[str], +) -> str: + """ + Returns a unique name for a subgraph. This name is based on two things: + 1. the name of the set containing the underlying type of the base op in the + subgraph (i.e. 'torch.nn.functional.linear' if this is related to a linear op) + 2. the number of previous subgraphs with related underlying type of the base op + + For example, in the graph + + linear0 -> relu0 -> linear1 -> relu1 + + The subgraphs are (linear0, relu0) and (linear1, relu1). If we iterate + from the output node backwards, the name given to (linear1, relu1) will be + `base_op_torch.nn.functional.linear_0`, and the name given to (linear0, relu0) + will be `base_op_torch.nn.functional.linear_1`. + + Why are we not just using the node name? Answer: because of two requirements: + A. fusions must be supported + B. some Numeric Suite APIs can be called without having all of the models in memory + + For example, let's say we need to match nodes of + + (1) ... -> linear0 -> relu0 -> ... + + And + + (2) ... -> linear_relu0 -> ... + + Without being able to inspect them together. With the current naming scheme, if + we iterate through both of these graphs in the same order, and assuming the rest + of the graphs match, both of these subgraphs will get the same name without + (1) and (2) knowing anything about each other. + """ + target_type = _get_node_target_type(subgraph_a.base_op_node, gm_a) + target_base_type = None + for base_name, sets_of_related_ops in base_name_to_sets_of_related_ops.items(): + if target_type in sets_of_related_ops: + target_base_type = base_name + target_base_name = 'base_op_' + str(target_base_type) + counter = 0 + proposed_name = target_base_name + '_' + str(counter) + while proposed_name in existing_names: + counter += 1 + proposed_name = target_base_name + '_' + str(counter) + existing_names.add(proposed_name) + return proposed_name + +def _get_node_target_type(node: Node, gm: GraphModule) -> Optional[NSNodeTargetType]: + if node.op in ('call_function', 'call_method'): + return node.target + elif node.op == 'call_module': + assert isinstance(node.target, str) + mod = getattr_from_fqn(gm, node.target) + return type(mod) + return None + +def get_matching_subgraph_pairs( + gm_a: GraphModule, + gm_b: GraphModule, + base_name_to_sets_of_related_ops: Optional[Dict[str, Set[NSNodeTargetType]]] = None, + unmatchable_types_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None, +) -> Dict[str, Tuple[NSSubgraph, NSSubgraph]]: + """ + Matches matchable subgraphs of graph_a to graph_b. + + For a node, "matchable" is defined as a node which is not an observer, + fake_quants, quant or dequant. + + A subgraph can contain one or more nodes. A subgraph is matchable if + at least one node inside of it is matchable. Currently, all nodes in + a subgraph must be matchable (because we assume no observers will be + inserted in the middle of a fusion). + + A subgraph is defined by (start_node, end_node). We assume that only + start_node and end_node are linked with the surrounding graph, all other + nodes in a subgraph are self-contained. + + A pair of nodes is "related" if both nodes represent the same mathematical + operation across different quantization flavors. For example, + `F.linear` and `torch.ops.quantized.linear` are related, and + `F.linear` and `torch.nn.Conv` are not related. + + For each matchable pair of nodes node_a and node_b, they will match + if node_a and node_b are related. + + For graphs A and B, they will match iff: + 1. the number of matchable subgraphs in A and B is equivalent + 2. when iterating through the matchable subgraphs of A and B in the same order, each + corresponding pair of base nodes is related. + + This enables us to find the corresponding subgraphs between + graphs of related models. For example, if we had two graphs such as: + + graph_a: x0 -> conv_0 (type: nn.Conv2d) -> obs_0 -> x1 + w -/ + b -/ + + graph_b: x0 -> quant_0 -> qconv_0 (type: nnq.Conv2d) -> dequant_0 -> x1 + packed_params_0 -/ + + This function will return the following result: + { + 'conv_0': ( # the name of the node in graph_b + (conv_0, conv_0), # (start_node_a, end_node_a) + (qconv_0, qconv_0), # (start_node_b, end_node_b) + ), + } + + Or, if we have a fusion pattern, + + graph_a: x0 -> linear_0 -> relu_0 -> obs_0 -> x1 + w -/ + b -/ + + graph_b: x0 -> quant_0 -> linear_relu_0 -> dequant_0 -> x1 + packed_params_0 -/ + + This function will return the following result: + { + 'linear_relu_0': ( # the name of the node in graph_b + (linear_0, relu_0), # (start_node_a, end_node_a) + (linear_relu_0, linear_relu_0), # (start_node_b, end_node_b) + ), + } + """ + if unmatchable_types_map is None: + unmatchable_types_map = get_unmatchable_types_map() + non_matchable_functions = unmatchable_types_map['funs_unmatchable'] + non_matchable_modules = unmatchable_types_map['mods_unmatchable'] + non_matchable_methods = unmatchable_types_map['meths_unmatchable'] + + graph_a_iterator = _NSGraphMatchableSubgraphsIterator( + gm_a, non_matchable_functions, non_matchable_modules, + non_matchable_methods) + graph_b_iterator = _NSGraphMatchableSubgraphsIterator( + gm_b, non_matchable_functions, non_matchable_modules, + non_matchable_methods) + results = collections.OrderedDict() + if base_name_to_sets_of_related_ops is None: + base_name_to_sets_of_related_ops = get_base_name_to_sets_of_related_ops() + type_a_related_to_b = \ + get_type_a_related_to_b(base_name_to_sets_of_related_ops) + + existing_names_a: Set[str] = set() + existing_names_b: Set[str] = set() + + while True: + # fetch the next subgraphs from a and b + cur_subgraph_a, cur_subgraph_b = None, None + try: + cur_subgraph_a = next(graph_a_iterator) + except StopIteration: + pass + try: + cur_subgraph_b = next(graph_b_iterator) + except StopIteration: + pass + + # look up types of a and b for useful error messages + type_start_a, type_start_b = None, None + if cur_subgraph_a is not None: + type_start_a = _get_node_target_type(cur_subgraph_a.start_node, gm_a) + if cur_subgraph_b is not None: + type_start_b = _get_node_target_type(cur_subgraph_b.start_node, gm_b) + + # check for results and determine what to do next + if cur_subgraph_a is not None and cur_subgraph_b is not None: + # both nodes were fetched, check for subgraph_relationship + # note: subgraph_relationship is checked on the start node, i.e. + # if a linear-relu pattern is checked, we would check for subgraph_relationship + # of the linear + subgraph_relationship = _get_subgraph_relationship_type( + cur_subgraph_a, cur_subgraph_b, + gm_a, gm_b, type_a_related_to_b) + if subgraph_relationship == SubgraphTypeRelationship.NOT_RELATED: + msg = f""" +The subgraphs +({cur_subgraph_a}, {type_start_a}) and +({cur_subgraph_b}, {type_start_b}) +are not related. Please ensure that the two models you pass in have the same number +of subgraphs, and each pair of subgraphs is related to each other.""" + raise GraphMatchingException(msg) + elif subgraph_relationship == SubgraphTypeRelationship.EQUAL_BUT_UKNOWN: + # skip matching but unknown types + continue + key_name_a = _get_name_for_subgraph( + cur_subgraph_a, gm_a, base_name_to_sets_of_related_ops, + existing_names_a) + key_name_b = _get_name_for_subgraph( + cur_subgraph_b, gm_b, base_name_to_sets_of_related_ops, + existing_names_b) + assert key_name_a == key_name_b, \ + f"Subgraph names {key_name_a} and {key_name_b} do not match" + results[key_name_a] = (cur_subgraph_a, cur_subgraph_b) + continue + elif cur_subgraph_a is None and cur_subgraph_b is None: + # we reached the end of both graphs + break + else: + # only one node was fetched, no match possible, throw error + msg = f""" +Attempting to match +({cur_subgraph_a}, {type_start_a}) and +({cur_subgraph_b}, {type_start_b}), +one of which is empty. Please ensure that the two models you pass in have the same number +of subgraphs.""" + raise GraphMatchingException(msg) + + # The subgraph pairs are originally created by traversing the two graphs + # from the outputs to the inputs. Reverse the results to return the + # subgraphs in their order of execution. + results = collections.OrderedDict(reversed(list(results.items()))) + + return results diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_passes.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_passes.py new file mode 100644 index 0000000000000000000000000000000000000000..fbd03426790d523ee1ef9c16ce44f924d82673d0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/ns/fx/graph_passes.py @@ -0,0 +1,950 @@ +import torch +from torch.fx import GraphModule, map_arg +from torch.fx.graph import Graph, Node +from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix + +from .utils import ( + get_node_first_input_and_output_type, + getattr_from_fqn, + NodeInputOrOutputType, + return_first_non_observer_node, + get_number_of_non_param_args, + get_target_type_str, + get_arg_indices_of_inputs_to_log, + get_node_input_qparams, + op_type_supports_shadowing, + get_normalized_nth_input, +) + +from .ns_types import ( + NSSingleResultValuesType, + NSSubgraph, + NSNodeTargetType, +) +from torch.ao.ns.fx.mappings import ( + get_node_type_to_io_type_map, +) +from torch.ao.quantization.observer import _is_activation_post_process + +from typing import Dict, Tuple, Callable, List, Any, Union, Optional, Set + +def _maybe_get_fqn(node: Node, gm: GraphModule) -> Optional[str]: + fqn = None + if hasattr(gm, '_node_name_to_scope'): + # fqn on observers is not present, because they do not + # exist when the fqns are created during tracing. If this is + # an observer, get the fqn of the node being observed. + node_to_use_for_fqn = node + if node.op == 'call_module': + assert isinstance(node.target, str) + module = getattr_from_fqn(gm, node.target) + if _is_activation_post_process(module): + node_to_use_for_fqn = get_normalized_nth_input(node, gm, 0) + fqn = gm._node_name_to_scope[node_to_use_for_fqn.name][0] # type: ignore[index] + return fqn # type: ignore[return-value] + +def _insert_logger_after_node( + node: Node, + gm: GraphModule, + logger_cls: Callable, + logger_node_name_suffix: str, + ref_node_name: str, + model_name: str, + ref_name: str, + ref_node_target_type: str, + results_type: str, + index_within_arg: int, + index_of_arg: int, + fqn: Optional[str], +) -> Node: + """ + Given a starting graph of + + prev_node -> node -> next_node + + This function creates a new logger_cls obj and adds it + after node, resulting in + + prev_node -> node -> logger_obj -> next_node + """ + # create new name + logger_node_name = \ + get_new_attr_name_with_prefix(node.name + logger_node_name_suffix)(gm) + target_type = get_target_type_str(node, gm) + # create the logger object + logger_obj = logger_cls( + ref_node_name, node.name, model_name, ref_name, target_type, + ref_node_target_type, + results_type, index_within_arg, index_of_arg, fqn) + # attach the logger object to the parent module + setattr(gm, logger_node_name, logger_obj) + logger_node = node.graph.create_node( + 'call_module', logger_node_name, (node,), {}) + return logger_node + +def add_loggers_to_model( + gm: GraphModule, + node_to_instrument_inputs_to_ref_node_name: Dict[Node, Tuple[str, str]], + node_to_instrument_outputs_to_ref_node_name: Dict[Node, Tuple[str, str]], + logger_cls: Callable, + model_name: str, +) -> GraphModule: + """ + Takes the graph of gm, adds loggers to the output + of each node in nodes_to_instrument. Returns a GraphModule with the new + graph. + """ + + new_graph = Graph() + env: Dict[str, Any] = {} + modules = dict(gm.named_modules()) + + def load_arg(a): + return map_arg(a, lambda node: env[node.name]) + + for node in gm.graph.nodes: + if node.op == 'output': + new_graph.output(map_arg(get_normalized_nth_input(node, gm, 0), load_arg)) + continue + + if ( + (node in node_to_instrument_inputs_to_ref_node_name) or + (node in node_to_instrument_outputs_to_ref_node_name) + ): + fqn = _maybe_get_fqn(node, gm) + + if node in node_to_instrument_inputs_to_ref_node_name: + ref_name, ref_node_type = node_to_instrument_inputs_to_ref_node_name[node] + # Ops such add and mul are special because either + # one or two of the first two arguments can be tensors, + # and if one argument is a tensor it can be first or + # second (x + 1 versus 1 + x). + arg_indices_to_log = get_arg_indices_of_inputs_to_log(node) + for node_arg_idx in arg_indices_to_log: + node_arg = get_normalized_nth_input(node, gm, node_arg_idx) + if type(node_arg) == Node: + # create a single input logger + prev_node = env[node_arg.name] + env[node_arg.name] = _insert_logger_after_node( + prev_node, gm, logger_cls, '_ns_logger_', node.name, + model_name, ref_name, ref_node_type, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=0, index_of_arg=node_arg_idx, + fqn=fqn) + elif type(node_arg) == torch.fx.immutable_collections.immutable_list: + # create N input loggers, one for each node + for arg_idx, arg in enumerate(node_arg): # type: ignore[var-annotated, arg-type] + prev_node = env[arg.name] + env[prev_node.name] = _insert_logger_after_node( + prev_node, gm, logger_cls, '_ns_logger_', node.name, + model_name, ref_name, ref_node_type, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=arg_idx, index_of_arg=node_arg_idx, + fqn=fqn) + else: + pass + + # ensure env is populated with base node + # Note: runs for both inputs and outputs + env[node.name] = new_graph.node_copy(node, load_arg) + + if node in node_to_instrument_outputs_to_ref_node_name: + ref_name, ref_node_type = node_to_instrument_outputs_to_ref_node_name[node] + # add the logger after the base node + env[node.name] = _insert_logger_after_node( + env[node.name], gm, logger_cls, '_ns_logger_', node.name, + model_name, ref_name, ref_node_type, + NSSingleResultValuesType.NODE_OUTPUT.value, + index_within_arg=0, index_of_arg=0, fqn=fqn) + + else: + env[node.name] = new_graph.node_copy(node, load_arg) + + new_gm = GraphModule(gm, new_graph) + return new_gm + +def _insert_quantize_per_tensor_node( + prev_node_c: Node, + node_a: Node, + gm_b: GraphModule, + graph_c: Graph, + scale: Union[torch.Tensor, float], + zero_point: Union[torch.Tensor, int], + dtype_cast_name: str, +) -> Node: + # copy scale + scale_node_name = \ + get_new_attr_name_with_prefix( + node_a.name + '_input_scale_')(gm_b) + setattr(gm_b, scale_node_name, scale) + scale_node = graph_c.create_node( + 'get_attr', scale_node_name, (), {}, scale_node_name) + # copy zero_point + zero_point_node_name = \ + get_new_attr_name_with_prefix( + node_a.name + '_input_zero_point_')(gm_b) + setattr(gm_b, zero_point_node_name, zero_point) + zero_point_node = graph_c.create_node( + 'get_attr', zero_point_node_name, (), {}, zero_point_node_name) + # create the quantize_per_tensor call + return graph_c.create_node( + 'call_function', torch.quantize_per_tensor, + (prev_node_c, scale_node, zero_point_node, torch.quint8), {}, + dtype_cast_name) + +def _insert_dtype_cast_after_node( + node_a: Node, + node_c: Node, + prev_node_c: Union[Node, List[Node]], + gm_a: GraphModule, + gm_b: GraphModule, + graph_c: Graph, + node_name_prefix: str, + logger_cls: Callable, + node_type_to_io_type_map: Dict[str, Set[NSNodeTargetType]], +) -> Union[Node, List[Node]]: + """ + Given a starting graph C (derived from graph B) of + + ... -> prev_node_c -> node_c -> ... + + And a corresponding related node_a, inserts the correct dtype + cast node after prev_node_c to cast into the dtype expected + by node_a, resulting in: + + dtype_cast + / + ... -> prev_node_c -> node_c -> ... + + For example, if node_c is an int8 op and node_a is an fp32 op, this function + will insert a dequant. + """ + dtype_cast_op = None + dtype_cast_mod_cls = None + dtype_cast_method = None + dtype_cast_method_dtype = None + dtype_cast_scale = None + dtype_cast_zero_point = None + node_input_type_a, _node_output_type_a = \ + get_node_first_input_and_output_type( + node_a, gm_a, logger_cls, node_type_to_io_type_map) + node_input_type_c, _node_output_type_c = \ + get_node_first_input_and_output_type( + node_c, gm_b, logger_cls, node_type_to_io_type_map) + + if ( + (node_input_type_a == NodeInputOrOutputType.FP32 and + node_input_type_c == NodeInputOrOutputType.INT8) or + (node_input_type_a == NodeInputOrOutputType.FP32 and + node_input_type_c == NodeInputOrOutputType.FP16) or + # TODO(future PR): determine the actual dtype of node_c, + # the current code only works because dequantize works with + # multiple input dtypes. + (node_input_type_a == NodeInputOrOutputType.FP32 and + node_input_type_c == NodeInputOrOutputType.FP32_OR_INT8) + ): + dtype_cast_op = torch.dequantize + elif ( + node_input_type_a == node_input_type_c and + node_input_type_a != NodeInputOrOutputType.UNKNOWN + ): + dtype_cast_mod_cls = torch.nn.Identity + elif ( + node_input_type_a == NodeInputOrOutputType.INT8 and + node_input_type_c == NodeInputOrOutputType.FP32 + ): + # int8 shadows fp32, the dtype cast needs to quantize to int8 + # with the right qparams. + node_a_input_qparams = get_node_input_qparams( + node_a, gm_a, node_type_to_io_type_map) + if node_a_input_qparams is not None: + dtype_cast_op = torch.quantize_per_tensor # type: ignore[assignment] + dtype_cast_scale, dtype_cast_zero_point = node_a_input_qparams + elif ( + node_input_type_a == NodeInputOrOutputType.FP16 and + node_input_type_c == NodeInputOrOutputType.FP32 + ): + dtype_cast_method = 'to' + dtype_cast_method_dtype = torch.float16 + else: + raise AssertionError( + f"dtype cast from {node_input_type_c} {node_c.format_node()} to " + + f"{node_input_type_a} {node_a.format_node()} needs to be implemented") + + if isinstance(prev_node_c, Node): + new_dtype_cast_name = \ + get_new_attr_name_with_prefix(node_name_prefix)(gm_b) + if dtype_cast_op: + if dtype_cast_scale is not None and dtype_cast_zero_point is not None: + return _insert_quantize_per_tensor_node( + prev_node_c, node_a, gm_b, graph_c, dtype_cast_scale, + dtype_cast_zero_point, new_dtype_cast_name) + else: + return graph_c.create_node( + 'call_function', dtype_cast_op, (prev_node_c,), {}, + new_dtype_cast_name) + elif dtype_cast_method: + return graph_c.create_node( + 'call_method', dtype_cast_method, + (prev_node_c, dtype_cast_method_dtype), {}, new_dtype_cast_name) + else: + assert dtype_cast_mod_cls + dtype_cast_mod = dtype_cast_mod_cls() + setattr(gm_b, new_dtype_cast_name, dtype_cast_mod) + return graph_c.create_node( + 'call_module', new_dtype_cast_name, (prev_node_c,), {}, + new_dtype_cast_name) + elif isinstance(prev_node_c, list): + results = [] + for prev_node_c_inner in prev_node_c: + new_dtype_cast_name = \ + get_new_attr_name_with_prefix(node_name_prefix)(gm_b) + if dtype_cast_op: + # TODO(future PR): add handling for quantize_per_tensor + new_dtype_cast_node = graph_c.create_node( + 'call_function', dtype_cast_op, (prev_node_c_inner,), {}, + new_dtype_cast_name) + results.append(new_dtype_cast_node) + else: + assert dtype_cast_mod_cls + dtype_cast_mod = dtype_cast_mod_cls() + setattr(gm_b, new_dtype_cast_name, dtype_cast_mod) + new_dtype_cast_node = graph_c.create_node( + 'call_module', new_dtype_cast_name, (prev_node_c_inner,), {}, + new_dtype_cast_name) + results.append(new_dtype_cast_node) + return results + else: + raise AssertionError(f"type f{type(prev_node_c)} is not handled") + +# TODO(future PR): look into using copy_node API instead +def _copy_node_from_a_to_c( + node_a: Node, + gm_a: GraphModule, + gm_b: GraphModule, + graph_c: Graph, +) -> Node: + """ + Simple copy of node_a to graph_c. + """ + if node_a.op == 'get_attr': + node_a_copy_name = \ + get_new_attr_name_with_prefix(node_a.name + '_shadow_copy_')(gm_b) + node_a_obj = getattr_from_fqn(gm_a, node_a.target) # type: ignore[arg-type] + if torch.is_tensor(node_a_obj): + node_a_obj = node_a_obj.detach() + setattr(gm_b, node_a_copy_name, node_a_obj) + node_a_copy = graph_c.create_node( + node_a.op, node_a_copy_name, (), {}, node_a_copy_name) + return node_a_copy + elif node_a.op == 'call_method': + assert node_a.target in ('dequantize', 'to'), \ + f"target {node_a.target} is not implemented" + if node_a.target == 'dequantize': + arg_copy = _copy_node_from_a_to_c( + get_normalized_nth_input(node_a, gm_a, 0), + gm_a, gm_b, graph_c) # type: ignore[arg-type] + node_a_copy_name = \ + get_new_attr_name_with_prefix(node_a.name + '_shadow_copy_')(gm_b) + node_a_copy = graph_c.create_node( + node_a.op, node_a.target, (arg_copy,), {}, node_a_copy_name) + return node_a_copy + else: # to + arg_copy = _copy_node_from_a_to_c( + get_normalized_nth_input(node_a, gm_a, 0), gm_a, gm_b, graph_c) # type: ignore[arg-type] + node_a_copy_name = \ + get_new_attr_name_with_prefix(node_a.name + '_shadow_copy_')(gm_b) + node_a_copy = graph_c.create_node( + node_a.op, node_a.target, + (arg_copy, get_normalized_nth_input(node_a, gm_a, 1)), + {}, node_a_copy_name) + return node_a_copy + + else: + raise AssertionError( + f"handling of node {node_a.format_node()} with op {node_a.op} is not implemented") + +def _can_insert_copy_of_subgraph_a( + subgraph_a: NSSubgraph, + gm_a: GraphModule, + num_non_param_args_node_a: int, +) -> bool: + """ + This function returns `False` if the input subgraph cannot be copied by + `_insert_copy_of_subgraph_a_after_input_node_c`. This usually means + that there is a corner case logic for which copy is not yet implemented. + """ + # populate the list of nodes we need to check + nodes = [] + cur_node = subgraph_a.end_node + while cur_node != subgraph_a.start_node: + nodes.append(cur_node) + cur_node = get_normalized_nth_input(cur_node, gm_a, 0) # type: ignore[assignment] + nodes.append(cur_node) + nodes.reverse() + + def _can_insert(node_a_arg, gm_a): + if isinstance(node_a_arg, Node): + arg_a = return_first_non_observer_node(node_a_arg, gm_a) + if arg_a.op == 'call_method': + return arg_a.target in ('dequantize', 'to') + elif arg_a.op == 'get_attr': + return True + else: + return False + elif isinstance(node_a_arg, (list, tuple)): + for el in node_a_arg: + if not isinstance(el, Node): + return False + return True + + # For each node, check if we handle the copy behavior. This follows the + # logic in `_insert_copy_of_subgraph_a_after_input_node_c`. + for node_a in nodes: + + local_num_non_param_args_node_a = num_non_param_args_node_a \ + if node_a is nodes[0] else 1 + + norm_args_kwargs = node_a.normalized_arguments( + gm_a, normalize_to_only_use_kwargs=True) + if norm_args_kwargs is not None: + norm_args, norm_kwargs = norm_args_kwargs + else: + norm_args, norm_kwargs = node_a.args, node_a.kwargs + + cur_idx = 0 + + while cur_idx < len(norm_args): + if cur_idx == 0: + pass + elif cur_idx == 1 and local_num_non_param_args_node_a == 2: + pass + else: + if not _can_insert(norm_args[cur_idx], gm_a): + return False + cur_idx += 1 + + for kwarg_val in norm_kwargs.values(): + # stitch the inputs from base graph + if cur_idx == 0: + pass + elif cur_idx == 1 and local_num_non_param_args_node_a == 2: + pass + else: + if not _can_insert(kwarg_val, gm_a): + return False + cur_idx += 1 + + return True + +def _insert_copy_of_subgraph_a_after_input_node_c( + input_node_c: Union[Node, List[Node]], + input_node_c_2: Optional[Union[Node, List[Node]]], + subgraph_a: NSSubgraph, + gm_a: GraphModule, + gm_b: GraphModule, + node_name_prefix: str, +) -> Node: + """ + TODO(before land): real docblock + """ + if isinstance(input_node_c, Node): + graph_c = input_node_c.graph + else: + assert isinstance(input_node_c, list) + graph_c = input_node_c[0].graph + + # create a sequential list of the subgraphs' nodes from start to end, + # because we need to add the nodes to graph C in non-reverse order + nodes_of_a = [subgraph_a.end_node] + cur_node = subgraph_a.end_node + while cur_node != subgraph_a.start_node: + cur_node = get_normalized_nth_input(cur_node, gm_a, 0) # type: ignore[assignment] + nodes_of_a.insert(0, cur_node) + + # go through nodes of a in order, and insert them into the graph of c + # sequentially + cur_node_a = nodes_of_a[0] + cur_node_c = _insert_copy_of_node_a_after_input_node_c( + input_node_c, + input_node_c_2, + cur_node_a, + gm_a, + gm_b, + node_name_prefix) + for cur_idx_a in range(1, len(nodes_of_a)): + cur_node_a = nodes_of_a[cur_idx_a] + prev_node_c = cur_node_c # previous added node is the input to next node + cur_node_c = _insert_copy_of_node_a_after_input_node_c( + prev_node_c, + # TODO(future PR): enable multiple inputs for nodes which are not at start of subgraph + None, + cur_node_a, + gm_a, + gm_b, + node_name_prefix) + # return the last inserted node + return cur_node_c + + +def _insert_copy_of_node_a_after_input_node_c( + input_node_c: Union[Node, List[Node]], + input_node_c_2: Optional[Union[Node, List[Node]]], + node_a: Node, + gm_a: GraphModule, + gm_b: GraphModule, + node_name_prefix: str, +) -> Node: + """ + Assume that node_a from graph_a has + args (input, (input2)?, arg1, ...), and + kwargs {kw0: kwarg0, ...} + + Note: input2 is optional. If it equals to None, we assume that the op + has a single non-param input. If it is specified, we assume that the op + has two non-param inputs. + + Copies the underlying values of arg1..argn and kwarg0..kwargn into gm_b, + and creates the corresponding nodes in graph_c. Note: observers are ignored, + so if an arg is an observer we navigate up until we find a non-observer parent. + + If node_a is a call_module, points the module pointed to by node_a to gm_b. + + Creates the copy of node_a in graph_c, with input as the first arg, + and all other args and kwargs pointing to the copies of the objects + in gm_b created above. + + An example in pictures: + + graph A: + ======== + + input -------------> node_a + / / / + (input_2)?----------/ / / + / / + weight -> weight_obs / + / + bias ---------------- + + graph C (derived from B): + ========================= + + input_node_c --> node_a_copy + / / / + (input_node_c_2)? / / + / / + weight_copy ----/ / + / + bias_copy ------/ + """ + if isinstance(input_node_c, Node): + graph_c = input_node_c.graph + else: + assert isinstance(input_node_c, list) + graph_c = input_node_c[0].graph + + norm_args_kwargs = node_a.normalized_arguments( + gm_a, normalize_to_only_use_kwargs=True) + if norm_args_kwargs is not None: + norm_args, norm_kwargs = norm_args_kwargs + else: + norm_args, norm_kwargs = node_a.args, node_a.kwargs + + new_args = [] + new_kwargs = {} + + def _copy_arg(arg): + # copy the other inputs from the other graph + if isinstance(arg, Node): + arg = return_first_non_observer_node(arg, gm_a) + arg = _copy_node_from_a_to_c(arg, gm_a, gm_b, graph_c) + return arg + elif isinstance(arg, (int, float, torch.dtype)): + return arg + elif isinstance(kwarg_val, (list, tuple)): + for el in kwarg_val: + assert not isinstance(el, Node), \ + "handling of Node inside list is not implemented" + return arg + else: + raise AssertionError( + f"handling for kwarg of type {type(kwarg_val)} is not implemented") + + cur_idx = 0 + + while cur_idx < len(norm_args): + if cur_idx == 0: + new_arg = input_node_c + elif cur_idx == 1 and input_node_c_2 is not None: + new_arg = input_node_c_2 + else: + new_arg = _copy_arg(norm_args[cur_idx]) + new_args.append(new_arg) + cur_idx += 1 + + for kwarg_name, kwarg_val in norm_kwargs.items(): + # stitch the inputs from base graph + if cur_idx == 0: + new_kwargs[kwarg_name] = input_node_c + elif cur_idx == 1 and input_node_c_2 is not None: + new_kwargs[kwarg_name] = input_node_c_2 + else: + new_kwargs[kwarg_name] = _copy_arg(kwarg_val) + cur_idx += 1 + + new_args = tuple(new_args) # type: ignore[assignment] + + node_a_shadows_c_name = \ + get_new_attr_name_with_prefix(node_name_prefix)(gm_b) + + if node_a.op == 'call_module': + # if target is a module, we point to the module from gm_b + new_mod_copy_name = \ + get_new_attr_name_with_prefix(node_name_prefix)(gm_b) + # fetch the corresponding module from gm_a + assert isinstance(node_a.target, str) + mod_a = getattr_from_fqn(gm_a, node_a.target) + setattr(gm_b, new_mod_copy_name, mod_a) + node_a_shadows_c = graph_c.create_node( + node_a.op, new_mod_copy_name, new_args, + new_kwargs, node_a_shadows_c_name) + return node_a_shadows_c + else: + assert node_a.op in ('call_function', 'call_method') + node_a_shadows_c = graph_c.create_node( + node_a.op, node_a.target, new_args, + new_kwargs, node_a_shadows_c_name) + return node_a_shadows_c + +def create_a_shadows_b( + name_a: str, + gm_a: GraphModule, + name_b: str, + gm_b: GraphModule, + matched_subgraph_pairs: Dict[str, Tuple[NSSubgraph, NSSubgraph]], + logger_cls: Callable, + should_log_inputs: bool, + node_type_to_io_type_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None, +) -> GraphModule: + """ + Creates a new GraphModule consisting of the graph of C, with the meaningful + nodes of A shadowing the corresponding nodes of B. For example, + + Graph A: + a0 -> op0_fp32 -> a1 -> op1_fp32 -> a2 + + Graph B: + b0 -> op0_int8 -> b1 -> op1_int8 -> b2 + + matched_node_pairs: {'op0': (op0_fp32, op0_int8), 'op1': (op1_fp32, op1_int8)} + + Graph C (A shadows B): + + / dequant0 -> op0_fp32 -> logger_a_0 / dequant_1 -> op1_fp32 -> logger_a_1 + / / + b0 -------------> op0_int8 -> logger_b_0 --------------> op1_int8 -> logger_b_1 + + In a nutshell, this function does the following for each node pair: + * copies the necessary attributes and modules from gm_a to gm_b, + keeping names unique + * adds a dtype cast op (dequant, quant, etc) + * adds a copy of node_a in gm_b's graph + * adds loggers to the outputs of node_a and node_b + """ + + if node_type_to_io_type_map is None: + node_type_to_io_type_map = get_node_type_to_io_type_map() + + # graph_c is the graph created from copying the nodes of graph_b and inserting + # the shadows with the nodes copied from graph_a + graph_c = Graph() + env_c: Dict[str, Any] = {} + modules = dict(gm_b.named_modules()) + + def load_arg(a): + return map_arg(a, lambda node: env_c[node.name]) + + start_node_b_to_matched_subgraph_a_and_name = {} + end_node_b_to_matched_subgraph_a_and_name = {} + for match_name, match in matched_subgraph_pairs.items(): + subgraph_a, subgraph_b = match + ref_node_type_a = get_target_type_str(subgraph_a.base_op_node, gm_a) + ref_node_type_b = get_target_type_str(subgraph_b.base_op_node, gm_b) + start_node_b_to_matched_subgraph_a_and_name[subgraph_b.start_node] = \ + (subgraph_a, match_name, ref_node_type_a, ref_node_type_b) + end_node_b_to_matched_subgraph_a_and_name[subgraph_b.end_node] = \ + (subgraph_a, match_name, ref_node_type_a, ref_node_type_b) + + for node_b in gm_b.graph.nodes: + if node_b.op == 'output': + graph_c.output(map_arg(node_b.args[0], load_arg)) + continue + + # calculate the flags to determine what to do with this node + node_b_is_start_node = node_b in start_node_b_to_matched_subgraph_a_and_name + node_b_is_end_node = node_b in end_node_b_to_matched_subgraph_a_and_name + + if (node_b_is_start_node or node_b_is_end_node): + + if node_b_is_start_node: + subgraph_a, ref_name, ref_node_type_a, ref_node_type_b = \ + start_node_b_to_matched_subgraph_a_and_name[node_b] + else: + assert node_b_is_end_node + subgraph_a, ref_name, ref_node_type_a, ref_node_type_b = \ + end_node_b_to_matched_subgraph_a_and_name[node_b] + + all_op_types_support_shadowing = ( + op_type_supports_shadowing(subgraph_a.start_node) and + op_type_supports_shadowing(node_b) + ) + if not all_op_types_support_shadowing: + print( + f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' + + f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' + + ', unsupported') + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + + # For both start_node and end_node verify that we know how to do + # the dtype cast. If we do not, skip. + node_input_type_a, node_output_type_a = \ + get_node_first_input_and_output_type( + subgraph_a.start_node, gm_a, logger_cls, + node_type_to_io_type_map) + node_input_type_b, node_output_type_b = \ + get_node_first_input_and_output_type( + node_b, gm_b, logger_cls, + node_type_to_io_type_map) + node_io_types_known_a_and_b = ( + node_input_type_a != NodeInputOrOutputType.UNKNOWN and + node_output_type_a != NodeInputOrOutputType.UNKNOWN and + node_input_type_b != NodeInputOrOutputType.UNKNOWN and + node_output_type_b != NodeInputOrOutputType.UNKNOWN + ) + if not node_io_types_known_a_and_b: + print( + f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' + + f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' + + ', unknown dtype cast') + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + + # If we are shadowing from fp32 to int8, we need to insert + # quantize_per_tensor call with qparams from the previous node. + # Only do this if we are able to infer these qparams from the graph. + if ( + node_input_type_a == NodeInputOrOutputType.INT8 and + node_input_type_b == NodeInputOrOutputType.FP32 + ): + node_a_input_qparams = get_node_input_qparams( + subgraph_a.start_node, gm_a, node_type_to_io_type_map) + if not node_a_input_qparams: + print( + f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' + + f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' + + ', unknown input qparams') + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + + num_non_param_args_node_a = \ + get_number_of_non_param_args(subgraph_a.start_node, gm_a) + if not _can_insert_copy_of_subgraph_a(subgraph_a, gm_a, num_non_param_args_node_a): + print( + f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' + + f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' + + ', unhandled logic in subgraph copy') + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + + fqn_base_a = _maybe_get_fqn(subgraph_a.base_op_node, gm_a) + fqn_base_b = _maybe_get_fqn(subgraph_b.base_op_node, gm_b) # type: ignore[possibly-undefined] + + if node_b_is_start_node: + + # if necessary, log the input of node_c + if should_log_inputs: + prev_node_b = get_normalized_nth_input(node_b, gm_b, 0) + if isinstance(prev_node_b, Node): + prev_node_c = env_c[prev_node_b.name] + env_c[prev_node_c.name] = _insert_logger_after_node( + prev_node_c, gm_b, logger_cls, '_ns_logger_b_inp_', + node_b.name, name_b, ref_name, ref_node_type_b, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=0, index_of_arg=0, + fqn=fqn_base_b) + elif isinstance(prev_node_b, list): + # first, save the prev_node instances, because they + # will be overwritten in the env after the first logger + # is added + prev_node_c_list = [env_c[arg.name] for arg in prev_node_b] + + for arg_idx, arg in enumerate(prev_node_b): + prev_node_c = prev_node_c_list[arg_idx] + env_c[prev_node_c.name] = _insert_logger_after_node( + prev_node_c, gm_b, logger_cls, '_ns_logger_b_inp_', + node_b.name, name_b, ref_name, ref_node_type_b, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=arg_idx, index_of_arg=0, + fqn=fqn_base_b) + else: + # logging of inputs which are not lists is not supported yet + raise AssertionError(f"type {type(prev_node_b)} is not handled yet") + # subgraph so far: + # + # (prev_node_c)+ -> (logger_c_input)? + + # Note: this if statement is always True, spelling it out to clarify code + # intent. + if node_b_is_start_node or node_b_is_end_node: + # ensure env_c is populated with base node + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + node_c = env_c[node_b.name] + + # after this point, + # + # node_a is the original node from graph_a, with parent module gm_a + # node_b is the original node from graph_b, with parent module gm_b + # node_c is the copy of node_b in graph_c + # + # subgraph so far: + # + # (prev_node_c)+ -> (logger_c_input)? -> node_start_c + + if node_b_is_start_node: + + # cast dtype from the dtype of node_c's input to the dtype of + # node_a's input (dequant, etc) + # prev_node_c = node_c.args[0] + prev_node_c = get_normalized_nth_input(node_c, gm_b, 0) # type: ignore[possibly-undefined] + if should_log_inputs: + # skip the input logger when inserting a dtype cast + if isinstance(prev_node_c, Node): + prev_node_c = get_normalized_nth_input(node_c, gm_b, 0) + elif isinstance(prev_node_c, list): + prev_node_c = [get_normalized_nth_input(arg, gm_b, 0) for arg in prev_node_c] + dtype_cast_node = _insert_dtype_cast_after_node( + subgraph_a.start_node, node_c, prev_node_c, gm_a, gm_b, graph_c, + node_b.name + '_dtype_cast_', logger_cls, + node_type_to_io_type_map) + # note: not inserting to env_c because all nodes which use the dtype + # casts are copied from graph_a + # + # subgraph so far: + # + # (dtype_cast_node)+ + # / + # (prev_node_c)+ -> (logger_c_input)? -> node_start_c + + # if input logging is enabled, log the input to the subgraph + if should_log_inputs: + # TODO: explain this + ref_node_name = '' + if isinstance(dtype_cast_node, Node): + dtype_cast_node = _insert_logger_after_node( + dtype_cast_node, gm_b, logger_cls, '_ns_logger_a_inp_', + ref_node_name, name_a, ref_name, ref_node_type_a, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=0, index_of_arg=0, + fqn=fqn_base_a) + input_logger: Union[Node, List[Node]] = dtype_cast_node + else: + assert isinstance(dtype_cast_node, list) + new_loggers = [] + for dtype_cast_idx, dtype_cast_node_inner in enumerate(dtype_cast_node): + dtype_cast_logger = _insert_logger_after_node( + dtype_cast_node_inner, gm_b, logger_cls, '_ns_logger_a_inp_', + ref_node_name, name_a, ref_name, ref_node_type_a, + NSSingleResultValuesType.NODE_INPUT.value, + index_within_arg=dtype_cast_idx, + index_of_arg=0, + fqn=fqn_base_a) + new_loggers.append(dtype_cast_logger) + dtype_cast_node = new_loggers + input_logger = dtype_cast_node + # subgraph so far: + # + # (dtype_cast_node)+ -> (logger_a_input)? + # / + # prev_node_c -> (logger_c_input)? -> node_start_c + + # hook up the new mod_a copy to be in the graph, receiving the + # same inputs as mod_b does, with dtype cast to match a + # Some ops, such as LSTMs, have two non-param inputs. If we have + # such an op, pass the second param as well. Note: dtype casting + # for the second param is not implemented yet, it can be added + # later if there is a use case. + node_c_second_non_param_arg = None + num_non_param_args_node_a = get_number_of_non_param_args(subgraph_a.start_node, gm_a) + if num_non_param_args_node_a == 2: + # node_c_second_non_param_arg = node_c.args[1] + node_c_second_non_param_arg = get_normalized_nth_input(node_c, gm_b, 1) + node_a_shadows_c = _insert_copy_of_subgraph_a_after_input_node_c( + dtype_cast_node, node_c_second_non_param_arg, + subgraph_a, gm_a, gm_b, node_c.name + '_shadow_copy_') + env_c[node_a_shadows_c.name] = node_a_shadows_c + # subgraph so far: + # + # dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy(args/kwargs not shown) + # / + # (prev_node_c)+ -> (logger_c_input)? -> node_start_c + + if should_log_inputs: + # When we created the input logger, we left the ref_node_name + # as an empty string, because the subgraph copy did not exist + # yet. Now that the subgraph copy exists, we modify this name + # to its true value. + # Note: the alternative to this is to create the input logger + # after creating the subgraph, which is slightly more + # complicated. This is the lesser of two evils. + # input_logger = env_c[dtype_cast_node.name] + # Find the first node in the subgraph + cur_node = node_a_shadows_c + while get_normalized_nth_input(cur_node, gm_b, 0) != input_logger: # type: ignore[possibly-undefined] + cur_node = get_normalized_nth_input(cur_node, gm_b, 0) # type: ignore[assignment] + if isinstance(input_logger, Node): + input_logger_mod = getattr(gm_b, input_logger.name) + input_logger_mod.ref_node_name = cur_node.name + else: + assert isinstance(input_logger, list) + for input_logger_inner in input_logger: + input_logger_mod = getattr(gm_b, input_logger_inner.name) + input_logger_mod.ref_node_name = cur_node.name + + # hook up a logger to the mod_a copy + env_c[node_a_shadows_c.name] = _insert_logger_after_node( + env_c[node_a_shadows_c.name], gm_b, logger_cls, '_ns_logger_a_', + node_a_shadows_c.name, name_a, ref_name, ref_node_type_a, + NSSingleResultValuesType.NODE_OUTPUT.value, + index_within_arg=0, index_of_arg=0, + fqn=fqn_base_a) + # subgraph so far: + # + # dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a + # / + # (prev_node_c)+ -> (logger_c_input)? -> node_start_c + + if node_b_is_end_node: + + # hook up a logger to the mod_b copy + env_c[node_b.name] = _insert_logger_after_node( + env_c[node_b.name], gm_b, logger_cls, '_ns_logger_b_', + node_b.name, name_b, ref_name, ref_node_type_b, + NSSingleResultValuesType.NODE_OUTPUT.value, + index_within_arg=0, index_of_arg=0, + fqn=fqn_base_b) + # subgraph so far: + # + # dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a + # / + # (prev_node_c+) -> (logger_c_input)? -> node_start_c -> ... -> node_end_c -> logger_c + # + # Note: node_start_c may be the same node as node_end_c, or they + # may have nodes inbetween. + + else: + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + + gm_c = GraphModule(gm_b, graph_c) + return gm_c diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_safeguard.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_safeguard.py new file mode 100644 index 0000000000000000000000000000000000000000..854fff2bcca279ae4861ec9d3d4a2f7038540c12 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_safeguard.py @@ -0,0 +1,42 @@ +import torch +from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode +from torch.overrides import TorchFunctionMode + + +class AutogradStateOpsFailSafeguard(TorchFunctionMode): + """ + Detect grad state ops during exporting the graph and fail the process by + raising an error, to avoid unexpected behavior. Those grad mode ops could be: + `torch.no_grad` + `torch.enable_grad` + `torch.set_grad_enabled` + + Export with predispatch mode is exempted. + """ + + def __torch_function__(self, func, types, args=(), kwargs=None): + kwargs = kwargs or {} + unsupported_grad_mode_ops = [ + torch._C._set_grad_enabled, + ] + # It's only enabled while tracing, by confirming the torch dispatch mode is + # any active PROXY. This is to allow the autograd ops out of tracing. + current_state = torch._C.is_grad_enabled() + if func in unsupported_grad_mode_ops: + assert len(args) == 1 + changed_state = args[0] + mode = torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY) + # Intend to check if it's not the pre_dispatch mode. It's allowed to use + # autograd ops in pre_dispatch mode, e.g. `torch.no_grad` + if ( + mode + and isinstance(mode, ProxyTorchDispatchMode) + and not mode.pre_dispatch + and changed_state != current_state + ): + raise RuntimeError( + f"Encountered autograd state manager op {func} trying to change global autograd state " + "while exporting. This is unsafe because we don't capture this op in torch.export " + "today, hence we can't reflect the user intention soundly." + ) + return func(*args, **kwargs) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_tree_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_tree_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a1615ebd5f586ce5216c65d40162a74ffb7bc5d1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/export/_tree_utils.py @@ -0,0 +1,64 @@ +from typing import Any, Callable, Dict, Optional + +from torch.utils._pytree import Context, TreeSpec + + +def reorder_kwargs(user_kwargs: Dict[str, Any], spec: TreeSpec) -> Dict[str, Any]: + """Reorder user-provided kwargs to match the order in `spec`. `spec` is + expected to be the in_spec of an exported program, i.e. the spec that + results from flattening `(args, kwargs)`. + + We need this to provide consistent input ordering, such so that users can + pass in foo(a=a, b=b) OR foo(b=b, a=a) and receive the same result. + """ + # Make sure that the spec is actually shaped like (args, kwargs) + assert spec.type is tuple + assert spec.num_children == 2 + kwargs_spec = spec.children_specs[1] + assert kwargs_spec.type is dict + + if set(user_kwargs) != set(kwargs_spec.context): + raise ValueError( + f"kwarg key mismatch: " + f"Got {list(user_kwargs)} but expected {kwargs_spec.context}" + ) + + reordered_kwargs = {} + for kw in kwargs_spec.context: + reordered_kwargs[kw] = user_kwargs[kw] + + return reordered_kwargs + + +def is_equivalent( + spec1: TreeSpec, + spec2: TreeSpec, + equivalence_fn: Callable[[Optional[type], Context, Optional[type], Context], bool], +) -> bool: + """Customizable equivalence check for two TreeSpecs. + + Arguments: + spec1: The first TreeSpec to compare + spec2: The second TreeSpec to compare + equivalence_fn: A function to determine the equivalence of two + TreeSpecs by examining their types and contexts. It will be called like: + + equivalence_fn(spec1.type, spec1.context, spec2.type, spec2.context) + + This function will be applied recursively to all children. + + Returns: + True if the two TreeSpecs are equivalent, False otherwise. + """ + if not equivalence_fn(spec1.type, spec1.context, spec2.type, spec2.context): + return False + + # Recurse on children + if len(spec1.children_specs) != len(spec2.children_specs): + return False + + for child_spec1, child_spec2 in zip(spec1.children_specs, spec2.children_specs): + if not is_equivalent(child_spec1, child_spec2, equivalence_fn): + return False + + return True diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf61515308a3dc60e49309710daa0805a92521f9 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/common_types.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/common_types.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7798d11aff5d4487c1c858058a9d40eebc5ef50 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/common_types.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/cpp.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/cpp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a5ab357202fae765fa531617e85da1314f69301 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/cpp.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/init.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/init.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7faaafc4b6c20cc873f077bcb45ca528960caeb Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/init.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/parameter.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/parameter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a6808c85a7bbd028b64c59a56e4bc53160b1e7d Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/__pycache__/parameter.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53ece59d010985a8107cb41c5f3c7484839d66b0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/thnn.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/thnn.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96497631414d8292a1dbd6b1597e178a858f7754 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/__pycache__/thnn.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/thnn.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/thnn.py new file mode 100644 index 0000000000000000000000000000000000000000..5250b4bff1674880c97be7b36ca81d6cd6b665a4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/backends/thnn.py @@ -0,0 +1,4 @@ +# this is for historical pickle deserialization, it is not used otherwise + +def _get_thnn_function_backend(): + pass diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e19378bc7aee4f9f8c6c73eef4cacd097d39d2fa --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__init__.py @@ -0,0 +1,35 @@ +from torch.ao.nn.intrinsic import ConvBn1d +from torch.ao.nn.intrinsic import ConvBn2d +from torch.ao.nn.intrinsic import ConvBn3d +from torch.ao.nn.intrinsic import ConvBnReLU1d +from torch.ao.nn.intrinsic import ConvBnReLU2d +from torch.ao.nn.intrinsic import ConvBnReLU3d +from torch.ao.nn.intrinsic import ConvReLU1d +from torch.ao.nn.intrinsic import ConvReLU2d +from torch.ao.nn.intrinsic import ConvReLU3d +from torch.ao.nn.intrinsic import LinearReLU +from torch.ao.nn.intrinsic import BNReLU2d +from torch.ao.nn.intrinsic import BNReLU3d +from torch.ao.nn.intrinsic import LinearBn1d +from torch.ao.nn.intrinsic.modules.fused import _FusedModule # noqa: F401 + +# Include the subpackages in case user imports from it directly +from . import modules # noqa: F401 +from . import qat # noqa: F401 +from . import quantized # noqa: F401 + +__all__ = [ + 'ConvBn1d', + 'ConvBn2d', + 'ConvBn3d', + 'ConvBnReLU1d', + 'ConvBnReLU2d', + 'ConvBnReLU3d', + 'ConvReLU1d', + 'ConvReLU2d', + 'ConvReLU3d', + 'LinearReLU', + 'BNReLU2d', + 'BNReLU3d', + 'LinearBn1d', +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d131785f39997c5438645df9c67ad9c5f055fa51 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/modules/fused.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/modules/fused.py new file mode 100644 index 0000000000000000000000000000000000000000..dc962f956427ec6f6e6b1d0580a1d5c73bd9cd29 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/modules/fused.py @@ -0,0 +1,30 @@ +from torch.ao.nn.intrinsic import BNReLU2d +from torch.ao.nn.intrinsic import BNReLU3d +from torch.ao.nn.intrinsic import ConvBn1d +from torch.ao.nn.intrinsic import ConvBn2d +from torch.ao.nn.intrinsic import ConvBn3d +from torch.ao.nn.intrinsic import ConvBnReLU1d +from torch.ao.nn.intrinsic import ConvBnReLU2d +from torch.ao.nn.intrinsic import ConvBnReLU3d +from torch.ao.nn.intrinsic import ConvReLU1d +from torch.ao.nn.intrinsic import ConvReLU2d +from torch.ao.nn.intrinsic import ConvReLU3d +from torch.ao.nn.intrinsic import LinearBn1d +from torch.ao.nn.intrinsic import LinearReLU +from torch.ao.nn.intrinsic.modules.fused import _FusedModule # noqa: F401 + +__all__ = [ + 'BNReLU2d', + 'BNReLU3d', + 'ConvBn1d', + 'ConvBn2d', + 'ConvBn3d', + 'ConvBnReLU1d', + 'ConvBnReLU2d', + 'ConvBnReLU3d', + 'ConvReLU1d', + 'ConvReLU2d', + 'ConvReLU3d', + 'LinearBn1d', + 'LinearReLU', +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e10ec0821f66c03aeaf5d3ca48bda548a2c8e400 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9007e0da7545209f70c404d6f0972cb6399c1e2 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b949303a40834fed294900278d4bdde55711a284 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/__init__.py @@ -0,0 +1,13 @@ +from .modules import * # noqa: F403 +# to ensure customers can use the module below +# without importing it directly +import torch.nn.intrinsic.quantized.dynamic + +__all__ = [ + 'BNReLU2d', + 'BNReLU3d', + 'ConvReLU1d', + 'ConvReLU2d', + 'ConvReLU3d', + 'LinearReLU', +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce4e3dad5e3a707ceaaa29ebe724d40ad8988d46 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py new file mode 100644 index 0000000000000000000000000000000000000000..63cc8609e2d8580b994203c4fe58e0d2328dc7de --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py @@ -0,0 +1,5 @@ +from torch.ao.nn.intrinsic.quantized.dynamic import LinearReLU + +__all__ = [ + 'LinearReLU', +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..521e409b2b6422b005e1b2de87ba31f261ce6590 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__init__.py @@ -0,0 +1,12 @@ +from .linear_relu import LinearReLU +from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d +from .bn_relu import BNReLU2d, BNReLU3d + +__all__ = [ + 'LinearReLU', + 'ConvReLU1d', + 'ConvReLU2d', + 'ConvReLU3d', + 'BNReLU2d', + 'BNReLU3d', +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5d6cc445b2d8235efc832548898b01f5bf50070 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ec105560cae9707156d4130474c2aec40df7a21 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..67916b3ae756f85113449c2066805fa0421e1bbb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__init__.py @@ -0,0 +1,68 @@ +from .module import Module +from .linear import Identity, Linear, Bilinear, LazyLinear +from .conv import Conv1d, Conv2d, Conv3d, \ + ConvTranspose1d, ConvTranspose2d, ConvTranspose3d, \ + LazyConv1d, LazyConv2d, LazyConv3d, LazyConvTranspose1d, LazyConvTranspose2d, LazyConvTranspose3d +from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \ + Softmax, Softmax2d, LogSoftmax, ELU, SELU, CELU, GELU, Hardshrink, LeakyReLU, LogSigmoid, \ + Softplus, Softshrink, MultiheadAttention, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU, \ + Hardsigmoid, Hardswish, SiLU, Mish +from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \ + CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \ + MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, SmoothL1Loss, HuberLoss, \ + SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, TripletMarginWithDistanceLoss, PoissonNLLLoss, GaussianNLLLoss +from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict +from .pooling import AvgPool1d, AvgPool2d, AvgPool3d, MaxPool1d, MaxPool2d, MaxPool3d, \ + MaxUnpool1d, MaxUnpool2d, MaxUnpool3d, FractionalMaxPool2d, FractionalMaxPool3d, LPPool1d, LPPool2d, LPPool3d, \ + AdaptiveMaxPool1d, AdaptiveMaxPool2d, AdaptiveMaxPool3d, AdaptiveAvgPool1d, AdaptiveAvgPool2d, AdaptiveAvgPool3d +from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d, SyncBatchNorm, \ + LazyBatchNorm1d, LazyBatchNorm2d, LazyBatchNorm3d +from .instancenorm import InstanceNorm1d, InstanceNorm2d, InstanceNorm3d, \ + LazyInstanceNorm1d, LazyInstanceNorm2d, LazyInstanceNorm3d +from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm, GroupNorm +from .dropout import Dropout, Dropout1d, Dropout2d, Dropout3d, AlphaDropout, FeatureAlphaDropout +from .padding import ReflectionPad1d, ReflectionPad2d, ReflectionPad3d, ReplicationPad1d, ReplicationPad2d, \ + ReplicationPad3d, ZeroPad1d, ZeroPad2d, ZeroPad3d, ConstantPad1d, ConstantPad2d, ConstantPad3d, \ + CircularPad1d, CircularPad2d, CircularPad3d +from .sparse import Embedding, EmbeddingBag +from .rnn import RNNBase, RNN, LSTM, GRU, \ + RNNCellBase, RNNCell, LSTMCell, GRUCell +from .pixelshuffle import PixelShuffle, PixelUnshuffle +from .upsampling import UpsamplingNearest2d, UpsamplingBilinear2d, Upsample +from .distance import PairwiseDistance, CosineSimilarity +from .fold import Fold, Unfold +from .adaptive import AdaptiveLogSoftmaxWithLoss +from .transformer import TransformerEncoder, TransformerDecoder, \ + TransformerEncoderLayer, TransformerDecoderLayer, Transformer +from .flatten import Flatten, Unflatten +from .channelshuffle import ChannelShuffle + +__all__ = [ + 'Module', 'Identity', 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', + 'ConvTranspose2d', 'ConvTranspose3d', 'Threshold', 'ReLU', 'Hardtanh', 'ReLU6', + 'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'CELU', 'GLU', 'GELU', 'Hardshrink', + 'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'MultiheadAttention', 'PReLU', 'Softsign', 'Softmin', + 'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss', + 'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'CTCLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss', + 'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss', 'GaussianNLLLoss', + 'HuberLoss', 'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict', + 'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d', + 'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'FractionalMaxPool2d', "FractionalMaxPool3d", + 'LPPool1d', 'LPPool2d', 'LPPool3d', 'LocalResponseNorm', 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', + 'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'SyncBatchNorm', + 'Dropout', 'Dropout1d', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout', + 'ReflectionPad1d', 'ReflectionPad2d', 'ReflectionPad3d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d', + 'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell', + 'LSTMCell', 'GRUCell', 'PixelShuffle', 'PixelUnshuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', + 'PairwiseDistance', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', + 'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad1d', 'ZeroPad2d', 'ZeroPad3d', + 'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold', + 'AdaptiveLogSoftmaxWithLoss', 'TransformerEncoder', 'TransformerDecoder', + 'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Transformer', + 'LazyLinear', 'LazyConv1d', 'LazyConv2d', 'LazyConv3d', + 'LazyConvTranspose1d', 'LazyConvTranspose2d', 'LazyConvTranspose3d', + 'LazyBatchNorm1d', 'LazyBatchNorm2d', 'LazyBatchNorm3d', + 'LazyInstanceNorm1d', 'LazyInstanceNorm2d', 'LazyInstanceNorm3d', + 'Flatten', 'Unflatten', 'Hardsigmoid', 'Hardswish', 'SiLU', 'Mish', 'TripletMarginWithDistanceLoss', 'ChannelShuffle', + 'CircularPad1d', 'CircularPad2d', 'CircularPad3d' +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a01a64f9b5567e9c637a66ef98de1a9dd8c4502 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/_functions.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/_functions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46e726c855e900667d84e6577082645b72b872e5 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/_functions.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/container.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/container.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5c3a718bf4e718689e5f7cb0a74428dbfc48099 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/container.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/conv.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/conv.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d4063c0a014c45b442829df29a488ab40140e1a Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/conv.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/flatten.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/flatten.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..378cce6252c61e9180e534650c92cc064cf8fe24 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/flatten.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/fold.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/fold.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e86755dcfbf84c9f488454d7c5e8fceec7ea336d Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/fold.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/lazy.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/lazy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..666c73c85fd30c4e338b7114f55ddae7a2b71e02 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/__pycache__/lazy.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/activation.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..2302ec5ea51f2b022dd0f728f5b27e73477991cc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/activation.py @@ -0,0 +1,1624 @@ +import warnings +from typing import Optional, Tuple + +import torch +from torch import Tensor +from .linear import NonDynamicallyQuantizableLinear +from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ +from torch.nn.parameter import Parameter +from .module import Module +from .. import functional as F + +__all__ = ['Threshold', 'ReLU', 'RReLU', 'Hardtanh', 'ReLU6', 'Sigmoid', 'Hardsigmoid', 'Tanh', + 'SiLU', 'Mish', 'Hardswish', 'ELU', 'CELU', 'SELU', 'GLU', 'GELU', 'Hardshrink', 'LeakyReLU', + 'LogSigmoid', 'Softplus', 'Softshrink', 'MultiheadAttention', 'PReLU', 'Softsign', 'Tanhshrink', + 'Softmin', 'Softmax', 'Softmax2d', 'LogSoftmax'] + + +class Threshold(Module): + r"""Thresholds each element of the input Tensor. + + Threshold is defined as: + + .. math:: + y = + \begin{cases} + x, &\text{ if } x > \text{threshold} \\ + \text{value}, &\text{ otherwise } + \end{cases} + + Args: + threshold: The value to threshold at + value: The value to replace with + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + Examples:: + + >>> m = nn.Threshold(0.1, 20) + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['threshold', 'value', 'inplace'] + + threshold: float + value: float + inplace: bool + + def __init__(self, threshold: float, value: float, inplace: bool = False) -> None: + super().__init__() + self.threshold = threshold + self.value = value + self.inplace = inplace + # TODO: check in THNN (if inplace == True, then assert value <= threshold) + + def forward(self, input: Tensor) -> Tensor: + return F.threshold(input, self.threshold, self.value, self.inplace) + + def extra_repr(self): + inplace_str = ', inplace=True' if self.inplace else '' + return f'threshold={self.threshold}, value={self.value}{inplace_str}' + + +class ReLU(Module): + r"""Applies the rectified linear unit function element-wise. + + :math:`\text{ReLU}(x) = (x)^+ = \max(0, x)` + + Args: + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/ReLU.png + + Examples:: + + >>> m = nn.ReLU() + >>> input = torch.randn(2) + >>> output = m(input) + + + An implementation of CReLU - https://arxiv.org/abs/1603.05201 + + >>> m = nn.ReLU() + >>> input = torch.randn(2).unsqueeze(0) + >>> output = torch.cat((m(input), m(-input))) + """ + + __constants__ = ['inplace'] + inplace: bool + + def __init__(self, inplace: bool = False): + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.relu(input, inplace=self.inplace) + + def extra_repr(self) -> str: + inplace_str = 'inplace=True' if self.inplace else '' + return inplace_str + + +class RReLU(Module): + r"""Applies the randomized leaky rectified linear unit function, element-wise. + + Method described in the paper: + `Empirical Evaluation of Rectified Activations in Convolutional Network `_. + + The function is defined as: + + .. math:: + \text{RReLU}(x) = + \begin{cases} + x & \text{if } x \geq 0 \\ + ax & \text{ otherwise } + \end{cases} + + where :math:`a` is randomly sampled from uniform distribution + :math:`\mathcal{U}(\text{lower}, \text{upper})` during training while during + evaluation :math:`a` is fixed with :math:`a = \frac{\text{lower} + \text{upper}}{2}`. + + Args: + lower: lower bound of the uniform distribution. Default: :math:`\frac{1}{8}` + upper: upper bound of the uniform distribution. Default: :math:`\frac{1}{3}` + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/RReLU.png + + Examples:: + + >>> m = nn.RReLU(0.1, 0.3) + >>> input = torch.randn(2) + >>> output = m(input) + + """ + + __constants__ = ['lower', 'upper', 'inplace'] + + lower: float + upper: float + inplace: bool + + def __init__( + self, + lower: float = 1. / 8, + upper: float = 1. / 3, + inplace: bool = False + ): + super().__init__() + self.lower = lower + self.upper = upper + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.rrelu(input, self.lower, self.upper, self.training, self.inplace) + + def extra_repr(self): + inplace_str = ', inplace=True' if self.inplace else '' + return f'lower={self.lower}, upper={self.upper}{inplace_str}' + + +class Hardtanh(Module): + r"""Applies the HardTanh function element-wise. + + HardTanh is defined as: + + .. math:: + \text{HardTanh}(x) = \begin{cases} + \text{max\_val} & \text{ if } x > \text{ max\_val } \\ + \text{min\_val} & \text{ if } x < \text{ min\_val } \\ + x & \text{ otherwise } \\ + \end{cases} + + Args: + min_val: minimum value of the linear region range. Default: -1 + max_val: maximum value of the linear region range. Default: 1 + inplace: can optionally do the operation in-place. Default: ``False`` + + Keyword arguments :attr:`min_value` and :attr:`max_value` + have been deprecated in favor of :attr:`min_val` and :attr:`max_val`. + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Hardtanh.png + + Examples:: + + >>> m = nn.Hardtanh(-2, 2) + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['min_val', 'max_val', 'inplace'] + + min_val: float + max_val: float + inplace: bool + + def __init__( + self, + min_val: float = -1., + max_val: float = 1., + inplace: bool = False, + min_value: Optional[float] = None, + max_value: Optional[float] = None + ) -> None: + super().__init__() + if min_value is not None: + warnings.warn("keyword argument min_value is deprecated and rename to min_val") + min_val = min_value + if max_value is not None: + warnings.warn("keyword argument max_value is deprecated and rename to max_val") + max_val = max_value + + self.min_val = min_val + self.max_val = max_val + self.inplace = inplace + assert self.max_val > self.min_val + + def forward(self, input: Tensor) -> Tensor: + return F.hardtanh(input, self.min_val, self.max_val, self.inplace) + + def extra_repr(self) -> str: + inplace_str = ', inplace=True' if self.inplace else '' + return f'min_val={self.min_val}, max_val={self.max_val}{inplace_str}' + + +class ReLU6(Hardtanh): + r"""Applies the ReLU6 function element-wise. + + .. math:: + \text{ReLU6}(x) = \min(\max(0,x), 6) + + Args: + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/ReLU6.png + + Examples:: + + >>> m = nn.ReLU6() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def __init__(self, inplace: bool = False): + super().__init__(0., 6., inplace) + + def extra_repr(self) -> str: + inplace_str = 'inplace=True' if self.inplace else '' + return inplace_str + + +class Sigmoid(Module): + r"""Applies the Sigmoid function element-wise. + + .. math:: + \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)} + + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Sigmoid.png + + Examples:: + + >>> m = nn.Sigmoid() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + return torch.sigmoid(input) + + +class Hardsigmoid(Module): + r"""Applies the Hardsigmoid function element-wise. + + Hardsigmoid is defined as: + + .. math:: + \text{Hardsigmoid}(x) = \begin{cases} + 0 & \text{if~} x \le -3, \\ + 1 & \text{if~} x \ge +3, \\ + x / 6 + 1 / 2 & \text{otherwise} + \end{cases} + + Args: + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Hardsigmoid.png + + Examples:: + + >>> m = nn.Hardsigmoid() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['inplace'] + + inplace: bool + + def __init__(self, inplace : bool = False) -> None: + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.hardsigmoid(input, self.inplace) + + +class Tanh(Module): + r"""Applies the Hyperbolic Tangent (Tanh) function element-wise. + + Tanh is defined as: + + .. math:: + \text{Tanh}(x) = \tanh(x) = \frac{\exp(x) - \exp(-x)} {\exp(x) + \exp(-x)} + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Tanh.png + + Examples:: + + >>> m = nn.Tanh() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + return torch.tanh(input) + +class SiLU(Module): + r"""Applies the Sigmoid Linear Unit (SiLU) function, element-wise. + + The SiLU function is also known as the swish function. + + .. math:: + \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.} + + .. note:: + See `Gaussian Error Linear Units (GELUs) `_ + where the SiLU (Sigmoid Linear Unit) was originally coined, and see + `Sigmoid-Weighted Linear Units for Neural Network Function Approximation + in Reinforcement Learning `_ and `Swish: + a Self-Gated Activation Function `_ + where the SiLU was experimented with later. + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/SiLU.png + + Examples:: + + >>> m = nn.SiLU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['inplace'] + inplace: bool + + def __init__(self, inplace: bool = False): + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.silu(input, inplace=self.inplace) + + def extra_repr(self) -> str: + inplace_str = 'inplace=True' if self.inplace else '' + return inplace_str + +class Mish(Module): + r"""Applies the Mish function, element-wise. + + Mish: A Self Regularized Non-Monotonic Neural Activation Function. + + .. math:: + \text{Mish}(x) = x * \text{Tanh}(\text{Softplus}(x)) + + .. note:: + See `Mish: A Self Regularized Non-Monotonic Neural Activation Function `_ + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Mish.png + + Examples:: + + >>> m = nn.Mish() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['inplace'] + inplace: bool + + def __init__(self, inplace: bool = False): + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.mish(input, inplace=self.inplace) + + def extra_repr(self) -> str: + inplace_str = 'inplace=True' if self.inplace else '' + return inplace_str + +class Hardswish(Module): + r"""Applies the Hardswish function, element-wise. + + Method described in the paper: `Searching for MobileNetV3 `_. + + Hardswish is defined as: + + .. math:: + \text{Hardswish}(x) = \begin{cases} + 0 & \text{if~} x \le -3, \\ + x & \text{if~} x \ge +3, \\ + x \cdot (x + 3) /6 & \text{otherwise} + \end{cases} + + Args: + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Hardswish.png + + Examples:: + + >>> m = nn.Hardswish() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['inplace'] + + inplace: bool + + def __init__(self, inplace : bool = False) -> None: + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.hardswish(input, self.inplace) + + +class ELU(Module): + r"""Applies the Exponential Linear Unit (ELU) function, element-wise. + + Method described in the paper: `Fast and Accurate Deep Network Learning by Exponential Linear + Units (ELUs) `__. + + ELU is defined as: + + .. math:: + \text{ELU}(x) = \begin{cases} + x, & \text{ if } x > 0\\ + \alpha * (\exp(x) - 1), & \text{ if } x \leq 0 + \end{cases} + + Args: + alpha: the :math:`\alpha` value for the ELU formulation. Default: 1.0 + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/ELU.png + + Examples:: + + >>> m = nn.ELU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['alpha', 'inplace'] + alpha: float + inplace: bool + + def __init__(self, alpha: float = 1., inplace: bool = False) -> None: + super().__init__() + self.alpha = alpha + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.elu(input, self.alpha, self.inplace) + + def extra_repr(self) -> str: + inplace_str = ', inplace=True' if self.inplace else '' + return f'alpha={self.alpha}{inplace_str}' + + +class CELU(Module): + r"""Applies the CELU function element-wise. + + .. math:: + \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1)) + + More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ . + + Args: + alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0 + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/CELU.png + + Examples:: + + >>> m = nn.CELU() + >>> input = torch.randn(2) + >>> output = m(input) + + .. _`Continuously Differentiable Exponential Linear Units`: + https://arxiv.org/abs/1704.07483 + """ + + __constants__ = ['alpha', 'inplace'] + alpha: float + inplace: bool + + def __init__(self, alpha: float = 1., inplace: bool = False) -> None: + super().__init__() + self.alpha = alpha + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.celu(input, self.alpha, self.inplace) + + def extra_repr(self) -> str: + inplace_str = ', inplace=True' if self.inplace else '' + return f'alpha={self.alpha}{inplace_str}' + + +class SELU(Module): + r"""Applies the SELU function element-wise. + + .. math:: + \text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1))) + + with :math:`\alpha = 1.6732632423543772848170429916717` and + :math:`\text{scale} = 1.0507009873554804934193349852946`. + + .. warning:: + When using ``kaiming_normal`` or ``kaiming_normal_`` for initialisation, + ``nonlinearity='linear'`` should be used instead of ``nonlinearity='selu'`` + in order to get `Self-Normalizing Neural Networks`_. + See :func:`torch.nn.init.calculate_gain` for more information. + + More details can be found in the paper `Self-Normalizing Neural Networks`_ . + + Args: + inplace (bool, optional): can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/SELU.png + + Examples:: + + >>> m = nn.SELU() + >>> input = torch.randn(2) + >>> output = m(input) + + .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515 + """ + + __constants__ = ['inplace'] + inplace: bool + + def __init__(self, inplace: bool = False) -> None: + super().__init__() + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.selu(input, self.inplace) + + def extra_repr(self) -> str: + inplace_str = 'inplace=True' if self.inplace else '' + return inplace_str + + +class GLU(Module): + r"""Applies the gated linear unit function. + + :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half + of the input matrices and :math:`b` is the second half. + + Args: + dim (int): the dimension on which to split the input. Default: -1 + + Shape: + - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional + dimensions + - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2` + + Examples:: + + >>> m = nn.GLU() + >>> input = torch.randn(4, 2) + >>> output = m(input) + """ + + __constants__ = ['dim'] + dim: int + + def __init__(self, dim: int = -1) -> None: + super().__init__() + self.dim = dim + + def forward(self, input: Tensor) -> Tensor: + return F.glu(input, self.dim) + + def extra_repr(self) -> str: + return f'dim={self.dim}' + + +class GELU(Module): + r"""Applies the Gaussian Error Linear Units function. + + .. math:: \text{GELU}(x) = x * \Phi(x) + + where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution. + + When the approximate argument is 'tanh', Gelu is estimated with: + + .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt{2 / \pi} * (x + 0.044715 * x^3))) + + Args: + approximate (str, optional): the gelu approximation algorithm to use: + ``'none'`` | ``'tanh'``. Default: ``'none'`` + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/GELU.png + + Examples:: + + >>> m = nn.GELU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['approximate'] + approximate: str + + def __init__(self, approximate: str = 'none') -> None: + super().__init__() + self.approximate = approximate + + def forward(self, input: Tensor) -> Tensor: + return F.gelu(input, approximate=self.approximate) + + def extra_repr(self) -> str: + return f'approximate={repr(self.approximate)}' + + +class Hardshrink(Module): + r"""Applies the Hard Shrinkage (Hardshrink) function element-wise. + + Hardshrink is defined as: + + .. math:: + \text{HardShrink}(x) = + \begin{cases} + x, & \text{ if } x > \lambda \\ + x, & \text{ if } x < -\lambda \\ + 0, & \text{ otherwise } + \end{cases} + + Args: + lambd: the :math:`\lambda` value for the Hardshrink formulation. Default: 0.5 + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Hardshrink.png + + Examples:: + + >>> m = nn.Hardshrink() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['lambd'] + lambd: float + + def __init__(self, lambd: float = 0.5) -> None: + super().__init__() + self.lambd = lambd + + def forward(self, input: Tensor) -> Tensor: + return F.hardshrink(input, self.lambd) + + def extra_repr(self) -> str: + return f'{self.lambd}' + + +class LeakyReLU(Module): + r"""Applies the LeakyReLU function element-wise. + + .. math:: + \text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x) + + + or + + .. math:: + \text{LeakyReLU}(x) = + \begin{cases} + x, & \text{ if } x \geq 0 \\ + \text{negative\_slope} \times x, & \text{ otherwise } + \end{cases} + + Args: + negative_slope: Controls the angle of the negative slope (which is used for + negative input values). Default: 1e-2 + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)` where `*` means, any number of additional + dimensions + - Output: :math:`(*)`, same shape as the input + + .. image:: ../scripts/activation_images/LeakyReLU.png + + Examples:: + + >>> m = nn.LeakyReLU(0.1) + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['inplace', 'negative_slope'] + inplace: bool + negative_slope: float + + def __init__(self, negative_slope: float = 1e-2, inplace: bool = False) -> None: + super().__init__() + self.negative_slope = negative_slope + self.inplace = inplace + + def forward(self, input: Tensor) -> Tensor: + return F.leaky_relu(input, self.negative_slope, self.inplace) + + def extra_repr(self) -> str: + inplace_str = ', inplace=True' if self.inplace else '' + return f'negative_slope={self.negative_slope}{inplace_str}' + + +class LogSigmoid(Module): + r"""Applies the Logsigmoid function element-wise. + + .. math:: + \text{LogSigmoid}(x) = \log\left(\frac{ 1 }{ 1 + \exp(-x)}\right) + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/LogSigmoid.png + + Examples:: + + >>> m = nn.LogSigmoid() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + return F.logsigmoid(input) + + +class Softplus(Module): + r"""Applies the Softplus function element-wise. + + .. math:: + \text{Softplus}(x) = \frac{1}{\beta} * \log(1 + \exp(\beta * x)) + + SoftPlus is a smooth approximation to the ReLU function and can be used + to constrain the output of a machine to always be positive. + + For numerical stability the implementation reverts to the linear function + when :math:`input \times \beta > threshold`. + + Args: + beta: the :math:`\beta` value for the Softplus formulation. Default: 1 + threshold: values above this revert to a linear function. Default: 20 + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Softplus.png + + Examples:: + + >>> m = nn.Softplus() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['beta', 'threshold'] + beta: float + threshold: float + + def __init__(self, beta: float = 1.0, threshold: float = 20.0) -> None: + super().__init__() + self.beta = beta + self.threshold = threshold + + def forward(self, input: Tensor) -> Tensor: + return F.softplus(input, self.beta, self.threshold) + + def extra_repr(self) -> str: + return f'beta={self.beta}, threshold={self.threshold}' + + +class Softshrink(Module): + r"""Applies the soft shrinkage function element-wise. + + .. math:: + \text{SoftShrinkage}(x) = + \begin{cases} + x - \lambda, & \text{ if } x > \lambda \\ + x + \lambda, & \text{ if } x < -\lambda \\ + 0, & \text{ otherwise } + \end{cases} + + Args: + lambd: the :math:`\lambda` (must be no less than zero) value for the Softshrink formulation. Default: 0.5 + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Softshrink.png + + Examples:: + + >>> m = nn.Softshrink() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['lambd'] + lambd: float + + def __init__(self, lambd: float = 0.5) -> None: + super().__init__() + self.lambd = lambd + + def forward(self, input: Tensor) -> Tensor: + return F.softshrink(input, self.lambd) + + def extra_repr(self) -> str: + return str(self.lambd) + + +def _check_arg_device(x: Optional[torch.Tensor]) -> bool: + if x is not None: + return x.device.type in ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name] + return True + + +def _arg_requires_grad(x: Optional[torch.Tensor]) -> bool: + if x is not None: + return x.requires_grad + return False + + +def _is_make_fx_tracing(): + if not torch.jit.is_scripting(): + torch_dispatch_mode_stack = torch.utils._python_dispatch._get_current_dispatch_mode_stack() + return any(type(x) == torch.fx.experimental.proxy_tensor.ProxyTorchDispatchMode for x in torch_dispatch_mode_stack) + else: + return False + + +class MultiheadAttention(Module): + r"""Allows the model to jointly attend to information from different representation subspaces. + + Method described in the paper: + `Attention Is All You Need `_. + + Multi-Head Attention is defined as: + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + + where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. + + ``nn.MultiHeadAttention`` will use the optimized implementations of + ``scaled_dot_product_attention()`` when possible. + + In addition to support for the new ``scaled_dot_product_attention()`` + function, for speeding up Inference, MHA will use + fastpath inference with support for Nested Tensors, iff: + + - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor). + - inputs are batched (3D) with ``batch_first==True`` + - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad`` + - training is disabled (using ``.eval()``) + - ``add_bias_kv`` is ``False`` + - ``add_zero_attn`` is ``False`` + - ``kdim`` and ``vdim`` are equal to ``embed_dim`` + - if a `NestedTensor `_ is passed, neither ``key_padding_mask`` + nor ``attn_mask`` is passed + - autocast is disabled + + If the optimized inference fastpath implementation is in use, a + `NestedTensor `_ can be passed for + ``query``/``key``/``value`` to represent padding more efficiently than using a + padding mask. In this case, a `NestedTensor `_ + will be returned, and an additional speedup proportional to the fraction of the input + that is padding can be expected. + + Args: + embed_dim: Total dimension of the model. + num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split + across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``). + dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout). + bias: If specified, adds bias to input / output projection layers. Default: ``True``. + add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``. + add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1. + Default: ``False``. + kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``). + vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + + Examples:: + + >>> # xdoctest: +SKIP + >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + >>> attn_output, attn_output_weights = multihead_attn(query, key, value) + + .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`: + https://arxiv.org/abs/2205.14135 + + """ + + __constants__ = ['batch_first'] + bias_k: Optional[torch.Tensor] + bias_v: Optional[torch.Tensor] + + def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, + kdim=None, vdim=None, batch_first=False, device=None, dtype=None) -> None: + if embed_dim <= 0 or num_heads <= 0: + raise ValueError( + f"embed_dim and num_heads must be greater than 0," + f" got embed_dim={embed_dim} and num_heads={num_heads} instead" + ) + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.batch_first = batch_first + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + if not self._qkv_same_embed_dim: + self.q_proj_weight = Parameter(torch.empty((embed_dim, embed_dim), **factory_kwargs)) + self.k_proj_weight = Parameter(torch.empty((embed_dim, self.kdim), **factory_kwargs)) + self.v_proj_weight = Parameter(torch.empty((embed_dim, self.vdim), **factory_kwargs)) + self.register_parameter('in_proj_weight', None) + else: + self.in_proj_weight = Parameter(torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)) + self.register_parameter('q_proj_weight', None) + self.register_parameter('k_proj_weight', None) + self.register_parameter('v_proj_weight', None) + + if bias: + self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs)) + else: + self.register_parameter('in_proj_bias', None) + self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs) + + if add_bias_kv: + self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + else: + self.bias_k = self.bias_v = None + + self.add_zero_attn = add_zero_attn + + self._reset_parameters() + + def _reset_parameters(self): + if self._qkv_same_embed_dim: + xavier_uniform_(self.in_proj_weight) + else: + xavier_uniform_(self.q_proj_weight) + xavier_uniform_(self.k_proj_weight) + xavier_uniform_(self.v_proj_weight) + + if self.in_proj_bias is not None: + constant_(self.in_proj_bias, 0.) + constant_(self.out_proj.bias, 0.) + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + def __setstate__(self, state): + # Support loading old MultiheadAttention checkpoints generated by v1.1.0 + if '_qkv_same_embed_dim' not in state: + state['_qkv_same_embed_dim'] = True + + super().__setstate__(state) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + average_attn_weights: bool = True, + is_causal : bool = False) -> Tuple[Tensor, Optional[Tensor]]: + r"""Compute attention outputs using query, key, and value embeddings. + + Supports optional parameters for padding, masks and attention weights. + + Args: + query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False`` + or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length, + :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``. + Queries are compared against key-value pairs to produce the output. + See "Attention Is All You Need" for more details. + key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False`` + or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length, + :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``. + See "Attention Is All You Need" for more details. + value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when + ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source + sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``. + See "Attention Is All You Need" for more details. + key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key`` + to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`. + Binary and float masks are supported. + For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for + the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value. + need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``. + Set ``need_weights=False`` to use the optimized ``scaled_dot_product_attention`` + and achieve the best performance for MHA. + Default: ``True``. + attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape + :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size, + :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be + broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch. + Binary and float masks are supported. For a binary mask, a ``True`` value indicates that the + corresponding position is not allowed to attend. For a float mask, the mask values will be added to + the attention weight. + If both attn_mask and key_padding_mask are supplied, their types should match. + average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across + heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an + effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads) + is_causal: If specified, applies a causal mask as attention mask. + Default: ``False``. + Warning: + ``is_causal`` provides a hint that ``attn_mask`` is the + causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + + Outputs: + - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched, + :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``, + where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the + embedding dimension ``embed_dim``. + - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``, + returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or + :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and + :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per + head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`. + + .. note:: + `batch_first` argument is ignored for unbatched inputs. + """ + why_not_fast_path = '' + if ((attn_mask is not None and torch.is_floating_point(attn_mask)) + or (key_padding_mask is not None) and torch.is_floating_point(key_padding_mask)): + why_not_fast_path = "floating-point masks are not supported for fast path." + + is_batched = query.dim() == 3 + + key_padding_mask = F._canonical_mask( + mask=key_padding_mask, + mask_name="key_padding_mask", + other_type=F._none_or_dtype(attn_mask), + other_name="attn_mask", + target_type=query.dtype + ) + + attn_mask = F._canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=query.dtype, + check_other=False, + ) + + is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled() + + if not is_fastpath_enabled: + why_not_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True" + elif not is_batched: + why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}" + elif query is not key or key is not value: + # When lifting this restriction, don't forget to either + # enforce that the dtypes all match or test cases where + # they don't! + why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)" + elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype: + why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" + elif self.in_proj_weight is None: + why_not_fast_path = "in_proj_weight was None" + elif query.dtype != self.in_proj_weight.dtype: + # this case will fail anyway, but at least they'll get a useful error message. + why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" + elif self.training: + why_not_fast_path = "training is enabled" + elif (self.num_heads % 2) != 0: + why_not_fast_path = "self.num_heads is not even" + elif not self.batch_first: + why_not_fast_path = "batch_first was not True" + elif self.bias_k is not None: + why_not_fast_path = "self.bias_k was not None" + elif self.bias_v is not None: + why_not_fast_path = "self.bias_v was not None" + elif self.add_zero_attn: + why_not_fast_path = "add_zero_attn was enabled" + elif not self._qkv_same_embed_dim: + why_not_fast_path = "_qkv_same_embed_dim was not True" + elif query.is_nested and (key_padding_mask is not None or attn_mask is not None): + why_not_fast_path = "supplying both src_key_padding_mask and src_mask at the same time \ + is not supported with NestedTensor input" + elif torch.is_autocast_enabled(): + why_not_fast_path = "autocast is enabled" + + if not why_not_fast_path: + tensor_args = ( + query, + key, + value, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj.weight, + self.out_proj.bias, + ) + # We have to use list comprehensions below because TorchScript does not support + # generator expressions. + if torch.overrides.has_torch_function(tensor_args): + why_not_fast_path = "some Tensor argument has_torch_function" + elif _is_make_fx_tracing(): + why_not_fast_path = "we are running make_fx tracing" + elif not all(_check_arg_device(x) for x in tensor_args): + why_not_fast_path = ("some Tensor argument's device is neither one of " + f"cpu, cuda or {torch.utils.backend_registration._privateuse1_backend_name}") + elif torch.is_grad_enabled() and any(_arg_requires_grad(x) for x in tensor_args): + why_not_fast_path = ("grad is enabled and at least one of query or the " + "input/output projection weights or biases requires_grad") + if not why_not_fast_path: + merged_mask, mask_type = self.merge_masks(attn_mask, key_padding_mask, query) + + if self.in_proj_bias is not None and self.in_proj_weight is not None: + return torch._native_multi_head_attention( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj.weight, + self.out_proj.bias, + merged_mask, + need_weights, + average_attn_weights, + mask_type) + + any_nested = query.is_nested or key.is_nested or value.is_nested + assert not any_nested, ("MultiheadAttention does not support NestedTensor outside of its fast path. " + + f"The fast path was not hit because {why_not_fast_path}") + + if self.batch_first and is_batched: + # make sure that the transpose op does not affect the "is" property + if key is value: + if query is key: + query = key = value = query.transpose(1, 0) + else: + query, key = (x.transpose(1, 0) for x in (query, key)) + value = key + else: + query, key, value = (x.transpose(1, 0) for x in (query, key, value)) + + if not self._qkv_same_embed_dim: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, key, value, self.embed_dim, self.num_heads, + self.in_proj_weight, self.in_proj_bias, + self.bias_k, self.bias_v, self.add_zero_attn, + self.dropout, self.out_proj.weight, self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, need_weights=need_weights, + attn_mask=attn_mask, + use_separate_proj_weight=True, + q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, + v_proj_weight=self.v_proj_weight, + average_attn_weights=average_attn_weights, + is_causal=is_causal) + else: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, key, value, self.embed_dim, self.num_heads, + self.in_proj_weight, self.in_proj_bias, + self.bias_k, self.bias_v, self.add_zero_attn, + self.dropout, self.out_proj.weight, self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + average_attn_weights=average_attn_weights, + is_causal=is_causal) + if self.batch_first and is_batched: + return attn_output.transpose(1, 0), attn_output_weights + else: + return attn_output, attn_output_weights + + def merge_masks(self, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], + query: Tensor) -> Tuple[Optional[Tensor], Optional[int]]: + r"""Determine mask type and combine masks if necessary. + + If only one mask is provided, that mask + and the corresponding mask type will be returned. If both masks are provided, they will be both + expanded to shape ``(batch_size, num_heads, seq_len, seq_len)``, combined with logical ``or`` + and mask type 2 will be returned + Args: + attn_mask: attention mask of shape ``(seq_len, seq_len)``, mask type 0 + key_padding_mask: padding mask of shape ``(batch_size, seq_len)``, mask type 1 + query: query embeddings of shape ``(batch_size, seq_len, embed_dim)`` + Returns: + merged_mask: merged mask + mask_type: merged mask type (0, 1, or 2) + """ + mask_type: Optional[int] = None + merged_mask: Optional[Tensor] = None + + if key_padding_mask is not None: + mask_type = 1 + merged_mask = key_padding_mask + + if attn_mask is not None: + # In this branch query can't be a nested tensor, so it has a shape + batch_size, seq_len, _ = query.shape + mask_type = 2 + + # Always expands attn_mask to 4D + if attn_mask.dim() == 3: + attn_mask_expanded = attn_mask.view(batch_size, -1, seq_len, seq_len) + else: # attn_mask.dim() == 2: + attn_mask_expanded = attn_mask.view(1, 1, seq_len, seq_len).expand(batch_size, self.num_heads, -1, -1) + merged_mask = attn_mask_expanded + + if key_padding_mask is not None: + key_padding_mask_expanded = key_padding_mask.view(batch_size, 1, 1, seq_len).expand(-1, self.num_heads, -1, -1) + merged_mask = attn_mask_expanded + key_padding_mask_expanded + + # no attn_mask and no key_padding_mask, returns None, None + return merged_mask, mask_type + + +class PReLU(Module): + r"""Applies the element-wise PReLU function. + + .. math:: + \text{PReLU}(x) = \max(0,x) + a * \min(0,x) + + or + + .. math:: + \text{PReLU}(x) = + \begin{cases} + x, & \text{ if } x \geq 0 \\ + ax, & \text{ otherwise } + \end{cases} + + Here :math:`a` is a learnable parameter. When called without arguments, `nn.PReLU()` uses a single + parameter :math:`a` across all input channels. If called with `nn.PReLU(nChannels)`, + a separate :math:`a` is used for each input channel. + + + .. note:: + weight decay should not be used when learning :math:`a` for good performance. + + .. note:: + Channel dim is the 2nd dim of input. When input has dims < 2, then there is + no channel dim and the number of channels = 1. + + Args: + num_parameters (int): number of :math:`a` to learn. + Although it takes an int as input, there is only two values are legitimate: + 1, or the number of channels at input. Default: 1 + init (float): the initial value of :math:`a`. Default: 0.25 + + Shape: + - Input: :math:`( *)` where `*` means, any number of additional + dimensions. + - Output: :math:`(*)`, same shape as the input. + + Attributes: + weight (Tensor): the learnable weights of shape (:attr:`num_parameters`). + + .. image:: ../scripts/activation_images/PReLU.png + + Examples:: + + >>> m = nn.PReLU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + __constants__ = ['num_parameters'] + num_parameters: int + + def __init__(self, num_parameters: int = 1, init: float = 0.25, + device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + self.num_parameters = num_parameters + super().__init__() + self.init = init + self.weight = Parameter(torch.empty(num_parameters, **factory_kwargs)) + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.constant_(self.weight, self.init) + + def forward(self, input: Tensor) -> Tensor: + return F.prelu(input, self.weight) + + def extra_repr(self) -> str: + return f'num_parameters={self.num_parameters}' + + +class Softsign(Module): + r"""Applies the element-wise Softsign function. + + .. math:: + \text{SoftSign}(x) = \frac{x}{ 1 + |x|} + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Softsign.png + + Examples:: + + >>> m = nn.Softsign() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + return F.softsign(input) + + +class Tanhshrink(Module): + r"""Applies the element-wise Tanhshrink function. + + .. math:: + \text{Tanhshrink}(x) = x - \tanh(x) + + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + + .. image:: ../scripts/activation_images/Tanhshrink.png + + Examples:: + + >>> m = nn.Tanhshrink() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + return F.tanhshrink(input) + + +class Softmin(Module): + r"""Applies the Softmin function to an n-dimensional input Tensor. + + Rescales them so that the elements of the n-dimensional output Tensor + lie in the range `[0, 1]` and sum to 1. + + Softmin is defined as: + + .. math:: + \text{Softmin}(x_{i}) = \frac{\exp(-x_i)}{\sum_j \exp(-x_j)} + + Shape: + - Input: :math:`(*)` where `*` means, any number of additional + dimensions + - Output: :math:`(*)`, same shape as the input + + Args: + dim (int): A dimension along which Softmin will be computed (so every slice + along dim will sum to 1). + + Returns: + a Tensor of the same dimension and shape as the input, with + values in the range [0, 1] + + Examples:: + + >>> m = nn.Softmin(dim=1) + >>> input = torch.randn(2, 3) + >>> output = m(input) + """ + + __constants__ = ['dim'] + dim: Optional[int] + + def __init__(self, dim: Optional[int] = None) -> None: + super().__init__() + self.dim = dim + + def __setstate__(self, state): + super().__setstate__(state) + if not hasattr(self, 'dim'): + self.dim = None + + def forward(self, input: Tensor) -> Tensor: + return F.softmin(input, self.dim, _stacklevel=5) + + def extra_repr(self): + return f'dim={self.dim}' + +class Softmax(Module): + r"""Applies the Softmax function to an n-dimensional input Tensor. + + Rescales them so that the elements of the n-dimensional output Tensor + lie in the range [0,1] and sum to 1. + + Softmax is defined as: + + .. math:: + \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)} + + When the input Tensor is a sparse tensor then the unspecified + values are treated as ``-inf``. + + Shape: + - Input: :math:`(*)` where `*` means, any number of additional + dimensions + - Output: :math:`(*)`, same shape as the input + + Returns: + a Tensor of the same dimension and shape as the input with + values in the range [0, 1] + + Args: + dim (int): A dimension along which Softmax will be computed (so every slice + along dim will sum to 1). + + .. note:: + This module doesn't work directly with NLLLoss, + which expects the Log to be computed between the Softmax and itself. + Use `LogSoftmax` instead (it's faster and has better numerical properties). + + Examples:: + + >>> m = nn.Softmax(dim=1) + >>> input = torch.randn(2, 3) + >>> output = m(input) + + """ + + __constants__ = ['dim'] + dim: Optional[int] + + def __init__(self, dim: Optional[int] = None) -> None: + super().__init__() + self.dim = dim + + def __setstate__(self, state): + super().__setstate__(state) + if not hasattr(self, 'dim'): + self.dim = None + + def forward(self, input: Tensor) -> Tensor: + return F.softmax(input, self.dim, _stacklevel=5) + + def extra_repr(self) -> str: + return f'dim={self.dim}' + + +class Softmax2d(Module): + r"""Applies SoftMax over features to each spatial location. + + When given an image of ``Channels x Height x Width``, it will + apply `Softmax` to each location :math:`(Channels, h_i, w_j)` + + Shape: + - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`. + - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input) + + Returns: + a Tensor of the same dimension and shape as the input with + values in the range [0, 1] + + Examples:: + + >>> m = nn.Softmax2d() + >>> # you softmax over the 2nd dimension + >>> input = torch.randn(2, 3, 12, 13) + >>> output = m(input) + """ + + def forward(self, input: Tensor) -> Tensor: + if input.dim() not in (3, 4): + raise ValueError( + f"Softmax2d: expected input to be 3D or 4D, got {input.dim()}D instead" + ) + return F.softmax(input, -3, _stacklevel=5) + + +class LogSoftmax(Module): + r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional input Tensor. + + The LogSoftmax formulation can be simplified as: + + .. math:: + \text{LogSoftmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right) + + Shape: + - Input: :math:`(*)` where `*` means, any number of additional + dimensions + - Output: :math:`(*)`, same shape as the input + + Args: + dim (int): A dimension along which LogSoftmax will be computed. + + Returns: + a Tensor of the same dimension and shape as the input with + values in the range [-inf, 0) + + Examples:: + + >>> m = nn.LogSoftmax(dim=1) + >>> input = torch.randn(2, 3) + >>> output = m(input) + """ + + __constants__ = ['dim'] + dim: Optional[int] + + def __init__(self, dim: Optional[int] = None) -> None: + super().__init__() + self.dim = dim + + def __setstate__(self, state): + super().__setstate__(state) + if not hasattr(self, 'dim'): + self.dim = None + + def forward(self, input: Tensor) -> Tensor: + return F.log_softmax(input, self.dim, _stacklevel=5) + + def extra_repr(self): + return f'dim={self.dim}' diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/batchnorm.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/batchnorm.py new file mode 100644 index 0000000000000000000000000000000000000000..0eac5cef2daf75c8de8fe981263011ec05ca00c9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/batchnorm.py @@ -0,0 +1,849 @@ +from typing import Optional, Any + +import torch +from torch import Tensor +from torch.nn.parameter import Parameter, UninitializedParameter, UninitializedBuffer + +from .. import functional as F +from .. import init +from ._functions import SyncBatchNorm as sync_batch_norm +from .lazy import LazyModuleMixin +from .module import Module + +__all__ = ['BatchNorm1d', 'LazyBatchNorm1d', 'BatchNorm2d', 'LazyBatchNorm2d', 'BatchNorm3d', + 'LazyBatchNorm3d', 'SyncBatchNorm'] + + +class _NormBase(Module): + """Common base of _InstanceNorm and _BatchNorm.""" + + _version = 2 + __constants__ = ["track_running_stats", "momentum", "eps", "num_features", "affine"] + num_features: int + eps: float + momentum: float + affine: bool + track_running_stats: bool + # WARNING: weight and bias purposely not defined here. + # See https://github.com/pytorch/pytorch/issues/39670 + + def __init__( + self, + num_features: int, + eps: float = 1e-5, + momentum: float = 0.1, + affine: bool = True, + track_running_stats: bool = True, + device=None, + dtype=None + ) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + self.num_features = num_features + self.eps = eps + self.momentum = momentum + self.affine = affine + self.track_running_stats = track_running_stats + if self.affine: + self.weight = Parameter(torch.empty(num_features, **factory_kwargs)) + self.bias = Parameter(torch.empty(num_features, **factory_kwargs)) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if self.track_running_stats: + self.register_buffer('running_mean', torch.zeros(num_features, **factory_kwargs)) + self.register_buffer('running_var', torch.ones(num_features, **factory_kwargs)) + self.running_mean: Optional[Tensor] + self.running_var: Optional[Tensor] + self.register_buffer('num_batches_tracked', + torch.tensor(0, dtype=torch.long, + **{k: v for k, v in factory_kwargs.items() if k != 'dtype'})) + self.num_batches_tracked: Optional[Tensor] + else: + self.register_buffer("running_mean", None) + self.register_buffer("running_var", None) + self.register_buffer("num_batches_tracked", None) + self.reset_parameters() + + def reset_running_stats(self) -> None: + if self.track_running_stats: + # running_mean/running_var/num_batches... are registered at runtime depending + # if self.track_running_stats is on + self.running_mean.zero_() # type: ignore[union-attr] + self.running_var.fill_(1) # type: ignore[union-attr] + self.num_batches_tracked.zero_() # type: ignore[union-attr,operator] + + def reset_parameters(self) -> None: + self.reset_running_stats() + if self.affine: + init.ones_(self.weight) + init.zeros_(self.bias) + + def _check_input_dim(self, input): + raise NotImplementedError + + def extra_repr(self): + return ( + "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, " + "track_running_stats={track_running_stats}".format(**self.__dict__) + ) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + + if (version is None or version < 2) and self.track_running_stats: + # at version 2: added num_batches_tracked buffer + # this should have a default value of 0 + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key not in state_dict: + state_dict[num_batches_tracked_key] = ( + self.num_batches_tracked + if self.num_batches_tracked is not None and self.num_batches_tracked.device != torch.device('meta') + else torch.tensor(0, dtype=torch.long) + ) + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + +class _BatchNorm(_NormBase): + def __init__( + self, + num_features: int, + eps: float = 1e-5, + momentum: float = 0.1, + affine: bool = True, + track_running_stats: bool = True, + device=None, + dtype=None + ) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__( + num_features, eps, momentum, affine, track_running_stats, **factory_kwargs + ) + + def forward(self, input: Tensor) -> Tensor: + self._check_input_dim(input) + + # exponential_average_factor is set to self.momentum + # (when it is available) only so that it gets updated + # in ONNX graph when this node is exported to ONNX. + if self.momentum is None: + exponential_average_factor = 0.0 + else: + exponential_average_factor = self.momentum + + if self.training and self.track_running_stats: + # TODO: if statement only here to tell the jit to skip emitting this when it is None + if self.num_batches_tracked is not None: # type: ignore[has-type] + self.num_batches_tracked.add_(1) # type: ignore[has-type] + if self.momentum is None: # use cumulative moving average + exponential_average_factor = 1.0 / float(self.num_batches_tracked) + else: # use exponential moving average + exponential_average_factor = self.momentum + + r""" + Decide whether the mini-batch stats should be used for normalization rather than the buffers. + Mini-batch stats are used in training mode, and in eval mode when buffers are None. + """ + if self.training: + bn_training = True + else: + bn_training = (self.running_mean is None) and (self.running_var is None) + + r""" + Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be + passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are + used for normalization (i.e. in eval mode when buffers are not None). + """ + return F.batch_norm( + input, + # If buffers are not to be tracked, ensure that they won't be updated + self.running_mean + if not self.training or self.track_running_stats + else None, + self.running_var if not self.training or self.track_running_stats else None, + self.weight, + self.bias, + bn_training, + exponential_average_factor, + self.eps, + ) + + +class _LazyNormBase(LazyModuleMixin, _NormBase): + + weight: UninitializedParameter # type: ignore[assignment] + bias: UninitializedParameter # type: ignore[assignment] + + def __init__(self, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, + device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__( + # affine and track_running_stats are hardcoded to False to + # avoid creating tensors that will soon be overwritten. + 0, + eps, + momentum, + False, + False, + **factory_kwargs, + ) + self.affine = affine + self.track_running_stats = track_running_stats + if self.affine: + self.weight = UninitializedParameter(**factory_kwargs) + self.bias = UninitializedParameter(**factory_kwargs) + if self.track_running_stats: + self.running_mean = UninitializedBuffer(**factory_kwargs) + self.running_var = UninitializedBuffer(**factory_kwargs) + self.num_batches_tracked = torch.tensor( + 0, dtype=torch.long, **{k: v for k, v in factory_kwargs.items() if k != 'dtype'}) + + def reset_parameters(self) -> None: + if not self.has_uninitialized_params() and self.num_features != 0: + super().reset_parameters() + + def initialize_parameters(self, input) -> None: # type: ignore[override] + if self.has_uninitialized_params(): + self.num_features = input.shape[1] + if self.affine: + assert isinstance(self.weight, UninitializedParameter) + assert isinstance(self.bias, UninitializedParameter) + self.weight.materialize((self.num_features,)) + self.bias.materialize((self.num_features,)) + if self.track_running_stats: + self.running_mean.materialize((self.num_features,)) # type:ignore[union-attr] + self.running_var.materialize((self.num_features,)) # type:ignore[union-attr] + self.reset_parameters() + + +class BatchNorm1d(_BatchNorm): + r"""Applies Batch Normalization over a 2D or 3D input. + + Method described in the paper + `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `__ . + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension over + the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors + of size `C` (where `C` is the number of features or channels of the input). By default, the + elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0. + At train time in the forward pass, the standard-deviation is calculated via the biased estimator, + equivalent to ``torch.var(input, unbiased=False)``. However, the value stored in the + moving average of the standard-deviation is calculated via the unbiased estimator, equivalent to + ``torch.var(input, unbiased=True)``. + + Also by default, during training this layer keeps running estimates of its + computed mean and variance, which are then used for normalization during + evaluation. The running estimates are kept with a default :attr:`momentum` + of 0.1. + + If :attr:`track_running_stats` is set to ``False``, this layer then does not + keep running estimates, and batch statistics are instead used during + evaluation time as well. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + Because the Batch Normalization is done over the `C` dimension, computing statistics + on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization. + + Args: + num_features: number of features or channels :math:`C` of the input + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + + Shape: + - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size, + :math:`C` is the number of features or channels, and :math:`L` is the sequence length + - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input) + + Examples:: + + >>> # With Learnable Parameters + >>> m = nn.BatchNorm1d(100) + >>> # Without Learnable Parameters + >>> m = nn.BatchNorm1d(100, affine=False) + >>> input = torch.randn(20, 100) + >>> output = m(input) + """ + + def _check_input_dim(self, input): + if input.dim() != 2 and input.dim() != 3: + raise ValueError( + f"expected 2D or 3D input (got {input.dim()}D input)" + ) + + +class LazyBatchNorm1d(_LazyNormBase, _BatchNorm): + r"""A :class:`torch.nn.BatchNorm1d` module with lazy initialization. + + Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred + from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight`, `bias`, + `running_mean` and `running_var`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + """ + + cls_to_become = BatchNorm1d # type: ignore[assignment] + + def _check_input_dim(self, input): + if input.dim() != 2 and input.dim() != 3: + raise ValueError( + f"expected 2D or 3D input (got {input.dim()}D input)" + ) + + +class BatchNorm2d(_BatchNorm): + r"""Applies Batch Normalization over a 4D input. + + 4D is a mini-batch of 2D inputs + with additional channel dimension. Method described in the paper + `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `__ . + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension over + the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors + of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set + to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the + standard-deviation is calculated via the biased estimator, equivalent to + ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the + standard-deviation is calculated via the unbiased estimator, equivalent to + ``torch.var(input, unbiased=True)``. + + Also by default, during training this layer keeps running estimates of its + computed mean and variance, which are then used for normalization during + evaluation. The running estimates are kept with a default :attr:`momentum` + of 0.1. + + If :attr:`track_running_stats` is set to ``False``, this layer then does not + keep running estimates, and batch statistics are instead used during + evaluation time as well. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + Because the Batch Normalization is done over the `C` dimension, computing statistics + on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, H, W)` + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + + Shape: + - Input: :math:`(N, C, H, W)` + - Output: :math:`(N, C, H, W)` (same shape as input) + + Examples:: + + >>> # With Learnable Parameters + >>> m = nn.BatchNorm2d(100) + >>> # Without Learnable Parameters + >>> m = nn.BatchNorm2d(100, affine=False) + >>> input = torch.randn(20, 100, 35, 45) + >>> output = m(input) + """ + + def _check_input_dim(self, input): + if input.dim() != 4: + raise ValueError(f"expected 4D input (got {input.dim()}D input)") + + +class LazyBatchNorm2d(_LazyNormBase, _BatchNorm): + r"""A :class:`torch.nn.BatchNorm2d` module with lazy initialization. + + Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred + from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight`, `bias`, + `running_mean` and `running_var`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + """ + + cls_to_become = BatchNorm2d # type: ignore[assignment] + + def _check_input_dim(self, input): + if input.dim() != 4: + raise ValueError(f"expected 4D input (got {input.dim()}D input)") + + +class BatchNorm3d(_BatchNorm): + r"""Applies Batch Normalization over a 5D input. + + 5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper + `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `__ . + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension over + the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors + of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set + to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the + standard-deviation is calculated via the biased estimator, equivalent to + ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the + standard-deviation is calculated via the unbiased estimator, equivalent to + ``torch.var(input, unbiased=True)``. + + Also by default, during training this layer keeps running estimates of its + computed mean and variance, which are then used for normalization during + evaluation. The running estimates are kept with a default :attr:`momentum` + of 0.1. + + If :attr:`track_running_stats` is set to ``False``, this layer then does not + keep running estimates, and batch statistics are instead used during + evaluation time as well. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + Because the Batch Normalization is done over the `C` dimension, computing statistics + on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization + or Spatio-temporal Batch Normalization. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, D, H, W)` + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + + Shape: + - Input: :math:`(N, C, D, H, W)` + - Output: :math:`(N, C, D, H, W)` (same shape as input) + + Examples:: + + >>> # With Learnable Parameters + >>> m = nn.BatchNorm3d(100) + >>> # Without Learnable Parameters + >>> m = nn.BatchNorm3d(100, affine=False) + >>> input = torch.randn(20, 100, 35, 45, 10) + >>> output = m(input) + """ + + def _check_input_dim(self, input): + if input.dim() != 5: + raise ValueError(f"expected 5D input (got {input.dim()}D input)") + + +class LazyBatchNorm3d(_LazyNormBase, _BatchNorm): + r"""A :class:`torch.nn.BatchNorm3d` module with lazy initialization. + + Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred + from the ``input.size(1)``. + The attributes that will be lazily initialized are `weight`, `bias`, + `running_mean` and `running_var`. + + Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation + on lazy modules and their limitations. + + Args: + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + """ + + cls_to_become = BatchNorm3d # type: ignore[assignment] + + def _check_input_dim(self, input): + if input.dim() != 5: + raise ValueError(f"expected 5D input (got {input.dim()}D input)") + + +class SyncBatchNorm(_BatchNorm): + r"""Applies Batch Normalization over a N-Dimensional input. + + The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper + `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `__ . + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated per-dimension over all + mini-batches of the same process groups. :math:`\gamma` and :math:`\beta` + are learnable parameter vectors of size `C` (where `C` is the input size). + By default, the elements of :math:`\gamma` are sampled from + :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0. + The standard-deviation is calculated via the biased estimator, equivalent to + `torch.var(input, unbiased=False)`. + + Also by default, during training this layer keeps running estimates of its + computed mean and variance, which are then used for normalization during + evaluation. The running estimates are kept with a default :attr:`momentum` + of 0.1. + + If :attr:`track_running_stats` is set to ``False``, this layer then does not + keep running estimates, and batch statistics are instead used during + evaluation time as well. + + .. note:: + This :attr:`momentum` argument is different from one used in optimizer + classes and the conventional notion of momentum. Mathematically, the + update rule for running statistics here is + :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`, + where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the + new observed value. + + Because the Batch Normalization is done for each channel in the ``C`` dimension, computing + statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch + Normalization or Spatio-temporal Batch Normalization. + + Currently :class:`SyncBatchNorm` only supports + :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use + :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert + :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping + Network with DDP. + + Args: + num_features: :math:`C` from an expected input of size + :math:`(N, C, +)` + eps: a value added to the denominator for numerical stability. + Default: ``1e-5`` + momentum: the value used for the running_mean and running_var + computation. Can be set to ``None`` for cumulative moving average + (i.e. simple average). Default: 0.1 + affine: a boolean value that when set to ``True``, this module has + learnable affine parameters. Default: ``True`` + track_running_stats: a boolean value that when set to ``True``, this + module tracks the running mean and variance, and when set to ``False``, + this module does not track such statistics, and initializes statistics + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + When these buffers are ``None``, this module always uses batch statistics. + in both training and eval modes. Default: ``True`` + process_group: synchronization of stats happen within each process group + individually. Default behavior is synchronization across the whole + world + + Shape: + - Input: :math:`(N, C, +)` + - Output: :math:`(N, C, +)` (same shape as input) + + .. note:: + Synchronization of batchnorm statistics occurs only while training, i.e. + synchronization is disabled when ``model.eval()`` is set or if + ``self.training`` is otherwise ``False``. + + Examples:: + + >>> # xdoctest: +SKIP + >>> # With Learnable Parameters + >>> m = nn.SyncBatchNorm(100) + >>> # creating process group (optional) + >>> # ranks is a list of int identifying rank ids. + >>> ranks = list(range(8)) + >>> r1, r2 = ranks[:4], ranks[4:] + >>> # Note: every rank calls into new_group for every + >>> # process group created, even if that rank is not + >>> # part of the group. + >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]] + >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1] + >>> # Without Learnable Parameters + >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group) + >>> input = torch.randn(20, 100, 35, 45, 10) + >>> output = m(input) + + >>> # network is nn.BatchNorm layer + >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group) + >>> # only single gpu per process is currently supported + >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel( + >>> sync_bn_network, + >>> device_ids=[args.local_rank], + >>> output_device=args.local_rank) + """ + + def __init__( + self, + num_features: int, + eps: float = 1e-5, + momentum: float = 0.1, + affine: bool = True, + track_running_stats: bool = True, + process_group: Optional[Any] = None, + device=None, + dtype=None + ) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__( + num_features, eps, momentum, affine, track_running_stats, **factory_kwargs + ) + self.process_group = process_group + + def _check_input_dim(self, input): + if input.dim() < 2: + raise ValueError( + f"expected at least 2D input (got {input.dim()}D input)" + ) + + def _check_non_zero_input_channels(self, input): + if input.size(1) == 0: + raise ValueError( + "SyncBatchNorm number of input channels should be non-zero" + ) + + def forward(self, input: Tensor) -> Tensor: + self._check_input_dim(input) + self._check_non_zero_input_channels(input) + + # exponential_average_factor is set to self.momentum + # (when it is available) only so that it gets updated + # in ONNX graph when this node is exported to ONNX. + if self.momentum is None: + exponential_average_factor = 0.0 + else: + exponential_average_factor = self.momentum + + if self.training and self.track_running_stats: + assert self.num_batches_tracked is not None + self.num_batches_tracked.add_(1) + if self.momentum is None: # use cumulative moving average + exponential_average_factor = 1.0 / self.num_batches_tracked.item() + else: # use exponential moving average + exponential_average_factor = self.momentum + + r""" + Decide whether the mini-batch stats should be used for normalization rather than the buffers. + Mini-batch stats are used in training mode, and in eval mode when buffers are None. + """ + if self.training: + bn_training = True + else: + bn_training = (self.running_mean is None) and (self.running_var is None) + + r""" + Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be + passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are + used for normalization (i.e. in eval mode when buffers are not None). + """ + # If buffers are not to be tracked, ensure that they won't be updated + running_mean = ( + self.running_mean if not self.training or self.track_running_stats else None + ) + running_var = ( + self.running_var if not self.training or self.track_running_stats else None + ) + + # Don't sync batchnorm stats in inference mode (model.eval()). + need_sync = (bn_training and self.training and + torch.distributed.is_available() and torch.distributed.is_initialized()) + if need_sync: + # currently only GPU/PrivateUse1 input is supported + if input.device.type not in ["cuda", torch._C._get_privateuse1_backend_name()]: + raise ValueError("SyncBatchNorm expected input tensor to be on GPU or " + f"{torch._C._get_privateuse1_backend_name()}") + + process_group = torch.distributed.group.WORLD + if self.process_group: + process_group = self.process_group + world_size = torch.distributed.get_world_size(process_group) + need_sync = world_size > 1 + + # fallback to framework BN when synchronization is not necessary + if not need_sync: + return F.batch_norm( + input, + running_mean, + running_var, + self.weight, + self.bias, + bn_training, + exponential_average_factor, + self.eps, + ) + else: + assert bn_training + return sync_batch_norm.apply( + input, + self.weight, + self.bias, + running_mean, + running_var, + self.eps, + exponential_average_factor, + process_group, # type: ignore[possibly-undefined] + world_size, # type: ignore[possibly-undefined] + ) + + @classmethod + def convert_sync_batchnorm(cls, module, process_group=None): + r"""Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers. + + Args: + module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers + process_group (optional): process group to scope synchronization, + default is the whole world + + Returns: + The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm` + layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer, + a new :class:`torch.nn.SyncBatchNorm` layer object will be returned + instead. + + Example:: + + >>> # Network with nn.BatchNorm layer + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA) + >>> module = torch.nn.Sequential( + >>> torch.nn.Linear(20, 100), + >>> torch.nn.BatchNorm1d(100), + >>> ).cuda() + >>> # creating process group (optional) + >>> # ranks is a list of int identifying rank ids. + >>> ranks = list(range(8)) + >>> r1, r2 = ranks[:4], ranks[4:] + >>> # Note: every rank calls into new_group for every + >>> # process group created, even if that rank is not + >>> # part of the group. + >>> # xdoctest: +SKIP("distributed") + >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]] + >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1] + >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group) + + """ + module_output = module + if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): + module_output = torch.nn.SyncBatchNorm( + module.num_features, + module.eps, + module.momentum, + module.affine, + module.track_running_stats, + process_group, + ) + if module.affine: + with torch.no_grad(): + module_output.weight = module.weight + module_output.bias = module.bias + module_output.running_mean = module.running_mean + module_output.running_var = module.running_var + module_output.num_batches_tracked = module.num_batches_tracked + module_output.training = module.training + if hasattr(module, "qconfig"): + module_output.qconfig = module.qconfig + for name, child in module.named_children(): + module_output.add_module( + name, cls.convert_sync_batchnorm(child, process_group) + ) + del module + return module_output diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/channelshuffle.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/channelshuffle.py new file mode 100644 index 0000000000000000000000000000000000000000..d098fdc68ca872d73ddec423aa38e4a928cacfb9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/channelshuffle.py @@ -0,0 +1,57 @@ +from .module import Module +from .. import functional as F + +from torch import Tensor + +__all__ = ['ChannelShuffle'] + +class ChannelShuffle(Module): + r"""Divides and rearranges the channels in a tensor. + + This operation divides the channels in a tensor of shape :math:`(*, C , H, W)` + into g groups and rearranges them as :math:`(*, \frac{C}{g}, g, H, W)`, + while keeping the original tensor shape. + + Args: + groups (int): number of groups to divide channels in. + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("FIXME: incorrect want") + >>> channel_shuffle = nn.ChannelShuffle(2) + >>> input = torch.randn(1, 4, 2, 2) + >>> print(input) + [[[[1, 2], + [3, 4]], + [[5, 6], + [7, 8]], + [[9, 10], + [11, 12]], + [[13, 14], + [15, 16]], + ]] + >>> output = channel_shuffle(input) + >>> print(output) + [[[[1, 2], + [3, 4]], + [[9, 10], + [11, 12]], + [[5, 6], + [7, 8]], + [[13, 14], + [15, 16]], + ]] + """ + + __constants__ = ['groups'] + groups: int + + def __init__(self, groups: int) -> None: + super().__init__() + self.groups = groups + + def forward(self, input: Tensor) -> Tensor: + return F.channel_shuffle(input, self.groups) + + def extra_repr(self) -> str: + return f'groups={self.groups}' diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/container.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/container.py new file mode 100644 index 0000000000000000000000000000000000000000..1b5659d4b7e968a2b91165befb6316a71c744e85 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/container.py @@ -0,0 +1,911 @@ +import warnings +from collections import OrderedDict, abc as container_abcs +from itertools import chain, islice +import operator + +import torch +from .module import Module +from ..parameter import Parameter +from torch._jit_internal import _copy_to_script_wrapper + +from typing import Any, Dict, Iterable, Iterator, Mapping, Optional, overload, Tuple, TypeVar, Union +from typing_extensions import Self + +__all__ = ['Container', 'Sequential', 'ModuleList', 'ModuleDict', 'ParameterList', 'ParameterDict'] + +T = TypeVar('T', bound=Module) + + +# Copied from torch.nn.modules.module, required for a custom __repr__ for ModuleList +def _addindent(s_, numSpaces): + s = s_.split('\n') + # don't do anything for single-line stuff + if len(s) == 1: + return s_ + first = s.pop(0) + s = [(numSpaces * ' ') + line for line in s] + s = '\n'.join(s) + s = first + '\n' + s + return s + + +class Container(Module): + + def __init__(self, **kwargs: Any) -> None: + super().__init__() + # DeprecationWarning is ignored by default + warnings.warn("nn.Container is deprecated. All of it's functionality " + "is now implemented in nn.Module. Subclass that instead.") + for key, value in kwargs.items(): + self.add_module(key, value) + + +class Sequential(Module): + r"""A sequential container. + + Modules will be added to it in the order they are passed in the + constructor. Alternatively, an ``OrderedDict`` of modules can be + passed in. The ``forward()`` method of ``Sequential`` accepts any + input and forwards it to the first module it contains. It then + "chains" outputs to inputs sequentially for each subsequent module, + finally returning the output of the last module. + + The value a ``Sequential`` provides over manually calling a sequence + of modules is that it allows treating the whole container as a + single module, such that performing a transformation on the + ``Sequential`` applies to each of the modules it stores (which are + each a registered submodule of the ``Sequential``). + + What's the difference between a ``Sequential`` and a + :class:`torch.nn.ModuleList`? A ``ModuleList`` is exactly what it + sounds like--a list for storing ``Module`` s! On the other hand, + the layers in a ``Sequential`` are connected in a cascading way. + + Example:: + + # Using Sequential to create a small model. When `model` is run, + # input will first be passed to `Conv2d(1,20,5)`. The output of + # `Conv2d(1,20,5)` will be used as the input to the first + # `ReLU`; the output of the first `ReLU` will become the input + # for `Conv2d(20,64,5)`. Finally, the output of + # `Conv2d(20,64,5)` will be used as input to the second `ReLU` + model = nn.Sequential( + nn.Conv2d(1,20,5), + nn.ReLU(), + nn.Conv2d(20,64,5), + nn.ReLU() + ) + + # Using Sequential with OrderedDict. This is functionally the + # same as the above code + model = nn.Sequential(OrderedDict([ + ('conv1', nn.Conv2d(1,20,5)), + ('relu1', nn.ReLU()), + ('conv2', nn.Conv2d(20,64,5)), + ('relu2', nn.ReLU()) + ])) + """ + + _modules: Dict[str, Module] # type: ignore[assignment] + + @overload + def __init__(self, *args: Module) -> None: + ... + + @overload + def __init__(self, arg: 'OrderedDict[str, Module]') -> None: + ... + + def __init__(self, *args): + super().__init__() + if len(args) == 1 and isinstance(args[0], OrderedDict): + for key, module in args[0].items(): + self.add_module(key, module) + else: + for idx, module in enumerate(args): + self.add_module(str(idx), module) + + def _get_item_by_idx(self, iterator, idx) -> T: # type: ignore[misc, type-var] + """Get the idx-th item of the iterator.""" + size = len(self) + idx = operator.index(idx) + if not -size <= idx < size: + raise IndexError(f'index {idx} is out of range') + idx %= size + return next(islice(iterator, idx, None)) + + @_copy_to_script_wrapper + def __getitem__(self, idx: Union[slice, int]) -> Union['Sequential', T]: + if isinstance(idx, slice): + return self.__class__(OrderedDict(list(self._modules.items())[idx])) + else: + return self._get_item_by_idx(self._modules.values(), idx) + + def __setitem__(self, idx: int, module: Module) -> None: + key: str = self._get_item_by_idx(self._modules.keys(), idx) + return setattr(self, key, module) + + def __delitem__(self, idx: Union[slice, int]) -> None: + if isinstance(idx, slice): + for key in list(self._modules.keys())[idx]: + delattr(self, key) + else: + key = self._get_item_by_idx(self._modules.keys(), idx) + delattr(self, key) + # To preserve numbering + str_indices = [str(i) for i in range(len(self._modules))] + self._modules = OrderedDict(list(zip(str_indices, self._modules.values()))) + + @_copy_to_script_wrapper + def __len__(self) -> int: + return len(self._modules) + + def __add__(self, other) -> 'Sequential': + if isinstance(other, Sequential): + ret = Sequential() + for layer in self: + ret.append(layer) + for layer in other: + ret.append(layer) + return ret + else: + raise ValueError('add operator supports only objects ' + f'of Sequential class, but {str(type(other))} is given.') + + def pop(self, key: Union[int, slice]) -> Module: + v = self[key] + del self[key] + return v + + def __iadd__(self, other) -> Self: + if isinstance(other, Sequential): + offset = len(self) + for i, module in enumerate(other): + self.add_module(str(i + offset), module) + return self + else: + raise ValueError('add operator supports only objects ' + f'of Sequential class, but {str(type(other))} is given.') + + def __mul__(self, other: int) -> 'Sequential': + if not isinstance(other, int): + raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}") + elif (other <= 0): + raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}") + else: + combined = Sequential() + offset = 0 + for _ in range(other): + for module in self: + combined.add_module(str(offset), module) + offset += 1 + return combined + + def __rmul__(self, other: int) -> 'Sequential': + return self.__mul__(other) + + def __imul__(self, other: int) -> Self: + if not isinstance(other, int): + raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}") + elif (other <= 0): + raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}") + else: + len_original = len(self) + offset = len(self) + for _ in range(other - 1): + for i in range(len_original): + self.add_module(str(i + offset), self._modules[str(i)]) + offset += len_original + return self + + @_copy_to_script_wrapper + def __dir__(self): + keys = super().__dir__() + keys = [key for key in keys if not key.isdigit()] + return keys + + @_copy_to_script_wrapper + def __iter__(self) -> Iterator[Module]: + return iter(self._modules.values()) + + # NB: We can't really type check this function as the type of input + # may change dynamically (as is tested in + # TestScript.test_sequential_intermediary_types). Cannot annotate + # with Any as TorchScript expects a more precise type + def forward(self, input): + for module in self: + input = module(input) + return input + + def append(self, module: Module) -> 'Sequential': + r"""Append a given module to the end. + + Args: + module (nn.Module): module to append + """ + self.add_module(str(len(self)), module) + return self + + def insert(self, index: int, module: Module) -> 'Sequential': + if not isinstance(module, Module): + raise AssertionError( + f'module should be of type: {Module}') + n = len(self._modules) + if not (-n <= index <= n): + raise IndexError( + f'Index out of range: {index}') + if index < 0: + index += n + for i in range(n, index, -1): + self._modules[str(i)] = self._modules[str(i - 1)] + self._modules[str(index)] = module + return self + + def extend(self, sequential) -> 'Sequential': + for layer in sequential: + self.append(layer) + return self + + +class ModuleList(Module): + r"""Holds submodules in a list. + + :class:`~torch.nn.ModuleList` can be indexed like a regular Python list, but + modules it contains are properly registered, and will be visible by all + :class:`~torch.nn.Module` methods. + + Args: + modules (iterable, optional): an iterable of modules to add + + Example:: + + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)]) + + def forward(self, x): + # ModuleList can act as an iterable, or be indexed using ints + for i, l in enumerate(self.linears): + x = self.linears[i // 2](x) + l(x) + return x + """ + + _modules: Dict[str, Module] # type: ignore[assignment] + + def __init__(self, modules: Optional[Iterable[Module]] = None) -> None: + super().__init__() + if modules is not None: + self += modules + + def _get_abs_string_index(self, idx): + """Get the absolute index for the list of modules.""" + idx = operator.index(idx) + if not (-len(self) <= idx < len(self)): + raise IndexError(f'index {idx} is out of range') + if idx < 0: + idx += len(self) + return str(idx) + + @_copy_to_script_wrapper + def __getitem__(self, idx: Union[int, slice]) -> Union[Module, 'ModuleList']: + if isinstance(idx, slice): + return self.__class__(list(self._modules.values())[idx]) + else: + return self._modules[self._get_abs_string_index(idx)] + + def __setitem__(self, idx: int, module: Module) -> None: + idx = self._get_abs_string_index(idx) + return setattr(self, str(idx), module) + + def __delitem__(self, idx: Union[int, slice]) -> None: + if isinstance(idx, slice): + for k in range(len(self._modules))[idx]: + delattr(self, str(k)) + else: + delattr(self, self._get_abs_string_index(idx)) + # To preserve numbering, self._modules is being reconstructed with modules after deletion + str_indices = [str(i) for i in range(len(self._modules))] + self._modules = OrderedDict(list(zip(str_indices, self._modules.values()))) + + @_copy_to_script_wrapper + def __len__(self) -> int: + return len(self._modules) + + @_copy_to_script_wrapper + def __iter__(self) -> Iterator[Module]: + return iter(self._modules.values()) + + def __iadd__(self, modules: Iterable[Module]) -> Self: + return self.extend(modules) + + def __add__(self, other: Iterable[Module]) -> 'ModuleList': + combined = ModuleList() + for i, module in enumerate(chain(self, other)): + combined.add_module(str(i), module) + return combined + + def __repr__(self): + """Return a custom repr for ModuleList that compresses repeated module representations.""" + list_of_reprs = [repr(item) for item in self] + if len(list_of_reprs) == 0: + return self._get_name() + '()' + + start_end_indices = [[0, 0]] + repeated_blocks = [list_of_reprs[0]] + for i, r in enumerate(list_of_reprs[1:], 1): + if r == repeated_blocks[-1]: + start_end_indices[-1][1] += 1 + continue + + start_end_indices.append([i, i]) + repeated_blocks.append(r) + + lines = [] + main_str = self._get_name() + '(' + for (start_id, end_id), b in zip(start_end_indices, repeated_blocks): + local_repr = f"({start_id}): {b}" # default repr + + if start_id != end_id: + n = end_id - start_id + 1 + local_repr = f"({start_id}-{end_id}): {n} x {b}" + + local_repr = _addindent(local_repr, 2) + lines.append(local_repr) + + main_str += '\n ' + '\n '.join(lines) + '\n' + main_str += ')' + return main_str + + @_copy_to_script_wrapper + def __dir__(self): + keys = super().__dir__() + keys = [key for key in keys if not key.isdigit()] + return keys + + def insert(self, index: int, module: Module) -> None: + r"""Insert a given module before a given index in the list. + + Args: + index (int): index to insert. + module (nn.Module): module to insert + """ + for i in range(len(self._modules), index, -1): + self._modules[str(i)] = self._modules[str(i - 1)] + self._modules[str(index)] = module + + def append(self, module: Module) -> 'ModuleList': + r"""Append a given module to the end of the list. + + Args: + module (nn.Module): module to append + """ + self.add_module(str(len(self)), module) + return self + + def pop(self, key: Union[int, slice]) -> Module: + v = self[key] + del self[key] + return v + + def extend(self, modules: Iterable[Module]) -> Self: + r"""Append modules from a Python iterable to the end of the list. + + Args: + modules (iterable): iterable of modules to append + """ + if not isinstance(modules, container_abcs.Iterable): + raise TypeError("ModuleList.extend should be called with an " + "iterable, but got " + type(modules).__name__) + offset = len(self) + for i, module in enumerate(modules): + self.add_module(str(offset + i), module) + return self + + # remove forward alltogether to fallback on Module's _forward_unimplemented + + +class ModuleDict(Module): + r"""Holds submodules in a dictionary. + + :class:`~torch.nn.ModuleDict` can be indexed like a regular Python dictionary, + but modules it contains are properly registered, and will be visible by all + :class:`~torch.nn.Module` methods. + + :class:`~torch.nn.ModuleDict` is an **ordered** dictionary that respects + + * the order of insertion, and + + * in :meth:`~torch.nn.ModuleDict.update`, the order of the merged + ``OrderedDict``, ``dict`` (started from Python 3.6) or another + :class:`~torch.nn.ModuleDict` (the argument to + :meth:`~torch.nn.ModuleDict.update`). + + Note that :meth:`~torch.nn.ModuleDict.update` with other unordered mapping + types (e.g., Python's plain ``dict`` before Python version 3.6) does not + preserve the order of the merged mapping. + + Args: + modules (iterable, optional): a mapping (dictionary) of (string: module) + or an iterable of key-value pairs of type (string, module) + + Example:: + + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.choices = nn.ModuleDict({ + 'conv': nn.Conv2d(10, 10, 3), + 'pool': nn.MaxPool2d(3) + }) + self.activations = nn.ModuleDict([ + ['lrelu', nn.LeakyReLU()], + ['prelu', nn.PReLU()] + ]) + + def forward(self, x, choice, act): + x = self.choices[choice](x) + x = self.activations[act](x) + return x + """ + + _modules: Dict[str, Module] # type: ignore[assignment] + + def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None: + super().__init__() + if modules is not None: + self.update(modules) + + @_copy_to_script_wrapper + def __getitem__(self, key: str) -> Module: + return self._modules[key] + + def __setitem__(self, key: str, module: Module) -> None: + self.add_module(key, module) + + def __delitem__(self, key: str) -> None: + del self._modules[key] + + @_copy_to_script_wrapper + def __len__(self) -> int: + return len(self._modules) + + @_copy_to_script_wrapper + def __iter__(self) -> Iterator[str]: + return iter(self._modules) + + @_copy_to_script_wrapper + def __contains__(self, key: str) -> bool: + return key in self._modules + + def clear(self) -> None: + """Remove all items from the ModuleDict.""" + self._modules.clear() + + def pop(self, key: str) -> Module: + r"""Remove key from the ModuleDict and return its module. + + Args: + key (str): key to pop from the ModuleDict + """ + v = self[key] + del self[key] + return v + + @_copy_to_script_wrapper + def keys(self) -> Iterable[str]: + r"""Return an iterable of the ModuleDict keys.""" + return self._modules.keys() + + @_copy_to_script_wrapper + def items(self) -> Iterable[Tuple[str, Module]]: + r"""Return an iterable of the ModuleDict key/value pairs.""" + return self._modules.items() + + @_copy_to_script_wrapper + def values(self) -> Iterable[Module]: + r"""Return an iterable of the ModuleDict values.""" + return self._modules.values() + + def update(self, modules: Mapping[str, Module]) -> None: + r"""Update the :class:`~torch.nn.ModuleDict` with key-value pairs from a mapping, overwriting existing keys. + + .. note:: + If :attr:`modules` is an ``OrderedDict``, a :class:`~torch.nn.ModuleDict`, or + an iterable of key-value pairs, the order of new elements in it is preserved. + + Args: + modules (iterable): a mapping (dictionary) from string to :class:`~torch.nn.Module`, + or an iterable of key-value pairs of type (string, :class:`~torch.nn.Module`) + """ + if not isinstance(modules, container_abcs.Iterable): + raise TypeError("ModuleDict.update should be called with an " + "iterable of key/value pairs, but got " + + type(modules).__name__) + + if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)): + for key, module in modules.items(): + self[key] = module + else: + # modules here can be a list with two items + for j, m in enumerate(modules): + if not isinstance(m, container_abcs.Iterable): + raise TypeError("ModuleDict update sequence element " + "#" + str(j) + " should be Iterable; is" + + type(m).__name__) + if not len(m) == 2: + raise ValueError("ModuleDict update sequence element " + "#" + str(j) + " has length " + str(len(m)) + + "; 2 is required") + # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)] + # that's too cumbersome to type correctly with overloads, so we add an ignore here + self[m[0]] = m[1] # type: ignore[assignment] + + # remove forward alltogether to fallback on Module's _forward_unimplemented + + +class ParameterList(Module): + r"""Holds parameters in a list. + + :class:`~torch.nn.ParameterList` can be used like a regular Python + list, but Tensors that are :class:`~torch.nn.Parameter` are properly registered, + and will be visible by all :class:`~torch.nn.Module` methods. + + Note that the constructor, assigning an element of the list, the + :meth:`~torch.nn.ParameterDict.append` method and the :meth:`~torch.nn.ParameterDict.extend` + method will convert any :class:`~torch.Tensor` into :class:`~torch.nn.Parameter`. + + Args: + parameters (iterable, optional): an iterable of elements to add to the list. + + Example:: + + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)]) + + def forward(self, x): + # ParameterList can act as an iterable, or be indexed using ints + for i, p in enumerate(self.params): + x = self.params[i // 2].mm(x) + p.mm(x) + return x + """ + + def __init__(self, values: Optional[Iterable[Any]] = None) -> None: + super().__init__() + self._size = 0 + if values is not None: + self += values + + def _get_abs_string_index(self, idx): + """Get the absolute index for the list of modules.""" + idx = operator.index(idx) + if not (-len(self) <= idx < len(self)): + raise IndexError(f'index {idx} is out of range') + if idx < 0: + idx += len(self) + return str(idx) + + @overload + def __getitem__(self, idx: int) -> Any: + ... + + @overload + def __getitem__(self: T, idx: slice) -> T: + ... + + def __getitem__(self, idx): + if isinstance(idx, slice): + start, stop, step = idx.indices(len(self)) + out = self.__class__() + for i in range(start, stop, step): + out.append(self[i]) + return out + else: + idx = self._get_abs_string_index(idx) + return getattr(self, str(idx)) + + def __setitem__(self, idx: int, param: Any) -> None: + # Note that all other function that add an entry to the list part of + # the ParameterList end up here. So this is the only place where we need + # to wrap things into Parameter if needed. + # Objects added via setattr() are not in the list part and thus won't + # call into this function. + idx = self._get_abs_string_index(idx) + if isinstance(param, torch.Tensor) and not isinstance(param, Parameter): + param = Parameter(param) + return setattr(self, str(idx), param) + + def __len__(self) -> int: + return self._size + + def __iter__(self) -> Iterator[Any]: + return iter(self[i] for i in range(len(self))) + + def __iadd__(self, parameters: Iterable[Any]) -> Self: + return self.extend(parameters) + + def __dir__(self): + keys = super().__dir__() + keys = [key for key in keys if not key.isdigit()] + return keys + + def append(self, value: Any) -> 'ParameterList': + """Append a given value at the end of the list. + + Args: + value (Any): value to append + """ + new_idx = len(self) + self._size += 1 + self[new_idx] = value + return self + + def extend(self, values: Iterable[Any]) -> Self: + """Append values from a Python iterable to the end of the list. + + Args: + values (iterable): iterable of values to append + """ + # Tensor is an iterable but we never want to unpack it here + if not isinstance(values, container_abcs.Iterable) or isinstance(values, torch.Tensor): + raise TypeError("ParameterList.extend should be called with an " + "iterable, but got " + type(values).__name__) + for value in values: + self.append(value) + return self + + def extra_repr(self) -> str: + child_lines = [] + for k, p in enumerate(self): + if isinstance(p, torch.Tensor): + size_str = 'x'.join(str(size) for size in p.size()) + if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]: + device_str = f' ({p.device})' + else: + device_str = '' + parastr = '{} containing: [{} of size {}{}]'.format( + "Parameter" if isinstance(p, Parameter) else "Tensor", + p.dtype, size_str, device_str) + child_lines.append(' (' + str(k) + '): ' + parastr) + else: + child_lines.append(' (' + str(k) + '): Object of type: ' + type(p).__name__) + + tmpstr = '\n'.join(child_lines) + return tmpstr + + def __call__(self, *args, **kwargs): + raise RuntimeError('ParameterList should not be called.') + + +class ParameterDict(Module): + r"""Holds parameters in a dictionary. + + ParameterDict can be indexed like a regular Python dictionary, but Parameters it + contains are properly registered, and will be visible by all Module methods. + Other objects are treated as would be done by a regular Python dictionary + + :class:`~torch.nn.ParameterDict` is an **ordered** dictionary. + :meth:`~torch.nn.ParameterDict.update` with other unordered mapping + types (e.g., Python's plain ``dict``) does not preserve the order of the + merged mapping. On the other hand, ``OrderedDict`` or another :class:`~torch.nn.ParameterDict` + will preserve their ordering. + + Note that the constructor, assigning an element of the dictionary and the + :meth:`~torch.nn.ParameterDict.update` method will convert any :class:`~torch.Tensor` into + :class:`~torch.nn.Parameter`. + + Args: + values (iterable, optional): a mapping (dictionary) of + (string : Any) or an iterable of key-value pairs + of type (string, Any) + + Example:: + + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.params = nn.ParameterDict({ + 'left': nn.Parameter(torch.randn(5, 10)), + 'right': nn.Parameter(torch.randn(5, 10)) + }) + + def forward(self, x, choice): + x = self.params[choice].mm(x) + return x + """ + + def __init__(self, parameters: Any = None) -> None: + super().__init__() + self._keys: Dict[str, None] = {} + if parameters is not None: + self.update(parameters) + + def _key_to_attr(self, key: str) -> str: + if not isinstance(key, str): + raise TypeError("Index given to ParameterDict cannot be used as a key as it is " + f"not a string (type is '{type(key).__name__}'). Open an issue on " + "github if you need non-string keys.") + else: + # Use the key as-is so that `.named_parameters()` returns the right thing + return key + + def __getitem__(self, key: str) -> Any: + attr = self._key_to_attr(key) + return getattr(self, attr) + + def __setitem__(self, key: str, value: Any) -> None: + # Note that all other function that add an entry to the dictionary part of + # the ParameterDict end up here. So this is the only place where we need + # to wrap things into Parameter if needed. + # Objects added via setattr() are not in the dictionary part and thus won't + # call into this function. + self._keys[key] = None + attr = self._key_to_attr(key) + if isinstance(value, torch.Tensor) and not isinstance(value, Parameter): + value = Parameter(value) + setattr(self, attr, value) + + def __delitem__(self, key: str) -> None: + del self._keys[key] + attr = self._key_to_attr(key) + delattr(self, attr) + + def __len__(self) -> int: + return len(self._keys) + + def __iter__(self) -> Iterator[str]: + return iter(self._keys) + + def __reversed__(self) -> Iterator[str]: + return reversed(list(self._keys)) + + def copy(self) -> 'ParameterDict': + """Return a copy of this :class:`~torch.nn.ParameterDict` instance.""" + # We have to use an OrderedDict because the ParameterDict constructor + # behaves differently on plain dict vs OrderedDict + return ParameterDict(OrderedDict((k, self[k]) for k in self._keys)) + + def __contains__(self, key: str) -> bool: + return key in self._keys + + def setdefault(self, key: str, default: Optional[Any] = None) -> Any: + """Set the default for a key in the Parameterdict. + + If key is in the ParameterDict, return its value. + If not, insert `key` with a parameter `default` and return `default`. + `default` defaults to `None`. + + Args: + key (str): key to set default for + default (Any): the parameter set to the key + """ + if key not in self: + self[key] = default + return self[key] + + def clear(self) -> None: + """Remove all items from the ParameterDict.""" + for k in self._keys.copy(): + del self[k] + + def pop(self, key: str) -> Any: + r"""Remove key from the ParameterDict and return its parameter. + + Args: + key (str): key to pop from the ParameterDict + """ + v = self[key] + del self[key] + return v + + def popitem(self) -> Tuple[str, Any]: + """Remove and return the last inserted `(key, parameter)` pair from the ParameterDict.""" + k, _ = self._keys.popitem() + # We need the key in the _keys to be able to access/del + self._keys[k] = None + val = self[k] + del self[k] + return k, val + + def get(self, key: str, default: Optional[Any] = None) -> Any: + r"""Return the parameter associated with key if present. Otherwise return default if provided, None if not. + + Args: + key (str): key to get from the ParameterDict + default (Parameter, optional): value to return if key not present + """ + return self[key] if key in self else default + + def fromkeys(self, keys: Iterable[str], default: Optional[Any] = None) -> 'ParameterDict': + r"""Return a new ParameterDict with the keys provided. + + Args: + keys (iterable, string): keys to make the new ParameterDict from + default (Parameter, optional): value to set for all keys + """ + return ParameterDict((k, default) for k in keys) + + def keys(self) -> Iterable[str]: + r"""Return an iterable of the ParameterDict keys.""" + return self._keys.keys() + + def items(self) -> Iterable[Tuple[str, Any]]: + r"""Return an iterable of the ParameterDict key/value pairs.""" + return ((k, self[k]) for k in self._keys) + + def values(self) -> Iterable[Any]: + r"""Return an iterable of the ParameterDict values.""" + return (self[k] for k in self._keys) + + def update(self, parameters: Union[Mapping[str, Any], 'ParameterDict']) -> None: + r"""Update the :class:`~torch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys. + + .. note:: + If :attr:`parameters` is an ``OrderedDict``, a :class:`~torch.nn.ParameterDict`, or + an iterable of key-value pairs, the order of new elements in it is preserved. + + Args: + parameters (iterable): a mapping (dictionary) from string to + :class:`~torch.nn.Parameter`, or an iterable of + key-value pairs of type (string, :class:`~torch.nn.Parameter`) + """ + if not isinstance(parameters, container_abcs.Iterable): + raise TypeError("ParametersDict.update should be called with an " + "iterable of key/value pairs, but got " + + type(parameters).__name__) + + if isinstance(parameters, (OrderedDict, ParameterDict)): + for key, parameter in parameters.items(): + self[key] = parameter + elif isinstance(parameters, container_abcs.Mapping): + for key, parameter in sorted(parameters.items()): + self[key] = parameter + else: + for j, p in enumerate(parameters): + if not isinstance(p, container_abcs.Iterable): + raise TypeError("ParameterDict update sequence element " + "#" + str(j) + " should be Iterable; is" + + type(p).__name__) + if not len(p) == 2: + raise ValueError("ParameterDict update sequence element " + "#" + str(j) + " has length " + str(len(p)) + + "; 2 is required") + # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment + self[p[0]] = p[1] # type: ignore[assignment] + + def extra_repr(self) -> str: + child_lines = [] + for k, p in self.items(): + if isinstance(p, torch.Tensor): + size_str = 'x'.join(str(size) for size in p.size()) + if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]: + device_str = f' ({p.device})' + else: + device_str = '' + parastr = '{} containing: [{} of size {}{}]'.format( + "Parameter" if isinstance(p, Parameter) else "Tensor", + torch.typename(p), size_str, device_str) + child_lines.append(' (' + str(k) + '): ' + parastr) + else: + child_lines.append(' (' + str(k) + '): Object of type: ' + type(p).__name__) + tmpstr = '\n'.join(child_lines) + return tmpstr + + def __call__(self, input): + raise RuntimeError('ParameterDict should not be called.') + + def __or__(self, other: 'ParameterDict') -> 'ParameterDict': + copy = self.copy() + copy.update(other) + return copy + + def __ror__(self, other: 'ParameterDict') -> 'ParameterDict': + copy = other.copy() + copy.update(self) + return copy + + def __ior__(self, other : 'ParameterDict') -> Self: + self.update(other) + return self diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/dropout.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/dropout.py new file mode 100644 index 0000000000000000000000000000000000000000..f4e151879d7de7d70a6bb880f86b04af8d62948b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/dropout.py @@ -0,0 +1,294 @@ +from .module import Module +from .. import functional as F + +from torch import Tensor + +__all__ = ['Dropout', 'Dropout1d', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout'] + +class _DropoutNd(Module): + __constants__ = ['p', 'inplace'] + p: float + inplace: bool + + def __init__(self, p: float = 0.5, inplace: bool = False) -> None: + super().__init__() + if p < 0 or p > 1: + raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}") + self.p = p + self.inplace = inplace + + def extra_repr(self) -> str: + return f'p={self.p}, inplace={self.inplace}' + + +class Dropout(_DropoutNd): + r"""During training, randomly zeroes some of the elements of the input tensor with probability :attr:`p`. + + The zeroed elements are chosen independently for each forward call and are sampled from a Bernoulli distribution. + + Each channel will be zeroed out independently on every forward call. + + This has proven to be an effective technique for regularization and + preventing the co-adaptation of neurons as described in the paper + `Improving neural networks by preventing co-adaptation of feature + detectors`_ . + + Furthermore, the outputs are scaled by a factor of :math:`\frac{1}{1-p}` during + training. This means that during evaluation the module simply computes an + identity function. + + Args: + p: probability of an element to be zeroed. Default: 0.5 + inplace: If set to ``True``, will do this operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(*)`. Input can be of any shape + - Output: :math:`(*)`. Output is of the same shape as input + + Examples:: + + >>> m = nn.Dropout(p=0.2) + >>> input = torch.randn(20, 16) + >>> output = m(input) + + .. _Improving neural networks by preventing co-adaptation of feature + detectors: https://arxiv.org/abs/1207.0580 + """ + + def forward(self, input: Tensor) -> Tensor: + return F.dropout(input, self.p, self.training, self.inplace) + + +class Dropout1d(_DropoutNd): + r"""Randomly zero out entire channels. + + A channel is a 1D feature map, + e.g., the :math:`j`-th channel of the :math:`i`-th sample in the + batched input is a 1D tensor :math:`\text{input}[i, j]`. + + Each channel will be zeroed out independently on every forward call with + probability :attr:`p` using samples from a Bernoulli distribution. + + Usually the input comes from :class:`nn.Conv1d` modules. + + As described in the paper + `Efficient Object Localization Using Convolutional Networks`_ , + if adjacent pixels within feature maps are strongly correlated + (as is normally the case in early convolution layers) then i.i.d. dropout + will not regularize the activations and will otherwise just result + in an effective learning rate decrease. + + In this case, :func:`nn.Dropout1d` will help promote independence between + feature maps and should be used instead. + + Args: + p (float, optional): probability of an element to be zero-ed. + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + Shape: + - Input: :math:`(N, C, L)` or :math:`(C, L)`. + - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input). + + Examples:: + + >>> m = nn.Dropout1d(p=0.2) + >>> input = torch.randn(20, 16, 32) + >>> output = m(input) + + .. _Efficient Object Localization Using Convolutional Networks: + https://arxiv.org/abs/1411.4280 + """ + + def forward(self, input: Tensor) -> Tensor: + return F.dropout1d(input, self.p, self.training, self.inplace) + + +class Dropout2d(_DropoutNd): + r"""Randomly zero out entire channels. + + A channel is a 2D feature map, + e.g., the :math:`j`-th channel of the :math:`i`-th sample in the + batched input is a 2D tensor :math:`\text{input}[i, j]`. + + Each channel will be zeroed out independently on every forward call with + probability :attr:`p` using samples from a Bernoulli distribution. + + Usually the input comes from :class:`nn.Conv2d` modules. + + As described in the paper + `Efficient Object Localization Using Convolutional Networks`_ , + if adjacent pixels within feature maps are strongly correlated + (as is normally the case in early convolution layers) then i.i.d. dropout + will not regularize the activations and will otherwise just result + in an effective learning rate decrease. + + In this case, :func:`nn.Dropout2d` will help promote independence between + feature maps and should be used instead. + + Args: + p (float, optional): probability of an element to be zero-ed. + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + .. warning :: + Due to historical reasons, this class will perform 1D channel-wise dropout + for 3D inputs (as done by :class:`nn.Dropout1d`). Thus, it currently does NOT + support inputs without a batch dimension of shape :math:`(C, H, W)`. This + behavior will change in a future release to interpret 3D inputs as no-batch-dim + inputs. To maintain the old behavior, switch to :class:`nn.Dropout1d`. + + Shape: + - Input: :math:`(N, C, H, W)` or :math:`(N, C, L)`. + - Output: :math:`(N, C, H, W)` or :math:`(N, C, L)` (same shape as input). + + Examples:: + + >>> m = nn.Dropout2d(p=0.2) + >>> input = torch.randn(20, 16, 32, 32) + >>> output = m(input) + + .. _Efficient Object Localization Using Convolutional Networks: + https://arxiv.org/abs/1411.4280 + """ + + def forward(self, input: Tensor) -> Tensor: + return F.dropout2d(input, self.p, self.training, self.inplace) + + +class Dropout3d(_DropoutNd): + r"""Randomly zero out entire channels. + + A channel is a 3D feature map, + e.g., the :math:`j`-th channel of the :math:`i`-th sample in the + batched input is a 3D tensor :math:`\text{input}[i, j]`. + + Each channel will be zeroed out independently on every forward call with + probability :attr:`p` using samples from a Bernoulli distribution. + + Usually the input comes from :class:`nn.Conv3d` modules. + + As described in the paper + `Efficient Object Localization Using Convolutional Networks`_ , + if adjacent pixels within feature maps are strongly correlated + (as is normally the case in early convolution layers) then i.i.d. dropout + will not regularize the activations and will otherwise just result + in an effective learning rate decrease. + + In this case, :func:`nn.Dropout3d` will help promote independence between + feature maps and should be used instead. + + Args: + p (float, optional): probability of an element to be zeroed. + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + Shape: + - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`. + - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input). + + Examples:: + + >>> m = nn.Dropout3d(p=0.2) + >>> input = torch.randn(20, 16, 4, 32, 32) + >>> output = m(input) + + .. _Efficient Object Localization Using Convolutional Networks: + https://arxiv.org/abs/1411.4280 + """ + + def forward(self, input: Tensor) -> Tensor: + return F.dropout3d(input, self.p, self.training, self.inplace) + + +class AlphaDropout(_DropoutNd): + r"""Applies Alpha Dropout over the input. + + Alpha Dropout is a type of Dropout that maintains the self-normalizing + property. + For an input with zero mean and unit standard deviation, the output of + Alpha Dropout maintains the original mean and standard deviation of the + input. + Alpha Dropout goes hand-in-hand with SELU activation function, which ensures + that the outputs have zero mean and unit standard deviation. + + During training, it randomly masks some of the elements of the input + tensor with probability *p* using samples from a bernoulli distribution. + The elements to masked are randomized on every forward call, and scaled + and shifted to maintain zero mean and unit standard deviation. + + During evaluation the module simply computes an identity function. + + More details can be found in the paper `Self-Normalizing Neural Networks`_ . + + Args: + p (float): probability of an element to be dropped. Default: 0.5 + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + Shape: + - Input: :math:`(*)`. Input can be of any shape + - Output: :math:`(*)`. Output is of the same shape as input + + Examples:: + + >>> m = nn.AlphaDropout(p=0.2) + >>> input = torch.randn(20, 16) + >>> output = m(input) + + .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515 + """ + + def forward(self, input: Tensor) -> Tensor: + return F.alpha_dropout(input, self.p, self.training) + + +class FeatureAlphaDropout(_DropoutNd): + r"""Randomly masks out entire channels. + + A channel is a feature map, + e.g. the :math:`j`-th channel of the :math:`i`-th sample in the batch input + is a tensor :math:`\text{input}[i, j]` of the input tensor). Instead of + setting activations to zero, as in regular Dropout, the activations are set + to the negative saturation value of the SELU activation function. More details + can be found in the paper `Self-Normalizing Neural Networks`_ . + + Each element will be masked independently for each sample on every forward + call with probability :attr:`p` using samples from a Bernoulli distribution. + The elements to be masked are randomized on every forward call, and scaled + and shifted to maintain zero mean and unit variance. + + Usually the input comes from :class:`nn.AlphaDropout` modules. + + As described in the paper + `Efficient Object Localization Using Convolutional Networks`_ , + if adjacent pixels within feature maps are strongly correlated + (as is normally the case in early convolution layers) then i.i.d. dropout + will not regularize the activations and will otherwise just result + in an effective learning rate decrease. + + In this case, :func:`nn.AlphaDropout` will help promote independence between + feature maps and should be used instead. + + Args: + p (float, optional): probability of an element to be zeroed. Default: 0.5 + inplace (bool, optional): If set to ``True``, will do this operation + in-place + + Shape: + - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`. + - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input). + + Examples:: + + >>> m = nn.FeatureAlphaDropout(p=0.2) + >>> input = torch.randn(20, 16, 4, 32, 32) + >>> output = m(input) + + .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515 + .. _Efficient Object Localization Using Convolutional Networks: + https://arxiv.org/abs/1411.4280 + """ + + def forward(self, input: Tensor) -> Tensor: + return F.feature_alpha_dropout(input, self.p, self.training) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/flatten.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/flatten.py new file mode 100644 index 0000000000000000000000000000000000000000..eaf62d5bbeea7728a124a4f650e735b3022bd5b7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/flatten.py @@ -0,0 +1,144 @@ +from .module import Module + +from typing import Tuple, Union +from torch import Tensor +from torch.types import _size + +__all__ = ['Flatten', 'Unflatten'] + +class Flatten(Module): + r""" + Flattens a contiguous range of dims into a tensor. + + For use with :class:`~nn.Sequential`, see :meth:`torch.flatten` for details. + + Shape: + - Input: :math:`(*, S_{\text{start}},..., S_{i}, ..., S_{\text{end}}, *)`,' + where :math:`S_{i}` is the size at dimension :math:`i` and :math:`*` means any + number of dimensions including none. + - Output: :math:`(*, \prod_{i=\text{start}}^{\text{end}} S_{i}, *)`. + + Args: + start_dim: first dim to flatten (default = 1). + end_dim: last dim to flatten (default = -1). + + Examples:: + >>> input = torch.randn(32, 1, 5, 5) + >>> # With default parameters + >>> m = nn.Flatten() + >>> output = m(input) + >>> output.size() + torch.Size([32, 25]) + >>> # With non-default parameters + >>> m = nn.Flatten(0, 2) + >>> output = m(input) + >>> output.size() + torch.Size([160, 5]) + """ + + __constants__ = ['start_dim', 'end_dim'] + start_dim: int + end_dim: int + + def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None: + super().__init__() + self.start_dim = start_dim + self.end_dim = end_dim + + def forward(self, input: Tensor) -> Tensor: + return input.flatten(self.start_dim, self.end_dim) + + def extra_repr(self) -> str: + return f'start_dim={self.start_dim}, end_dim={self.end_dim}' + + +class Unflatten(Module): + r""" + Unflattens a tensor dim expanding it to a desired shape. For use with :class:`~nn.Sequential`. + + * :attr:`dim` specifies the dimension of the input tensor to be unflattened, and it can + be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively. + + * :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be + a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input; a `NamedShape` + (tuple of `(name, size)` tuples) for `NamedTensor` input. + + Shape: + - Input: :math:`(*, S_{\text{dim}}, *)`, where :math:`S_{\text{dim}}` is the size at + dimension :attr:`dim` and :math:`*` means any number of dimensions including none. + - Output: :math:`(*, U_1, ..., U_n, *)`, where :math:`U` = :attr:`unflattened_size` and + :math:`\prod_{i=1}^n U_i = S_{\text{dim}}`. + + Args: + dim (Union[int, str]): Dimension to be unflattened + unflattened_size (Union[torch.Size, Tuple, List, NamedShape]): New shape of the unflattened dimension + + Examples: + >>> input = torch.randn(2, 50) + >>> # With tuple of ints + >>> m = nn.Sequential( + >>> nn.Linear(50, 50), + >>> nn.Unflatten(1, (2, 5, 5)) + >>> ) + >>> output = m(input) + >>> output.size() + torch.Size([2, 2, 5, 5]) + >>> # With torch.Size + >>> m = nn.Sequential( + >>> nn.Linear(50, 50), + >>> nn.Unflatten(1, torch.Size([2, 5, 5])) + >>> ) + >>> output = m(input) + >>> output.size() + torch.Size([2, 2, 5, 5]) + >>> # With namedshape (tuple of tuples) + >>> input = torch.randn(2, 50, names=('N', 'features')) + >>> unflatten = nn.Unflatten('features', (('C', 2), ('H', 5), ('W', 5))) + >>> output = unflatten(input) + >>> output.size() + torch.Size([2, 2, 5, 5]) + """ + + NamedShape = Tuple[Tuple[str, int]] + + __constants__ = ['dim', 'unflattened_size'] + dim: Union[int, str] + unflattened_size: Union[_size, NamedShape] + + def __init__(self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]) -> None: + super().__init__() + + if isinstance(dim, int): + self._require_tuple_int(unflattened_size) + elif isinstance(dim, str): + self._require_tuple_tuple(unflattened_size) + else: + raise TypeError("invalid argument type for dim parameter") + + self.dim = dim + self.unflattened_size = unflattened_size + + def _require_tuple_tuple(self, input): + if (isinstance(input, tuple)): + for idx, elem in enumerate(input): + if not isinstance(elem, tuple): + raise TypeError("unflattened_size must be tuple of tuples, " + + f"but found element of type {type(elem).__name__} at pos {idx}") + return + raise TypeError("unflattened_size must be a tuple of tuples, " + + f"but found type {type(input).__name__}") + + def _require_tuple_int(self, input): + if (isinstance(input, (tuple, list))): + for idx, elem in enumerate(input): + if not isinstance(elem, int): + raise TypeError("unflattened_size must be tuple of ints, " + + f"but found element of type {type(elem).__name__} at pos {idx}") + return + raise TypeError(f"unflattened_size must be a tuple of ints, but found type {type(input).__name__}") + + def forward(self, input: Tensor) -> Tensor: + return input.unflatten(self.dim, self.unflattened_size) + + def extra_repr(self) -> str: + return f'dim={self.dim}, unflattened_size={self.unflattened_size}' diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/normalization.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..6502ec2a471f389661b1309f372597224a8e1b0e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/normalization.py @@ -0,0 +1,297 @@ +import torch +import numbers +from torch.nn.parameter import Parameter +from .module import Module +from ._functions import CrossMapLRN2d as _cross_map_lrn2d +from .. import functional as F +from .. import init + +from torch import Tensor, Size +from typing import Union, List, Tuple + +__all__ = ['LocalResponseNorm', 'CrossMapLRN2d', 'LayerNorm', 'GroupNorm'] + +class LocalResponseNorm(Module): + r"""Applies local response normalization over an input signal. + + The input signal is composed of several input planes, where channels occupy the second dimension. + Applies normalization across channels. + + .. math:: + b_{c} = a_{c}\left(k + \frac{\alpha}{n} + \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta} + + Args: + size: amount of neighbouring channels used for normalization + alpha: multiplicative factor. Default: 0.0001 + beta: exponent. Default: 0.75 + k: additive factor. Default: 1 + + Shape: + - Input: :math:`(N, C, *)` + - Output: :math:`(N, C, *)` (same shape as input) + + Examples:: + + >>> lrn = nn.LocalResponseNorm(2) + >>> signal_2d = torch.randn(32, 5, 24, 24) + >>> signal_4d = torch.randn(16, 5, 7, 7, 7, 7) + >>> output_2d = lrn(signal_2d) + >>> output_4d = lrn(signal_4d) + + """ + + __constants__ = ['size', 'alpha', 'beta', 'k'] + size: int + alpha: float + beta: float + k: float + + def __init__(self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.) -> None: + super().__init__() + self.size = size + self.alpha = alpha + self.beta = beta + self.k = k + + def forward(self, input: Tensor) -> Tensor: + return F.local_response_norm(input, self.size, self.alpha, self.beta, + self.k) + + def extra_repr(self): + return '{size}, alpha={alpha}, beta={beta}, k={k}'.format(**self.__dict__) + + +class CrossMapLRN2d(Module): + size: int + alpha: float + beta: float + k: float + + def __init__(self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1) -> None: + super().__init__() + self.size = size + self.alpha = alpha + self.beta = beta + self.k = k + + def forward(self, input: Tensor) -> Tensor: + return _cross_map_lrn2d.apply(input, self.size, self.alpha, self.beta, + self.k) + + def extra_repr(self) -> str: + return '{size}, alpha={alpha}, beta={beta}, k={k}'.format(**self.__dict__) + + +_shape_t = Union[int, List[int], Size] + + +class LayerNorm(Module): + r"""Applies Layer Normalization over a mini-batch of inputs. + + This layer implements the operation as described in + the paper `Layer Normalization `__ + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The mean and standard-deviation are calculated over the last `D` dimensions, where `D` + is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape` + is ``(3, 5)`` (a 2-dimensional shape), the mean and standard-deviation are computed over + the last 2 dimensions of the input (i.e. ``input.mean((-2, -1))``). + :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of + :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``. + The standard-deviation is calculated via the biased estimator, equivalent to + `torch.var(input, unbiased=False)`. + + .. note:: + Unlike Batch Normalization and Instance Normalization, which applies + scalar scale and bias for each entire channel/plane with the + :attr:`affine` option, Layer Normalization applies per-element scale and + bias with :attr:`elementwise_affine`. + + This layer uses statistics computed from input data in both training and + evaluation modes. + + Args: + normalized_shape (int or list or torch.Size): input shape from an expected input + of size + + .. math:: + [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1] + \times \ldots \times \text{normalized\_shape}[-1]] + + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + eps: a value added to the denominator for numerical stability. Default: 1e-5 + elementwise_affine: a boolean value that when set to ``True``, this module + has learnable per-element affine parameters initialized to ones (for weights) + and zeros (for biases). Default: ``True``. + bias: If set to ``False``, the layer will not learn an additive bias (only relevant if + :attr:`elementwise_affine` is ``True``). Default: ``True``. + + Attributes: + weight: the learnable weights of the module of shape + :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``. + The values are initialized to 1. + bias: the learnable bias of the module of shape + :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``. + The values are initialized to 0. + + Shape: + - Input: :math:`(N, *)` + - Output: :math:`(N, *)` (same shape as input) + + Examples:: + + >>> # NLP Example + >>> batch, sentence_length, embedding_dim = 20, 5, 10 + >>> embedding = torch.randn(batch, sentence_length, embedding_dim) + >>> layer_norm = nn.LayerNorm(embedding_dim) + >>> # Activate module + >>> layer_norm(embedding) + >>> + >>> # Image Example + >>> N, C, H, W = 20, 5, 10, 10 + >>> input = torch.randn(N, C, H, W) + >>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions) + >>> # as shown in the image below + >>> layer_norm = nn.LayerNorm([C, H, W]) + >>> output = layer_norm(input) + + .. image:: ../_static/img/nn/layer_norm.jpg + :scale: 50 % + + """ + + __constants__ = ['normalized_shape', 'eps', 'elementwise_affine'] + normalized_shape: Tuple[int, ...] + eps: float + elementwise_affine: bool + + def __init__(self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True, + bias: bool = True, device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + if isinstance(normalized_shape, numbers.Integral): + # mypy error: incompatible types in assignment + normalized_shape = (normalized_shape,) # type: ignore[assignment] + self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] + self.eps = eps + self.elementwise_affine = elementwise_affine + if self.elementwise_affine: + self.weight = Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + if bias: + self.bias = Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + else: + self.register_parameter('bias', None) + else: + self.register_parameter('weight', None) + self.register_parameter('bias', None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.elementwise_affine: + init.ones_(self.weight) + if self.bias is not None: + init.zeros_(self.bias) + + def forward(self, input: Tensor) -> Tensor: + return F.layer_norm( + input, self.normalized_shape, self.weight, self.bias, self.eps) + + def extra_repr(self) -> str: + return '{normalized_shape}, eps={eps}, ' \ + 'elementwise_affine={elementwise_affine}'.format(**self.__dict__) + + +class GroupNorm(Module): + r"""Applies Group Normalization over a mini-batch of inputs. + + This layer implements the operation as described in + the paper `Group Normalization `__ + + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + The input channels are separated into :attr:`num_groups` groups, each containing + ``num_channels / num_groups`` channels. :attr:`num_channels` must be divisible by + :attr:`num_groups`. The mean and standard-deviation are calculated + separately over the each group. :math:`\gamma` and :math:`\beta` are learnable + per-channel affine transform parameter vectors of size :attr:`num_channels` if + :attr:`affine` is ``True``. + The standard-deviation is calculated via the biased estimator, equivalent to + `torch.var(input, unbiased=False)`. + + This layer uses statistics computed from input data in both training and + evaluation modes. + + Args: + num_groups (int): number of groups to separate the channels into + num_channels (int): number of channels expected in input + eps: a value added to the denominator for numerical stability. Default: 1e-5 + affine: a boolean value that when set to ``True``, this module + has learnable per-channel affine parameters initialized to ones (for weights) + and zeros (for biases). Default: ``True``. + + Shape: + - Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}` + - Output: :math:`(N, C, *)` (same shape as input) + + Examples:: + + >>> input = torch.randn(20, 6, 10, 10) + >>> # Separate 6 channels into 3 groups + >>> m = nn.GroupNorm(3, 6) + >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm) + >>> m = nn.GroupNorm(6, 6) + >>> # Put all 6 channels into a single group (equivalent with LayerNorm) + >>> m = nn.GroupNorm(1, 6) + >>> # Activating the module + >>> output = m(input) + """ + + __constants__ = ['num_groups', 'num_channels', 'eps', 'affine'] + num_groups: int + num_channels: int + eps: float + affine: bool + + def __init__(self, num_groups: int, num_channels: int, eps: float = 1e-5, affine: bool = True, + device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + if num_channels % num_groups != 0: + raise ValueError('num_channels must be divisible by num_groups') + + self.num_groups = num_groups + self.num_channels = num_channels + self.eps = eps + self.affine = affine + if self.affine: + self.weight = Parameter(torch.empty(num_channels, **factory_kwargs)) + self.bias = Parameter(torch.empty(num_channels, **factory_kwargs)) + else: + self.register_parameter('weight', None) + self.register_parameter('bias', None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.affine: + init.ones_(self.weight) + init.zeros_(self.bias) + + def forward(self, input: Tensor) -> Tensor: + return F.group_norm( + input, self.num_groups, self.weight, self.bias, self.eps) + + def extra_repr(self) -> str: + return '{num_groups}, {num_channels}, eps={eps}, ' \ + 'affine={affine}'.format(**self.__dict__) + + +# TODO: ContrastiveNorm2d +# TODO: DivisiveNorm2d +# TODO: SubtractiveNorm2d diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/padding.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/padding.py new file mode 100644 index 0000000000000000000000000000000000000000..0aecca58c305a24bc153fec8d72f03c946b7e191 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/padding.py @@ -0,0 +1,801 @@ +from .module import Module +from .utils import _pair, _quadruple, _ntuple +from .. import functional as F + +from torch import Tensor +from ..common_types import _size_2_t, _size_4_t, _size_6_t +from typing import Sequence, Tuple + + +# TODO: grad_output size asserts in THNN + +__all__ = ['CircularPad1d', 'CircularPad2d', 'CircularPad3d', 'ConstantPad1d', 'ConstantPad2d', + 'ConstantPad3d', 'ReflectionPad1d', 'ReflectionPad2d', 'ReflectionPad3d', + 'ReplicationPad1d', 'ReplicationPad2d', 'ReplicationPad3d', 'ZeroPad1d', 'ZeroPad2d', 'ZeroPad3d'] + + +class _CircularPadNd(Module): + __constants__ = ['padding'] + padding: Sequence[int] + + def _check_input_dim(self, input): + raise NotImplementedError + + def forward(self, input: Tensor) -> Tensor: + self._check_input_dim(input) + return F.pad(input, self.padding, 'circular') + + def extra_repr(self) -> str: + return f'{self.padding}' + + +class CircularPad1d(_CircularPadNd): + r"""Pads the input tensor using circular padding of the input boundary. + + Tensor values at the beginning of the dimension are used to pad the end, + and values at the end are used to pad the beginning. If negative padding is + applied then the ends of the tensor get removed. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this") + >>> m = nn.CircularPad1d(2) + >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4) + >>> input + tensor([[[0., 1., 2., 3.], + [4., 5., 6., 7.]]]) + >>> m(input) + tensor([[[2., 3., 0., 1., 2., 3., 0., 1.], + [6., 7., 4., 5., 6., 7., 4., 5.]]]) + >>> # using different paddings for different sides + >>> m = nn.CircularPad1d((3, 1)) + >>> m(input) + tensor([[[1., 2., 3., 0., 1., 2., 3., 0.], + [5., 6., 7., 4., 5., 6., 7., 4.]]]) + """ + + padding: Tuple[int, int] + + def __init__(self, padding: _size_2_t) -> None: + super().__init__() + self.padding = _pair(padding) + + def _check_input_dim(self, input): + if input.dim() != 2 and input.dim() != 3: + raise ValueError( + f"expected 2D or 3D input (got {input.dim()}D input)" + ) + + +class CircularPad2d(_CircularPadNd): + r"""Pads the input tensor using circular padding of the input boundary. + + Tensor values at the beginning of the dimension are used to pad the end, + and values at the end are used to pad the beginning. If negative padding is + applied then the ends of the tensor get removed. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.CircularPad2d(2) + >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3) + >>> input + tensor([[[[0., 1., 2.], + [3., 4., 5.], + [6., 7., 8.]]]]) + >>> m(input) + tensor([[[[4., 5., 3., 4., 5., 3., 4.], + [7., 8., 6., 7., 8., 6., 7.], + [1., 2., 0., 1., 2., 0., 1.], + [4., 5., 3., 4., 5., 3., 4.], + [7., 8., 6., 7., 8., 6., 7.], + [1., 2., 0., 1., 2., 0., 1.], + [4., 5., 3., 4., 5., 3., 4.]]]]) + >>> # using different paddings for different sides + >>> m = nn.CircularPad2d((1, 1, 2, 0)) + >>> m(input) + tensor([[[[5., 3., 4., 5., 3.], + [8., 6., 7., 8., 6.], + [2., 0., 1., 2., 0.], + [5., 3., 4., 5., 3.], + [8., 6., 7., 8., 6.]]]]) + """ + + padding: Tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t) -> None: + super().__init__() + self.padding = _quadruple(padding) + + def _check_input_dim(self, input): + if input.dim() != 3 and input.dim() != 4: + raise ValueError( + f"expected 3D or 4D input (got {input.dim()}D input)" + ) + + +class CircularPad3d(_CircularPadNd): + r"""Pads the input tensor using circular padding of the input boundary. + + Tensor values at the beginning of the dimension are used to pad the end, + and values at the end are used to pad the beginning. If negative padding is + applied then the ends of the tensor get removed. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, + where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.CircularPad3d(3) + >>> input = torch.randn(16, 3, 8, 320, 480) + >>> output = m(input) + >>> # using different paddings for different sides + >>> m = nn.CircularPad3d((3, 3, 6, 6, 1, 1)) + >>> output = m(input) + """ + + padding: Tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t) -> None: + super().__init__() + self.padding = _ntuple(6)(padding) + + def _check_input_dim(self, input): + if input.dim() != 4 and input.dim() != 5: + raise ValueError( + f"expected 4D or 5D input (got {input.dim()}D input)" + ) + + +class _ConstantPadNd(Module): + __constants__ = ['padding', 'value'] + value: float + padding: Sequence[int] + + def __init__(self, value: float) -> None: + super().__init__() + self.value = value + + def forward(self, input: Tensor) -> Tensor: + return F.pad(input, self.padding, 'constant', self.value) + + def extra_repr(self) -> str: + return f'padding={self.padding}, value={self.value}' + + +class ConstantPad1d(_ConstantPadNd): + r"""Pads the input tensor boundaries with a constant value. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in both boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ConstantPad1d(2, 3.5) + >>> input = torch.randn(1, 2, 4) + >>> input + tensor([[[-1.0491, -0.7152, -0.0749, 0.8530], + [-1.3287, 1.8966, 0.1466, -0.2771]]]) + >>> m(input) + tensor([[[ 3.5000, 3.5000, -1.0491, -0.7152, -0.0749, 0.8530, 3.5000, + 3.5000], + [ 3.5000, 3.5000, -1.3287, 1.8966, 0.1466, -0.2771, 3.5000, + 3.5000]]]) + >>> m = nn.ConstantPad1d(2, 3.5) + >>> input = torch.randn(1, 2, 3) + >>> input + tensor([[[ 1.6616, 1.4523, -1.1255], + [-3.6372, 0.1182, -1.8652]]]) + >>> m(input) + tensor([[[ 3.5000, 3.5000, 1.6616, 1.4523, -1.1255, 3.5000, 3.5000], + [ 3.5000, 3.5000, -3.6372, 0.1182, -1.8652, 3.5000, 3.5000]]]) + >>> # using different paddings for different sides + >>> m = nn.ConstantPad1d((3, 1), 3.5) + >>> m(input) + tensor([[[ 3.5000, 3.5000, 3.5000, 1.6616, 1.4523, -1.1255, 3.5000], + [ 3.5000, 3.5000, 3.5000, -3.6372, 0.1182, -1.8652, 3.5000]]]) + """ + + padding: Tuple[int, int] + + def __init__(self, padding: _size_2_t, value: float): + super().__init__(value) + self.padding = _pair(padding) + + +class ConstantPad2d(_ConstantPadNd): + r"""Pads the input tensor boundaries with a constant value. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ConstantPad2d(2, 3.5) + >>> input = torch.randn(1, 2, 2) + >>> input + tensor([[[ 1.6585, 0.4320], + [-0.8701, -0.4649]]]) + >>> m(input) + tensor([[[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 1.6585, 0.4320, 3.5000, 3.5000], + [ 3.5000, 3.5000, -0.8701, -0.4649, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000]]]) + >>> # using different paddings for different sides + >>> m = nn.ConstantPad2d((3, 0, 2, 1), 3.5) + >>> m(input) + tensor([[[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000], + [ 3.5000, 3.5000, 3.5000, 1.6585, 0.4320], + [ 3.5000, 3.5000, 3.5000, -0.8701, -0.4649], + [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000]]]) + """ + + __constants__ = ['padding', 'value'] + padding: Tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t, value: float) -> None: + super().__init__(value) + self.padding = _quadruple(padding) + + +class ConstantPad3d(_ConstantPadNd): + r"""Pads the input tensor boundaries with a constant value. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or + :math:`(C, D_{out}, H_{out}, W_{out})`, where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.ConstantPad3d(3, 3.5) + >>> input = torch.randn(16, 3, 10, 20, 30) + >>> output = m(input) + >>> # using different paddings for different sides + >>> m = nn.ConstantPad3d((3, 3, 6, 6, 0, 1), 3.5) + >>> output = m(input) + """ + + padding: Tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t, value: float) -> None: + super().__init__(value) + self.padding = _ntuple(6)(padding) + + +class _ReflectionPadNd(Module): + __constants__ = ['padding'] + padding: Sequence[int] + + def forward(self, input: Tensor) -> Tensor: + return F.pad(input, self.padding, 'reflect') + + def extra_repr(self) -> str: + return f'{self.padding}' + + +class ReflectionPad1d(_ReflectionPadNd): + r"""Pads the input tensor using the reflection of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.ReflectionPad1d(2) + >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles") + >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4) + >>> input + tensor([[[0., 1., 2., 3.], + [4., 5., 6., 7.]]]) + >>> m(input) + tensor([[[2., 1., 0., 1., 2., 3., 2., 1.], + [6., 5., 4., 5., 6., 7., 6., 5.]]]) + >>> # using different paddings for different sides + >>> m = nn.ReflectionPad1d((3, 1)) + >>> m(input) + tensor([[[3., 2., 1., 0., 1., 2., 3., 2.], + [7., 6., 5., 4., 5., 6., 7., 6.]]]) + """ + + padding: Tuple[int, int] + + def __init__(self, padding: _size_2_t) -> None: + super().__init__() + self.padding = _pair(padding) + + +class ReflectionPad2d(_ReflectionPadNd): + r"""Pads the input tensor using the reflection of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + Note that padding size should be less than the corresponding input dimension. + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})` where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this") + >>> m = nn.ReflectionPad2d(2) + >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3) + >>> input + tensor([[[[0., 1., 2.], + [3., 4., 5.], + [6., 7., 8.]]]]) + >>> m(input) + tensor([[[[8., 7., 6., 7., 8., 7., 6.], + [5., 4., 3., 4., 5., 4., 3.], + [2., 1., 0., 1., 2., 1., 0.], + [5., 4., 3., 4., 5., 4., 3.], + [8., 7., 6., 7., 8., 7., 6.], + [5., 4., 3., 4., 5., 4., 3.], + [2., 1., 0., 1., 2., 1., 0.]]]]) + >>> # using different paddings for different sides + >>> m = nn.ReflectionPad2d((1, 1, 2, 0)) + >>> m(input) + tensor([[[[7., 6., 7., 8., 7.], + [4., 3., 4., 5., 4.], + [1., 0., 1., 2., 1.], + [4., 3., 4., 5., 4.], + [7., 6., 7., 8., 7.]]]]) + """ + + padding: Tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t) -> None: + super().__init__() + self.padding = _quadruple(padding) + + +class ReflectionPad3d(_ReflectionPadNd): + r"""Pads the input tensor using the reflection of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, + where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this") + >>> m = nn.ReflectionPad3d(1) + >>> input = torch.arange(8, dtype=torch.float).reshape(1, 1, 2, 2, 2) + >>> m(input) + tensor([[[[[7., 6., 7., 6.], + [5., 4., 5., 4.], + [7., 6., 7., 6.], + [5., 4., 5., 4.]], + [[3., 2., 3., 2.], + [1., 0., 1., 0.], + [3., 2., 3., 2.], + [1., 0., 1., 0.]], + [[7., 6., 7., 6.], + [5., 4., 5., 4.], + [7., 6., 7., 6.], + [5., 4., 5., 4.]], + [[3., 2., 3., 2.], + [1., 0., 1., 0.], + [3., 2., 3., 2.], + [1., 0., 1., 0.]]]]]) + """ + + padding: Tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t) -> None: + super().__init__() + self.padding = _ntuple(6)(padding) + + +class _ReplicationPadNd(Module): + __constants__ = ['padding'] + padding: Sequence[int] + + def forward(self, input: Tensor) -> Tensor: + return F.pad(input, self.padding, 'replicate') + + def extra_repr(self) -> str: + return f'{self.padding}' + + +class ReplicationPad1d(_ReplicationPadNd): + r"""Pads the input tensor using replication of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this") + >>> m = nn.ReplicationPad1d(2) + >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4) + >>> input + tensor([[[0., 1., 2., 3.], + [4., 5., 6., 7.]]]) + >>> m(input) + tensor([[[0., 0., 0., 1., 2., 3., 3., 3.], + [4., 4., 4., 5., 6., 7., 7., 7.]]]) + >>> # using different paddings for different sides + >>> m = nn.ReplicationPad1d((3, 1)) + >>> m(input) + tensor([[[0., 0., 0., 0., 1., 2., 3., 3.], + [4., 4., 4., 4., 5., 6., 7., 7.]]]) + """ + + padding: Tuple[int, int] + + def __init__(self, padding: _size_2_t) -> None: + super().__init__() + self.padding = _pair(padding) + + +class ReplicationPad2d(_ReplicationPadNd): + r"""Pads the input tensor using replication of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.ReplicationPad2d(2) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3) + >>> input + tensor([[[[0., 1., 2.], + [3., 4., 5.], + [6., 7., 8.]]]]) + >>> m(input) + tensor([[[[0., 0., 0., 1., 2., 2., 2.], + [0., 0., 0., 1., 2., 2., 2.], + [0., 0., 0., 1., 2., 2., 2.], + [3., 3., 3., 4., 5., 5., 5.], + [6., 6., 6., 7., 8., 8., 8.], + [6., 6., 6., 7., 8., 8., 8.], + [6., 6., 6., 7., 8., 8., 8.]]]]) + >>> # using different paddings for different sides + >>> m = nn.ReplicationPad2d((1, 1, 2, 0)) + >>> m(input) + tensor([[[[0., 0., 1., 2., 2.], + [0., 0., 1., 2., 2.], + [0., 0., 1., 2., 2.], + [3., 3., 4., 5., 5.], + [6., 6., 7., 8., 8.]]]]) + """ + + padding: Tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t) -> None: + super().__init__() + self.padding = _quadruple(padding) + + +class ReplicationPad3d(_ReplicationPadNd): + r"""Pads the input tensor using replication of the input boundary. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, + where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ReplicationPad3d(3) + >>> input = torch.randn(16, 3, 8, 320, 480) + >>> output = m(input) + >>> # using different paddings for different sides + >>> m = nn.ReplicationPad3d((3, 3, 6, 6, 1, 1)) + >>> output = m(input) + """ + + padding: Tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t) -> None: + super().__init__() + self.padding = _ntuple(6)(padding) + + +class ZeroPad1d(ConstantPad1d): + r"""Pads the input tensor boundaries with zero. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in both boundaries. If a 2-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`) + + Shape: + - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`. + - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ZeroPad1d(2) + >>> input = torch.randn(1, 2, 4) + >>> input + tensor([[[-1.0491, -0.7152, -0.0749, 0.8530], + [-1.3287, 1.8966, 0.1466, -0.2771]]]) + >>> m(input) + tensor([[[ 0.0000, 0.0000, -1.0491, -0.7152, -0.0749, 0.8530, 0.0000, + 0.0000], + [ 0.0000, 0.0000, -1.3287, 1.8966, 0.1466, -0.2771, 0.0000, + 0.0000]]]) + >>> m = nn.ZeroPad1d(2) + >>> input = torch.randn(1, 2, 3) + >>> input + tensor([[[ 1.6616, 1.4523, -1.1255], + [-3.6372, 0.1182, -1.8652]]]) + >>> m(input) + tensor([[[ 0.0000, 0.0000, 1.6616, 1.4523, -1.1255, 0.0000, 0.0000], + [ 0.0000, 0.0000, -3.6372, 0.1182, -1.8652, 0.0000, 0.0000]]]) + >>> # using different paddings for different sides + >>> m = nn.ZeroPad1d((3, 1)) + >>> m(input) + tensor([[[ 0.0000, 0.0000, 0.0000, 1.6616, 1.4523, -1.1255, 0.0000], + [ 0.0000, 0.0000, 0.0000, -3.6372, 0.1182, -1.8652, 0.0000]]]) + """ + + padding: Tuple[int, int] + + def __init__(self, padding: _size_2_t) -> None: + super().__init__(padding, 0.) + + def extra_repr(self) -> str: + return f'{self.padding}' + +class ZeroPad2d(ConstantPad2d): + r"""Pads the input tensor boundaries with zero. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`, + :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`) + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> m = nn.ZeroPad2d(2) + >>> input = torch.randn(1, 1, 3, 3) + >>> input + tensor([[[[-0.1678, -0.4418, 1.9466], + [ 0.9604, -0.4219, -0.5241], + [-0.9162, -0.5436, -0.6446]]]]) + >>> m(input) + tensor([[[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, 0.0000, -0.1678, -0.4418, 1.9466, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.9604, -0.4219, -0.5241, 0.0000, 0.0000], + [ 0.0000, 0.0000, -0.9162, -0.5436, -0.6446, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]]) + >>> # using different paddings for different sides + >>> m = nn.ZeroPad2d((1, 1, 2, 0)) + >>> m(input) + tensor([[[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, -0.1678, -0.4418, 1.9466, 0.0000], + [ 0.0000, 0.9604, -0.4219, -0.5241, 0.0000], + [ 0.0000, -0.9162, -0.5436, -0.6446, 0.0000]]]]) + """ + + padding: Tuple[int, int, int, int] + + def __init__(self, padding: _size_4_t) -> None: + super().__init__(padding, 0.) + + def extra_repr(self) -> str: + return f'{self.padding}' + +class ZeroPad3d(ConstantPad3d): + r"""Pads the input tensor boundaries with zero. + + For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`. + + Args: + padding (int, tuple): the size of the padding. If is `int`, uses the same + padding in all boundaries. If a 6-`tuple`, uses + (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`, + :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`, + :math:`\text{padding\_front}`, :math:`\text{padding\_back}`) + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or + :math:`(C, D_{out}, H_{out}, W_{out})`, where + + :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}` + + :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}` + + :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}` + + Examples:: + + >>> m = nn.ZeroPad3d(3) + >>> input = torch.randn(16, 3, 10, 20, 30) + >>> output = m(input) + >>> # using different paddings for different sides + >>> m = nn.ZeroPad3d((3, 3, 6, 6, 0, 1)) + >>> output = m(input) + """ + + padding: Tuple[int, int, int, int, int, int] + + def __init__(self, padding: _size_6_t) -> None: + super().__init__(padding, 0.) + + def extra_repr(self) -> str: + return f'{self.padding}' diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pixelshuffle.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pixelshuffle.py new file mode 100644 index 0000000000000000000000000000000000000000..6050b7eaea60cf6ef655d1219c5f2869cac11615 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pixelshuffle.py @@ -0,0 +1,113 @@ +from .module import Module +from .. import functional as F + +from torch import Tensor + +__all__ = ['PixelShuffle', 'PixelUnshuffle'] + +class PixelShuffle(Module): + r"""Rearrange elements in a tensor according to an upscaling factor. + + Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` + to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an upscale factor. + + This is useful for implementing efficient sub-pixel convolution + with a stride of :math:`1/r`. + + See the paper: + `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_ + by Shi et. al (2016) for more details. + + Args: + upscale_factor (int): factor to increase spatial resolution by + + Shape: + - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions + - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where + + .. math:: + C_{out} = C_{in} \div \text{upscale\_factor}^2 + + .. math:: + H_{out} = H_{in} \times \text{upscale\_factor} + + .. math:: + W_{out} = W_{in} \times \text{upscale\_factor} + + Examples:: + + >>> pixel_shuffle = nn.PixelShuffle(3) + >>> input = torch.randn(1, 9, 4, 4) + >>> output = pixel_shuffle(input) + >>> print(output.size()) + torch.Size([1, 1, 12, 12]) + + .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network: + https://arxiv.org/abs/1609.05158 + """ + + __constants__ = ['upscale_factor'] + upscale_factor: int + + def __init__(self, upscale_factor: int) -> None: + super().__init__() + self.upscale_factor = upscale_factor + + def forward(self, input: Tensor) -> Tensor: + return F.pixel_shuffle(input, self.upscale_factor) + + def extra_repr(self) -> str: + return f'upscale_factor={self.upscale_factor}' + + +class PixelUnshuffle(Module): + r"""Reverse the PixelShuffle operation. + + Reverses the :class:`~torch.nn.PixelShuffle` operation by rearranging elements + in a tensor of shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape + :math:`(*, C \times r^2, H, W)`, where r is a downscale factor. + + See the paper: + `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_ + by Shi et. al (2016) for more details. + + Args: + downscale_factor (int): factor to decrease spatial resolution by + + Shape: + - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions + - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where + + .. math:: + C_{out} = C_{in} \times \text{downscale\_factor}^2 + + .. math:: + H_{out} = H_{in} \div \text{downscale\_factor} + + .. math:: + W_{out} = W_{in} \div \text{downscale\_factor} + + Examples:: + + >>> pixel_unshuffle = nn.PixelUnshuffle(3) + >>> input = torch.randn(1, 1, 12, 12) + >>> output = pixel_unshuffle(input) + >>> print(output.size()) + torch.Size([1, 9, 4, 4]) + + .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network: + https://arxiv.org/abs/1609.05158 + """ + + __constants__ = ['downscale_factor'] + downscale_factor: int + + def __init__(self, downscale_factor: int) -> None: + super().__init__() + self.downscale_factor = downscale_factor + + def forward(self, input: Tensor) -> Tensor: + return F.pixel_unshuffle(input, self.downscale_factor) + + def extra_repr(self) -> str: + return f'downscale_factor={self.downscale_factor}' diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pooling.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pooling.py new file mode 100644 index 0000000000000000000000000000000000000000..38acd9fb430a0d000b768a1c2ff7635bd7741cf2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/pooling.py @@ -0,0 +1,1306 @@ +from typing import List, Optional + +from torch import Tensor +from .module import Module +from .utils import _single, _pair, _triple +from .. import functional as F + +from ..common_types import (_size_any_t, _size_1_t, _size_2_t, _size_3_t, + _ratio_3_t, _ratio_2_t, _size_any_opt_t, _size_2_opt_t, _size_3_opt_t) + +__all__ = ['MaxPool1d', 'MaxPool2d', 'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', + 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'FractionalMaxPool2d', 'FractionalMaxPool3d', 'LPPool1d', + 'LPPool2d', 'LPPool3d', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', + 'AdaptiveAvgPool1d', 'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d'] + +class _MaxPoolNd(Module): + __constants__ = ['kernel_size', 'stride', 'padding', 'dilation', + 'return_indices', 'ceil_mode'] + return_indices: bool + ceil_mode: bool + + def __init__(self, kernel_size: _size_any_t, stride: Optional[_size_any_t] = None, + padding: _size_any_t = 0, dilation: _size_any_t = 1, + return_indices: bool = False, ceil_mode: bool = False) -> None: + super().__init__() + self.kernel_size = kernel_size + self.stride = stride if (stride is not None) else kernel_size + self.padding = padding + self.dilation = dilation + self.return_indices = return_indices + self.ceil_mode = ceil_mode + + def extra_repr(self) -> str: + return 'kernel_size={kernel_size}, stride={stride}, padding={padding}' \ + ', dilation={dilation}, ceil_mode={ceil_mode}'.format(**self.__dict__) + + +class MaxPool1d(_MaxPoolNd): + r"""Applies a 1D max pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, L)` + and output :math:`(N, C, L_{out})` can be precisely described as: + + .. math:: + out(N_i, C_j, k) = \max_{m=0, \ldots, \text{kernel\_size} - 1} + input(N_i, C_j, stride \times k + m) + + If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides + for :attr:`padding` number of points. :attr:`dilation` is the stride between the elements within the + sliding window. This `link`_ has a nice visualization of the pooling parameters. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + Args: + kernel_size: The size of the sliding window, must be > 0. + stride: The stride of the sliding window, must be > 0. Default value is :attr:`kernel_size`. + padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2. + dilation: The stride between elements within a sliding window, must be > 0. + return_indices: If ``True``, will return the argmax along with the max values. + Useful for :class:`torch.nn.MaxUnpool1d` later + ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This + ensures that every element in the input tensor is covered by a sliding window. + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where + + .. math:: + L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation} + \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor + + Examples:: + + >>> # pool of size=3, stride=2 + >>> m = nn.MaxPool1d(3, stride=2) + >>> input = torch.randn(20, 16, 50) + >>> output = m(input) + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + """ + + kernel_size: _size_1_t + stride: _size_1_t + padding: _size_1_t + dilation: _size_1_t + + def forward(self, input: Tensor): + return F.max_pool1d(input, self.kernel_size, self.stride, + self.padding, self.dilation, ceil_mode=self.ceil_mode, + return_indices=self.return_indices) + + +class MaxPool2d(_MaxPoolNd): + r"""Applies a 2D max pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`, + output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)` + can be precisely described as: + + .. math:: + \begin{aligned} + out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\ + & \text{input}(N_i, C_j, \text{stride[0]} \times h + m, + \text{stride[1]} \times w + n) + \end{aligned} + + If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides + for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points. + It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: + + - a single ``int`` -- in which case the same value is used for the height and width dimension + - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, + and the second `int` for the width dimension + + Args: + kernel_size: the size of the window to take a max over + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: Implicit negative infinity padding to be added on both sides + dilation: a parameter that controls the stride of elements in the window + return_indices: if ``True``, will return the max indices along with the outputs. + Useful for :class:`torch.nn.MaxUnpool2d` later + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})` + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding[0]} - \text{dilation[0]} + \times (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding[1]} - \text{dilation[1]} + \times (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor + + Examples:: + + >>> # pool of square window of size=3, stride=2 + >>> m = nn.MaxPool2d(3, stride=2) + >>> # pool of non-square window + >>> m = nn.MaxPool2d((3, 2), stride=(2, 1)) + >>> input = torch.randn(20, 16, 50, 32) + >>> output = m(input) + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + """ + + kernel_size: _size_2_t + stride: _size_2_t + padding: _size_2_t + dilation: _size_2_t + + def forward(self, input: Tensor): + return F.max_pool2d(input, self.kernel_size, self.stride, + self.padding, self.dilation, ceil_mode=self.ceil_mode, + return_indices=self.return_indices) + + +class MaxPool3d(_MaxPoolNd): + r"""Applies a 3D max pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`, + output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)` + can be precisely described as: + + .. math:: + \begin{aligned} + \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\ + & \text{input}(N_i, C_j, \text{stride[0]} \times d + k, + \text{stride[1]} \times h + m, \text{stride[2]} \times w + n) + \end{aligned} + + If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides + for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points. + It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: + + - a single ``int`` -- in which case the same value is used for the depth, height and width dimension + - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, + the second `int` for the height dimension and the third `int` for the width dimension + + Args: + kernel_size: the size of the window to take a max over + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: Implicit negative infinity padding to be added on all three sides + dilation: a parameter that controls the stride of elements in the window + return_indices: if ``True``, will return the max indices along with the outputs. + Useful for :class:`torch.nn.MaxUnpool3d` later + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times + (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times + (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times + (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor + + Examples:: + + >>> # pool of square window of size=3, stride=2 + >>> m = nn.MaxPool3d(3, stride=2) + >>> # pool of non-square window + >>> m = nn.MaxPool3d((3, 2, 2), stride=(2, 1, 2)) + >>> input = torch.randn(20, 16, 50, 44, 31) + >>> output = m(input) + + .. _link: + https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md + """ # noqa: E501 + + kernel_size: _size_3_t + stride: _size_3_t + padding: _size_3_t + dilation: _size_3_t + + def forward(self, input: Tensor): + return F.max_pool3d(input, self.kernel_size, self.stride, + self.padding, self.dilation, ceil_mode=self.ceil_mode, + return_indices=self.return_indices) + + +class _MaxUnpoolNd(Module): + + def extra_repr(self) -> str: + return f'kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}' + + +class MaxUnpool1d(_MaxUnpoolNd): + r"""Computes a partial inverse of :class:`MaxPool1d`. + + :class:`MaxPool1d` is not fully invertible, since the non-maximal values are lost. + + :class:`MaxUnpool1d` takes in as input the output of :class:`MaxPool1d` + including the indices of the maximal values and computes a partial inverse + in which all non-maximal values are set to zero. + + Note: + This operation may behave nondeterministically when the input indices has repeat values. + See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information. + + .. note:: :class:`MaxPool1d` can map several input sizes to the same output + sizes. Hence, the inversion process can get ambiguous. + To accommodate this, you can provide the needed output size + as an additional argument :attr:`output_size` in the forward call. + See the Inputs and Example below. + + Args: + kernel_size (int or tuple): Size of the max pooling window. + stride (int or tuple): Stride of the max pooling window. + It is set to :attr:`kernel_size` by default. + padding (int or tuple): Padding that was added to the input + + Inputs: + - `input`: the input Tensor to invert + - `indices`: the indices given out by :class:`~torch.nn.MaxPool1d` + - `output_size` (optional): the targeted output size + + Shape: + - Input: :math:`(N, C, H_{in})` or :math:`(C, H_{in})`. + - Output: :math:`(N, C, H_{out})` or :math:`(C, H_{out})`, where + + .. math:: + H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0] + + or as given by :attr:`output_size` in the call operator + + Example:: + + >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?") + >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True) + >>> unpool = nn.MaxUnpool1d(2, stride=2) + >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8]]]) + >>> output, indices = pool(input) + >>> unpool(output, indices) + tensor([[[ 0., 2., 0., 4., 0., 6., 0., 8.]]]) + + >>> # Example showcasing the use of output_size + >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8, 9]]]) + >>> output, indices = pool(input) + >>> unpool(output, indices, output_size=input.size()) + tensor([[[ 0., 2., 0., 4., 0., 6., 0., 8., 0.]]]) + + >>> unpool(output, indices) + tensor([[[ 0., 2., 0., 4., 0., 6., 0., 8.]]]) + """ + + kernel_size: _size_1_t + stride: _size_1_t + padding: _size_1_t + + def __init__(self, kernel_size: _size_1_t, stride: Optional[_size_1_t] = None, padding: _size_1_t = 0) -> None: + super().__init__() + self.kernel_size = _single(kernel_size) + self.stride = _single(stride if (stride is not None) else kernel_size) + self.padding = _single(padding) + + def forward(self, input: Tensor, indices: Tensor, output_size: Optional[List[int]] = None) -> Tensor: + return F.max_unpool1d(input, indices, self.kernel_size, self.stride, + self.padding, output_size) + + +class MaxUnpool2d(_MaxUnpoolNd): + r"""Computes a partial inverse of :class:`MaxPool2d`. + + :class:`MaxPool2d` is not fully invertible, since the non-maximal values are lost. + + :class:`MaxUnpool2d` takes in as input the output of :class:`MaxPool2d` + including the indices of the maximal values and computes a partial inverse + in which all non-maximal values are set to zero. + + Note: + This operation may behave nondeterministically when the input indices has repeat values. + See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information. + + .. note:: :class:`MaxPool2d` can map several input sizes to the same output + sizes. Hence, the inversion process can get ambiguous. + To accommodate this, you can provide the needed output size + as an additional argument :attr:`output_size` in the forward call. + See the Inputs and Example below. + + Args: + kernel_size (int or tuple): Size of the max pooling window. + stride (int or tuple): Stride of the max pooling window. + It is set to :attr:`kernel_size` by default. + padding (int or tuple): Padding that was added to the input + + Inputs: + - `input`: the input Tensor to invert + - `indices`: the indices given out by :class:`~torch.nn.MaxPool2d` + - `output_size` (optional): the targeted output size + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + .. math:: + H_{out} = (H_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]} + + .. math:: + W_{out} = (W_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]} + + or as given by :attr:`output_size` in the call operator + + Example:: + + >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True) + >>> unpool = nn.MaxUnpool2d(2, stride=2) + >>> input = torch.tensor([[[[ 1., 2., 3., 4.], + [ 5., 6., 7., 8.], + [ 9., 10., 11., 12.], + [13., 14., 15., 16.]]]]) + >>> output, indices = pool(input) + >>> unpool(output, indices) + tensor([[[[ 0., 0., 0., 0.], + [ 0., 6., 0., 8.], + [ 0., 0., 0., 0.], + [ 0., 14., 0., 16.]]]]) + >>> # Now using output_size to resolve an ambiguous size for the inverse + >>> input = torch.torch.tensor([[[[ 1., 2., 3., 4., 5.], + [ 6., 7., 8., 9., 10.], + [11., 12., 13., 14., 15.], + [16., 17., 18., 19., 20.]]]]) + >>> output, indices = pool(input) + >>> # This call will not work without specifying output_size + >>> unpool(output, indices, output_size=input.size()) + tensor([[[[ 0., 0., 0., 0., 0.], + [ 0., 7., 0., 9., 0.], + [ 0., 0., 0., 0., 0.], + [ 0., 17., 0., 19., 0.]]]]) + + + """ + + kernel_size: _size_2_t + stride: _size_2_t + padding: _size_2_t + + def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, padding: _size_2_t = 0) -> None: + super().__init__() + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride if (stride is not None) else kernel_size) + self.padding = _pair(padding) + + def forward(self, input: Tensor, indices: Tensor, output_size: Optional[List[int]] = None) -> Tensor: + return F.max_unpool2d(input, indices, self.kernel_size, self.stride, + self.padding, output_size) + + +class MaxUnpool3d(_MaxUnpoolNd): + r"""Computes a partial inverse of :class:`MaxPool3d`. + + :class:`MaxPool3d` is not fully invertible, since the non-maximal values are lost. + :class:`MaxUnpool3d` takes in as input the output of :class:`MaxPool3d` + including the indices of the maximal values and computes a partial inverse + in which all non-maximal values are set to zero. + + Note: + This operation may behave nondeterministically when the input indices has repeat values. + See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information. + + .. note:: :class:`MaxPool3d` can map several input sizes to the same output + sizes. Hence, the inversion process can get ambiguous. + To accommodate this, you can provide the needed output size + as an additional argument :attr:`output_size` in the forward call. + See the Inputs section below. + + Args: + kernel_size (int or tuple): Size of the max pooling window. + stride (int or tuple): Stride of the max pooling window. + It is set to :attr:`kernel_size` by default. + padding (int or tuple): Padding that was added to the input + + Inputs: + - `input`: the input Tensor to invert + - `indices`: the indices given out by :class:`~torch.nn.MaxPool3d` + - `output_size` (optional): the targeted output size + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = (D_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]} + + .. math:: + H_{out} = (H_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]} + + .. math:: + W_{out} = (W_{in} - 1) \times \text{stride[2]} - 2 \times \text{padding[2]} + \text{kernel\_size[2]} + + or as given by :attr:`output_size` in the call operator + + Example:: + + >>> # pool of square window of size=3, stride=2 + >>> pool = nn.MaxPool3d(3, stride=2, return_indices=True) + >>> unpool = nn.MaxUnpool3d(3, stride=2) + >>> output, indices = pool(torch.randn(20, 16, 51, 33, 15)) + >>> unpooled_output = unpool(output, indices) + >>> unpooled_output.size() + torch.Size([20, 16, 51, 33, 15]) + """ + + kernel_size: _size_3_t + stride: _size_3_t + padding: _size_3_t + + def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, padding: _size_3_t = 0) -> None: + super().__init__() + self.kernel_size = _triple(kernel_size) + self.stride = _triple(stride if (stride is not None) else kernel_size) + self.padding = _triple(padding) + + def forward(self, input: Tensor, indices: Tensor, output_size: Optional[List[int]] = None) -> Tensor: + return F.max_unpool3d(input, indices, self.kernel_size, self.stride, + self.padding, output_size) + + +class _AvgPoolNd(Module): + __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad'] + + def extra_repr(self) -> str: + return f'kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}' + + +class AvgPool1d(_AvgPoolNd): + r"""Applies a 1D average pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, L)`, + output :math:`(N, C, L_{out})` and :attr:`kernel_size` :math:`k` + can be precisely described as: + + .. math:: + + \text{out}(N_i, C_j, l) = \frac{1}{k} \sum_{m=0}^{k-1} + \text{input}(N_i, C_j, \text{stride} \times l + m) + + If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides + for :attr:`padding` number of points. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be + an ``int`` or a one-element tuple. + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: implicit zero padding to be added on both sides + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + count_include_pad: when True, will include the zero-padding in the averaging calculation + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where + + .. math:: + L_{out} = \left\lfloor \frac{L_{in} + + 2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor + + Per the note above, if ``ceil_mode`` is True and :math:`(L_{out} - 1) \times \text{stride} \geq L_{in} + + \text{padding}`, we skip the last window as it would start in the right padded region, resulting in + :math:`L_{out}` being reduced by one. + + Examples:: + + >>> # pool with window of size=3, stride=2 + >>> m = nn.AvgPool1d(3, stride=2) + >>> m(torch.tensor([[[1., 2, 3, 4, 5, 6, 7]]])) + tensor([[[2., 4., 6.]]]) + """ + + kernel_size: _size_1_t + stride: _size_1_t + padding: _size_1_t + ceil_mode: bool + count_include_pad: bool + + def __init__(self, kernel_size: _size_1_t, stride: _size_1_t = None, padding: _size_1_t = 0, ceil_mode: bool = False, + count_include_pad: bool = True) -> None: + super().__init__() + self.kernel_size = _single(kernel_size) + self.stride = _single(stride if stride is not None else kernel_size) + self.padding = _single(padding) + self.ceil_mode = ceil_mode + self.count_include_pad = count_include_pad + + def forward(self, input: Tensor) -> Tensor: + return F.avg_pool1d( + input, self.kernel_size, self.stride, self.padding, self.ceil_mode, + self.count_include_pad) + + +class AvgPool2d(_AvgPoolNd): + r"""Applies a 2D average pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`, + output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)` + can be precisely described as: + + .. math:: + + out(N_i, C_j, h, w) = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} + input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) + + If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides + for :attr:`padding` number of points. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be: + + - a single ``int`` -- in which case the same value is used for the height and width dimension + - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, + and the second `int` for the width dimension + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: implicit zero padding to be added on both sides + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + count_include_pad: when True, will include the zero-padding in the averaging calculation + divisor_override: if specified, it will be used as divisor, otherwise size of the pooling region will be used. + + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[0] - + \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[1] - + \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor + + Per the note above, if ``ceil_mode`` is True and :math:`(H_{out} - 1)\times \text{stride}[0]\geq H_{in} + + \text{padding}[0]`, we skip the last window as it would start in the bottom padded region, + resulting in :math:`H_{out}` being reduced by one. + + The same applies for :math:`W_{out}`. + + Examples:: + + >>> # pool of square window of size=3, stride=2 + >>> m = nn.AvgPool2d(3, stride=2) + >>> # pool of non-square window + >>> m = nn.AvgPool2d((3, 2), stride=(2, 1)) + >>> input = torch.randn(20, 16, 50, 32) + >>> output = m(input) + """ + + __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad', 'divisor_override'] + + kernel_size: _size_2_t + stride: _size_2_t + padding: _size_2_t + ceil_mode: bool + count_include_pad: bool + + def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, padding: _size_2_t = 0, + ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None: + super().__init__() + self.kernel_size = kernel_size + self.stride = stride if (stride is not None) else kernel_size + self.padding = padding + self.ceil_mode = ceil_mode + self.count_include_pad = count_include_pad + self.divisor_override = divisor_override + + def forward(self, input: Tensor) -> Tensor: + return F.avg_pool2d(input, self.kernel_size, self.stride, + self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override) + + +class AvgPool3d(_AvgPoolNd): + r"""Applies a 3D average pooling over an input signal composed of several input planes. + + In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`, + output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)` + can be precisely described as: + + .. math:: + \begin{aligned} + \text{out}(N_i, C_j, d, h, w) ={} & \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} \\ + & \frac{\text{input}(N_i, C_j, \text{stride}[0] \times d + k, + \text{stride}[1] \times h + m, \text{stride}[2] \times w + n)} + {kD \times kH \times kW} + \end{aligned} + + If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides + for :attr:`padding` number of points. + + Note: + When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding + or the input. Sliding windows that would start in the right padded region are ignored. + + The parameters :attr:`kernel_size`, :attr:`stride` can either be: + + - a single ``int`` -- in which case the same value is used for the depth, height and width dimension + - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, + the second `int` for the height dimension and the third `int` for the width dimension + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + padding: implicit zero padding to be added on all three sides + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + count_include_pad: when True, will include the zero-padding in the averaging calculation + divisor_override: if specified, it will be used as divisor, otherwise :attr:`kernel_size` will be used + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or + :math:`(C, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - + \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - + \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - + \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor + + Per the note above, if ``ceil_mode`` is True and :math:`(D_{out} - 1)\times \text{stride}[0]\geq D_{in} + + \text{padding}[0]`, we skip the last window as it would start in the padded region, + resulting in :math:`D_{out}` being reduced by one. + + The same applies for :math:`W_{out}` and :math:`H_{out}`. + + Examples:: + + >>> # pool of square window of size=3, stride=2 + >>> m = nn.AvgPool3d(3, stride=2) + >>> # pool of non-square window + >>> m = nn.AvgPool3d((3, 2, 2), stride=(2, 1, 2)) + >>> input = torch.randn(20, 16, 50, 44, 31) + >>> output = m(input) + """ + + __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad', 'divisor_override'] + + kernel_size: _size_3_t + stride: _size_3_t + padding: _size_3_t + ceil_mode: bool + count_include_pad: bool + + def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, padding: _size_3_t = 0, + ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None: + super().__init__() + self.kernel_size = kernel_size + self.stride = stride if (stride is not None) else kernel_size + self.padding = padding + self.ceil_mode = ceil_mode + self.count_include_pad = count_include_pad + self.divisor_override = divisor_override + + def forward(self, input: Tensor) -> Tensor: + return F.avg_pool3d(input, self.kernel_size, self.stride, + self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override) + + def __setstate__(self, d): + super().__setstate__(d) + self.__dict__.setdefault('padding', 0) + self.__dict__.setdefault('ceil_mode', False) + self.__dict__.setdefault('count_include_pad', True) + + +class FractionalMaxPool2d(Module): + r"""Applies a 2D fractional max pooling over an input signal composed of several input planes. + + Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham + + The max-pooling operation is applied in :math:`kH \times kW` regions by a stochastic + step size determined by the target output size. + The number of output features is equal to the number of input planes. + + .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined. + + Args: + kernel_size: the size of the window to take a max over. + Can be a single number k (for a square kernel of k x k) or a tuple `(kh, kw)` + output_size: the target output size of the image of the form `oH x oW`. + Can be a tuple `(oH, oW)` or a single number oH for a square image `oH x oH`. + Note that we must have :math:`kH + oH - 1 <= H_{in}` and :math:`kW + oW - 1 <= W_{in}` + output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given. + This has to be a number or tuple in the range (0, 1). + Note that we must have :math:`kH + (output\_ratio\_H * H_{in}) - 1 <= H_{in}` + and :math:`kW + (output\_ratio\_W * W_{in}) - 1 <= W_{in}` + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to :meth:`nn.MaxUnpool2d`. Default: ``False`` + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + :math:`(H_{out}, W_{out})=\text{output\_size}` or + :math:`(H_{out}, W_{out})=\text{output\_ratio} \times (H_{in}, W_{in})`. + + Examples: + >>> # pool of square window of size=3, and target output size 13x12 + >>> m = nn.FractionalMaxPool2d(3, output_size=(13, 12)) + >>> # pool of square window and target output size being half of input image size + >>> m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5)) + >>> input = torch.randn(20, 16, 50, 32) + >>> output = m(input) + + .. _Fractional MaxPooling: + https://arxiv.org/abs/1412.6071 + """ + + __constants__ = ['kernel_size', 'return_indices', 'output_size', + 'output_ratio'] + + kernel_size: _size_2_t + return_indices: bool + output_size: _size_2_t + output_ratio: _ratio_2_t + + def __init__(self, kernel_size: _size_2_t, output_size: Optional[_size_2_t] = None, + output_ratio: Optional[_ratio_2_t] = None, + return_indices: bool = False, _random_samples=None) -> None: + super().__init__() + self.kernel_size = _pair(kernel_size) + self.return_indices = return_indices + self.register_buffer('_random_samples', _random_samples) + self.output_size = _pair(output_size) if output_size is not None else None + self.output_ratio = _pair(output_ratio) if output_ratio is not None else None + if output_size is None and output_ratio is None: + raise ValueError("FractionalMaxPool2d requires specifying either " + "an output size, or a pooling ratio") + if output_size is not None and output_ratio is not None: + raise ValueError("only one of output_size and output_ratio may be specified") + if self.output_ratio is not None: + if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1): + raise ValueError(f"output_ratio must be between 0 and 1 (got {output_ratio})") + + def forward(self, input: Tensor): + return F.fractional_max_pool2d( + input, self.kernel_size, self.output_size, self.output_ratio, + self.return_indices, + _random_samples=self._random_samples) + + +class FractionalMaxPool3d(Module): + r"""Applies a 3D fractional max pooling over an input signal composed of several input planes. + + Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham + + The max-pooling operation is applied in :math:`kT \times kH \times kW` regions by a stochastic + step size determined by the target output size. + The number of output features is equal to the number of input planes. + + .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined. + + Args: + kernel_size: the size of the window to take a max over. + Can be a single number k (for a square kernel of k x k x k) or a tuple `(kt x kh x kw)` + output_size: the target output size of the image of the form `oT x oH x oW`. + Can be a tuple `(oT, oH, oW)` or a single number oH for a square image `oH x oH x oH` + output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given. + This has to be a number or tuple in the range (0, 1) + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to :meth:`nn.MaxUnpool3d`. Default: ``False`` + + Shape: + - Input: :math:`(N, C, T_{in}, H_{in}, W_{in})` or :math:`(C, T_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, T_{out}, H_{out}, W_{out})` or :math:`(C, T_{out}, H_{out}, W_{out})`, where + :math:`(T_{out}, H_{out}, W_{out})=\text{output\_size}` or + :math:`(T_{out}, H_{out}, W_{out})=\text{output\_ratio} \times (T_{in}, H_{in}, W_{in})` + + Examples: + >>> # pool of cubic window of size=3, and target output size 13x12x11 + >>> m = nn.FractionalMaxPool3d(3, output_size=(13, 12, 11)) + >>> # pool of cubic window and target output size being half of input size + >>> m = nn.FractionalMaxPool3d(3, output_ratio=(0.5, 0.5, 0.5)) + >>> input = torch.randn(20, 16, 50, 32, 16) + >>> output = m(input) + + .. _Fractional MaxPooling: + https://arxiv.org/abs/1412.6071 + """ + + __constants__ = ['kernel_size', 'return_indices', 'output_size', + 'output_ratio'] + kernel_size: _size_3_t + return_indices: bool + output_size: _size_3_t + output_ratio: _ratio_3_t + + def __init__(self, kernel_size: _size_3_t, output_size: Optional[_size_3_t] = None, + output_ratio: Optional[_ratio_3_t] = None, + return_indices: bool = False, _random_samples=None) -> None: + super().__init__() + self.kernel_size = _triple(kernel_size) + self.return_indices = return_indices + self.register_buffer('_random_samples', _random_samples) + self.output_size = _triple(output_size) if output_size is not None else None + self.output_ratio = _triple(output_ratio) if output_ratio is not None else None + if output_size is None and output_ratio is None: + raise ValueError("FractionalMaxPool3d requires specifying either " + "an output size, or a pooling ratio") + if output_size is not None and output_ratio is not None: + raise ValueError("only one of output_size and output_ratio may be specified") + if self.output_ratio is not None: + if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1 and 0 < self.output_ratio[2] < 1): + raise ValueError(f"output_ratio must be between 0 and 1 (got {output_ratio})") + + def forward(self, input: Tensor): + return F.fractional_max_pool3d( + input, self.kernel_size, self.output_size, self.output_ratio, + self.return_indices, + _random_samples=self._random_samples) + + +class _LPPoolNd(Module): + __constants__ = ['norm_type', 'kernel_size', 'stride', 'ceil_mode'] + + norm_type: float + ceil_mode: bool + + def __init__(self, norm_type: float, kernel_size: _size_any_t, stride: Optional[_size_any_t] = None, + ceil_mode: bool = False) -> None: + super().__init__() + self.norm_type = norm_type + self.kernel_size = kernel_size + self.stride = stride + self.ceil_mode = ceil_mode + + def extra_repr(self) -> str: + return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \ + 'ceil_mode={ceil_mode}'.format(**self.__dict__) + + +class LPPool1d(_LPPoolNd): + r"""Applies a 1D power-average pooling over an input signal composed of several input planes. + + On each window, the function computed is: + + .. math:: + f(X) = \sqrt[p]{\sum_{x \in X} x^{p}} + + - At p = :math:`\infty`, one gets Max Pooling + - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling) + + .. note:: If the sum to the power of `p` is zero, the gradient of this function is + not defined. This implementation will set the gradient to zero in this case. + + Args: + kernel_size: a single int, the size of the window + stride: a single int, the stride of the window. Default value is :attr:`kernel_size` + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where + + .. math:: + L_{out} = \left\lfloor\frac{L_{in} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor + + Examples:: + >>> # power-2 pool of window of length 3, with stride 2. + >>> m = nn.LPPool1d(2, 3, stride=2) + >>> input = torch.randn(20, 16, 50) + >>> output = m(input) + """ + + kernel_size: _size_1_t + stride: _size_1_t + + def forward(self, input: Tensor) -> Tensor: + return F.lp_pool1d(input, float(self.norm_type), self.kernel_size, + self.stride, self.ceil_mode) + + +class LPPool2d(_LPPoolNd): + r"""Applies a 2D power-average pooling over an input signal composed of several input planes. + + On each window, the function computed is: + + .. math:: + f(X) = \sqrt[p]{\sum_{x \in X} x^{p}} + + - At p = :math:`\infty`, one gets Max Pooling + - At p = 1, one gets Sum Pooling (which is proportional to average pooling) + + The parameters :attr:`kernel_size`, :attr:`stride` can either be: + + - a single ``int`` -- in which case the same value is used for the height and width dimension + - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension, + and the second `int` for the width dimension + + .. note:: If the sum to the power of `p` is zero, the gradient of this function is + not defined. This implementation will set the gradient to zero in this case. + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor + + Examples:: + + >>> # power-2 pool of square window of size=3, stride=2 + >>> m = nn.LPPool2d(2, 3, stride=2) + >>> # pool of non-square window of power 1.2 + >>> m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1)) + >>> input = torch.randn(20, 16, 50, 32) + >>> output = m(input) + + """ + + kernel_size: _size_2_t + stride: _size_2_t + + def forward(self, input: Tensor) -> Tensor: + return F.lp_pool2d(input, float(self.norm_type), self.kernel_size, + self.stride, self.ceil_mode) + + +class LPPool3d(_LPPoolNd): + r"""Applies a 3D power-average pooling over an input signal composed of several input planes. + + On each window, the function computed is: + + .. math:: + f(X) = \sqrt[p]{\sum_{x \in X} x^{p}} + + - At p = :math:`\infty`, one gets Max Pooling + - At p = 1, one gets Sum Pooling (which is proportional to average pooling) + + The parameters :attr:`kernel_size`, :attr:`stride` can either be: + + - a single ``int`` -- in which case the same value is used for the height, width and depth dimension + - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension, + the second `int` for the height dimension and the third `int` for the width dimension + + .. note:: If the sum to the power of `p` is zero, the gradient of this function is + not defined. This implementation will set the gradient to zero in this case. + + Args: + kernel_size: the size of the window + stride: the stride of the window. Default value is :attr:`kernel_size` + ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or + :math:`(C, D_{out}, H_{out}, W_{out})`, where + + .. math:: + D_{out} = \left\lfloor\frac{D_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor + + Examples:: + + >>> # power-2 pool of square window of size=3, stride=2 + >>> m = nn.LPPool3d(2, 3, stride=2) + >>> # pool of non-square window of power 1.2 + >>> m = nn.LPPool3d(1.2, (3, 2, 2), stride=(2, 1, 2)) + >>> input = torch.randn(20, 16, 50, 44, 31) + >>> output = m(input) + + """ + + kernel_size: _size_3_t + stride: _size_3_t + + def forward(self, input: Tensor) -> Tensor: + return F.lp_pool3d(input, float(self.norm_type), self.kernel_size, + self.stride, self.ceil_mode) + + +class _AdaptiveMaxPoolNd(Module): + __constants__ = ['output_size', 'return_indices'] + return_indices: bool + + def __init__(self, output_size: _size_any_opt_t, return_indices: bool = False) -> None: + super().__init__() + self.output_size = output_size + self.return_indices = return_indices + + def extra_repr(self) -> str: + return f'output_size={self.output_size}' + +# FIXME (by @ssnl): Improve adaptive pooling docs: specify what the input and +# output shapes are, and how the operation computes output. + + +class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd): + r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes. + + The output size is :math:`L_{out}`, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size :math:`L_{out}`. + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to nn.MaxUnpool1d. Default: ``False`` + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where + :math:`L_{out}=\text{output\_size}`. + + Examples: + >>> # target output size of 5 + >>> m = nn.AdaptiveMaxPool1d(5) + >>> input = torch.randn(1, 64, 8) + >>> output = m(input) + + """ + + output_size: _size_1_t + + def forward(self, input: Tensor): + return F.adaptive_max_pool1d(input, self.output_size, self.return_indices) + + +class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd): + r"""Applies a 2D adaptive max pooling over an input signal composed of several input planes. + + The output is of size :math:`H_{out} \times W_{out}`, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size of the image of the form :math:`H_{out} \times W_{out}`. + Can be a tuple :math:`(H_{out}, W_{out})` or a single :math:`H_{out}` for a + square image :math:`H_{out} \times H_{out}`. :math:`H_{out}` and :math:`W_{out}` + can be either a ``int``, or ``None`` which means the size will be the same as that + of the input. + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to nn.MaxUnpool2d. Default: ``False`` + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where + :math:`(H_{out}, W_{out})=\text{output\_size}`. + + Examples: + >>> # target output size of 5x7 + >>> m = nn.AdaptiveMaxPool2d((5, 7)) + >>> input = torch.randn(1, 64, 8, 9) + >>> output = m(input) + >>> # target output size of 7x7 (square) + >>> m = nn.AdaptiveMaxPool2d(7) + >>> input = torch.randn(1, 64, 10, 9) + >>> output = m(input) + >>> # target output size of 10x7 + >>> m = nn.AdaptiveMaxPool2d((None, 7)) + >>> input = torch.randn(1, 64, 10, 9) + >>> output = m(input) + + """ + + output_size: _size_2_opt_t + + def forward(self, input: Tensor): + return F.adaptive_max_pool2d(input, self.output_size, self.return_indices) + + +class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd): + r"""Applies a 3D adaptive max pooling over an input signal composed of several input planes. + + The output is of size :math:`D_{out} \times H_{out} \times W_{out}`, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size of the image of the form :math:`D_{out} \times H_{out} \times W_{out}`. + Can be a tuple :math:`(D_{out}, H_{out}, W_{out})` or a single + :math:`D_{out}` for a cube :math:`D_{out} \times D_{out} \times D_{out}`. + :math:`D_{out}`, :math:`H_{out}` and :math:`W_{out}` can be either a + ``int``, or ``None`` which means the size will be the same as that of the input. + + return_indices: if ``True``, will return the indices along with the outputs. + Useful to pass to nn.MaxUnpool3d. Default: ``False`` + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, + where :math:`(D_{out}, H_{out}, W_{out})=\text{output\_size}`. + + Examples: + >>> # target output size of 5x7x9 + >>> m = nn.AdaptiveMaxPool3d((5, 7, 9)) + >>> input = torch.randn(1, 64, 8, 9, 10) + >>> output = m(input) + >>> # target output size of 7x7x7 (cube) + >>> m = nn.AdaptiveMaxPool3d(7) + >>> input = torch.randn(1, 64, 10, 9, 8) + >>> output = m(input) + >>> # target output size of 7x9x8 + >>> m = nn.AdaptiveMaxPool3d((7, None, None)) + >>> input = torch.randn(1, 64, 10, 9, 8) + >>> output = m(input) + + """ + + output_size: _size_3_opt_t + + def forward(self, input: Tensor): + return F.adaptive_max_pool3d(input, self.output_size, self.return_indices) + + +class _AdaptiveAvgPoolNd(Module): + __constants__ = ['output_size'] + + def __init__(self, output_size: _size_any_opt_t) -> None: + super().__init__() + self.output_size = output_size + + def extra_repr(self) -> str: + return f'output_size={self.output_size}' + + +class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd): + r"""Applies a 1D adaptive average pooling over an input signal composed of several input planes. + + The output size is :math:`L_{out}`, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size :math:`L_{out}`. + + Shape: + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where + :math:`L_{out}=\text{output\_size}`. + + Examples: + >>> # target output size of 5 + >>> m = nn.AdaptiveAvgPool1d(5) + >>> input = torch.randn(1, 64, 8) + >>> output = m(input) + + """ + + output_size: _size_1_t + + def forward(self, input: Tensor) -> Tensor: + return F.adaptive_avg_pool1d(input, self.output_size) + + +class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd): + r"""Applies a 2D adaptive average pooling over an input signal composed of several input planes. + + The output is of size H x W, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size of the image of the form H x W. + Can be a tuple (H, W) or a single H for a square image H x H. + H and W can be either a ``int``, or ``None`` which means the size will + be the same as that of the input. + + Shape: + - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Output: :math:`(N, C, S_{0}, S_{1})` or :math:`(C, S_{0}, S_{1})`, where + :math:`S=\text{output\_size}`. + + Examples: + >>> # target output size of 5x7 + >>> m = nn.AdaptiveAvgPool2d((5, 7)) + >>> input = torch.randn(1, 64, 8, 9) + >>> output = m(input) + >>> # target output size of 7x7 (square) + >>> m = nn.AdaptiveAvgPool2d(7) + >>> input = torch.randn(1, 64, 10, 9) + >>> output = m(input) + >>> # target output size of 10x7 + >>> m = nn.AdaptiveAvgPool2d((None, 7)) + >>> input = torch.randn(1, 64, 10, 9) + >>> output = m(input) + + """ + + output_size: _size_2_opt_t + + def forward(self, input: Tensor) -> Tensor: + return F.adaptive_avg_pool2d(input, self.output_size) + + +class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd): + r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes. + + The output is of size D x H x W, for any input size. + The number of output features is equal to the number of input planes. + + Args: + output_size: the target output size of the form D x H x W. + Can be a tuple (D, H, W) or a single number D for a cube D x D x D. + D, H and W can be either a ``int``, or ``None`` which means the size will + be the same as that of the input. + + Shape: + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. + - Output: :math:`(N, C, S_{0}, S_{1}, S_{2})` or :math:`(C, S_{0}, S_{1}, S_{2})`, + where :math:`S=\text{output\_size}`. + + Examples: + >>> # target output size of 5x7x9 + >>> m = nn.AdaptiveAvgPool3d((5, 7, 9)) + >>> input = torch.randn(1, 64, 8, 9, 10) + >>> output = m(input) + >>> # target output size of 7x7x7 (cube) + >>> m = nn.AdaptiveAvgPool3d(7) + >>> input = torch.randn(1, 64, 10, 9, 8) + >>> output = m(input) + >>> # target output size of 7x9x8 + >>> m = nn.AdaptiveAvgPool3d((7, None, None)) + >>> input = torch.randn(1, 64, 10, 9, 8) + >>> output = m(input) + + """ + + output_size: _size_3_opt_t + + def forward(self, input: Tensor) -> Tensor: + return F.adaptive_avg_pool3d(input, self.output_size) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/transformer.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ec5203ee7cfafeaf9526422724436763376be917 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/modules/transformer.py @@ -0,0 +1,975 @@ +import copy +from typing import Optional, Any, Union, Callable + +import torch +import warnings +from torch import Tensor +from .. import functional as F +from .module import Module +from .activation import MultiheadAttention +from .container import ModuleList +from ..init import xavier_uniform_ +from .dropout import Dropout +from .linear import Linear +from .normalization import LayerNorm + +__all__ = ['Transformer', 'TransformerEncoder', 'TransformerDecoder', 'TransformerEncoderLayer', 'TransformerDecoderLayer'] + +def _generate_square_subsequent_mask( + sz: int, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, +) -> Tensor: + r"""Generate a square causal mask for the sequence. + + The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0). + """ + if device is None: + device = torch.device('cpu') + if dtype is None: + dtype = torch.float32 + return torch.triu( + torch.full((sz, sz), float('-inf'), dtype=dtype, device=device), + diagonal=1, + ) + + +def _get_seq_len( + src: Tensor, + batch_first: bool +) -> Optional[int]: + + if src.is_nested: + return None + else: + src_size = src.size() + if len(src_size) == 2: + # unbatched: S, E + return src_size[0] + else: + # batched: B, S, E if batch_first else S, B, E + seq_len_pos = 1 if batch_first else 0 + return src_size[seq_len_pos] + + +class Transformer(Module): + r"""A transformer model. + + User is able to modify the attributes as needed. The architecture + is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, + Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and + Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information + Processing Systems, pages 6000-6010. + + Args: + d_model: the number of expected features in the encoder/decoder inputs (default=512). + nhead: the number of heads in the multiheadattention models (default=8). + num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6). + num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of encoder/decoder intermediate layer, can be a string + ("relu" or "gelu") or a unary callable. Default: relu + custom_encoder: custom encoder (default=None). + custom_decoder: custom decoder (default=None). + layer_norm_eps: the eps value in layer normalization components (default=1e-5). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + norm_first: if ``True``, encoder and decoder layers will perform LayerNorms before + other attention and feedforward operations, otherwise after. Default: ``False`` (after). + bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive + bias. Default: ``True``. + + Examples:: + >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12) + >>> src = torch.rand((10, 32, 512)) + >>> tgt = torch.rand((20, 32, 512)) + >>> out = transformer_model(src, tgt) + + Note: A full example to apply nn.Transformer module for the word language model is available in + https://github.com/pytorch/examples/tree/master/word_language_model + """ + + def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6, + num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1, + activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, + custom_encoder: Optional[Any] = None, custom_decoder: Optional[Any] = None, + layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False, + bias: bool = True, device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}") + + if custom_encoder is not None: + self.encoder = custom_encoder + else: + encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, + activation, layer_norm_eps, batch_first, norm_first, + bias, **factory_kwargs) + encoder_norm = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) + + if custom_decoder is not None: + self.decoder = custom_decoder + else: + decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, + activation, layer_norm_eps, batch_first, norm_first, + bias, **factory_kwargs) + decoder_norm = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + self.batch_first = batch_first + + def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, + src_is_causal: Optional[bool] = None, tgt_is_causal: Optional[bool] = None, + memory_is_causal: bool = False) -> Tensor: + r"""Take in and process masked source/target sequences. + + .. note:: + + If a boolean tensor is provided for any of the [src/tgt/memory]_mask arguments, positions with a ``True`` value are + not allowed to participate in the attention, + which is the opposite of the definition for :attr:`attn_mask` + in :func:`torch.nn.functional.scaled_dot_product_attention`. + + Args: + src: the sequence to the encoder (required). + tgt: the sequence to the decoder (required). + src_mask: the additive mask for the src sequence (optional). + tgt_mask: the additive mask for the tgt sequence (optional). + memory_mask: the additive mask for the encoder output (optional). + src_key_padding_mask: the Tensor mask for src keys per batch (optional). + tgt_key_padding_mask: the Tensor mask for tgt keys per batch (optional). + memory_key_padding_mask: the Tensor mask for memory keys per batch (optional). + src_is_causal: If specified, applies a causal mask as ``src_mask``. + Default: ``None``; try to detect a causal mask. + Warning: + ``src_is_causal`` provides a hint that ``src_mask`` is + the causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + tgt_is_causal: If specified, applies a causal mask as ``tgt_mask``. + Default: ``None``; try to detect a causal mask. + Warning: + ``tgt_is_causal`` provides a hint that ``tgt_mask`` is + the causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + memory_is_causal: If specified, applies a causal mask as + ``memory_mask``. + Default: ``False``. + Warning: + ``memory_is_causal`` provides a hint that + ``memory_mask`` is the causal mask. Providing incorrect + hints can result in incorrect execution, including + forward and backward compatibility. + + Shape: + - src: :math:`(S, E)` for unbatched input, :math:`(S, N, E)` if `batch_first=False` or + `(N, S, E)` if `batch_first=True`. + - tgt: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or + `(N, T, E)` if `batch_first=True`. + - src_mask: :math:`(S, S)` or :math:`(N\cdot\text{num\_heads}, S, S)`. + - tgt_mask: :math:`(T, T)` or :math:`(N\cdot\text{num\_heads}, T, T)`. + - memory_mask: :math:`(T, S)`. + - src_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`. + - tgt_key_padding_mask: :math:`(T)` for unbatched input otherwise :math:`(N, T)`. + - memory_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`. + + Note: [src/tgt/memory]_mask ensures that position :math:`i` is allowed to attend the unmasked + positions. If a BoolTensor is provided, positions with ``True`` + are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by + the attention. If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + + - output: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or + `(N, T, E)` if `batch_first=True`. + + Note: Due to the multi-head attention architecture in the transformer model, + the output sequence length of a transformer is same as the input sequence + (i.e. target) length of the decoder. + + where :math:`S` is the source sequence length, :math:`T` is the target sequence length, :math:`N` is the + batch size, :math:`E` is the feature number + + Examples: + >>> # xdoctest: +SKIP + >>> output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask) + """ + is_batched = src.dim() == 3 + if not self.batch_first and src.size(1) != tgt.size(1) and is_batched: + raise RuntimeError("the batch number of src and tgt must be equal") + elif self.batch_first and src.size(0) != tgt.size(0) and is_batched: + raise RuntimeError("the batch number of src and tgt must be equal") + + if src.size(-1) != self.d_model or tgt.size(-1) != self.d_model: + raise RuntimeError("the feature number of src and tgt must be equal to d_model") + + memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask, + is_causal=src_is_causal) + output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask, + tgt_is_causal=tgt_is_causal, memory_is_causal=memory_is_causal) + return output + + @staticmethod + def generate_square_subsequent_mask( + sz: int, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> Tensor: + r"""Generate a square causal mask for the sequence. + + The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0). + """ + return _generate_square_subsequent_mask(sz, dtype=dtype, device=device) + + def _reset_parameters(self): + r"""Initiate parameters in the transformer model.""" + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + + +class TransformerEncoder(Module): + r"""TransformerEncoder is a stack of N encoder layers. + + Users can build the BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters. + + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + enable_nested_tensor: if True, input will automatically convert to nested tensor + (and convert back on output). This will improve the overall performance of + TransformerEncoder when padding rate is high. Default: ``True`` (enabled). + + Examples:: + >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) + >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 32, 512) + >>> out = transformer_encoder(src) + """ + + __constants__ = ['norm'] + + def __init__( + self, + encoder_layer: "TransformerEncoderLayer", + num_layers: int, + norm: Optional[Module] = None, + enable_nested_tensor: bool = True, + mask_check: bool = True + ) -> None: + super().__init__() + torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}") + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + # this attribute saves the value providedat object construction + self.enable_nested_tensor = enable_nested_tensor + # this attribute controls whether nested tensors are used + self.use_nested_tensor = enable_nested_tensor + self.mask_check = mask_check + + enc_layer = "encoder_layer" + why_not_sparsity_fast_path = '' + if not isinstance(encoder_layer, torch.nn.TransformerEncoderLayer): + why_not_sparsity_fast_path = f"{enc_layer} was not TransformerEncoderLayer" + elif encoder_layer.norm_first : + why_not_sparsity_fast_path = f"{enc_layer}.norm_first was True" + elif not encoder_layer.self_attn.batch_first: + why_not_sparsity_fast_path = (f"{enc_layer}.self_attn.batch_first was not True" + + "(use batch_first for better inference performance)") + elif not encoder_layer.self_attn._qkv_same_embed_dim: + why_not_sparsity_fast_path = f"{enc_layer}.self_attn._qkv_same_embed_dim was not True" + elif encoder_layer.self_attn.in_proj_bias is None: + why_not_sparsity_fast_path = f"{enc_layer}.self_attn was passed bias=False" + elif not encoder_layer.activation_relu_or_gelu: + why_not_sparsity_fast_path = f"{enc_layer}.activation_relu_or_gelu was not True" + elif not (encoder_layer.norm1.eps == encoder_layer.norm2.eps) : + why_not_sparsity_fast_path = f"{enc_layer}.norm1.eps was not equal to {enc_layer}.norm2.eps" + elif encoder_layer.self_attn.num_heads % 2 == 1: + why_not_sparsity_fast_path = f"{enc_layer}.self_attn.num_heads is odd" + + if enable_nested_tensor and why_not_sparsity_fast_path: + warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}") + self.use_nested_tensor = False + + + def forward( + self, + src: Tensor, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + is_causal: Optional[bool] = None) -> Tensor: + r"""Pass the input through the encoder layers in turn. + + Args: + src: the sequence to the encoder (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + is_causal: If specified, applies a causal mask as ``mask``. + Default: ``None``; try to detect a causal mask. + Warning: + ``is_causal`` provides a hint that ``mask`` is the + causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + + Shape: + see the docs in :class:`~torch.nn.Transformer`. + """ + src_key_padding_mask = F._canonical_mask( + mask=src_key_padding_mask, + mask_name="src_key_padding_mask", + other_type=F._none_or_dtype(mask), + other_name="mask", + target_type=src.dtype + ) + + mask = F._canonical_mask( + mask=mask, + mask_name="mask", + other_type=None, + other_name="", + target_type=src.dtype, + check_other=False, + ) + + output = src + convert_to_nested = False + first_layer = self.layers[0] + src_key_padding_mask_for_layers = src_key_padding_mask + why_not_sparsity_fast_path = '' + str_first_layer = "self.layers[0]" + batch_first = first_layer.self_attn.batch_first + is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled() + + if not is_fastpath_enabled: + why_not_sparsity_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True" + elif not hasattr(self, "use_nested_tensor"): + why_not_sparsity_fast_path = "use_nested_tensor attribute not present" + elif not self.use_nested_tensor: + why_not_sparsity_fast_path = "self.use_nested_tensor (set in init) was not True" + elif first_layer.training: + why_not_sparsity_fast_path = f"{str_first_layer} was in training mode" + elif not src.dim() == 3: + why_not_sparsity_fast_path = f"input not batched; expected src.dim() of 3 but got {src.dim()}" + elif src_key_padding_mask is None: + why_not_sparsity_fast_path = "src_key_padding_mask was None" + elif (((not hasattr(self, "mask_check")) or self.mask_check) + and not torch._nested_tensor_from_mask_left_aligned(src, src_key_padding_mask.logical_not())): + why_not_sparsity_fast_path = "mask_check enabled, and src and src_key_padding_mask was not left aligned" + elif output.is_nested: + why_not_sparsity_fast_path = "NestedTensor input is not supported" + elif mask is not None: + why_not_sparsity_fast_path = "src_key_padding_mask and mask were both supplied" + elif torch.is_autocast_enabled(): + why_not_sparsity_fast_path = "autocast is enabled" + + if not why_not_sparsity_fast_path: + tensor_args = ( + src, + first_layer.self_attn.in_proj_weight, + first_layer.self_attn.in_proj_bias, + first_layer.self_attn.out_proj.weight, + first_layer.self_attn.out_proj.bias, + first_layer.norm1.weight, + first_layer.norm1.bias, + first_layer.norm2.weight, + first_layer.norm2.bias, + first_layer.linear1.weight, + first_layer.linear1.bias, + first_layer.linear2.weight, + first_layer.linear2.bias, + ) + _supported_device_type = ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name] + if torch.overrides.has_torch_function(tensor_args): + why_not_sparsity_fast_path = "some Tensor argument has_torch_function" + elif src.device.type not in _supported_device_type: + why_not_sparsity_fast_path = f"src device is neither one of {_supported_device_type}" + elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args): + why_not_sparsity_fast_path = ("grad is enabled and at least one of query or the " + "input/output projection weights or biases requires_grad") + + if (not why_not_sparsity_fast_path) and (src_key_padding_mask is not None): + convert_to_nested = True + output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False) + src_key_padding_mask_for_layers = None + + seq_len = _get_seq_len(src, batch_first) + is_causal = _detect_is_causal_mask(mask, is_causal, seq_len) + + for mod in self.layers: + output = mod(output, src_mask=mask, is_causal=is_causal, src_key_padding_mask=src_key_padding_mask_for_layers) + + if convert_to_nested: + output = output.to_padded_tensor(0., src.size()) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoder(Module): + r"""TransformerDecoder is a stack of N decoder layers. + + Args: + decoder_layer: an instance of the TransformerDecoderLayer() class (required). + num_layers: the number of sub-decoder-layers in the decoder (required). + norm: the layer normalization component (optional). + + Examples:: + >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8) + >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6) + >>> memory = torch.rand(10, 32, 512) + >>> tgt = torch.rand(20, 32, 512) + >>> out = transformer_decoder(tgt, memory) + """ + + __constants__ = ['norm'] + + def __init__( + self, + decoder_layer: "TransformerDecoderLayer", + num_layers: int, + norm: Optional[Module] = None + ) -> None: + super().__init__() + torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}") + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, tgt_is_causal: Optional[bool] = None, + memory_is_causal: bool = False) -> Tensor: + r"""Pass the inputs (and mask) through the decoder layer in turn. + + Args: + tgt: the sequence to the decoder (required). + memory: the sequence from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + tgt_is_causal: If specified, applies a causal mask as ``tgt mask``. + Default: ``None``; try to detect a causal mask. + Warning: + ``tgt_is_causal`` provides a hint that ``tgt_mask`` is + the causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + memory_is_causal: If specified, applies a causal mask as + ``memory mask``. + Default: ``False``. + Warning: + ``memory_is_causal`` provides a hint that + ``memory_mask`` is the causal mask. Providing incorrect + hints can result in incorrect execution, including + forward and backward compatibility. + + Shape: + see the docs in :class:`~torch.nn.Transformer`. + """ + output = tgt + + seq_len = _get_seq_len(tgt, self.layers[0].self_attn.batch_first) + tgt_is_causal = _detect_is_causal_mask(tgt_mask, tgt_is_causal, seq_len) + + for mod in self.layers: + output = mod(output, memory, tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask, + tgt_is_causal=tgt_is_causal, + memory_is_causal=memory_is_causal) + + if self.norm is not None: + output = self.norm(output) + + return output + +class TransformerEncoderLayer(Module): + r"""TransformerEncoderLayer is made up of self-attn and feedforward network. + + This standard encoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + + TransformerEncoderLayer can handle either traditional torch.tensor inputs, + or Nested Tensor inputs. Derived classes are expected to similarly accept + both input formats. (Not all combinations of inputs are currently + supported by TransformerEncoderLayer while Nested Tensor is in prototype + state.) + + If you are implementing a custom layer, you may derive it either from + the Module or TransformerEncoderLayer class. If your custom layer + supports both torch.Tensors and Nested Tensors inputs, make its + implementation a derived class of TransformerEncoderLayer. If your custom + Layer supports only torch.Tensor inputs, derive its implementation from + Module. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of the intermediate layer, can be a string + ("relu" or "gelu") or a unary callable. Default: relu + layer_norm_eps: the eps value in layer normalization components (default=1e-5). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + norm_first: if ``True``, layer norm is done prior to attention and feedforward + operations, respectively. Otherwise it's done after. Default: ``False`` (after). + bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive + bias. Default: ``True``. + + Examples:: + >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) + >>> src = torch.rand(10, 32, 512) + >>> out = encoder_layer(src) + + Alternatively, when ``batch_first`` is ``True``: + >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True) + >>> src = torch.rand(32, 10, 512) + >>> out = encoder_layer(src) + + Fast path: + forward() will use a special optimized implementation described in + `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`_ if all of the following + conditions are met: + + - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor + argument ``requires_grad`` + - training is disabled (using ``.eval()``) + - batch_first is ``True`` and the input is batched (i.e., ``src.dim() == 3``) + - activation is one of: ``"relu"``, ``"gelu"``, ``torch.functional.relu``, or ``torch.functional.gelu`` + - at most one of ``src_mask`` and ``src_key_padding_mask`` is passed + - if src is a `NestedTensor `_, neither ``src_mask`` + nor ``src_key_padding_mask`` is passed + - the two ``LayerNorm`` instances have a consistent ``eps`` value (this will naturally be the case + unless the caller has manually modified one without modifying the other) + + If the optimized implementation is in use, a + `NestedTensor `_ can be + passed for ``src`` to represent padding more efficiently than using a padding + mask. In this case, a `NestedTensor `_ will be + returned, and an additional speedup proportional to the fraction of the input that + is padding can be expected. + + .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`: + https://arxiv.org/abs/2205.14135 + + """ + + __constants__ = ['norm_first'] + + def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1, + activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, + layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False, + bias: bool = True, device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, + bias=bias, batch_first=batch_first, + **factory_kwargs) + # Implementation of Feedforward model + self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs) + self.dropout = Dropout(dropout) + self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs) + + self.norm_first = norm_first + self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + self.dropout1 = Dropout(dropout) + self.dropout2 = Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + activation = _get_activation_fn(activation) + + # We can't test self.activation in forward() in TorchScript, + # so stash some information about it instead. + if activation is F.relu or isinstance(activation, torch.nn.ReLU): + self.activation_relu_or_gelu = 1 + elif activation is F.gelu or isinstance(activation, torch.nn.GELU): + self.activation_relu_or_gelu = 2 + else: + self.activation_relu_or_gelu = 0 + self.activation = activation + + def __setstate__(self, state): + super().__setstate__(state) + if not hasattr(self, 'activation'): + self.activation = F.relu + + + def forward( + self, + src: Tensor, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + is_causal: bool = False) -> Tensor: + r"""Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + is_causal: If specified, applies a causal mask as ``src mask``. + Default: ``False``. + Warning: + ``is_causal`` provides a hint that ``src_mask`` is the + causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + + Shape: + see the docs in :class:`~torch.nn.Transformer`. + """ + src_key_padding_mask = F._canonical_mask( + mask=src_key_padding_mask, + mask_name="src_key_padding_mask", + other_type=F._none_or_dtype(src_mask), + other_name="src_mask", + target_type=src.dtype + ) + + src_mask = F._canonical_mask( + mask=src_mask, + mask_name="src_mask", + other_type=None, + other_name="", + target_type=src.dtype, + check_other=False, + ) + + is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled() + + # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf + why_not_sparsity_fast_path = '' + if not is_fastpath_enabled: + why_not_sparsity_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True" + elif not src.dim() == 3: + why_not_sparsity_fast_path = f"input not batched; expected src.dim() of 3 but got {src.dim()}" + elif self.training: + why_not_sparsity_fast_path = "training is enabled" + elif not self.self_attn.batch_first: + why_not_sparsity_fast_path = "self_attn.batch_first was not True" + elif self.self_attn.in_proj_bias is None: + why_not_sparsity_fast_path = "self_attn was passed bias=False" + elif not self.self_attn._qkv_same_embed_dim: + why_not_sparsity_fast_path = "self_attn._qkv_same_embed_dim was not True" + elif not self.activation_relu_or_gelu: + why_not_sparsity_fast_path = "activation_relu_or_gelu was not True" + elif not (self.norm1.eps == self.norm2.eps): + why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps" + elif src.is_nested and (src_key_padding_mask is not None or src_mask is not None): + why_not_sparsity_fast_path = "neither src_key_padding_mask nor src_mask are not supported with NestedTensor input" + elif self.self_attn.num_heads % 2 == 1: + why_not_sparsity_fast_path = "num_head is odd" + elif torch.is_autocast_enabled(): + why_not_sparsity_fast_path = "autocast is enabled" + if not why_not_sparsity_fast_path: + tensor_args = ( + src, + self.self_attn.in_proj_weight, + self.self_attn.in_proj_bias, + self.self_attn.out_proj.weight, + self.self_attn.out_proj.bias, + self.norm1.weight, + self.norm1.bias, + self.norm2.weight, + self.norm2.bias, + self.linear1.weight, + self.linear1.bias, + self.linear2.weight, + self.linear2.bias, + ) + + # We have to use list comprehensions below because TorchScript does not support + # generator expressions. + _supported_device_type = ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name] + if torch.overrides.has_torch_function(tensor_args): + why_not_sparsity_fast_path = "some Tensor argument has_torch_function" + elif not all((x.device.type in _supported_device_type) for x in tensor_args): + why_not_sparsity_fast_path = ("some Tensor argument's device is neither one of " + f"{_supported_device_type}") + elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args): + why_not_sparsity_fast_path = ("grad is enabled and at least one of query or the " + "input/output projection weights or biases requires_grad") + + if not why_not_sparsity_fast_path: + merged_mask, mask_type = self.self_attn.merge_masks(src_mask, src_key_padding_mask, src) + return torch._transformer_encoder_layer_fwd( + src, + self.self_attn.embed_dim, + self.self_attn.num_heads, + self.self_attn.in_proj_weight, + self.self_attn.in_proj_bias, + self.self_attn.out_proj.weight, + self.self_attn.out_proj.bias, + self.activation_relu_or_gelu == 2, + self.norm_first, + self.norm1.eps, + self.norm1.weight, + self.norm1.bias, + self.norm2.weight, + self.norm2.bias, + self.linear1.weight, + self.linear1.bias, + self.linear2.weight, + self.linear2.bias, + merged_mask, + mask_type, + ) + + + x = src + if self.norm_first: + x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask, is_causal=is_causal) + x = x + self._ff_block(self.norm2(x)) + else: + x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal)) + x = self.norm2(x + self._ff_block(x)) + + return x + + # self-attention block + def _sa_block(self, x: Tensor, + attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor: + x = self.self_attn(x, x, x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False, is_causal=is_causal)[0] + return self.dropout1(x) + + # feed forward block + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + +class TransformerDecoderLayer(Module): + r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. + + This standard decoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of the intermediate layer, can be a string + ("relu" or "gelu") or a unary callable. Default: relu + layer_norm_eps: the eps value in layer normalization components (default=1e-5). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + norm_first: if ``True``, layer norm is done prior to self attention, multihead + attention and feedforward operations, respectively. Otherwise it's done after. + Default: ``False`` (after). + bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive + bias. Default: ``True``. + + Examples:: + >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8) + >>> memory = torch.rand(10, 32, 512) + >>> tgt = torch.rand(20, 32, 512) + >>> out = decoder_layer(tgt, memory) + + Alternatively, when ``batch_first`` is ``True``: + >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=True) + >>> memory = torch.rand(32, 10, 512) + >>> tgt = torch.rand(32, 20, 512) + >>> out = decoder_layer(tgt, memory) + """ + + __constants__ = ['norm_first'] + + def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1, + activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, + layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False, + bias: bool = True, device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, + bias=bias, **factory_kwargs) + self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, + bias=bias, **factory_kwargs) + # Implementation of Feedforward model + self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs) + self.dropout = Dropout(dropout) + self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs) + + self.norm_first = norm_first + self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs) + self.dropout1 = Dropout(dropout) + self.dropout2 = Dropout(dropout) + self.dropout3 = Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + self.activation = _get_activation_fn(activation) + else: + self.activation = activation + + def __setstate__(self, state): + if 'activation' not in state: + state['activation'] = F.relu + super().__setstate__(state) + + def forward( + self, + tgt: Tensor, + memory: Tensor, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + tgt_is_causal: bool = False, + memory_is_causal: bool = False, + ) -> Tensor: + r"""Pass the inputs (and mask) through the decoder layer. + + Args: + tgt: the sequence to the decoder layer (required). + memory: the sequence from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + tgt_is_causal: If specified, applies a causal mask as ``tgt mask``. + Default: ``False``. + Warning: + ``tgt_is_causal`` provides a hint that ``tgt_mask`` is + the causal mask. Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + memory_is_causal: If specified, applies a causal mask as + ``memory mask``. + Default: ``False``. + Warning: + ``memory_is_causal`` provides a hint that + ``memory_mask`` is the causal mask. Providing incorrect + hints can result in incorrect execution, including + forward and backward compatibility. + + Shape: + see the docs in :class:`~torch.nn.Transformer`. + """ + # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf + + x = tgt + if self.norm_first: + x = x + self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask, tgt_is_causal) + x = x + self._mha_block(self.norm2(x), memory, memory_mask, memory_key_padding_mask, memory_is_causal) + x = x + self._ff_block(self.norm3(x)) + else: + x = self.norm1(x + self._sa_block(x, tgt_mask, tgt_key_padding_mask, tgt_is_causal)) + x = self.norm2(x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask, memory_is_causal)) + x = self.norm3(x + self._ff_block(x)) + + return x + + # self-attention block + def _sa_block(self, x: Tensor, + attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor: + x = self.self_attn(x, x, x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + is_causal=is_causal, + need_weights=False)[0] + return self.dropout1(x) + + # multihead attention block + def _mha_block(self, x: Tensor, mem: Tensor, + attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor: + x = self.multihead_attn(x, mem, mem, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + is_causal=is_causal, + need_weights=False)[0] + return self.dropout2(x) + + # feed forward block + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout3(x) + + +def _get_clones(module, N): + # FIXME: copy.deepcopy() is not defined on nn.module + return ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]: + if activation == "relu": + return F.relu + elif activation == "gelu": + return F.gelu + + raise RuntimeError(f"activation should be relu/gelu, not {activation}") + + +def _detect_is_causal_mask( + mask: Optional[Tensor], + is_causal: Optional[bool] = None, + size: Optional[int] = None, +) -> bool: + """Return whether the given attention mask is causal. + + Warning: + If ``is_causal`` is not ``None``, its value will be returned as is. If a + user supplies an incorrect ``is_causal`` hint, + + ``is_causal=False`` when the mask is in fact a causal attention.mask + may lead to reduced performance relative to what would be achievable + with ``is_causal=True``; + ``is_causal=True`` when the mask is in fact not a causal attention.mask + may lead to incorrect and unpredictable execution - in some scenarios, + a causal mask may be applied based on the hint, in other execution + scenarios the specified mask may be used. The choice may not appear + to be deterministic, in that a number of factors like alignment, + hardware SKU, etc influence the decision whether to use a mask or + rely on the hint. + ``size`` if not None, check whether the mask is a causal mask of the provided size + Otherwise, checks for any causal mask. + """ + # Prevent type refinement + make_causal = (is_causal is True) + + if is_causal is None and mask is not None: + sz = size if size is not None else mask.size(-2) + causal_comparison = _generate_square_subsequent_mask( + sz, device=mask.device, dtype=mask.dtype) + + # Do not use `torch.equal` so we handle batched masks by + # broadcasting the comparison. + if mask.size() == causal_comparison.size(): + make_causal = bool((mask == causal_comparison).all()) + else: + make_causal = False + + return make_causal diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c810060942a9d4eb550567f167bd64312d1aa541 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/data_parallel.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/data_parallel.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee45e4723cde1a409e413c237a1d353c7563039e Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/data_parallel.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/scatter_gather.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/scatter_gather.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6f7dabe401e4d875948e588a788b365705da8c0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/__pycache__/scatter_gather.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..e968a99cf85ddd09f4d6cf1bc2ad76e69f1b552b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py @@ -0,0 +1,2350 @@ +import copy +import functools +import inspect +import itertools +import logging +import os +import sys +import warnings +import weakref +from collections import defaultdict, deque +from contextlib import contextmanager +from dataclasses import dataclass, fields, is_dataclass +from enum import auto, Enum +from typing import Any, Callable, List, Optional, Tuple, Type + +import torch +import torch.distributed as dist +from torch.autograd import Function, Variable +from torch.distributed.algorithms.join import Join, Joinable, JoinHook +from torch.utils._pytree import tree_flatten, tree_unflatten +from torch.utils.hooks import RemovableHandle + +RPC_AVAILABLE = False +if dist.is_available(): + from torch.distributed.distributed_c10d import ( + _get_default_group, + _rank_not_in_group, + ReduceOp, + ) + from torch.distributed.utils import ( + _alloc_storage, + _cast_forward_inputs, + _free_storage, + _sync_module_states, + _to_kwargs, + _verify_param_shape_across_processes, + ) +if torch.distributed.rpc.is_available(): + RPC_AVAILABLE = True + from torch.distributed.rpc import RRef + +from torch._utils import _get_device_index + +from ..modules import Module +from .scatter_gather import gather, scatter_kwargs # noqa: F401 + +__all__ = ["DistributedDataParallel"] + +logger = logging.getLogger(__name__) + + +@dataclass +class _MixedPrecision: + """ + This configures DDP-native mixed precision training. + + Attributes: + param_dtype (torch.dtype): This specifies the dtype for model + parameters, inputs (when ``cast_forward_inputs`` is set to + ``True``), and therefore the dtype for computation. + However, outside the forward and backward passes, parameters are in + full precision. Model checkpointing always happens in full + precision. + reduce_dtype (torch.dtype): This specifies the dtype for gradient + reduction, which is permitted to differ from ``param_dtype``. + buffer_dtype (torch.dtype): This specifies the dtype for buffers. + + .. note:: This API is experimental and subject to change. + + .. note:: Only floating point tensors are cast to their specified dtypes. + + .. note:: ``state_dict`` checkpoints parameters and buffers in full + precision. + + .. note:: Each low precision dtype must be specified explicitly. For + example, ``_MixedPrecision(reduce_dtype=torch.float16)`` only specifies + the reduction dtype to be low precision, and DDP will not cast + parameters or buffers. + + .. note:: If a ``reduce_dtype`` is not specified, then gradient reduction + happens in ``param_dtype`` if specified or the original parameter dtype + otherwise. For example, ``_MixedPrecision(param_dtype=torch.float16)`` + would result in communication occurring in fp16. + """ + + param_dtype: Optional[torch.dtype] = None + reduce_dtype: Optional[torch.dtype] = None + buffer_dtype: Optional[torch.dtype] = None + # TODO (rohan-varma): keep_low_precision_grads: bool = False + # TODO (rohan-varma): APIs to allow users to run batchnorm and layernorm + # in full precision. For DDP, this can be implemented by not performing the + # parameter cast for BN and LN units. + + +def _cast_buffers(mixed_precision_config, root_module): + """Casts buffers to the given ``buffer_dtype``.""" + for buf in root_module.buffers(): + if hasattr(buf, "_ddp_ignored") and buf._ddp_ignored: + continue + + buf.data = buf.to(dtype=mixed_precision_config.buffer_dtype) + + +def _setup_mixed_precision_params(mixed_precision_config, root_module): + """Create and free storage for the mixed precision parameters.""" + for param in root_module.parameters(): + # Do not setup mixed precision for DDP ignored parameters. + if hasattr(param, "_ddp_ignored") and param._ddp_ignored: + continue + + if not hasattr(param, "_mp_param"): + param._mp_param = torch.zeros_like( + param, + device=param.device, + dtype=mixed_precision_config.param_dtype, + requires_grad=param.requires_grad, + ) + _free_storage(param._mp_param) + # _fp_param will point to the full precision param so it can be switched + # back to at the end of forward / backward. + param._fp_param = param.data + + +def _tree_flatten_with_rref(output): + output_is_rref = RPC_AVAILABLE and isinstance(output, RRef) + if output_is_rref: + output_tensor_list, treespec = tree_flatten(output.local_value()) + else: + output_tensor_list, treespec = tree_flatten(output) + # Need to return flattened tensors, spec to re-pack them, as well + # as if the return type was actually an RRef to reconstruct. + return output_tensor_list, treespec, output_is_rref + + +def _tree_unflatten_with_rref(output, treespec, output_is_rref): + output = tree_unflatten(output, treespec) + if output_is_rref: + output = RRef(output) + return output + + +def _find_tensors(obj): + r"""Recursively find all tensors contained in the specified object.""" + if RPC_AVAILABLE and isinstance(obj, RRef): + # If the current node is the owner of the RRef, unwrap it and try to + # find Tensors. + # TODO: Expand to remote RRefs. + if obj.is_owner(): + return _find_tensors(obj.local_value()) + if isinstance(obj, torch.Tensor): + return [obj] + if isinstance(obj, (list, tuple)): + return itertools.chain.from_iterable(map(_find_tensors, obj)) + if isinstance(obj, dict): + return itertools.chain.from_iterable(map(_find_tensors, obj.values())) + if is_dataclass(obj): + return itertools.chain.from_iterable( + map(_find_tensors, (getattr(obj, f.name) for f in fields(obj))) + ) + + return [] + + +def _dump_DDP_relevant_env_vars(): + relevant_env_vars = [ + "RANK", + "LOCAL_RANK", + "WORLD_SIZE", + "MASTER_PORT", + "MASTER_ADDR", + "CUDA_VISIBLE_DEVICES", + "GLOO_SOCKET_IFNAME", + "GLOO_DEVICE_TRANSPORT", + "NCCL_SOCKET_IFNAME", + "TORCH_NCCL_BLOCKING_WAIT", + "NCCL_DEBUG", + "NCCL_DEBUG_SUBSYS", + "NCCL_IB_DISABLE", + # More NCCL env vars: + "NCCL_P2P_DISABLE", + "NCCL_P2P_LEVEL", + "NCCL_SHM_DISABLE", + "NCCL_SOCKET_NTHREADS", + "NCCL_NSOCKS_PERTHREAD", + "NCCL_BUFFSIZE", + "NCCL_NTHREADS", + "NCCL_RINGS", + "NCCL_MAX_NCHANNELS", + "NCCL_MIN_NCHANNELS", + "NCCL_CHECKS_DISABLE", + "NCCL_CHECK_POINTERS", + "NCCL_LAUNCH_MODE", + "NCCL_IB_HCA", + "NCCL_IB_TIMEOUT", + "NCCL_IB_RETRY_CNT", + "NCCL_IB_GID_INDEX", + "NCCL_IB_SL", + "NCCL_IB_TC", + "NCCL_IB_AR_THRESHOLD", + "NCCL_IB_CUDA_SUPPORT", + "NCCL_NET_GDR_LEVEL", + "NCCL_NET_GDR_READ", + "NCCL_SINGLE_RING_THRESHOLD", + "NCCL_LL_THRESHOLD", + "NCCL_TREE_THRESHOLD", + "NCCL_ALGO", + "NCCL_PROTO", + "NCCL_IGNORE_CPU_AFFINITY", + "NCCL_DEBUG_FILE", + "NCCL_COLLNET_ENABLE", + "NCCL_TOPO_FILE", + "NCCL_TOPO_DUMP_FILE", + "TORCH_NCCL_ASYNC_ERROR_HANDLING", + ] + formatted_output = "" + for var in relevant_env_vars: + value = os.environ[var] if var in os.environ else "N/A" + formatted_output += f"env:{var}={value}\n" + print(formatted_output) + + +class _BufferCommHookLocation(Enum): + PRE_FORWARD = auto() + POST_FORWARD = auto() + + +@dataclass +class _BufferCommHook: + buffer_comm_hook: Callable + buffer_comm_hook_state: Any + buffer_comm_hook_location: _BufferCommHookLocation + + +# Add a DDPSink to run various functions when backwards starts, such as +# queueing call back of out-most backward/graph task, +# this helps call back is fired after all gradients' calculation +# is completed. +class _DDPSink(Function): + @staticmethod + def forward(ctx, ddp_weakref, *inputs): + # set_materialize_grads(False) will ensure that None gradients stay as + # None and are not filled with zeros. + ctx.set_materialize_grads(False) + ctx.ddp_weakref = ddp_weakref + ret = tuple( + inp.clone() if isinstance(inp, torch.Tensor) else inp for inp in inputs + ) + return ret + + @staticmethod + def backward(ctx, *grad_outputs): + # Enqueue delay allreduce for static graph training on the first + # iteration. + ddp_weakref = ctx.ddp_weakref() + reducer = ddp_weakref.reducer + static_graph = ddp_weakref.static_graph + delay_ar_enqueued = ( + static_graph and ddp_weakref._static_graph_delay_allreduce_enqueued + ) + if static_graph and not delay_ar_enqueued: + Variable._execution_engine.queue_callback( # type: ignore[call-arg,misc] + reducer._delay_all_reduce + ) + ddp_weakref._static_graph_delay_allreduce_enqueued = True + + return (None, *grad_outputs) + + +class _DDPJoinHook(JoinHook): + def __init__(self, ddp, divide_by_initial_world_size): + """Set config variables for internal usage.""" + assert isinstance(ddp, DistributedDataParallel), ( + "DDP join hook requires passing in a DistributedDataParallel " + "instance as the state" + ) + assert ddp.logger is not None + ddp.logger._set_uneven_input_join() + self.ddp = ddp + self.ddp._divide_by_initial_world_size = divide_by_initial_world_size + super().__init__() + + def main_hook(self): + """Shadow the DDP collective communication operations in the forward and backward passes.""" + ddp = self.ddp + # Buckets are rebuilt only once during a training period + ddp.reducer._rebuild_buckets() + + # Schedule a broadcast if we are syncing module buffers in the + # forward pass + # TODO: make DDP uneven inputs context manager support buffer + # comm hook (https://github.com/pytorch/pytorch/issues/65436) + ddp._check_and_sync_module_buffers() + + # Check if need to sync in the backward pass + should_sync_backwards = ddp._check_global_requires_backward_grad_sync( + is_joined_rank=True + ) + # Forward parameter sync is disabled in the next iteration if we + # are skipping gradient sync this iteration, so set + # `require_forward_param_sync` accordingly + ddp.require_forward_param_sync = should_sync_backwards + if not should_sync_backwards: + return + + # Schedule one allreduce per gradient bucket to match the backward + # pass allreduce + ddp._match_all_reduce_for_bwd_pass() + + # Check if we need to allreduce locally unused parameters + if ddp.find_unused_parameters: + ddp._match_unused_params_allreduce() + + # Rebuilt parameters are pushed only once during a training period + ddp.reducer._push_all_rebuilt_params() + + def post_hook(self, is_last_joiner: bool): + """Sync the final model to ensure that the model is the same across all processes.""" + self.ddp._sync_final_model(is_last_joiner) + + +class DistributedDataParallel(Module, Joinable): + r"""Implement distributed data parallelism based on ``torch.distributed`` at module level. + + This container provides data parallelism by synchronizing gradients + across each model replica. The devices to synchronize across are + specified by the input ``process_group``, which is the entire world + by default. Note that ``DistributedDataParallel`` does not chunk or + otherwise shard the input across participating GPUs; the user is + responsible for defining how to do so, for example through the use + of a :class:`DistributedSampler`. + + See also: :ref:`distributed-basics` and :ref:`cuda-nn-ddp-instead`. + The same constraints on input as in :class:`torch.nn.DataParallel` apply. + + Creation of this class requires that ``torch.distributed`` to be already + initialized, by calling :func:`torch.distributed.init_process_group`. + + ``DistributedDataParallel`` is proven to be significantly faster than + :class:`torch.nn.DataParallel` for single-node multi-GPU data + parallel training. + + To use ``DistributedDataParallel`` on a host with N GPUs, you should spawn + up ``N`` processes, ensuring that each process exclusively works on a single + GPU from 0 to N-1. This can be done by either setting + ``CUDA_VISIBLE_DEVICES`` for every process or by calling: + + >>> # xdoctest: +SKIP("undefined variables") + >>> torch.cuda.set_device(i) + + where i is from 0 to N-1. In each process, you should refer the following + to construct this module: + + >>> # xdoctest: +SKIP("undefined variables") + >>> torch.distributed.init_process_group( + >>> backend='nccl', world_size=N, init_method='...' + >>> ) + >>> model = DistributedDataParallel(model, device_ids=[i], output_device=i) + + In order to spawn up multiple processes per node, you can use either + ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``. + + .. note:: + Please refer to `PyTorch Distributed Overview `__ + for a brief introduction to all features related to distributed training. + + .. note:: + ``DistributedDataParallel`` can be used in conjunction with + :class:`torch.distributed.optim.ZeroRedundancyOptimizer` to reduce + per-rank optimizer states memory footprint. Please refer to + `ZeroRedundancyOptimizer recipe `__ + for more details. + + .. note:: ``nccl`` backend is currently the fastest and highly recommended + backend when using GPUs. This applies to both single-node and + multi-node distributed training. + + .. note:: This module also supports mixed-precision distributed training. + This means that your model can have different types of parameters such + as mixed types of ``fp16`` and ``fp32``, the gradient reduction on these + mixed types of parameters will just work fine. + + .. note:: If you use ``torch.save`` on one process to checkpoint the module, + and ``torch.load`` on some other processes to recover it, make sure that + ``map_location`` is configured properly for every process. Without + ``map_location``, ``torch.load`` would recover the module to devices + where the module was saved from. + + .. note:: When a model is trained on ``M`` nodes with ``batch=N``, the + gradient will be ``M`` times smaller when compared to the same model + trained on a single node with ``batch=M*N`` if the loss is summed (NOT + averaged as usual) across instances in a batch (because the gradients + between different nodes are averaged). You should take this into + consideration when you want to obtain a mathematically equivalent + training process compared to the local training counterpart. But in most + cases, you can just treat a DistributedDataParallel wrapped model, a + DataParallel wrapped model and an ordinary model on a single GPU as the + same (E.g. using the same learning rate for equivalent batch size). + + .. note:: + Parameters are never broadcast between processes. The module performs + an all-reduce step on gradients and assumes that they will be modified + by the optimizer in all processes in the same way. Buffers + (e.g. BatchNorm stats) are broadcast from the module in process of rank + 0, to all other replicas in the system in every iteration. + + .. note:: + If you are using DistributedDataParallel in conjunction with the + :ref:`distributed-rpc-framework`, you should always use + :meth:`torch.distributed.autograd.backward` to compute gradients and + :class:`torch.distributed.optim.DistributedOptimizer` for optimizing + parameters. + + Example:: + + >>> # xdoctest: +SKIP("undefined variables") + >>> import torch.distributed.autograd as dist_autograd + >>> from torch.nn.parallel import DistributedDataParallel as DDP + >>> import torch + >>> from torch import optim + >>> from torch.distributed.optim import DistributedOptimizer + >>> import torch.distributed.rpc as rpc + >>> from torch.distributed.rpc import RRef + >>> + >>> t1 = torch.rand((3, 3), requires_grad=True) + >>> t2 = torch.rand((3, 3), requires_grad=True) + >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2)) + >>> ddp_model = DDP(my_model) + >>> + >>> # Setup optimizer + >>> optimizer_params = [rref] + >>> for param in ddp_model.parameters(): + >>> optimizer_params.append(RRef(param)) + >>> + >>> dist_optim = DistributedOptimizer( + >>> optim.SGD, + >>> optimizer_params, + >>> lr=0.05, + >>> ) + >>> + >>> with dist_autograd.context() as context_id: + >>> pred = ddp_model(rref.to_here()) + >>> loss = loss_func(pred, target) + >>> dist_autograd.backward(context_id, [loss]) + >>> dist_optim.step(context_id) + + .. note:: + DistributedDataParallel currently offers limited support for gradient + checkpointing with :meth:`torch.utils.checkpoint`. + If the checkpoint is done with use_reentrant=False (recommended), DDP + will work as expected without any limitations. + If, however, the checkpoint is done with use_reentrant=True (the default), + DDP will work as expected when there are no unused parameters in the model + and each layer is checkpointed at most once (make sure you are not passing + `find_unused_parameters=True` to DDP). We currently do not support the + case where a layer is checkpointed multiple times, or when there unused + parameters in the checkpointed model. + + .. note:: + To let a non-DDP model load a state dict from a DDP model, + :meth:`~torch.nn.modules.utils.consume_prefix_in_state_dict_if_present` + needs to be applied to strip the prefix "module." in the DDP state dict before loading. + + .. warning:: + Constructor, forward method, and differentiation of the output (or a + function of the output of this module) are distributed synchronization + points. Take that into account in case different processes might be + executing different code. + + .. warning:: + This module assumes all parameters are registered in the model by the + time it is created. No parameters should be added nor removed later. + Same applies to buffers. + + .. warning:: + This module assumes all parameters are registered in the model of each + distributed processes are in the same order. The module itself will + conduct gradient ``allreduce`` following the reverse order of the + registered parameters of the model. In other words, it is users' + responsibility to ensure that each distributed process has the exact + same model and thus the exact same parameter registration order. + + .. warning:: + This module allows parameters with non-rowmajor-contiguous strides. + For example, your model may contain some parameters whose + :class:`torch.memory_format` is ``torch.contiguous_format`` + and others whose format is ``torch.channels_last``. However, + corresponding parameters in different processes must have the + same strides. + + .. warning:: + This module doesn't work with :func:`torch.autograd.grad` (i.e. it will + only work if gradients are to be accumulated in ``.grad`` attributes of + parameters). + + .. warning:: + If you plan on using this module with a ``nccl`` backend or a ``gloo`` + backend (that uses Infiniband), together with a DataLoader that uses + multiple workers, please change the multiprocessing start method to + ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately + Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will + likely experience deadlocks if you don't change this setting. + + .. warning:: + You should never try to change your model's parameters after wrapping + up your model with ``DistributedDataParallel``. Because, when + wrapping up your model with ``DistributedDataParallel``, the constructor + of ``DistributedDataParallel`` will register the additional gradient + reduction functions on all the parameters of the model itself at the + time of construction. If you change the model's parameters afterwards, + gradient reduction functions no longer match the correct set of + parameters. + + .. warning:: + Using ``DistributedDataParallel`` in conjunction with the + :ref:`distributed-rpc-framework` is experimental and subject to change. + + Args: + module (Module): module to be parallelized + device_ids (list of int or torch.device): CUDA devices. + 1) For single-device modules, ``device_ids`` can + contain exactly one device id, which represents the only + CUDA device where the input module corresponding to this process resides. + Alternatively, ``device_ids`` can also be ``None``. + 2) For multi-device modules and CPU modules, + ``device_ids`` must be ``None``. + + When ``device_ids`` is ``None`` for both cases, + both the input data for the forward pass and the actual module + must be placed on the correct device. + (default: ``None``) + output_device (int or torch.device): Device location of output for + single-device CUDA modules. For multi-device modules and + CPU modules, it must be ``None``, and the module itself + dictates the output location. (default: ``device_ids[0]`` + for single-device modules) + broadcast_buffers (bool): Flag that enables syncing (broadcasting) + buffers of the module at beginning of the ``forward`` + function. (default: ``True``) + process_group: The process group to be used for distributed data + all-reduction. If ``None``, the default process group, which + is created by :func:`torch.distributed.init_process_group`, + will be used. (default: ``None``) + bucket_cap_mb: ``DistributedDataParallel`` will bucket parameters into + multiple buckets so that gradient reduction of each + bucket can potentially overlap with backward computation. + :attr:`bucket_cap_mb` controls the bucket size in + MegaBytes (MB). (default: 25) + find_unused_parameters (bool): Traverse the autograd graph from all + tensors contained in the return value of the + wrapped module's ``forward`` function. Parameters + that don't receive gradients as part of this + graph are preemptively marked as being ready to + be reduced. In addition, parameters that may have + been used in the wrapped module's ``forward`` + function but were not part of loss computation and + thus would also not receive gradients are + preemptively marked as ready to be reduced. + (default: ``False``) + check_reduction: This argument is deprecated. + gradient_as_bucket_view (bool): When set to ``True``, gradients will be views + pointing to different offsets of ``allreduce`` communication + buckets. This can reduce peak memory usage, where the + saved memory size will be equal to the total gradients + size. Moreover, it avoids the overhead of copying between + gradients and ``allreduce`` communication buckets. When + gradients are views, ``detach_()`` cannot be called on the + gradients. If hitting such errors, please fix it by + referring to the :meth:`~torch.optim.Optimizer.zero_grad` + function in ``torch/optim/optimizer.py`` as a solution. + Note that gradients will be views after first iteration, so + the peak memory saving should be checked after first iteration. + static_graph (bool): When set to ``True``, DDP knows the trained graph is + static. Static graph means 1) The set of used and unused + parameters will not change during the whole training loop; in + this case, it does not matter whether users set + ``find_unused_parameters = True`` or not. 2) How the graph is trained + will not change during the whole training loop (meaning there is + no control flow depending on iterations). + When static_graph is set to be ``True``, DDP will support cases that + can not be supported in the past: + 1) Reentrant backwards. + 2) Activation checkpointing multiple times. + 3) Activation checkpointing when model has unused parameters. + 4) There are model parameters that are outside of forward function. + 5) Potentially improve performance when there are unused parameters, + as DDP will not search graph in each iteration to detect unused + parameters when static_graph is set to be ``True``. + To check whether you can set static_graph to be ``True``, one way is to + check ddp logging data at the end of your previous model training, + if ``ddp_logging_data.get("can_set_static_graph") == True``, mostly you + can set ``static_graph = True`` as well. + + Example:: + >>> # xdoctest: +SKIP("undefined variables") + >>> model_DDP = torch.nn.parallel.DistributedDataParallel(model) + >>> # Training loop + >>> ... + >>> ddp_logging_data = model_DDP._get_ddp_logging_data() + >>> static_graph = ddp_logging_data.get("can_set_static_graph") + delay_all_reduce_named_params (list of tuple of str and torch.nn.Parameter): a list + of named parameters whose all reduce will be delayed when the gradient of + the parameter specified in ``param_to_hook_all_reduce`` is ready. Other + arguments of DDP do not apply to named params specified in this argument + as these named params will be ignored by DDP reducer. + param_to_hook_all_reduce (torch.nn.Parameter): a parameter to hook delayed all reduce + of parameters specified in ``delay_all_reduce_named_params``. + + + Attributes: + module (Module): the module to be parallelized. + + Example:: + + >>> # xdoctest: +SKIP("undefined variables") + >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...') + >>> net = torch.nn.parallel.DistributedDataParallel(model) + """ + + # used to track whether the given thread is inside ddp forward for torchdynamo purposes + _active_ddp_module: Optional["DistributedDataParallel"] = None + + def __init__( + self, + module, + device_ids=None, + output_device=None, + dim=0, + broadcast_buffers=True, + process_group=None, + bucket_cap_mb=25, + find_unused_parameters=False, + check_reduction=False, + gradient_as_bucket_view=False, + static_graph=False, + delay_all_reduce_named_params=None, + param_to_hook_all_reduce=None, + mixed_precision: Optional[_MixedPrecision] = None, + device_mesh=None, + ): + super().__init__() + Joinable.__init__(self) + self.logger = None + if bool(delay_all_reduce_named_params is not None) != bool( + param_to_hook_all_reduce is not None + ): + self._log_and_throw( + ValueError, + "delay_all_reduce_named_params and param_to_hook_all_reduce " + "need to be set at the same time.", + ) + + self._delay_all_reduce_params = [] + if hasattr(module, "_ddp_params_and_buffers_to_ignore"): + self.parameters_to_ignore = set(module._ddp_params_and_buffers_to_ignore) + else: + self.parameters_to_ignore = set() + if delay_all_reduce_named_params is not None: + for name, param in delay_all_reduce_named_params: + self.parameters_to_ignore.add(name) + self._delay_all_reduce_params.append(param) + + self._module_parameters = [ + p + for n, p in module.named_parameters() + if n not in self.parameters_to_ignore + ] + if not any(p.requires_grad for p in self._module_parameters): + if len(self._delay_all_reduce_params): + logger.info("Delay the AllReduce of all parameters.") + else: + self._log_and_throw( + RuntimeError, + "DistributedDataParallel is not needed when a module " + "doesn't have any parameter that requires a gradient.", + ) + + if device_ids is not None and len(device_ids) > 1: + self._log_and_throw( + ValueError, + "device_ids can only be None or contain a single element.", + ) + + self.is_multi_device_module = ( + len({p.device for p in self._module_parameters}) > 1 + ) + distinct_device_types = { + p.device.type for p in self._module_parameters if p.device is not None + } + if len(distinct_device_types) != 1: + self._log_and_throw( + ValueError, + "DistributedDataParallel's input module must be on " + f"the same type of devices, but input module parameters locate in {distinct_device_types}.", + ) + + self.device_type = next(iter(distinct_device_types)) + + if ( + device_ids is None + or len(device_ids) == 0 # For backward compatibility. + or self.device_type == "cpu" + or self.is_multi_device_module + ): + if device_ids or output_device: + self._log_and_throw( + ValueError, + "DistributedDataParallel device_ids and output_device arguments " + "only work with single-device/multiple-device GPU modules or CPU modules, " + "but got device_ids {}, output_device {}, and module parameters {}.".format( + device_ids, + output_device, + {p.device for p in self._module_parameters}, + ), + ) + + self.device_ids = None + self.output_device = None + else: + self.device_ids = [_get_device_index(x, True) for x in device_ids] + + if output_device is None: + output_device = device_ids[0] + + self.output_device = _get_device_index(output_device, True) + + if process_group and device_mesh is not None: + raise RuntimeError( + "Cannot specify both process_group and device_mesh arguments." + ) + elif process_group is None and device_mesh is None: + self.process_group = _get_default_group() + elif device_mesh is None: + self.process_group = process_group + else: + if device_mesh.ndim != 1: + raise RuntimeError( + f"Only 1D device mesh is supported, but got {device_mesh}." + ) + self.device_mesh = device_mesh + self.process_group = device_mesh.get_group(mesh_dim=0) + + self.static_graph = False + self.dim = dim + self.module = module + self.device = next(iter(self._module_parameters)).device + self.broadcast_buffers = broadcast_buffers + self.find_unused_parameters = find_unused_parameters + self.require_backward_grad_sync = True + self.require_forward_param_sync = True + self.gradient_as_bucket_view = gradient_as_bucket_view + self.mixed_precision = mixed_precision + if self.mixed_precision is not None: + logger.warning("Received mixed precision config %s", self.mixed_precision) + + if check_reduction: + # This argument is no longer used since the reducer + # will ensure reduction completes even if some parameters + # do not receive gradients. + warnings.warn( + "The `check_reduction` argument in `DistributedDataParallel` " + "module is deprecated. Please avoid using it." + ) + + # Check that a module does not have Uninitialized parameters + for param in self._module_parameters: + if isinstance(param, torch.nn.parameter.UninitializedParameter): + self._log_and_throw( + RuntimeError, + "Modules with uninitialized parameters can't be used with `DistributedDataParallel`. " + "Run a dummy forward pass to correctly initialize the modules", + ) + # used for intra-node param sync and inter-node sync as well + self.broadcast_bucket_size = int(250 * 1024 * 1024) + + # reduction bucket size + self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024) + # Whether to perform input tensor CPU to GPU copies on a side-stream + self.use_side_stream_for_tensor_copies = ( + os.environ.get("PYTORCH_DDP_USE_SIDE_STREAM", "1") == "1" + ) + + # Initialize gradient buffers and register all reduce hook + self._delay_grad_buffer = None + self._delay_grad_views: List[torch.Tensor] = [] + self._delay_all_reduce_all_params = False + if len(self._delay_all_reduce_params) != 0: + self._register_delay_all_reduce_hook( + bucket_cap_mb=bucket_cap_mb, + param_to_hook_all_reduce=param_to_hook_all_reduce, + device_ids=device_ids, + ) + if self._delay_all_reduce_all_params: + return + + # Build parameters for reducer. + parameters, expect_sparse_gradient = self._build_params_for_reducer() + # Verify model equivalence. + _verify_param_shape_across_processes(self.process_group, parameters) + # Sync params and buffers. Ensures all DDP models start off at the same value. + _sync_module_states( + module=self.module, + process_group=self.process_group, + broadcast_bucket_size=self.broadcast_bucket_size, + src=0, + params_and_buffers_to_ignore=self.parameters_to_ignore, + broadcast_buffers=self.broadcast_buffers, + ) + # In debug mode, build a mapping of parameter index -> parameter. + param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters) + + # Builds reducer. + self._ddp_init_helper( + parameters, + expect_sparse_gradient, + param_to_name_mapping, + static_graph, + ) + self._comm_hooks: List[Tuple[Callable, object]] = [] + + if self.mixed_precision is not None: + _setup_mixed_precision_params(self.mixed_precision, self.module) + _cast_buffers(self.mixed_precision, self.module) + # Stream used for async low precision copies. + self._mp_stream = torch.cuda.Stream() + self._submodule_to_event = defaultdict(deque) # type: ignore[var-annotated] + # Add forward pre-hook to root module to kick off copies to lower + # precision. + self.module.register_forward_pre_hook( + self._root_copy_hook, prepend=False, with_kwargs=True + ) + # Add forward pre hook to all submodules to wait for copy events + # before running computation. + for module in self.module.modules(): + module.register_forward_pre_hook( + self._module_wait_for_copy_hook, + prepend=False, + with_kwargs=True, + ) + # Set up callbacks in backward to upcast and use full precision + # params. TODO (rohan-varma): Make this compose with general + # comm hooks and apply_optimizer_in_backward. Importing inline to + # avoid circular import issue. + from torch.distributed.algorithms.ddp_comm_hooks.mixed_precision_hooks import ( + _AllreduceUpcastHookState, + _reducer_allreduce_and_upcast_hook, + ) + + upcast_hook_state = _AllreduceUpcastHookState( + ddp_weakref=weakref.ref(self), + upcast_stream=torch.cuda.Stream(), + ) + self.register_comm_hook( + upcast_hook_state, + _reducer_allreduce_and_upcast_hook, + ) + # Inform reducer of reduced precision param dtype for correctness + # of type checks between gradient and bucket. + self.reducer._set_mixed_precision_param_dtype( # type: ignore[attr-defined] + self.mixed_precision.param_dtype + ) + + self._has_rebuilt_buckets = False + + if static_graph: + self._set_static_graph() + + self._lazy_init_ran = False + + # Register the AccumulateGrad post hooks if optimize_ddp is + # True. The hooks will be deregistered if compiled_autograd is not + # enabled. + self._accum_grad_hooks: List[RemovableHandle] = [] + optimize_ddp = torch._dynamo.config._get_optimize_ddp_mode() + self._use_python_reducer = optimize_ddp in ( + "python_reducer", + "python_reducer_without_compiled_forward", + ) + self._force_to_disable_cpp_reducer = ( + optimize_ddp == "python_reducer_without_compiled_forward" + ) + if self._use_python_reducer: + self._register_accum_grad_hook() + + def _register_accum_grad_hook(self): + import torch.distributed._functional_collectives as fcol + + def compiled_accum_grad_hook( + param, + *, + param_index: int, + ): + if not self.require_backward_grad_sync: + return + + if param.grad is None: + return + + if self._comm_hooks: + for hook, state in self._comm_hooks: + hook(state, (param.grad, param)) + else: + gradient = param.grad / self.process_group.size() + gradient = fcol.all_reduce(gradient, "sum", self.process_group) + param.grad.copy_(gradient) + + for index, param in enumerate(self._module_parameters): + self._accum_grad_hooks.append( + param.register_post_accumulate_grad_hook( + functools.partial( + compiled_accum_grad_hook, + param_index=index, + ) + ) + ) + + def _delayed_all_reduce_hook(self, grad): + world_size = dist.get_world_size(self.process_group) + + self._delay_grad_buffer.div_(world_size) # type: ignore[union-attr] + _ = dist.all_reduce( + self._delay_grad_buffer, group=self.process_group, async_op=True + ) + return grad + + def _register_delay_all_reduce_hook( + self, + bucket_cap_mb, + param_to_hook_all_reduce, + device_ids, + ): + # 1. Create gradient buffer + device = torch.device("cpu") if device_ids is None else device_ids[0] + self._delay_grad_buffer = torch.zeros( + sum([p.numel() for p in self._delay_all_reduce_params]), + device=device, + ) + + # 2. Broadcast the parameters + detached_params = [p.detach() for p in self._delay_all_reduce_params] + dist._broadcast_coalesced(self.process_group, detached_params, bucket_cap_mb, 0) + + # 3. Hook all reduce to the specified parameter + param_to_hook_all_reduce.register_hook(self._delayed_all_reduce_hook) + + # 4. Build tensor views for gradients + offset = 0 + for param in self._delay_all_reduce_params: + grad_view = self._delay_grad_buffer[offset : (offset + param.numel())].view( + param.shape + ) + self._delay_grad_views.append(grad_view) + offset = offset + param.numel() + + # 5. Check whether the all reduce of all params requiring grad is delayed. + for module_name, module in self.module.named_modules(): + for param_name, param in module.named_parameters(recurse=False): + if param.requires_grad: + full_name = f"{module_name}.{param_name}" + if full_name not in self.parameters_to_ignore: + # There is at least a param whose all reduce will not be delayed. + # In this case, we should not set self._delay_all_reduce_all_params + # to True. + return + self._delay_all_reduce_all_params = True + + def _setup_in_backward_optimizers(self): + # Check if user has used apply_optim_in_backward to overlap optimizer + # step + DDP backward. Current constraints: + # 1. Only allreduce is supported at the moment, no custom communication. + # 2. For DDP-managed parameters that have their optimizer run in + # backward, their gradients are set to ``None``. If your use case + # requires DDP parameters grad not to be set to ``None`` after their + # in-backward optimizer runs, please ping + # https://github.com/pytorch/pytorch/issues/90052. + # NOTE: we use self._module_parameters instead of .parameters() since + # the former excludes ignored (non-DDP managed) parameters. + if any(hasattr(p, "_in_backward_optimizers") for p in self._module_parameters): + torch._C._log_api_usage_once("ddp.optimizer_in_backward") + # Remove hooks that apply_optim_in_backward had registered because + # DDP customizes how optimizer is overlapped with backward due to + # the allreduce. + param_to_handle_map = ( + dist.optim.apply_optimizer_in_backward.param_to_optim_hook_handle_map + ) + for p in self._module_parameters: + for handle in param_to_handle_map.get(p, []): + handle.remove() + + # Need a weakref to DDP instance to run all_reduce (from reducer) + # and get managed DDP parameters. + ddp_weakref = weakref.ref(self) + # Note: importing in function, otherwise this will cause a circular + # import. + from torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks import ( + _apply_optim_in_backward_hook, + ) + + self.register_comm_hook( + ddp_weakref, + _apply_optim_in_backward_hook( + gradient_is_bucket_view=self.gradient_as_bucket_view + ), + ) + + self.reducer._set_optimizer_in_backward() # type: ignore[attr-defined] + + def _fire_reducer_autograd_hook(self, idx, *unused): + """ + Fire the reducer's autograd hook to allreduce params in a Reducer bucket. + + Note that this is only used during mixed precision training as the + Reducer's hooks installed during construction time would not be called + as we're working in the low precision parameter setting. + """ + self.reducer._autograd_hook(idx) # type: ignore[attr-defined] + + def _root_copy_hook(self, *args: Any, **kwargs: Any) -> None: + """ + For DDP mixed precision, put low precision copies on separate stream and create events to wait for them. + + When training with DDP mixed precision, this root pre-forward hook kicks + off low precision copies on a separate stream and creates respective + events to wait for them. + """ + # Clear out previous iteration submodule to event. This is because we + # may have populated some events for modules that didn't end up being + # used. + self._submodule_to_event = defaultdict(deque) # type: ignore[var-annotated] + with torch.cuda.stream(self._mp_stream): + for submodule in self.module.modules(): + for param in submodule.parameters(recurse=False): + # Do not cast DDP ignored parameters. + if hasattr(param, "_ddp_ignored") and param._ddp_ignored: + continue + _alloc_storage(param._mp_param, param.size()) + # copy() implicitly casts to low precision + with torch.no_grad(): + param._mp_param.copy_(param.data) + # TODO: when zero_grad(set_to_none=False) or in grad + # accumulation case, accumulated grads can be in fp32 + # which can cause errors when running DDP backwards due + # to mismatched incoming and accumulated gradient types. + # So we manually cast the accumulated grad down for now, + # in the future we may shift to FSDP style gradient + # accumulation management where the accumulated gradient + # is saved and .grad field is set to None, bypassing + # this issue. + if param.grad is not None: + param.grad.data = param.grad.to( + self.mixed_precision.param_dtype # type: ignore[union-attr] + ) + param.data = param._mp_param + copy_event = torch.cuda.Event() + copy_event.record() + self._submodule_to_event[submodule].append(copy_event) + + def _module_wait_for_copy_hook( + self, + module, + *args: Any, + **kwargs: Any, + ) -> None: + """Before carrying out computation, wait on the appropriate event to ensure low precision copies have finished.""" + try: + event = self._submodule_to_event[module].popleft() + except IndexError: + # copy event has already been waited on + return + + event.wait(stream=torch.cuda.current_stream()) + for p in module.parameters(recurse=False): + # Don't register hooks if param does not require grad + if not p.requires_grad or (hasattr(p, "_ddp_ignored") and p._ddp_ignored): + continue + # We need to register autograd hook here instead of DDP's ctor + # since we're working with the low precision param. Register them + # via obtaining the gradient accumulator. + tmp = p.expand_as(p) + grad_acc = tmp.grad_fn.next_functions[0][0] + + hook = grad_acc.register_hook( + functools.partial(self._fire_reducer_autograd_hook, p._idx) + ) + p._ddp_mp_hook_state = (grad_acc, hook) + + def _log_and_throw(self, err_type, err_msg): + if self.logger is not None: + self.logger.set_error_and_log(f"{str(err_type)}: {err_msg}") + raise err_type(err_msg) + + def _ddp_init_helper( + self, + parameters, + expect_sparse_gradient, + param_to_name_mapping, + static_graph, + ): + """ + DDP init helper function to manage parameters, grad hooks, logging, and SyncBatchNorm. + + Initialization helper function that does the following: + (1) bucketing the parameters for reductions + (2) resetting the bucketing states + (3) registering the grad hooks + (4) Logging construction-time DDP logging data + (5) passing a handle of DDP to SyncBatchNorm Layer + """ + # Notice, the parameters order is not in the order in which they are used, + # especially in models with control flow. + # + # Alongside parameters are not presented in the real execution order, + # if a certain model happens to also + # 1) have other collectives comm ops in its backward graph. + # 2) have unused parameter in subset ranks of the whole world. + # bucketing could insert ALL-REDUCE comm op too early on the rank with unused parameter, + # matching up with other collectives comm ops on other ranks unexpectedly. + # + # In order to handle this corner case, when the parameters are not in the real execution order, + # we don't do bucketing, thus only one ALL-REDUCE is inserted after all the gradients + # of the whole graph are computed. + # + # Notice, here we only disable bucketing for the first iteration. + # After the first iteration, it's OK to rebuild buckets, + # because "bucket rebuild" bucketizes parameters based on its real execution order in backward graph. + + # Can remove this branching once #73732 is landed. + if static_graph is True or self.find_unused_parameters is False: + bucket_size_limits = [sys.maxsize] + else: + bucket_size_limits = [ + dist._DEFAULT_FIRST_BUCKET_BYTES, + self.bucket_bytes_cap, + ] + ( + bucket_indices, + per_bucket_size_limits, + ) = dist._compute_bucket_assignment_by_size( + parameters, + bucket_size_limits, + expect_sparse_gradient, + ) + + # Remember index for parameters if we are in mixed precision, as we + # need to pass in index to Reducer's autograd hook via python. + if self.mixed_precision is not None: + for i, p in enumerate(parameters): + p._idx = i + + # Note: reverse list of buckets because we want to approximate the + # order in which their gradients are produced, and assume they + # are used in the forward pass in the order they are defined. + self.reducer = dist.Reducer( + parameters, + list(reversed(bucket_indices)), + list(reversed(per_bucket_size_limits)), + self.process_group, + expect_sparse_gradient, + # The bucket size limit is specified in the constructor. + # Additionally, we allow for a single small bucket for parameters + # that are defined first, such that their gradients don't spill into + # a much larger bucket, adding unnecessary latency after gradient + # computation finishes. Experiments showed 1MB is a reasonable value. + self.bucket_bytes_cap, + self.find_unused_parameters, + self.gradient_as_bucket_view, + param_to_name_mapping, + # User can set dist._DEFAULT_FIRST_BUCKET_BYTES to tune DDP first + # bucket. + dist._DEFAULT_FIRST_BUCKET_BYTES, + ) + + self.logger = dist.Logger(self.reducer) + # Set as a weak reference to avoid reference cycle between + # logger and reducer. + self.reducer.set_logger(self.logger) + + has_sync_bn = False + for submodule in self.module.modules(): + if isinstance(submodule, torch.nn.SyncBatchNorm): + has_sync_bn = True + break + + # Set logging data that can be got during construction time. + self.logger.set_construction_data_and_log( + self.module.__class__.__name__, + [] if self.device_ids is None else self.device_ids, + -1 if self.output_device is None else self.output_device, + self.broadcast_buffers, + has_sync_bn, + static_graph, + ) + + # passing a handle to torch.nn.SyncBatchNorm layer + self._passing_sync_batchnorm_handle(self.module) + + def __getstate__(self): + self._check_default_group() + attrs = copy.copy(self.__dict__) + del attrs["process_group"] + del attrs["reducer"] + del attrs["logger"] + return attrs + + def __setstate__(self, state): + # If serializable, then the process group should be the default one + self.process_group = _get_default_group() + super().__setstate__(state) + self.__dict__.setdefault("require_forward_param_sync", True) + self.__dict__.setdefault("require_backward_grad_sync", True) + parameters, expect_sparse_gradient = self._build_params_for_reducer() + # In debug mode, build a mapping of parameter index -> parameter. + param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters) + # Builds reducer. + self._ddp_init_helper( + parameters, + expect_sparse_gradient, + param_to_name_mapping, + self.static_graph, + ) + if self.static_graph: + self.reducer._set_static_graph() + assert self.logger is not None + self.logger._set_static_graph() + + def _build_params_for_reducer(self): + # Build tuple of (module, parameter) for all parameters that require grads. + modules_and_parameters = [ + (module, parameter) + for module_name, module in self.module.named_modules() + for parameter in [ + param + # Note that we access module.named_parameters instead of + # parameters(module). parameters(module) is only needed in the + # single-process multi device case, where it accesses replicated + # parameters through _former_parameters. + for param_name, param in module.named_parameters(recurse=False) + if param.requires_grad + and f"{module_name}.{param_name}" not in self.parameters_to_ignore + ] + ] + + # Deduplicate any parameters that might be shared across child modules. + memo = set() + modules_and_parameters = [ + # "p not in memo" is the deduplication check. + # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed. + (m, p) + for m, p in modules_and_parameters + if p not in memo and not memo.add(p) # type: ignore[func-returns-value] + ] + + # Build list of parameters. + parameters = [parameter for _, parameter in modules_and_parameters] + + # Checks if a module will produce a sparse gradient. + def produces_sparse_gradient(module): + if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)): + return module.sparse + return False + + # Build list of booleans indicating whether or not to expect sparse + # gradients for the corresponding parameters. + expect_sparse_gradient = [ + produces_sparse_gradient(module) for module, _ in modules_and_parameters + ] + + self._assign_modules_buffers() + + return parameters, expect_sparse_gradient + + def _assign_modules_buffers(self): + """ + Assign self.module.named_buffers to self.modules_buffers. + + Assigns module buffers to self.modules_buffers which are then used to + broadcast across ranks when broadcast_buffers=True. Note that this + must be called every time buffers need to be synced because buffers can + be reassigned by user module, + see https://github.com/pytorch/pytorch/issues/63916. + """ + # Collect buffers for modules, filtering out buffers that should be ignored. + named_module_buffers = [ + (buffer, buffer_name) + for buffer_name, buffer in self.module.named_buffers() + if buffer_name not in self.parameters_to_ignore + ] + self.modules_buffers = [ + buffer for (buffer, buffer_name) in named_module_buffers + ] + # Dict[str, tensor] representing module buffers not ignored by DDP. + self.named_module_buffers = { + buffer_name: buffer for (buffer, buffer_name) in named_module_buffers + } + + def _build_debug_param_to_name_mapping(self, parameters): + param_to_param_index = {parameters[i]: i for i in range(len(parameters))} + param_set = set(parameters) + param_index_to_param_fqn = {} + for module_name, module in self.module.named_modules(): + for param_name, param in module.named_parameters(recurse=False): + fqn = f"{module_name}.{param_name}" + # Bypass ignored parameters since those are not reduced by DDP + # to begin with. + if fqn not in self.parameters_to_ignore and param.requires_grad: + if param not in param_set: + self._log_and_throw( + ValueError, + f"Param with name {fqn} found in module parameters, but not DDP parameters." + " This indicates a bug in DDP, please report an issue to PyTorch.", + ) + param_index = param_to_param_index[param] + param_index_to_param_fqn[param_index] = fqn + + # Ensure we covered all parameters + if len(param_set) != len(param_index_to_param_fqn): + self._log_and_throw( + ValueError, + ( + "Expected param to name mapping to cover all parameters, but" + f" got conflicting lengths: {len(param_set)} vs " + f"{len(param_index_to_param_fqn)}. This indicates a bug in DDP" + ", please report an issue to PyTorch." + ), + ) + + return param_index_to_param_fqn + + def _get_parameters(self, m, recurse=True): + """Return a generator of module parameters.""" + + def model_parameters(m): + ps = ( + m._former_parameters.values() + if hasattr(m, "_former_parameters") + else m.parameters(recurse=False) + ) + yield from ps + + for mod in m.modules() if recurse else [m]: + yield from model_parameters(mod) + + def _check_default_group(self): + pickle_not_supported = False + try: + if self.process_group != _get_default_group(): + pickle_not_supported = True + except RuntimeError: + pickle_not_supported = True + + if pickle_not_supported: + self._log_and_throw( + RuntimeError, + "DDP Pickling/Unpickling are only supported " + "when using DDP with the default process " + "group. That is, when you have called " + "init_process_group and have not passed " + "process_group argument to DDP constructor", + ) + + @contextmanager + def no_sync(self): + r""" + Context manager to disable gradient synchronizations across DDP processes. + + Within this context, gradients will be accumulated on module + variables, which will later be synchronized in the first + forward-backward pass exiting the context. + + Example:: + + >>> # xdoctest: +SKIP("undefined variables") + >>> ddp = torch.nn.parallel.DistributedDataParallel(model, pg) + >>> with ddp.no_sync(): + >>> for input in inputs: + >>> ddp(input).backward() # no synchronization, accumulate grads + >>> ddp(another_input).backward() # synchronize grads + + .. warning:: + The forward pass should be included inside the context manager, or + else gradients will still be synchronized. + """ + old_require_backward_grad_sync = self.require_backward_grad_sync + self.require_backward_grad_sync = False + try: + yield + finally: + self.require_backward_grad_sync = old_require_backward_grad_sync + + @classmethod + def _get_active_ddp_module(cls): + """`TorchDynamo` requires DDP's status and module for cooperative optimization.""" + return cls._active_ddp_module + + # note, this ctxmgr function is marked 'skip' in torchdynamo, so dynamo only kicks in + # for the 'module_to_run' underneath + # see torch._dynamo/eval_frame.py TorchPatcher.patch for more details + @contextmanager + @torch._disable_dynamo(recursive=False) + def _inside_ddp_forward(self): + DistributedDataParallel._active_ddp_module = self + try: + yield + finally: + DistributedDataParallel._active_ddp_module = None + + def _run_ddp_forward(self, *inputs, **kwargs): + if self._use_python_reducer: + return self.module(*inputs, **kwargs) # type: ignore[index] + else: + with self._inside_ddp_forward(): + return self.module(*inputs, **kwargs) # type: ignore[index] + + def _clear_grad_buffer(self): + # Making param.grad points to the grad buffers before backward is based on the + # assumption that the grad accumulation is done in place in autograd engine, + # for some edge cases, if the grad accumulation in autograd engine is not in + # place, then the param.grad and grad buffers are detached. + if self._delay_grad_buffer is not None: + # We batch zero_grad for all params by resetting the whole grad + # buffer when the grad of all params is set to None. + all_param_grad_none = all( + param.grad is None for param in self._delay_all_reduce_params + ) + + for index, param in enumerate(self._delay_all_reduce_params): + if param.grad is None: + param.grad = self._delay_grad_views[index] + if not all_param_grad_none: + param.grad.zero_() + + if all_param_grad_none: + self._delay_grad_buffer.zero_() + + def _lazy_init(self): + # Initialization for DDP that occurs after construction, but lazily + # before the first forward pass. + self._setup_in_backward_optimizers() + self._lazy_init_ran = True + + def _should_disable_cpp_reducer(self) -> bool: + return self._use_python_reducer and ( + torch._utils.is_compiling() or self._force_to_disable_cpp_reducer + ) + + def _pre_forward(self, *inputs, **kwargs): + if self._should_disable_cpp_reducer(): + return inputs, kwargs + + # Disable the python reducer if compiled_autograd is not enabled. + if self._accum_grad_hooks: + for index, h in enumerate(self._accum_grad_hooks): + h.remove() + self._accum_grad_hooks.clear() + + if not self._lazy_init_ran and not torch._utils.is_compiling(): + self._lazy_init() + + if self._delay_all_reduce_all_params: + return inputs, kwargs + + if torch.is_grad_enabled() and self.require_backward_grad_sync: + assert self.logger is not None + self.logger.set_runtime_stats_and_log() + self.reducer.prepare_for_forward() + + # Notify the join context that this process has not joined, if + # needed + work = Join.notify_join_context(self) + if work: + self.reducer._set_forward_pass_work_handle( + work, self._divide_by_initial_world_size # type: ignore[arg-type] + ) + + # Calling _rebuild_buckets before forward computation, + # It may allocate new buckets before deallocating old buckets + # inside _rebuild_buckets. To save peak memory usage, + # call _rebuild_buckets before the peak memory usage increases + # during forward computation. + # This should be called only once during whole training period. + if torch.is_grad_enabled() and self.reducer._rebuild_buckets(): + logger.info("Reducer buckets have been rebuilt in this iteration.") + self._has_rebuilt_buckets = True + + # sync params according to location (before/after forward) user + # specified as part of hook, if hook was specified. + if self._check_sync_bufs_pre_fwd(): + self._sync_buffers() + + if self._join_config.enable: + # Notify joined ranks whether they should sync in backwards pass or not. + self._check_global_requires_backward_grad_sync(is_joined_rank=False) + + if self.device_ids: + moved_inputs, moved_kwargs = _to_kwargs( + inputs, + kwargs, + torch.device(self.device_type, self.device_ids[0]), + self.use_side_stream_for_tensor_copies, + ) + args, kwargs = moved_inputs[0], moved_kwargs[0] + # Cast inputs to reduced precision if needed. + if self.mixed_precision is not None: + args, kwargs = _cast_forward_inputs( + self.mixed_precision.param_dtype, + *args, + **kwargs, + ) + return args, kwargs + else: + # Cast inputs to reduced precision if needed. + # TODO (rohan-varma) test this codepath. + if self.mixed_precision is not None: + inputs, kwargs = _cast_forward_inputs( + self.mixed_precision.param_dtype, + *inputs, + **kwargs, + ) + return inputs, kwargs + + def _post_forward(self, output): + if self._should_disable_cpp_reducer(): + return output + + if self._delay_all_reduce_all_params: + self._clear_grad_buffer() + return output + + # sync params according to location (before/after forward) user + # specified as part of hook, if hook was specified. + if self._check_sync_bufs_post_fwd(): + self._sync_buffers() + + if torch.is_grad_enabled() and self.require_backward_grad_sync: + self.require_forward_param_sync = True + # We'll return the output object verbatim since it is a freeform + # object. We need to find any tensors in this object, though, + # because we need to figure out which parameters were used during + # this forward pass, to ensure we short circuit reduction for any + # unused parameters. Only if `find_unused_parameters` is set. + if self.find_unused_parameters and not self.static_graph: + # Do not need to populate this for static graph. + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + self.require_forward_param_sync = False + + # TODO: DDPSink is currently enabled for unused parameter detection and + # static graph training for first iteration. + if (self.find_unused_parameters and not self.static_graph) or ( + self.static_graph and not self._static_graph_delay_allreduce_enqueued + ): + ( + output_tensor_list, + treespec, + output_is_rref, + ) = _tree_flatten_with_rref(output) + output_placeholders = [None for _ in range(len(output_tensor_list))] + # Do not touch tensors that have no grad_fn, which can cause issues + # such as https://github.com/pytorch/pytorch/issues/60733 + for i, output in enumerate(output_tensor_list): + if torch.is_tensor(output) and output.grad_fn is None: + output_placeholders[i] = output + + # When find_unused_parameters=True, makes tensors which require grad + # run through the DDPSink backward pass. When not all outputs are + # used in loss, this makes those corresponding tensors receive + # undefined gradient which the reducer then handles to ensure + # param.grad field is not touched and we don't error out. + passthrough_tensor_list = _DDPSink.apply( + weakref.ref(self), + *output_tensor_list, + ) + for i in range(len(output_placeholders)): + if output_placeholders[i] is None: + output_placeholders[i] = passthrough_tensor_list[i] + + # Reconstruct output data structure. + output = _tree_unflatten_with_rref( + output_placeholders, treespec, output_is_rref + ) + + # At the end of the forward pass, reset the grad buffer and grad views + self._clear_grad_buffer() + return output + + def forward(self, *inputs, **kwargs): + with torch.autograd.profiler.record_function("DistributedDataParallel.forward"): + inputs, kwargs = self._pre_forward(*inputs, **kwargs) + output = ( + self.module.forward(*inputs, **kwargs) + if self._delay_all_reduce_all_params + else self._run_ddp_forward(*inputs, **kwargs) + ) + return self._post_forward(output) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def to_kwargs(self, inputs, kwargs, device_id): + # Kept for BC + return _to_kwargs( + inputs, + kwargs, + torch.device(self.device_type, device_id), + self.use_side_stream_for_tensor_copies, + ) + + def gather(self, outputs, output_device): + return gather(outputs, output_device, dim=self.dim) + + def train(self, mode=True): + super().train(mode) + return self + + # When running in join mode, schedules an allreduce to notify joined ranks + # of whether backwards pass synchronization will run this iteration or not. + def _check_global_requires_backward_grad_sync(self, is_joined_rank): + if not is_joined_rank and self.require_backward_grad_sync: + requires_sync_tensor = torch.ones(1, device=self.device) + else: + requires_sync_tensor = torch.zeros(1, device=self.device) + + work = dist.all_reduce( + requires_sync_tensor, group=self.process_group, async_op=True + ) + + # (kwen2501) This if condition is a plain translation of previous + # behavior, i.e. in the `is_joined_rank=False` case, `work.wait()` + # is not called and it doesn't care about the result. I am guessing + # that it just wants to fire a matching all-reduce and does not want + # the main stream to wait. + if is_joined_rank: + work.wait() + should_sync_backwards = requires_sync_tensor.item() != 0 + return should_sync_backwards + else: + return None # Return value is not/should not be used. + + # When running in join mode, checks and performs sync of module buffers if + # the models have buffers that should be synchronized in the forward pass. + def _check_and_sync_module_buffers(self): + if self._check_sync_bufs_pre_fwd(): + authoritative_rank = self._find_common_rank(self._distributed_rank, False) + self._sync_module_buffers(authoritative_rank) + + # When running in join model, agrees upon a common rank and broadcast model + # parameters to all other ranks. + def _sync_final_model(self, is_last_joiner): + # Agree upon the process that will be the authoritative model copy. + # The current rank is a candidate for being the authoritative copy if + # is_last_joiner=True. We break ties via picking the larger rank. + self._authoritative_rank = self._find_common_rank( + self._distributed_rank, is_last_joiner + ) + _sync_module_states( + module=self.module, + process_group=self.process_group, + broadcast_bucket_size=self.broadcast_bucket_size, + src=self._authoritative_rank, + params_and_buffers_to_ignore=self.parameters_to_ignore, + broadcast_buffers=self.broadcast_buffers, + ) + + # Schedule comm ops to match those scheduled in the reducer's backward + # pass. + def _match_all_reduce_for_bwd_pass(self): + comm_work = [] + # Schedule comm in the same order as Reducer schedules them, i.e. + # the order of the buckets. Retrieving the bucket order from the reducer + # ensures that we keep the same order in join mode, such as when bucket + # order is rebuilt dynamically. + + # Returns grad_buckets in order, but real tensors are substituted with + # zero tensors of the same shape. + grad_buckets = self.reducer._get_zeros_like_grad_buckets() + for grad_bucket in grad_buckets: + # Joined processes contribute zero gradient. In the case that + # divide_by_initial_world_size=True, we divide grads by the static + # world size, if not, the dividing factor is reduced by the number + # of joined processes. + work = self.reducer._run_comm_hook(grad_bucket) + comm_work.append(work) + for work in comm_work: + work.wait() + + # Allreduces the used parameter mapping across ranks. + def _match_unused_params_allreduce(self): + locally_used_param_map = self.reducer._get_local_used_map() + self.process_group.allreduce(locally_used_param_map) + + def join( + self, + divide_by_initial_world_size: bool = True, + enable: bool = True, + throw_on_early_termination: bool = False, + ): + r""" + Context manager for training with uneven inputs across processes in DDP. + + This context manager will keep track of already-joined DDP processes, + and "shadow" the forward and backward passes by inserting collective + communication operations to match with the ones created by non-joined + DDP processes. This will ensure each collective call has a corresponding + call by already-joined DDP processes, preventing hangs or errors that + would otherwise happen when training with uneven inputs across + processes. Alternatively, if the flag ``throw_on_early_termination`` is + specified to be ``True``, all trainers will throw an error once one rank + runs out of inputs, allowing these errors to be caught and handled + according to application logic. + + Once all DDP processes have joined, the context manager will broadcast + the model corresponding to the last joined process to all processes to + ensure the model is the same across all processes + (which is guaranteed by DDP). + + To use this to enable training with uneven inputs across processes, + simply wrap this context manager around your training loop. No further + modifications to the model or data loading is required. + + .. warning:: + If the model or training loop this context manager is wrapped around + has additional distributed collective operations, such as + ``SyncBatchNorm`` in the model's forward pass, then the flag + ``throw_on_early_termination`` must be enabled. This is because this + context manager is not aware of non-DDP collective communication. + This flag will cause all ranks to throw when any one rank + exhausts inputs, allowing these errors to be caught and recovered + from across all ranks. + + Args: + divide_by_initial_world_size (bool): If ``True``, will divide + gradients by the initial ``world_size`` DDP training was launched + with. If ``False``, will compute the effective world size + (number of ranks that have not depleted their inputs yet) and + divide gradients by that during allreduce. Set + ``divide_by_initial_world_size=True`` to ensure every input + sample including the uneven inputs have equal weight in terms of + how much they contribute to the global gradient. This is + achieved by always dividing the gradient by the initial + ``world_size`` even when we encounter uneven inputs. If you set + this to ``False``, we divide the gradient by the remaining + number of nodes. This ensures parity with training on a smaller + ``world_size`` although it also means the uneven inputs would + contribute more towards the global gradient. Typically, you + would want to set this to ``True`` for cases where the last few + inputs of your training job are uneven. In extreme cases, where + there is a large discrepancy in the number of inputs, setting + this to ``False`` might provide better results. + enable (bool): Whether to enable uneven input detection or not. Pass + in ``enable=False`` to disable in cases where you know that + inputs are even across participating processes. Default is + ``True``. + throw_on_early_termination (bool): Whether to throw an error + or continue training when at least one rank has exhausted + inputs. If ``True``, will throw upon the first rank reaching end + of data. If ``False``, will continue training with a smaller + effective world size until all ranks are joined. Note that if + this flag is specified, then the flag + ``divide_by_initial_world_size`` would be ignored. Default + is ``False``. + + + Example:: + + >>> # xdoctest: +SKIP("Distributed") + >>> import torch + >>> import torch.distributed as dist + >>> import os + >>> import torch.multiprocessing as mp + >>> import torch.nn as nn + >>> # On each spawned worker + >>> def worker(rank): + >>> dist.init_process_group("nccl", rank=rank, world_size=2) + >>> torch.cuda.set_device(rank) + >>> model = nn.Linear(1, 1, bias=False).to(rank) + >>> model = torch.nn.parallel.DistributedDataParallel( + >>> model, device_ids=[rank], output_device=rank + >>> ) + >>> # Rank 1 gets one more input than rank 0. + >>> inputs = [torch.tensor([1]).float() for _ in range(10 + rank)] + >>> with model.join(): + >>> for _ in range(5): + >>> for inp in inputs: + >>> loss = model(inp).sum() + >>> loss.backward() + >>> # Without the join() API, the below synchronization will hang + >>> # blocking for rank 1's allreduce to complete. + >>> torch.cuda.synchronize(device=rank) + """ + return Join( + [self], + enable, + throw_on_early_termination, + divide_by_initial_world_size=divide_by_initial_world_size, + ) + + def join_hook( + self, + **kwargs, + ): + r""" + DDP join hook enables training on uneven inputs by mirroring communications in forward and backward passes. + + Arguments: + kwargs (dict): a :class:`dict` containing any keyword arguments + to modify the behavior of the join hook at run time; all + :class:`Joinable` instances sharing the same join context + manager are forwarded the same value for ``kwargs``. + + The hook supports the following keyword arguments: + divide_by_initial_world_size (bool, optional): + If ``True``, then gradients are divided by the initial world + size that DDP was launched with. + If ``False``, then gradients are divided by the effective world + size (i.e. the number of non-joined processes), meaning that + the uneven inputs contribute more toward the global gradient. + Typically, this should be set to ``True`` if the degree of + unevenness is small but can be set to ``False`` in extreme + cases for possibly better results. + Default is ``True``. + """ + divide_by_initial_world_size = kwargs.get("divide_by_initial_world_size", True) + return _DDPJoinHook( + self, divide_by_initial_world_size=divide_by_initial_world_size + ) + + @property + def join_device(self): + return self.device + + @property + def join_process_group(self): + return self.process_group + + def _register_buffer_comm_hook( + self, + state, + hook: Callable, + comm_hook_location=_BufferCommHookLocation.POST_FORWARD, + ): + r""" + Allow custom registration of hooks that define how buffer are synchronized across ranks. + + The hook takes in an optional state and is passed in a Dict[str, Tensor] + corresponding to buffer names and the buffers, and can run arbitrary reductions + on buffers as opposed to DDP's default broadcast from rank 0. This is useful for + example if a counter needs to be summed or averaged across ranks every iteration. + + Args: + state (Any): Optional state that is passed to the hook. + hook (Callable): Callable with the following signature: + ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]`` + comm_hook_location (_BufferCommHookLocation): Enum value indicating + where to run the hook. + _BufferCommHookLocation.PRE_FORWARD means that the + hook will run _before_ the forward pass, and + _BufferCommHookLocation.POST_FORWARD means that the + hook will run _after_ the forward pass. + + NOTE: To maximize performance, users can return a + List[torch.futures.Future] from their hook, and DDP will + install and await these hooks appropriately at the end of + the backward pass. This will ensure all buffers are + synchronized by the end of the backward pass. If this + setting is used, it is recommended to pass + comm_hook_location=_BufferCommHookLocation.POST_FORWARD, + which will trigger the hook after the forward pass. + If _BufferCommHookLocation.PRE_FORWARD is used, users must + ensure appropriate synchronization when manipulating GPU + buffers in the forward pass. + """ + assert callable(hook) + self.buffer_hook = _BufferCommHook( + buffer_comm_hook=hook, + buffer_comm_hook_state=state, + buffer_comm_hook_location=comm_hook_location, + ) + + def register_comm_hook(self, state: object, hook: Callable): + r""" + Register communication hook for user-defined DDP aggregation of gradients across multiple workers. + + This hook would be very useful for researchers to try out new ideas. For + example, this hook can be used to implement several algorithms like GossipGrad + and gradient compression which involve different communication strategies for + parameter syncs while running Distributed DataParallel training. + + Args: + state (object): Passed to the hook to maintain any state information during the training process. + Examples include error feedback in gradient compression, + peers to communicate with next in GossipGrad, etc. + + It is locally stored by each worker + and shared by all the gradient tensors on the worker. + hook (Callable): Callable with the following signature: + ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``: + + This function is called once the bucket is ready. The + hook can perform whatever processing is needed and return + a Future indicating completion of any async work (ex: allreduce). + If the hook doesn't perform any communication, it still + must return a completed Future. The Future should hold the + new value of grad bucket's tensors. Once a bucket is ready, + c10d reducer would call this hook and use the tensors returned + by the Future and copy grads to individual parameters. + Note that the future's return type must be a single tensor. + + We also provide an API called ``get_future`` to retrieve a + Future associated with the completion of ``c10d.ProcessGroup.Work``. + ``get_future`` is currently supported for NCCL and also supported for most + operations on GLOO and MPI, except for peer to peer operations (send/recv). + + .. warning :: + Grad bucket's tensors will not be predivided by world_size. User is responsible + to divide by the world_size in case of operations like allreduce. + + .. warning :: + DDP communication hook can only be registered once and should be registered + before calling backward. + + .. warning :: + The Future object that hook returns should contain a single tensor + that has the same shape with the tensors inside grad bucket. + + .. warning :: + ``get_future`` API supports NCCL, and partially GLOO and MPI backends (no support + for peer-to-peer operations like send/recv) and will return a ``torch.futures.Future``. + + Example:: + Below is an example of a noop hook that returns the same tensor. + + >>> # xdoctest: +SKIP('undefined name') + >>> def noop(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]: + >>> fut = torch.futures.Future() + >>> fut.set_result(bucket.buffer()) + >>> return fut + >>> ddp.register_comm_hook(state=None, hook=noop) + + Example:: + Below is an example of a Parallel SGD algorithm where gradients are encoded before + allreduce, and then decoded after allreduce. + + >>> # xdoctest: +SKIP('undefined name') + >>> def encode_and_decode(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]: + >>> encoded_tensor = encode(bucket.buffer()) # encode gradients + >>> fut = torch.distributed.all_reduce(encoded_tensor).get_future() + >>> # Define the then callback to decode. + >>> def decode(fut): + >>> decoded_tensor = decode(fut.value()[0]) # decode gradients + >>> return decoded_tensor + >>> return fut.then(decode) + >>> ddp.register_comm_hook(state=None, hook=encode_and_decode) + """ + self._check_comm_hook(hook) + if hook.__name__ in ["bf16_compress_hook", "fp16_compress_hook"]: + # If we pass None, then the hook will try to get the world size + # by calling `dist.group.WORLD.size()`, which causes compilation + # errors. So we pre-decode the process group and pass it to the + # hook. + if state is None: + state = dist.group.WORLD + assert self.logger is not None + self.logger._set_comm_hook_name(hook.__qualname__) + self._comm_hooks.append((hook, state)) + dist._register_comm_hook(self.reducer, state, hook) + + def _register_builtin_comm_hook(self, comm_hook_type): + r""" + Register a built-in communication hook that specifies how DDP aggregates gradients across multiple workers. + + The built-in hooks aim to provide efficient C++ implementations for certain hooks, + which might not be as efficient if implemented in Python using a Python communication hook. + + Args: + comm_hook_type (dist.BuiltinCommHookType): type of communication hook, such as ALLREDUCE, FP16_COMPRESS, etc. + + .. warning :: + DDP communication hook can only be registered once and should be registered + before calling backward. + + Example:: + Below is an example of a FP16 compression where gradients are + compressed into 16-bit floating-point numbers before allreduce, and + then decompressed after allreduce. + + >>> # xdoctest: +SKIP('undefined name') + >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS) + + """ + assert self.logger is not None + self.logger._set_comm_hook_name(str(comm_hook_type)) + dist._register_builtin_comm_hook(self.reducer, comm_hook_type) + + def _register_fused_optim(self, optim: Type, *args, optim_params=None, **kwargs): + r""" + Register an optimizer in DDP to optimize parameter immediately after its gradient reduction. + + Registers an optimizer with DDP such that the optimization for a + parameter will run immediately when that parameter's gradient is + finished with reduction, instead of waiting for all parameters' + gradients to finish reduction. This can result in a training speedup + depending on your workload since the optimizer can run while gradient + reduction for other parameters are still ongoing. In addition, this has + the potential to reduce peak memory consumption during training, as it + only needs to load the per-parameter optimizer states of a single + parameter at a time, instead of loading all per-parameter optimizer + states at once. + + Args: + optim (Type): a ``torch.optim.Optimizer`` class to be registered + as a fused optimizer. + *args (Sequence[Any]): Arguments to forward to `optim`. + optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters + to optimize, similar to `params` argument of traditional `torch.optim` + Optimizers. If this is omitted, all DDP model parameters will be + optimized. + **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim`. + + .. warning :: + _register_fused_optim should only be called once on a DDP instance, + and registering multiple fused optimizers for the same DDP model + is not currently supported. Please ping + https://github.com/pytorch/pytorch/issues/71595 if this is necessary + for your use case. + + .. warning :: + _register_fused_optim and register_comm_hook currently do not + compose together, meaning that custom DDP communication hooks are + not supported with overlapped optimizers. Please ping + https://github.com/pytorch/pytorch/issues/71595 if this is necessary + for your use case. + + .. warning :: + Gradient accumulation and DDP `no_sync` are currently not supported + with overlapped optimizer. Please ping + https://github.com/pytorch/pytorch/issues/71595 if this is necessary + for your use case. + + Example:: + + >>> # xdoctest: +SKIP("No rendezvous handler") + >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...') + >>> net = torch.nn.parallel.DistributedDataParallel(model, pg) + >>> lr = 1e-2 + >>> betas = (0.9, 0.99) + >>> eps = 1e-6 + >>> net._register_fused_optim(torch.optim.Adam, lr, betas=betas, eps=eps) + >>> # Example with subset of parameters + >>> params_to_opt = [list(net.parameters())[0]] + >>> net._register_fused_optim( + ... torch.optim.Adam, lr, optim_params=params_to_opt, betas=betas, eps=eps + ... ) + """ + # Note: importing in function, otherwise this will cause a circular + # import as optimizer_overlap module needs to import DistributedDataParallel. + from torch.distributed.algorithms._optimizer_overlap import _as_overlapped_optim + + overlapped_optim = _as_overlapped_optim(optim, optim_params, *args, **kwargs) + try: + overlapped_optim.register_ddp(self) + except NotImplementedError as e: + raise RuntimeError( + f"{optim} does not support overlapped DDP. Please file an issue to PyTorch or the respective owner of {optim}." + ) from e + + def _distributed_broadcast_coalesced( + self, tensors, buffer_size, authoritative_rank=0 + ): + dist._broadcast_coalesced( + self.process_group, tensors, buffer_size, authoritative_rank + ) + + def _check_sync_bufs_post_fwd(self): + return ( + self.will_sync_module_buffers() + and hasattr(self, "buffer_hook") + and self.buffer_hook.buffer_comm_hook_location + == _BufferCommHookLocation.POST_FORWARD + ) + + def _check_sync_bufs_pre_fwd(self): + return self.will_sync_module_buffers() and ( + not hasattr(self, "buffer_hook") + or self.buffer_hook.buffer_comm_hook_location + == _BufferCommHookLocation.PRE_FORWARD + ) + + def will_sync_module_buffers(self): + return ( + self.require_forward_param_sync + and self.broadcast_buffers + and len(self.modules_buffers) > 0 + ) + + def _find_common_rank(self, input_rank, rank_cond): + # -1 indicates that this rank is not under consideration to be the + # common_rank + rank_to_use = torch.tensor( + [input_rank if rank_cond else -1], + device=self.device, + ) + dist.all_reduce(rank_to_use, op=ReduceOp.MAX, group=self.process_group) + if rank_to_use.item() == -1: + self._log_and_throw( + ValueError, + "BUG! Expected rank_cond to be true for at least one process." + " This indicates a bug in PyTorch, please report an issue.", + ) + return rank_to_use.item() + + def _sync_buffers(self): + with torch.no_grad(): + # module buffer sync + # Synchronize buffers across processes. + # If we are running DDP with the join manager, we have to agree + # upon a rank to sync module buffers from, since rank 0 may + # already have been joined and have stale module buffers. + if self._join_config.enable: + authoritative_rank = self._find_common_rank( + self._distributed_rank, True + ) + else: + # The process with rank 0 is considered the authoritative copy. + authoritative_rank = 0 + # Update self.modules_buffers incase any buffers were + # reassigned. + self._assign_modules_buffers() + self._sync_module_buffers(authoritative_rank) + + def _sync_module_buffers(self, authoritative_rank): + if not hasattr(self, "buffer_hook"): + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + else: + hook = self.buffer_hook.buffer_comm_hook + state = self.buffer_hook.buffer_comm_hook_state + futs = hook(state, self.named_module_buffers) + if futs is not None: + self.reducer._install_post_backward_futures(futs) + + def _default_broadcast_coalesced( + self, bufs=None, bucket_size=None, authoritative_rank=0 + ): + """ + Broadcasts buffers from rank 0 to rest of workers. + + If bufs, bucket_size are None, default values self.modules_buffers + and self.broadcast_bucket_size are used instead. + """ + if bufs is None: + bufs = self.modules_buffers + if bucket_size is None: + bucket_size = self.broadcast_bucket_size + + self._distributed_broadcast_coalesced(bufs, bucket_size, authoritative_rank) + + def _passing_sync_batchnorm_handle(self, module): + for layer in module.modules(): + if isinstance(layer, torch.nn.modules.SyncBatchNorm): + if self.device_type == "cpu": + self._log_and_throw( + ValueError, + "SyncBatchNorm layers only work with GPU modules", + ) + + def _check_comm_hook(self, hook): + if not callable(hook): + self._log_and_throw(TypeError, "Communication hook must be callable.") + + sig = inspect.signature(hook) + if ( + sig.parameters["bucket"].annotation != inspect._empty + and sig.parameters["bucket"].annotation != dist.GradBucket + ): + self._log_and_throw( + ValueError, + "Communication hook: bucket annotation should be dist.GradBucket.", + ) + + if ( + sig.return_annotation != inspect._empty + and sig.return_annotation != torch.futures.Future[torch.Tensor] + ): + self._log_and_throw( + ValueError, + "Communication hook: return annotation should be torch.futures.Future[torch.Tensor].", + ) + + if hook.__name__ in [ + "bf16_compress_hook", + "bf16_compress_wrapper_hook", + ] and ( + (torch.version.cuda is None and torch.version.hip is None) + or ( + torch.version.cuda is not None + and int(torch.version.cuda.split(".")[0]) < 11 + ) + or not dist.is_available() + or not dist.is_nccl_available() + or torch.cuda.nccl.version() < (2, 10) + ): + self._log_and_throw( + TypeError, + "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.10+.", + ) + + @property + def _distributed_rank(self): + return dist.get_rank(self.process_group) + + @staticmethod + def _get_data_parallel_params(module, named_params=False): + """Return a generator of parameters managed by a given DDP unit.""" + for param in ( + module.parameters() if not named_params else module.named_parameters() + ): + if not hasattr(param, "_ddp_ignored"): + yield param + + @staticmethod + def _set_params_and_buffers_to_ignore_for_model( + module, params_and_buffers_to_ignore + ): + """ + Set parameters and buffers to be ignored by DDP. + + Expected format for parameters is the fully qualified name: {module_name}.{param_name}, and + similarly, {module_name}.{buffer_name} for buffers. For example: + params_to_ignore = [] + # NB: model here is vanilla PyTorch module, not yet wrapped with DDP. + for module_name, module in model.named_modules(): + for param_name, param in module.named_parameters(recurse=False): + if should_ignore(param): + # Create expected format + fqn = f"{module_name}.{param_name}" + params_to_ignore.append(fqn) + torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model( + model, + params_to_ignore + ) + """ + # This is a workaround to set parameters and buffers DDP should ignore + # during synchronization. It will be removed when the API is finalized + # as part of addressing https://github.com/pytorch/pytorch/issues/43690. + module._ddp_params_and_buffers_to_ignore = params_and_buffers_to_ignore + for name, param in module.named_parameters(): + if name in params_and_buffers_to_ignore: + param._ddp_ignored = True + for name, buffer in module.named_buffers(): + if name in params_and_buffers_to_ignore: + buffer._ddp_ignored = True + + def _get_ddp_logging_data(self): + r""" + Return a dictionary of logging data for debugging and analysis. + + This interface can be called after DistributedDataParallel() is + constructed. It returns a dictionary of logging data. It could help + for debugging and analysis. The logging data includes DistributedDataParallel + constructor input parameters, some internal states of DistributedDataParallel + and performance metrics. Simply print the dictionary and see what + these metrics are. + This is a prototype interface and subject to change in the future. + """ + assert self.logger is not None + ddp_logging_data = self.logger._get_ddp_logging_data() + return {**ddp_logging_data.strs_map, **ddp_logging_data.ints_map} + + def _set_ddp_runtime_logging_sample_rate(self, sample_rate): + r""" + Set sample_rate of collecting runtime stats. + + This interface allows users to set sample_rate of collecting + runtime stats. The runtime stats will be recorded for the + first 10 iterations, after 10 iterations runtime stats will be + recorded once every "sample_rate" training iterations. In + default, runtime stats are recorded for the first 10 iterations, + after 10 iterations runtime stats are recorded once every + "kDDPRuntimeLoggingSampleRate=100" training iterations. + This is a prototype interface and subject to change in the future. + """ + if sample_rate < 1: + self._log_and_throw( + ValueError, + "DDP runtime logging sample rate should be equal or greater than 1", + ) + self.reducer._set_ddp_runtime_logging_sample_rate(sample_rate) + + def _set_static_graph(self): + """ + Set static graph for DDP. + + It is recommended to set static graph in the DDP constructor, which will + call this private API internally. + """ + # If self.static_graph has been set, no need to set it again + if self.static_graph: + warnings.warn( + "You've set static_graph to be True, no need to set it again." + ) + return + self.static_graph = True + self._static_graph_delay_allreduce_enqueued = False + self.reducer._set_static_graph() + assert self.logger is not None + self.logger._set_static_graph() + if self.find_unused_parameters: + warnings.warn( + "You passed find_unused_parameters=true to DistributedDataParallel, " + "`_set_static_graph` will detect unused parameters automatically, so " + "you do not need to set find_unused_parameters=true, just be sure these " + "unused parameters will not change during training loop while calling " + "`_set_static_graph`." + ) + + def _remove_autograd_hooks(self): + """Remove autograd hooks registered by the reducer on the model parameters.""" + self.reducer._remove_autograd_hooks() + + def _check_reducer_finalized(self): + """ + Check if the reducer has processed all buckets and finalized the backward appropriately. + + It is useful to call this method after calling .backward() in your training loop + in order to avoid subsequent hard to debug errors down the road due to the + reducer not finalizing backward. + """ + self.reducer._check_reducer_finalized() + + def _set_sparse_metadata(self, global_unique_ids): + self.reducer._set_sparse_metadata(global_unique_ids) + + def _update_process_group(self, new_process_group): + """ + Dynamically updates the process group for DDP so that we can shrink/expand DDP + world size without having to reinitialize DDP. + + NOTE: If you are using custom communications hooks via, register_comm_hook, + you need to update the process groups for those hooks separately. + """ + # Force a rebuild of buckets for a new process group. This ensures all ranks + # are synchronized in terms of when they will rebuild buckets and also + # re-evaluates previous assumptions of buckets given the world size might have + # changed. + self._has_rebuilt_buckets = False + self.reducer._reset_state() + + if not _rank_not_in_group(new_process_group): + self.process_group = new_process_group + self.reducer._update_process_group(new_process_group) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py new file mode 100644 index 0000000000000000000000000000000000000000..6a90f897fa8ada1575524e50474df402b9c42a0d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py @@ -0,0 +1,110 @@ +import threading +import torch +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast +from ..modules import Module +from torch.cuda._utils import _get_device_index +from torch.cuda.amp import autocast +from torch._utils import ExceptionWrapper + +__all__ = ['get_a_var', 'parallel_apply'] + +def get_a_var(obj: Union[torch.Tensor, List[Any], Tuple[Any, ...], Dict[Any, Any]]) -> Optional[torch.Tensor]: + if isinstance(obj, torch.Tensor): + return obj + + if isinstance(obj, (list, tuple)): + for result in map(get_a_var, obj): + if isinstance(result, torch.Tensor): + return result + if isinstance(obj, dict): + for result in map(get_a_var, obj.items()): + if isinstance(result, torch.Tensor): + return result + return None + +def parallel_apply( + modules: Sequence[Module], + inputs: Sequence[Any], + kwargs_tup: Optional[Sequence[Dict[str, Any]]] = None, + devices: Optional[Sequence[Optional[Union[int, torch.device]]]] = None, +) -> List[Any]: + r"""Apply each `module` in :attr:`modules` in parallel on each of :attr:`devices`. + + Args: + modules (Module): modules to be parallelized + inputs (tensor): inputs to the modules + devices (list of int or torch.device): CUDA devices + + :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and + :attr:`devices` (if given) should all have same length. Moreover, each + element of :attr:`inputs` can either be a single object as the only argument + to a module, or a collection of positional arguments. + """ + assert len(modules) == len(inputs), f'The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}' + if kwargs_tup is not None: + assert len(modules) == len(kwargs_tup) + else: + kwargs_tup = (cast(Dict[str, Any], {}),) * len(modules) + if devices is not None: + assert len(modules) == len(devices) + else: + devices = [None] * len(modules) + devices = [_get_device_index(x, True) for x in devices] + streams = [torch.cuda.current_stream(x) for x in devices] + lock = threading.Lock() + results = {} + grad_enabled, autocast_enabled = torch.is_grad_enabled(), torch.is_autocast_enabled() + + def _worker( + i: int, + module: Module, + input: Any, + kwargs: Dict[str, Any], + device: Optional[Union[int, torch.device]] = None, + stream: Optional[torch.cuda.Stream] = None, + ) -> None: + torch.set_grad_enabled(grad_enabled) + if device is None: + t = get_a_var(input) + if t is None: + with lock: + results[i] = ExceptionWrapper( + where=f"in replica {i}, no device was provided and no tensor input was found; " + "device cannot be resolved") + return + device = t.get_device() + if stream is None: + stream = torch.cuda.current_stream(device) + try: + with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled): + # this also avoids accidental slicing of `input` if it is a Tensor + if not isinstance(input, (list, tuple)): + input = (input,) + output = module(*input, **kwargs) + with lock: + results[i] = output + except Exception: + with lock: + results[i] = ExceptionWrapper( + where=f"in replica {i} on device {device}") + + if len(modules) > 1: + threads = [threading.Thread(target=_worker, + args=(i, module, input, kwargs, device, stream)) + for i, (module, input, kwargs, device, stream) in + enumerate(zip(modules, inputs, kwargs_tup, devices, streams))] + + for thread in threads: + thread.start() + for thread in threads: + thread.join() + else: + _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0], streams[0]) + + outputs = [] + for i in range(len(inputs)): + output = results[i] + if isinstance(output, ExceptionWrapper): + output.reraise() + outputs.append(output) + return outputs diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76879dafdb905988587af05d71f7b87a0d146784 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/dynamic/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/dynamic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5a49722b1e9d7e6384e72e5277dee042d252bc28 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/dynamic/__init__.py @@ -0,0 +1,7 @@ +# flake8: noqa: F401 +r"""QAT Dynamic Modules. + +This package is in the process of being deprecated. +Please, use `torch.ao.nn.qat.dynamic` instead. +""" +from .modules import * # noqa: F403 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/dynamic/modules/__pycache__/linear.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/dynamic/modules/__pycache__/linear.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0ca6472a656928b3473dc383a0f5cfad36198fa Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/dynamic/modules/__pycache__/linear.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..63b1d2c08efd3f8c79cf142276525474f2c3b7f9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__init__.py @@ -0,0 +1,24 @@ +# flake8: noqa: F401 +r"""QAT Modules. + +This package is in the process of being deprecated. +Please, use `torch.ao.nn.qat.modules` instead. +""" +from torch.ao.nn.qat.modules.linear import Linear +from torch.ao.nn.qat.modules.conv import Conv1d +from torch.ao.nn.qat.modules.conv import Conv2d +from torch.ao.nn.qat.modules.conv import Conv3d +from torch.ao.nn.qat.modules.embedding_ops import EmbeddingBag, Embedding + +from . import conv +from . import embedding_ops +from . import linear + +__all__ = [ + "Linear", + "Conv1d", + "Conv2d", + "Conv3d", + "Embedding", + "EmbeddingBag", +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d129553c46d2fa51eafb6a25ce40141676f770d7 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/conv.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/conv.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25db33cfdc8b8554a1307a700deb324e86b4c3e0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/conv.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/embedding_ops.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/embedding_ops.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e20176fac8354409c38c47275e5c7131dbe90e82 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/embedding_ops.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/linear.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/linear.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f0045c67408329b222ebf6f2753d2f4e8aabfee Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/__pycache__/linear.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/conv.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..33fa1de6b561f86b3e23044f0caa0e3baa5ac5f7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/conv.py @@ -0,0 +1,12 @@ +# flake8: noqa: F401 +r"""QAT Modules. + +This file is in the process of migration to `torch/ao/nn/qat`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate file under the `torch/ao/nn/qat/modules`, +while adding an import statement here. +""" +from torch.ao.nn.qat.modules.conv import Conv1d +from torch.ao.nn.qat.modules.conv import Conv2d +from torch.ao.nn.qat.modules.conv import Conv3d diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/linear.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..f5841a46096c18bb1d5afb3ca5c25d9c9666d22a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/qat/modules/linear.py @@ -0,0 +1,10 @@ +# flake8: noqa: F401 +r"""QAT Modules. + +This file is in the process of migration to `torch/ao/nn/qat`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate file under the `torch/ao/nn/qat/modules`, +while adding an import statement here. +""" +from torch.ao.nn.qat.modules.linear import Linear diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a1ff60ed6a8b00777e3ca8ad5f8cf46f201016f3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/__init__.py @@ -0,0 +1,31 @@ +# flake8: noqa: F401 +r"""Quantized Reference Modules. + +This module is in the process of migration to +`torch/ao/nn/quantized/reference`, and is kept here for +compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate file under the `torch/ao/nn/quantized/reference`, +while adding an import statement here. +""" + +from torch.ao.nn.quantized.reference.modules.linear import Linear +from torch.ao.nn.quantized.reference.modules.conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d +from torch.ao.nn.quantized.reference.modules.rnn import RNNCell, LSTMCell, GRUCell, LSTM +from torch.ao.nn.quantized.reference.modules.sparse import Embedding, EmbeddingBag + +__all__ = [ + 'Linear', + 'Conv1d', + 'Conv2d', + 'Conv3d', + 'ConvTranspose1d', + 'ConvTranspose2d', + 'ConvTranspose3d', + 'RNNCell', + 'LSTMCell', + 'GRUCell', + 'LSTM', + 'Embedding', + 'EmbeddingBag', +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9708c15fbc8ab0010a53eceaf63637a3f44461aa Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/conv.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..bbfeb2959f4b4c4030c5496fd3a4f666c9330569 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/conv.py @@ -0,0 +1,19 @@ +# flake8: noqa: F401 +r"""Quantized Reference Modules. + +This module is in the process of migration to +`torch/ao/nn/quantized/reference`, and is kept here for +compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate file under the `torch/ao/nn/quantized/reference`, +while adding an import statement here. +""" + +from torch.ao.nn.quantized.reference.modules.conv import _ConvNd +from torch.ao.nn.quantized.reference.modules.conv import Conv1d +from torch.ao.nn.quantized.reference.modules.conv import Conv2d +from torch.ao.nn.quantized.reference.modules.conv import Conv3d +from torch.ao.nn.quantized.reference.modules.conv import _ConvTransposeNd +from torch.ao.nn.quantized.reference.modules.conv import ConvTranspose1d +from torch.ao.nn.quantized.reference.modules.conv import ConvTranspose2d +from torch.ao.nn.quantized.reference.modules.conv import ConvTranspose3d diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/linear.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..6be6d5a140bb58f76b0e6061eb4ccb37d385757f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/linear.py @@ -0,0 +1,12 @@ +# flake8: noqa: F401 +r"""Quantized Reference Modules. + +This module is in the process of migration to +`torch/ao/nn/quantized/reference`, and is kept here for +compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate file under the `torch/ao/nn/quantized/reference`, +while adding an import statement here. +""" + +from torch.ao.nn.quantized.reference.modules.linear import Linear diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/rnn.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..2464eab87b52469a5ee9c0ef3e0a9ce13fb814bf --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/rnn.py @@ -0,0 +1,17 @@ +# flake8: noqa: F401 +r"""Quantized Reference Modules. + +This module is in the process of migration to +`torch/ao/nn/quantized/reference`, and is kept here for +compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate file under the `torch/ao/nn/quantized/reference`, +while adding an import statement here. +""" + +from torch.ao.nn.quantized.reference.modules.rnn import RNNCellBase +from torch.ao.nn.quantized.reference.modules.rnn import RNNCell +from torch.ao.nn.quantized.reference.modules.rnn import LSTMCell +from torch.ao.nn.quantized.reference.modules.rnn import GRUCell +from torch.ao.nn.quantized.reference.modules.rnn import RNNBase +from torch.ao.nn.quantized.reference.modules.rnn import LSTM diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f79835124931eca4763677b2cdc6c1a748dd74c1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/_reference/modules/utils.py @@ -0,0 +1,15 @@ +# flake8: noqa: F401 +r"""Quantized Reference Modules. + +This module is in the process of migration to +`torch/ao/nn/quantized/reference`, and is kept here for +compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate file under the `torch/ao/nn/quantized/reference`, +while adding an import statement here. +""" +from torch.ao.nn.quantized.reference.modules.utils import _quantize_weight +from torch.ao.nn.quantized.reference.modules.utils import _quantize_and_dequantize_weight +from torch.ao.nn.quantized.reference.modules.utils import _save_weight_qparams +from torch.ao.nn.quantized.reference.modules.utils import _get_weight_qparam_keys +from torch.ao.nn.quantized.reference.modules.utils import ReferenceQuantizedModule diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/functional.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..d763e171fdb432c8ba2059cc2332e7ac6424854a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/functional.py @@ -0,0 +1,10 @@ +r"""nn.quantized.functional. + +Quantized equivalents of the `nn.functional`. + +Note:: + This location is in the process of being deprecated. + Please, use the `torch.ao.nn.quantized.functional` instead. +""" + +from torch.ao.nn.quantized.functional import * # noqa: F401,F403 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/__pycache__/embedding_ops.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/__pycache__/embedding_ops.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cce1fdc83eb659fe34a9385098f659c0ec6640cc Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/__pycache__/embedding_ops.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/__pycache__/rnn.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/__pycache__/rnn.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b21cbd24ec303223c34ac10de4781bbacc71157c Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/__pycache__/rnn.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/batchnorm.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/batchnorm.py new file mode 100644 index 0000000000000000000000000000000000000000..29cb184fbece72de56055f8f00e471e881c72c12 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/batchnorm.py @@ -0,0 +1,12 @@ +# flake8: noqa: F401 +r"""Quantized Modules. + +This file is in the process of migration to `torch/ao/nn/quantized`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate file under the `torch/ao/nn/quantized/modules`, +while adding an import statement here. +""" + +from torch.ao.nn.quantized.modules.batchnorm import BatchNorm2d +from torch.ao.nn.quantized.modules.batchnorm import BatchNorm3d diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/linear.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..e558bdb817b3fcba98fee8d4aaa08c91cd7183ff --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/nn/quantized/modules/linear.py @@ -0,0 +1,14 @@ +# flake8: noqa: F401 +r"""Quantized Modules. + +This file is in the process of migration to `torch/ao/nn/quantized`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate file under the `torch/ao/nn/quantized/modules`, +while adding an import statement here. +""" + +__all__ = ['LinearPackedParams', 'Linear'] + +from torch.ao.nn.quantized.modules.linear import Linear +from torch.ao.nn.quantized.modules.linear import LinearPackedParams