salmankhanpm commited on Mar 19

Commit

4f4376a

verified ·

1 Parent(s): 69e1a8d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

accelerate/commands/__init__.py +13 -0
accelerate/commands/accelerate_cli.py +54 -0
accelerate/commands/config/__init__.py +52 -0
accelerate/commands/config/cluster.py +939 -0
accelerate/commands/config/config.py +89 -0
accelerate/commands/config/config_args.py +252 -0
accelerate/commands/config/config_utils.py +122 -0
accelerate/commands/config/default.py +172 -0
accelerate/commands/config/sagemaker.py +274 -0
accelerate/commands/config/update.py +63 -0
accelerate/commands/env.py +143 -0
accelerate/commands/estimate.py +318 -0
accelerate/commands/launch.py +1415 -0
accelerate/commands/menu/__init__.py +14 -0
accelerate/commands/menu/cursor.py +65 -0
accelerate/commands/menu/helpers.py +59 -0
accelerate/commands/menu/input.py +84 -0
accelerate/commands/menu/keymap.py +133 -0
accelerate/commands/menu/selection_menu.py +145 -0
accelerate/commands/merge.py +69 -0
accelerate/commands/test.py +65 -0
accelerate/commands/to_fsdp2.py +172 -0
accelerate/commands/tpu.py +157 -0
accelerate/commands/utils.py +123 -0
accelerate/test_utils/__init__.py +66 -0
accelerate/test_utils/examples.py +148 -0
accelerate/test_utils/scripts/__init__.py +13 -0
accelerate/test_utils/scripts/external_deps/__init__.py +13 -0
accelerate/test_utils/scripts/external_deps/test_checkpointing.py +269 -0
accelerate/test_utils/scripts/external_deps/test_ds_alst_ulysses_sp.py +131 -0
accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py +331 -0
accelerate/test_utils/scripts/external_deps/test_metrics.py +307 -0
accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py +323 -0
accelerate/test_utils/scripts/external_deps/test_performance.py +299 -0
accelerate/test_utils/scripts/external_deps/test_pippy.py +117 -0
accelerate/test_utils/scripts/external_deps/test_zero3_integration.py +59 -0
accelerate/test_utils/scripts/test_cli.py +32 -0
accelerate/test_utils/scripts/test_ddp_comm_hook.py +85 -0
accelerate/test_utils/scripts/test_distributed_data_loop.py +429 -0
accelerate/test_utils/scripts/test_merge_weights.py +158 -0
accelerate/test_utils/scripts/test_notebook.py +125 -0
accelerate/test_utils/scripts/test_ops.py +181 -0
accelerate/test_utils/scripts/test_script.py +909 -0
accelerate/test_utils/scripts/test_sync.py +413 -0
accelerate/test_utils/testing.py +889 -0
accelerate/test_utils/training.py +150 -0
accelerate/utils/__init__.py +304 -0
accelerate/utils/ao.py +143 -0
accelerate/utils/bnb.py +464 -0
accelerate/utils/constants.py +108 -0

accelerate/commands/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

accelerate/commands/accelerate_cli.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from accelerate.commands.config import get_config_parser
+from accelerate.commands.env import env_command_parser
+from accelerate.commands.estimate import estimate_command_parser
+from accelerate.commands.launch import launch_command_parser
+from accelerate.commands.merge import merge_command_parser
+from accelerate.commands.test import test_command_parser
+from accelerate.commands.to_fsdp2 import to_fsdp2_command_parser
+from accelerate.commands.tpu import tpu_command_parser
+from accelerate.commands.utils import CustomArgumentParser
+def main():
+    parser = CustomArgumentParser("Accelerate CLI tool", usage="accelerate <command> [<args>]", allow_abbrev=False)
+    subparsers = parser.add_subparsers(help="accelerate command helpers")
+    # Register commands
+    get_config_parser(subparsers=subparsers)
+    estimate_command_parser(subparsers=subparsers)
+    env_command_parser(subparsers=subparsers)
+    launch_command_parser(subparsers=subparsers)
+    merge_command_parser(subparsers=subparsers)
+    tpu_command_parser(subparsers=subparsers)
+    test_command_parser(subparsers=subparsers)
+    to_fsdp2_command_parser(subparsers=subparsers)
+    # Let's go
+    args = parser.parse_args()
+    if not hasattr(args, "func"):
+        parser.print_help()
+        exit(1)
+    # Run
+    args.func(args)
+if __name__ == "__main__":
+    main()

accelerate/commands/config/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from .config import config_command_parser
+from .config_args import default_config_file, load_config_from_file  # noqa: F401
+from .default import default_command_parser
+from .update import update_command_parser
+def get_config_parser(subparsers=None):
+    parent_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
+    # The main config parser
+    config_parser = config_command_parser(subparsers)
+    # The subparser to add commands to
+    subcommands = config_parser.add_subparsers(title="subcommands", dest="subcommand")
+    # Then add other parsers with the parent parser
+    default_command_parser(subcommands, parents=[parent_parser])
+    update_command_parser(subcommands, parents=[parent_parser])
+    return config_parser
+def main():
+    config_parser = get_config_parser()
+    args = config_parser.parse_args()
+    if not hasattr(args, "func"):
+        config_parser.print_help()
+        exit(1)
+    # Run
+    args.func(args)
+if __name__ == "__main__":
+    main()

accelerate/commands/config/cluster.py ADDED Viewed

	@@ -0,0 +1,939 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from ...utils import (
+    ComputeEnvironment,
+    DistributedType,
+    is_deepspeed_available,
+    is_fp8_available,
+    is_hpu_available,
+    is_mlu_available,
+    is_mps_available,
+    is_msamp_available,
+    is_musa_available,
+    is_neuron_available,
+    is_npu_available,
+    is_sdaa_available,
+    is_torchao_available,
+    is_transformer_engine_available,
+    is_transformers_available,
+    is_xpu_available,
+)
+from ...utils.constants import (
+    DEEPSPEED_MULTINODE_LAUNCHERS,
+    FSDP2_STATE_DICT_TYPE,
+    FSDP_AUTO_WRAP_POLICY,
+    FSDP_BACKWARD_PREFETCH,
+    FSDP_SHARDING_STRATEGY,
+    FSDP_STATE_DICT_TYPE,
+    TORCH_DYNAMO_MODES,
+)
+from .config_args import ClusterConfig
+from .config_utils import (
+    DYNAMO_BACKENDS,
+    _ask_field,
+    _ask_options,
+    _convert_distributed_mode,
+    _convert_dynamo_backend,
+    _convert_fp8_backend,
+    _convert_mixed_precision,
+    _convert_yes_no_to_bool,
+)
+def get_cluster_input():
+    distributed_type = _ask_options(
+        "Which type of machine are you using?",
+        [
+            "No distributed training",
+            "multi-CPU",
+            "multi-XPU",
+            "multi-HPU",
+            "multi-GPU",
+            "multi-NPU",
+            "multi-MLU",
+            "multi-SDAA",
+            "multi-MUSA",
+            "multi-NEURON",
+            "TPU",
+        ],
+        _convert_distributed_mode,
+    )
+    machine_rank = 0
+    num_machines = 1
+    num_processes = 1
+    gpu_ids = None
+    main_process_ip = None
+    main_process_port = None
+    rdzv_backend = "static"
+    same_network = True
+    debug = False
+    if distributed_type in [
+        DistributedType.MULTI_GPU,
+        DistributedType.MULTI_MLU,
+        DistributedType.MULTI_SDAA,
+        DistributedType.MULTI_MUSA,
+        DistributedType.MULTI_NPU,
+        DistributedType.MULTI_XPU,
+        DistributedType.MULTI_CPU,
+        DistributedType.MULTI_HPU,
+        DistributedType.MULTI_NEURON,
+    ]:
+        num_machines = _ask_field(
+            "How many different machines will you use (use more than 1 for multi-node training)? [1]: ",
+            int,
+            default=1,
+        )
+        if num_machines > 1:
+            machine_rank = _ask_options(
+                "What is the rank of this machine?",
+                list(range(num_machines)),
+                int,
+            )
+            main_process_ip = _ask_field(
+                "What is the IP address of the machine that will host the main process? ",
+            )
+            main_process_port = _ask_field(
+                "What is the port you will use to communicate with the main process? ",
+                int,
+            )
+            same_network = _ask_field(
+                "Are all the machines on the same local network? Answer `no` if nodes are on the cloud and/or on different network hosts [YES/no]: ",
+                _convert_yes_no_to_bool,
+                default=True,
+                error_message="Please enter yes or no.",
+            )
+            if not same_network:
+                rdzv_backend = _ask_field(
+                    "What rendezvous backend will you use? ('static', 'c10d', ...): ", default="static"
+                )
+        debug = _ask_field(
+            "Should distributed operations be checked while running for errors? This can avoid timeout issues but will be slower. [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+    if distributed_type == DistributedType.NO:
+        use_cpu = _ask_field(
+            "Do you want to run your training on CPU only (even if a GPU / Apple Silicon / Ascend NPU device is available)? [yes/NO]:",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+    elif distributed_type == DistributedType.MULTI_CPU:
+        use_cpu = True
+    else:
+        use_cpu = False
+    mpirun_config = {}
+    if use_cpu:
+        if distributed_type == DistributedType.MULTI_CPU:
+            use_mpirun = _ask_field(
+                "Do you want accelerate to launch mpirun? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            if use_mpirun:
+                mpirun_hostfile = _ask_field(
+                    "Please enter the path to the hostfile to use with mpirun [~/hostfile]: ",
+                    str,
+                    default="~/hostfile",
+                )
+                mpirun_config["mpirun_hostfile"] = os.path.expanduser(mpirun_hostfile.strip())
+    dynamo_config = {}
+    use_dynamo = _ask_field(
+        "Do you wish to optimize your script with torch dynamo?[yes/NO]:",
+        _convert_yes_no_to_bool,
+        default=False,
+        error_message="Please enter yes or no.",
+    )
+    if use_dynamo:
+        prefix = "dynamo_"
+        dynamo_config[prefix + "backend"] = _ask_options(
+            "Which dynamo backend would you like to use?",
+            [x.lower() for x in DYNAMO_BACKENDS],
+            _convert_dynamo_backend,
+            default=2,
+        )
+        use_custom_options = _ask_field(
+            "Do you want to customize the defaults sent to torch.compile? [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+        if use_custom_options:
+            dynamo_config[prefix + "mode"] = _ask_options(
+                "Which mode do you want to use?",
+                TORCH_DYNAMO_MODES,
+                lambda x: TORCH_DYNAMO_MODES[int(x)],
+                default=0,
+            )
+            dynamo_config[prefix + "use_fullgraph"] = _ask_field(
+                "Do you want the fullgraph mode or it is ok to break model into several subgraphs? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            dynamo_config[prefix + "use_dynamic"] = _ask_field(
+                "Do you want to enable dynamic shape tracing? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            dynamo_config[prefix + "use_regional_compilation"] = _ask_field(
+                "Do you want to enable regional compilation? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+    use_mps = not use_cpu and is_mps_available()
+    deepspeed_config = {}
+    if (
+        distributed_type
+        in [
+            DistributedType.MULTI_GPU,
+            DistributedType.MULTI_XPU,
+            DistributedType.MULTI_HPU,
+            DistributedType.MULTI_NPU,
+            DistributedType.MULTI_MLU,
+            DistributedType.MULTI_SDAA,
+            DistributedType.MULTI_MUSA,
+            DistributedType.MULTI_NEURON,
+            DistributedType.NO,
+        ]
+        and not use_mps
+    ):
+        use_deepspeed = _ask_field(
+            "Do you want to use DeepSpeed? [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+        if use_deepspeed:
+            if distributed_type is DistributedType.MULTI_NEURON:
+                raise RuntimeError("DeepSpeed is not supported on Neuron devices.")
+            distributed_type = DistributedType.DEEPSPEED
+            assert is_deepspeed_available(), (
+                "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source"
+            )
+        if distributed_type == DistributedType.DEEPSPEED:
+            use_deepspeed_config = _ask_field(
+                "Do you want to specify a json file to a DeepSpeed config? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            if use_deepspeed_config:
+                deepspeed_config["deepspeed_config_file"] = _ask_field(
+                    "Please enter the path to the json DeepSpeed config file: ",
+                    str,
+                    default="none",
+                )
+            else:
+                deepspeed_config["zero_stage"] = _ask_options(
+                    "What should be your DeepSpeed's ZeRO optimization stage?",
+                    [0, 1, 2, 3],
+                    int,
+                    default=2,
+                )
+                deepspeed_devices = ["none", "cpu", "nvme"]
+                if deepspeed_config["zero_stage"] >= 2:
+                    deepspeed_config["offload_optimizer_device"] = _ask_options(
+                        "Where to offload optimizer states?", deepspeed_devices, lambda x: deepspeed_devices[int(x)]
+                    )
+                    deepspeed_config["offload_param_device"] = _ask_options(
+                        "Where to offload parameters?", deepspeed_devices, lambda x: deepspeed_devices[int(x)]
+                    )
+                    if deepspeed_config["offload_param_device"] == "nvme":
+                        deepspeed_config["offload_param_nvme_path"] = _ask_field(
+                            "Nvme Path to offload parameters?",
+                            str,
+                            default="/nvme",
+                        )
+                    if deepspeed_config["offload_optimizer_device"] == "nvme":
+                        deepspeed_config["offload_optimizer_nvme_path"] = _ask_field(
+                            "Nvme Path to offload optimizer states?",
+                            str,
+                            default="/nvme",
+                        )
+                deepspeed_config["gradient_accumulation_steps"] = _ask_field(
+                    "How many gradient accumulation steps you're passing in your script? [1]: ",
+                    int,
+                    default=1,
+                )
+                use_gradient_clipping = _ask_field(
+                    "Do you want to use gradient clipping? [yes/NO]: ",
+                    _convert_yes_no_to_bool,
+                    default=False,
+                    error_message="Please enter yes or no.",
+                )
+                if use_gradient_clipping:
+                    deepspeed_config["gradient_clipping"] = _ask_field(
+                        "What is the gradient clipping value? [1.0]: ",
+                        float,
+                        default=1.0,
+                    )
+                if deepspeed_config["zero_stage"] == 3:
+                    deepspeed_config["zero3_save_16bit_model"] = _ask_field(
+                        "Do you want to save 16-bit model weights when using ZeRO Stage-3? [yes/NO]: ",
+                        _convert_yes_no_to_bool,
+                        default=False,
+                        error_message="Please enter yes or no.",
+                    )
+            deepspeed_config["zero3_init_flag"] = _ask_field(
+                "Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            if deepspeed_config["zero3_init_flag"]:
+                if not is_transformers_available():
+                    raise Exception(
+                        "When `zero3_init_flag` is set, it requires Transformers to be installed. "
+                        "Please run `pip3 install transformers`."
+                    )
+            use_moe = _ask_field(
+                "Do you want to enable Mixture-of-Experts training (MoE)? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            if use_moe:
+                deepspeed_config["deepspeed_moe_layer_cls_names"] = _ask_field(
+                    "Specify the comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
+                    " `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ... : ",
+                    str,
+                )
+            if num_machines > 1:
+                launcher_query = "Which Type of launcher do you want to use?"
+                deepspeed_config["deepspeed_multinode_launcher"] = _ask_options(
+                    launcher_query,
+                    DEEPSPEED_MULTINODE_LAUNCHERS,
+                    lambda x: DEEPSPEED_MULTINODE_LAUNCHERS[int(x)],
+                )
+                if deepspeed_config["deepspeed_multinode_launcher"] != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
+                    deepspeed_config["deepspeed_hostfile"] = _ask_field(
+                        "DeepSpeed configures multi-node compute resources with hostfile. "
+                        "Each row is of the format `hostname slots=[num_gpus]`, e.g., `localhost slots=2`; "
+                        "for more information please refer official [documentation]"
+                        "(https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node). "
+                        "Please specify the location of hostfile: ",
+                        str,
+                    )
+                    is_exclusion_filter = _ask_field(
+                        "Do you want to specify exclusion filter string? [yes/NO]: ",
+                        _convert_yes_no_to_bool,
+                        default=False,
+                        error_message="Please enter yes or no.",
+                    )
+                    if is_exclusion_filter:
+                        deepspeed_config["deepspeed_exclusion_filter"] = _ask_field(
+                            "DeepSpeed exclusion filter string: ",
+                            str,
+                        )
+                    is_inclusion_filter = _ask_field(
+                        "Do you want to specify inclusion filter string? [yes/NO]: ",
+                        _convert_yes_no_to_bool,
+                        default=False,
+                        error_message="Please enter yes or no.",
+                    )
+                    if is_inclusion_filter:
+                        deepspeed_config["deepspeed_inclusion_filter"] = _ask_field(
+                            "DeepSpeed inclusion filter string: ",
+                            str,
+                        )
+    fsdp_config = {}
+    if distributed_type in [
+        DistributedType.MULTI_GPU,
+        DistributedType.MULTI_NPU,
+        DistributedType.MULTI_MLU,
+        DistributedType.MULTI_SDAA,
+        DistributedType.MULTI_MUSA,
+        DistributedType.MULTI_XPU,
+        DistributedType.MULTI_HPU,
+        DistributedType.MULTI_NEURON,
+    ]:
+        use_fsdp = _ask_field(
+            "Do you want to use FullyShardedDataParallel? [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+        if use_fsdp:
+            if distributed_type is DistributedType.MULTI_NEURON:
+                raise NotImplementedError("FSDP is not currently supported on Neuron devices.")
+            distributed_type = DistributedType.FSDP
+        if distributed_type == DistributedType.FSDP:
+            fsdp_config["fsdp_version"] = _ask_options(
+                "What should be your FSDP version? [2]: ",
+                [1, 2],
+                lambda x: int(x) + 1,
+                default=1,
+            )
+            fsdp_version = fsdp_config["fsdp_version"]  # extract to a variable to simplify usage later
+            if fsdp_version == 1:
+                sharding_strategy_query = "What should be your sharding strategy?"
+                fsdp_config["fsdp_reshard_after_forward"] = _ask_options(
+                    sharding_strategy_query,
+                    FSDP_SHARDING_STRATEGY,
+                    lambda x: FSDP_SHARDING_STRATEGY[int(x)],
+                )
+            else:
+                fsdp_config["fsdp_reshard_after_forward"] = _ask_field(
+                    "Do you want to enable resharding after forward? [YES/no]: ",
+                    _convert_yes_no_to_bool,
+                    default=True,
+                    error_message="Please enter yes or no.",
+                )
+            fsdp_config["fsdp_offload_params"] = _ask_field(
+                "Do you want to offload parameters and gradients to CPU? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            fsdp_wrap_query = "What should be your auto wrap policy?"
+            fsdp_config["fsdp_auto_wrap_policy"] = _ask_options(
+                fsdp_wrap_query,
+                FSDP_AUTO_WRAP_POLICY,
+                lambda x: FSDP_AUTO_WRAP_POLICY[int(x)],
+            )
+            if fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[0]:
+                use_no_split_modules = _ask_field(
+                    "Do you want to use the model's `_no_split_modules` to wrap. Only applicable for 🤗 Transformers [yes/NO]: ",
+                    _convert_yes_no_to_bool,
+                    default=False,
+                    error_message="Please enter yes or no.",
+                )
+                if not use_no_split_modules:
+                    fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = _ask_field(
+                        "Specify the comma-separated list of transformer layer class names (case-sensitive) to wrap ,e.g, :"
+                        "`BertLayer`, `GPTJBlock`, `T5Block`, `BertLayer,BertEmbeddings,BertSelfOutput` ...? : ",
+                        str,
+                    )
+            elif fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[1]:
+                fsdp_config["fsdp_min_num_params"] = _ask_field(
+                    "What should be your FSDP's minimum number of parameters for Default Auto Wrapping Policy? [1e8]: ",
+                    int,
+                    default=100000000,
+                )
+            # Removed in FSDP2, ask for user input for FSDP1
+            if fsdp_version == 1:
+                fsdp_backward_prefetch_query = "What should be your FSDP's backward prefetch policy?"
+                fsdp_config["fsdp_backward_prefetch"] = _ask_options(
+                    fsdp_backward_prefetch_query,
+                    FSDP_BACKWARD_PREFETCH,
+                    lambda x: FSDP_BACKWARD_PREFETCH[int(x)],
+                )
+            fsdp_state_dict_type_query = "What should be your FSDP's state dict type?"
+            fsdp_config["fsdp_state_dict_type"] = _ask_options(
+                fsdp_state_dict_type_query,
+                FSDP_STATE_DICT_TYPE if fsdp_version == 1 else FSDP2_STATE_DICT_TYPE,
+                lambda x: FSDP_STATE_DICT_TYPE[int(x)] if fsdp_version == 1 else FSDP2_STATE_DICT_TYPE[int(x)],
+                default=0,
+            )
+            # Not implemented in FSDP2, ask for user input for FSDP1
+            if fsdp_version == 1:
+                fsdp_config["fsdp_forward_prefetch"] = _ask_field(
+                    "Do you want to enable FSDP's forward prefetch policy? [yes/NO]: ",
+                    _convert_yes_no_to_bool,
+                    default=False,
+                    error_message="Please enter yes or no.",
+                )
+            # Obsolete in FSDP2, ask for user input for FSDP1
+            if fsdp_version == 1:
+                fsdp_config["fsdp_use_orig_params"] = _ask_field(
+                    "Do you want to enable FSDP's `use_orig_params` feature? [YES/no]: ",
+                    _convert_yes_no_to_bool,
+                    default=True,
+                    error_message="Please enter yes or no.",
+                )
+            fsdp_config["fsdp_cpu_ram_efficient_loading"] = _ask_field(
+                "Do you want to enable CPU RAM efficient model loading? Only applicable for 🤗 Transformers models. [YES/no]: ",
+                _convert_yes_no_to_bool,
+                default=True,
+                error_message="Please enter yes or no.",
+            )
+            # Obsolete in FSDP2, ask for user input for FSDP1
+            if fsdp_version == 1:
+                if fsdp_config["fsdp_cpu_ram_efficient_loading"]:
+                    fsdp_config["fsdp_sync_module_states"] = True
+                else:
+                    fsdp_config["fsdp_sync_module_states"] = _ask_field(
+                        "Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start? [YES/no]: ",
+                        _convert_yes_no_to_bool,
+                        default=True,
+                        error_message="Please enter yes or no.",
+                    )
+            fsdp_config["fsdp_activation_checkpointing"] = _ask_field(
+                "Do you want to enable FSDP activation checkpointing? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+    parallelism_config = {}
+    if fsdp_config.get("fsdp_version", 1) == 2:
+        use_parallelism_config = _ask_field(
+            "Do you want to use the parallelism config? [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+        if use_parallelism_config:
+            prefix = "parallelism_config_"
+            parallelism_config[prefix + "dp_replicate_size"] = _ask_field(
+                "What is the data parallelism replicate size? [1]: ",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+            parallelism_config[prefix + "dp_shard_size"] = _ask_field(
+                "What is the FSDP shard size? [1]: ",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+            parallelism_config[prefix + "tp_size"] = _ask_field(
+                "What is the tensor parallelism size? [1]: ",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+            parallelism_config[prefix + "cp_size"] = _ask_field(
+                "What is the context parallelism size? [1]: ",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+            if parallelism_config[prefix + "cp_size"] > 1:
+                parallelism_config[prefix + "cp_comm_strategy"] = _ask_options(
+                    "What is the compute parallelism communication strategy?",
+                    ["allgather", "alltoall"],
+                    lambda x: ["allgather", "alltoall"][int(x)],
+                    default=0,
+                )
+    megatron_lm_config = {}
+    if distributed_type in [DistributedType.MULTI_GPU]:
+        use_megatron_lm = _ask_field(
+            "Do you want to use Megatron-LM ? [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+        if use_megatron_lm:
+            distributed_type = DistributedType.MEGATRON_LM
+        if distributed_type == DistributedType.MEGATRON_LM:
+            prefix = "megatron_lm_"
+            megatron_lm_config[prefix + "tp_degree"] = _ask_field(
+                "What is the Tensor Parallelism degree/size? [1]:",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+            if megatron_lm_config[prefix + "tp_degree"] > 1:
+                megatron_lm_config[prefix + "sequence_parallelism"] = _ask_field(
+                    "Do you want to enable Sequence Parallelism? [YES/no]: ",
+                    _convert_yes_no_to_bool,
+                    default=True,
+                    error_message="Please enter yes or no.",
+                )
+            megatron_lm_config[prefix + "pp_degree"] = _ask_field(
+                "What is the Pipeline Parallelism degree/size? [1]:",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+            if megatron_lm_config[prefix + "pp_degree"] > 1:
+                megatron_lm_config[prefix + "num_micro_batches"] = _ask_field(
+                    "What is the number of micro-batches? [1]:",
+                    int,
+                    default=1,
+                    error_message="Please enter an integer.",
+                )
+            megatron_lm_config[prefix + "recompute_activations"] = _ask_field(
+                "Do you want to enable selective activation recomputation? [YES/no]: ",
+                _convert_yes_no_to_bool,
+                default=True,
+                error_message="Please enter yes or no.",
+            )
+            megatron_lm_config[prefix + "use_distributed_optimizer"] = _ask_field(
+                "Do you want to use distributed optimizer "
+                "which shards optimizer state and gradients across data parallel ranks? [YES/no]: ",
+                _convert_yes_no_to_bool,
+                default=True,
+                error_message="Please enter yes or no.",
+            )
+            megatron_lm_config[prefix + "gradient_clipping"] = _ask_field(
+                "What is the gradient clipping value based on global L2 Norm (0 to disable)? [1.0]: ",
+                float,
+                default=1.0,
+            )
+    # TPU specific defaults
+    tpu_commands = None
+    tpu_command_file = None
+    tpu_downcast_bf16 = "no"
+    tpu_env = []
+    tpu_name = None
+    tpu_vm = None
+    tpu_zone = None
+    tpu_use_sudo = False
+    tpu_use_cluster = False
+    if distributed_type in [
+        DistributedType.MULTI_CPU,
+        DistributedType.MULTI_XPU,
+        DistributedType.MULTI_HPU,
+        DistributedType.MULTI_GPU,
+        DistributedType.MULTI_MLU,
+        DistributedType.MULTI_SDAA,
+        DistributedType.MULTI_MUSA,
+        DistributedType.MULTI_NPU,
+        DistributedType.MULTI_NEURON,
+        DistributedType.XLA,
+    ]:
+        machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "")
+        if machine_type in ["TPU", "NEURON"]:
+            machine_type += " cores"
+        elif machine_type == "CPU":
+            machine_type = "processes"
+        else:
+            machine_type += "(s)"
+        num_processes = _ask_field(
+            f"How many {machine_type} should be used for distributed training? [1]:",
+            int,
+            default=1,
+            error_message="Please enter an integer.",
+        )
+    elif distributed_type in [DistributedType.FSDP, DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
+        num_processes = _ask_field(
+            "How many GPU(s) should be used for distributed training? [1]:",
+            int,
+            default=1,
+            error_message="Please enter an integer.",
+        )
+    else:
+        num_processes = 1
+    if (distributed_type == DistributedType.MULTI_GPU) and (num_machines == 1) and (num_processes == 1):
+        raise ValueError(
+            f"Specified distributed type {distributed_type} but only using 1 GPU on a single machine. Please select `No distributed training` for the type of machine you are using."
+        )
+    if (
+        distributed_type
+        in [
+            DistributedType.MULTI_GPU,
+            DistributedType.MULTI_MLU,
+            DistributedType.MULTI_SDAA,
+            DistributedType.MULTI_MUSA,
+            DistributedType.MULTI_NPU,
+            DistributedType.MULTI_XPU,
+            DistributedType.MULTI_HPU,
+            DistributedType.MULTI_NEURON,
+            DistributedType.NO,
+        ]
+        and not use_cpu
+        and not use_mps
+    ):
+        if is_npu_available():
+            machine_type = "NPU(s)"
+        elif is_mlu_available():
+            machine_type = "MLU(s)"
+        elif is_sdaa_available():
+            machine_type = "SDAA(s)"
+        elif is_musa_available():
+            machine_type = "MUSA(s)"
+        elif is_xpu_available():
+            machine_type = "XPU(s)"
+        elif is_hpu_available():
+            machine_type = "HPU(s)"
+        elif is_neuron_available():
+            machine_type = "Neuron cores"
+        else:
+            machine_type = "GPU(s)"
+        gpu_ids = _ask_field(
+            f"What {machine_type} (by id) should be used for training on this machine as a comma-separated list? [all]:",
+            default="all",
+        )
+    # CPU affinity is only supported on NVIDIA hardware for now
+    enable_cpu_affinity = False
+    if distributed_type in (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
+        enable_cpu_affinity = _ask_field(
+            "Would you like to enable numa efficiency? (Currently only supported on NVIDIA hardware). [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+    fp8_config = None
+    if distributed_type == DistributedType.XLA:
+        mixed_precision = "no"
+        main_training_function = _ask_field(
+            "What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
+            default="main",
+        )
+        tpu_use_cluster = _ask_field(
+            "Are you using a TPU cluster? [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+        if tpu_use_cluster:
+            tpu_name = _ask_field(
+                "What is the name of your TPU cluster? ",
+                default=None,
+                error_message="Please enter the name of your TPU cluster.",
+            )
+            tpu_zone = _ask_field(
+                "What is the zone of your TPU cluster? ",
+                default=None,
+                error_message="Please enter the zone of your TPU cluster.",
+            )
+            tpu_use_sudo = _ask_field(
+                "To run a python script in a TPU pod, should `sudo` be used? [yes/NO]: ",
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            run_commands = _ask_field(
+                "Do you have code you wish to run on startup in each pod? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            if run_commands:
+                use_command_file = _ask_field(
+                    "Is this code located in a bash script? [yes/NO]: ",
+                    _convert_yes_no_to_bool,
+                    default=False,
+                    error_message="Please enter yes or no.",
+                )
+                if use_command_file:
+                    tpu_command_file = _ask_field(
+                        "What is the path to your bash script? ",
+                        default=None,
+                        error_message="Please enter the path to your bash script.",
+                    )
+                    tpu_command_file = os.path.abspath(tpu_command_file)
+                else:
+                    print("Please enter each command separately you wish to run on startup in each pod.")
+                    tpu_commands = []
+                    another_command = True
+                    while another_command:
+                        tpu_commands.append(
+                            _ask_field(
+                                "Please enter a single command to be ran ",
+                                default=None,
+                                error_message="Please enter the commands you wish to run on startup in each pod as a single string.",
+                            )
+                        )
+                        another_command = _ask_field(
+                            "Do you wish to add another command? [yes/NO]: ",
+                            _convert_yes_no_to_bool,
+                            default=False,
+                            error_message="Please enter yes or no.",
+                        )
+            tpu_vm = _ask_field(
+                "If not using an instance group, what are the names of the Compute VM instances to be used, separated by a comma: ",
+                default="",
+            ).split(",")
+            tpu_env = _ask_field(
+                "What environment variables do you wish to set in each pod, separated by a comma: ",
+                default="",
+            ).split(",")
+    else:
+        main_training_function = "main"
+        if distributed_type == DistributedType.DEEPSPEED and use_deepspeed_config:
+            mixed_precision = None
+        else:
+            mixed_precision = _ask_options(
+                "Do you wish to use mixed precision?",
+                ["no", "fp16", "bf16", "fp8"],
+                _convert_mixed_precision,
+            )
+            if mixed_precision == "fp8":
+                if not is_fp8_available():
+                    raise ValueError(
+                        "FP8 (either torchao, Transformer Engine or MSAMP) is not installed on this machine."
+                    )
+                fp8_config = {}
+                fp8_config["backend"] = _ask_options(
+                    "Which FP8 backend do you want to use?",
+                    ["ao", "te", "msamp"],
+                    _convert_fp8_backend,
+                )
+                if fp8_config["backend"] == "TE":
+                    if not is_transformer_engine_available():
+                        raise ValueError("TransformersEngine was selected, but it is not installed on this machine.")
+                    fp8_config["use_autocast_during_eval"] = _ask_field(
+                        "Do you want to use FP8 autocast during eval mode? Generally better metrics are found when this is disabled [yes/NO]: ",
+                        _convert_yes_no_to_bool,
+                        default=False,
+                    )
+                    fp8_config["margin"] = _ask_field(
+                        "What margin should be used for gradient scaling? [0]: ",
+                        int,
+                        default=0,
+                    )
+                    fp8_config["interval"] = _ask_field(
+                        "What interval should be used for for how often the scaling factor is recomputed? [1]: ",
+                        int,
+                        default=1,
+                    )
+                    fp8_config["fp8_format"] = _ask_options(
+                        "Which weight format should be used?",
+                        ["HYBRID", "E4M3", "E5M2"],
+                        lambda i: ["HYBRID", "E4M3", "E5M2"][i],
+                        default=0,
+                    )
+                    fp8_config["amax_history_length"] = _ask_field(
+                        "What length of history should be used for the amax scaling factor computation? [1024]: ",
+                        int,
+                        default=1024,
+                    )
+                    fp8_config["amax_compute_algorithm"] = _ask_options(
+                        "Which algorithm should be used for the amax scaling factor computation?",
+                        ["max", "most_recent"],
+                        lambda x: "max" if x == 0 else "most_recent",
+                        default=0,
+                    )
+                    fp8_config["override_linear_precision"] = _ask_field(
+                        "Do you want to to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision? [yes/NO]: ",
+                        _convert_yes_no_to_bool,
+                        default=False,
+                    )
+                    if fp8_config["override_linear_precision"]:
+                        fprop = _ask_field(
+                            "Should `fprop` be executed in higher precision? [yes/NO]: ",
+                            _convert_yes_no_to_bool,
+                            default=False,
+                        )
+                        dgrad = _ask_field(
+                            "Should `dgrad` be executed in higher precision? [yes/NO]: ",
+                            _convert_yes_no_to_bool,
+                            default=False,
+                        )
+                        wgrad = _ask_field(
+                            "Should `wgrad` be executed in higher precision? [yes/NO]: ",
+                            _convert_yes_no_to_bool,
+                            default=False,
+                        )
+                        fp8_config["override_linear_precision"] = (fprop, dgrad, wgrad)
+                    else:
+                        fp8_config["override_linear_precision"] = (False, False, False)
+                elif fp8_config["backend"] == "MSAMP":
+                    if not is_msamp_available():
+                        raise ValueError("MSAMP was selected, but it is not installed on this machine.")
+                    fp8_config["optimization_level"] = _ask_options(
+                        "Which optimization level should be used?",
+                        ["O1", "O2"],
+                        lambda x: "O1" if x == 0 else "O2",
+                        default=1,
+                    )
+                elif fp8_config["backend"] == "AO":
+                    if not is_torchao_available():
+                        raise ValueError("torchao was selected, but it is not installed on this machine.")
+                    fp8_config["enable_fsdp_float8_all_gather"] = _ask_field(
+                        "Do you want to enable FSDP2 float8 all gather? This is recommended for better performance if using FSDP2. [YES/no]: ",
+                        _convert_yes_no_to_bool,
+                        default=True,
+                    )
+                    fp8_config["pad_inner_dim"] = _ask_field(
+                        "Do you want to pad the inner dimension of weight matrices before float8 matmuls? This is required for _scaled_mm which has strict alignment requirements. Note: padding may cause memory spikes. [YES/no]: ",
+                        _convert_yes_no_to_bool,
+                        default=True,
+                    )
+    if use_dynamo and mixed_precision == "no" and not use_cpu:
+        print(
+            "Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts."
+        )
+    if distributed_type == DistributedType.XLA and mixed_precision == "bf16":
+        tpu_downcast_bf16 = _ask_field(
+            "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no"
+        )
+    return ClusterConfig(
+        compute_environment=ComputeEnvironment.LOCAL_MACHINE,
+        distributed_type=distributed_type,
+        num_processes=num_processes,
+        gpu_ids=gpu_ids,
+        mixed_precision=mixed_precision,
+        downcast_bf16=tpu_downcast_bf16,
+        machine_rank=machine_rank,
+        num_machines=num_machines,
+        main_process_ip=main_process_ip,
+        main_process_port=main_process_port,
+        main_training_function=main_training_function,
+        fp8_config=fp8_config,
+        deepspeed_config=deepspeed_config,
+        fsdp_config=fsdp_config,
+        parallelism_config=parallelism_config,
+        megatron_lm_config=megatron_lm_config,
+        mpirun_config=mpirun_config,
+        use_cpu=use_cpu,
+        rdzv_backend=rdzv_backend,
+        same_network=same_network,
+        commands=tpu_commands,
+        command_file=tpu_command_file,
+        tpu_env=tpu_env,
+        tpu_name=tpu_name,
+        tpu_vm=tpu_vm,
+        tpu_zone=tpu_zone,
+        tpu_use_sudo=tpu_use_sudo,
+        tpu_use_cluster=tpu_use_cluster,
+        dynamo_config=dynamo_config,
+        debug=debug,
+        enable_cpu_affinity=enable_cpu_affinity,
+    )

accelerate/commands/config/config.py ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from accelerate.utils import ComputeEnvironment
+from .cluster import get_cluster_input
+from .config_args import cache_dir, default_config_file, default_yaml_config_file, load_config_from_file  # noqa: F401
+from .config_utils import _ask_field, _ask_options, _convert_compute_environment  # noqa: F401
+from .sagemaker import get_sagemaker_input
+description = "Launches a series of prompts to create and save a `default_config.yaml` configuration file for your training system. Should always be ran first on your machine"
+def get_user_input():
+    compute_environment = _ask_options(
+        "In which compute environment are you running?",
+        ["This machine", "AWS (Amazon SageMaker)"],
+        _convert_compute_environment,
+    )
+    if compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
+        config = get_sagemaker_input()
+    else:
+        config = get_cluster_input()
+    return config
+def config_command_parser(subparsers=None):
+    if subparsers is not None:
+        parser = subparsers.add_parser("config", description=description)
+    else:
+        parser = argparse.ArgumentParser("Accelerate config command", description=description)
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        help=(
+            "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
+            "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
+            "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
+            "with 'huggingface'."
+        ),
+    )
+    if subparsers is not None:
+        parser.set_defaults(func=config_command)
+    return parser
+def config_command(args):
+    config = get_user_input()
+    if args.config_file is not None:
+        config_file = args.config_file
+    else:
+        if not os.path.isdir(cache_dir):
+            os.makedirs(cache_dir)
+        config_file = default_yaml_config_file
+    if config_file.endswith(".json"):
+        config.to_json_file(config_file)
+    else:
+        config.to_yaml_file(config_file)
+    print(f"accelerate configuration saved at {config_file}")
+def main():
+    parser = config_command_parser()
+    args = parser.parse_args()
+    config_command(args)
+if __name__ == "__main__":
+    main()

accelerate/commands/config/config_args.py ADDED Viewed

	@@ -0,0 +1,252 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Union
+import yaml
+from ...utils import ComputeEnvironment, DistributedType, SageMakerDistributedType
+from ...utils.constants import SAGEMAKER_PYTHON_VERSION, SAGEMAKER_PYTORCH_VERSION, SAGEMAKER_TRANSFORMERS_VERSION
+hf_cache_home = os.path.expanduser(
+    os.environ.get("HF_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
+cache_dir = os.path.join(hf_cache_home, "accelerate")
+default_json_config_file = os.path.join(cache_dir, "default_config.yaml")
+default_yaml_config_file = os.path.join(cache_dir, "default_config.yaml")
+# For backward compatibility: the default config is the json one if it's the only existing file.
+if os.path.isfile(default_yaml_config_file) or not os.path.isfile(default_json_config_file):
+    default_config_file = default_yaml_config_file
+else:
+    default_config_file = default_json_config_file
+def load_config_from_file(config_file):
+    if config_file is not None:
+        if not os.path.isfile(config_file):
+            raise FileNotFoundError(
+                f"The passed configuration file `{config_file}` does not exist. "
+                "Please pass an existing file to `accelerate launch`, or use the default one "
+                "created through `accelerate config` and run `accelerate launch` "
+                "without the `--config_file` argument."
+            )
+    else:
+        config_file = default_config_file
+    with open(config_file, encoding="utf-8") as f:
+        if config_file.endswith(".json"):
+            if (
+                json.load(f).get("compute_environment", ComputeEnvironment.LOCAL_MACHINE)
+                == ComputeEnvironment.LOCAL_MACHINE
+            ):
+                config_class = ClusterConfig
+            else:
+                config_class = SageMakerConfig
+            return config_class.from_json_file(json_file=config_file)
+        else:
+            if (
+                yaml.safe_load(f).get("compute_environment", ComputeEnvironment.LOCAL_MACHINE)
+                == ComputeEnvironment.LOCAL_MACHINE
+            ):
+                config_class = ClusterConfig
+            else:
+                config_class = SageMakerConfig
+            return config_class.from_yaml_file(yaml_file=config_file)
+@dataclass
+class BaseConfig:
+    compute_environment: ComputeEnvironment
+    distributed_type: Union[DistributedType, SageMakerDistributedType]
+    mixed_precision: str
+    use_cpu: bool
+    debug: bool
+    def to_dict(self):
+        result = self.__dict__
+        # For serialization, it's best to convert Enums to strings (or their underlying value type).
+        def _convert_enums(value):
+            if isinstance(value, Enum):
+                return value.value
+            if isinstance(value, dict):
+                if not bool(value):
+                    return None
+                for key1, value1 in value.items():
+                    value[key1] = _convert_enums(value1)
+            return value
+        for key, value in result.items():
+            result[key] = _convert_enums(value)
+        result = {k: v for k, v in result.items() if v is not None}
+        return result
+    @staticmethod
+    def process_config(config_dict):
+        """
+        Processes `config_dict` and sets default values for any missing keys
+        """
+        if "compute_environment" not in config_dict:
+            config_dict["compute_environment"] = ComputeEnvironment.LOCAL_MACHINE
+        if "distributed_type" not in config_dict:
+            raise ValueError("A `distributed_type` must be specified in the config file.")
+        if "num_processes" not in config_dict and config_dict["distributed_type"] == DistributedType.NO:
+            config_dict["num_processes"] = 1
+        if "mixed_precision" not in config_dict:
+            config_dict["mixed_precision"] = "fp16" if ("fp16" in config_dict and config_dict["fp16"]) else None
+        if "fp16" in config_dict:  # Convert the config to the new format.
+            del config_dict["fp16"]
+        if "dynamo_backend" in config_dict:  # Convert the config to the new format.
+            dynamo_backend = config_dict.pop("dynamo_backend")
+            config_dict["dynamo_config"] = {} if dynamo_backend == "NO" else {"dynamo_backend": dynamo_backend}
+        if "use_cpu" not in config_dict:
+            config_dict["use_cpu"] = False
+        if "debug" not in config_dict:
+            config_dict["debug"] = False
+        if "enable_cpu_affinity" not in config_dict:
+            config_dict["enable_cpu_affinity"] = False
+        return config_dict
+    @classmethod
+    def from_json_file(cls, json_file=None):
+        json_file = default_json_config_file if json_file is None else json_file
+        with open(json_file, encoding="utf-8") as f:
+            config_dict = json.load(f)
+        config_dict = cls.process_config(config_dict)
+        extra_keys = sorted(set(config_dict.keys()) - set(cls.__dataclass_fields__.keys()))
+        if len(extra_keys) > 0:
+            raise ValueError(
+                f"The config file at {json_file} had unknown keys ({extra_keys}), please try upgrading your `accelerate`"
+                " version or fix (and potentially remove) these keys from your config file."
+            )
+        return cls(**config_dict)
+    def to_json_file(self, json_file):
+        with open(json_file, "w", encoding="utf-8") as f:
+            content = json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+            f.write(content)
+    @classmethod
+    def from_yaml_file(cls, yaml_file=None):
+        yaml_file = default_yaml_config_file if yaml_file is None else yaml_file
+        with open(yaml_file, encoding="utf-8") as f:
+            config_dict = yaml.safe_load(f)
+        config_dict = cls.process_config(config_dict)
+        extra_keys = sorted(set(config_dict.keys()) - set(cls.__dataclass_fields__.keys()))
+        if len(extra_keys) > 0:
+            raise ValueError(
+                f"The config file at {yaml_file} had unknown keys ({extra_keys}), please try upgrading your `accelerate`"
+                " version or fix (and potentially remove) these keys from your config file."
+            )
+        return cls(**config_dict)
+    def to_yaml_file(self, yaml_file):
+        with open(yaml_file, "w", encoding="utf-8") as f:
+            yaml.safe_dump(self.to_dict(), f)
+    def __post_init__(self):
+        if isinstance(self.compute_environment, str):
+            self.compute_environment = ComputeEnvironment(self.compute_environment)
+        if isinstance(self.distributed_type, str):
+            if self.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
+                self.distributed_type = SageMakerDistributedType(self.distributed_type)
+            else:
+                self.distributed_type = DistributedType(self.distributed_type)
+        if getattr(self, "dynamo_config", None) is None:
+            self.dynamo_config = {}
+@dataclass
+class ClusterConfig(BaseConfig):
+    num_processes: int = -1  # For instance if we use SLURM and the user manually passes it in
+    machine_rank: int = 0
+    num_machines: int = 1
+    gpu_ids: Optional[str] = None
+    main_process_ip: Optional[str] = None
+    main_process_port: Optional[int] = None
+    rdzv_backend: Optional[str] = "static"
+    same_network: Optional[bool] = False
+    main_training_function: str = "main"
+    enable_cpu_affinity: bool = False
+    # args for FP8 training
+    fp8_config: Optional[dict] = None
+    # args for deepspeed_plugin
+    deepspeed_config: Optional[dict] = None
+    # args for fsdp
+    fsdp_config: Optional[dict] = None
+    # args for parallelism config
+    parallelism_config: Optional[dict] = None
+    # args for megatron_lm
+    megatron_lm_config: Optional[dict] = None
+    # args for mpirun
+    mpirun_config: Optional[dict] = None
+    # args for TPU
+    downcast_bf16: bool = False
+    # args for TPU pods
+    tpu_name: Optional[str] = None
+    tpu_zone: Optional[str] = None
+    tpu_use_cluster: bool = False
+    tpu_use_sudo: bool = False
+    command_file: Optional[str] = None
+    commands: list[str] = None
+    tpu_vm: list[str] = None
+    tpu_env: list[str] = None
+    # args for dynamo
+    dynamo_config: Optional[dict] = None
+    def __post_init__(self):
+        if self.deepspeed_config is None:
+            self.deepspeed_config = {}
+        if self.fsdp_config is None:
+            self.fsdp_config = {}
+        if self.megatron_lm_config is None:
+            self.megatron_lm_config = {}
+        if self.mpirun_config is None:
+            self.mpirun_config = {}
+        if self.fp8_config is None:
+            self.fp8_config = {}
+        if self.parallelism_config is None:
+            self.parallelism_config = {}
+        return super().__post_init__()
+@dataclass
+class SageMakerConfig(BaseConfig):
+    ec2_instance_type: str
+    iam_role_name: str
+    image_uri: Optional[str] = None
+    profile: Optional[str] = None
+    region: str = "us-east-1"
+    num_machines: int = 1
+    gpu_ids: str = "all"
+    base_job_name: str = f"accelerate-sagemaker-{num_machines}"
+    pytorch_version: str = SAGEMAKER_PYTORCH_VERSION
+    transformers_version: str = SAGEMAKER_TRANSFORMERS_VERSION
+    py_version: str = SAGEMAKER_PYTHON_VERSION
+    sagemaker_inputs_file: Optional[str] = None
+    sagemaker_metrics_file: Optional[str] = None
+    additional_args: Optional[dict] = None
+    dynamo_config: Optional[dict] = None
+    enable_cpu_affinity: bool = False

accelerate/commands/config/config_utils.py ADDED Viewed

	@@ -0,0 +1,122 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from ...utils.dataclasses import (
+    ComputeEnvironment,
+    DistributedType,
+    DynamoBackend,
+    FP8BackendType,
+    PrecisionType,
+    SageMakerDistributedType,
+)
+from ..menu import BulletMenu
+DYNAMO_BACKENDS = [
+    "EAGER",
+    "AOT_EAGER",
+    "INDUCTOR",
+    "AOT_TS_NVFUSER",
+    "NVPRIMS_NVFUSER",
+    "CUDAGRAPHS",
+    "OFI",
+    "FX2TRT",
+    "ONNXRT",
+    "TENSORRT",
+    "AOT_TORCHXLA_TRACE_ONCE",
+    "TORHCHXLA_TRACE_ONCE",
+    "TVM",
+]
+def _ask_field(input_text, convert_value=None, default=None, error_message=None):
+    ask_again = True
+    while ask_again:
+        result = input(input_text)
+        try:
+            if default is not None and len(result) == 0:
+                return default
+            return convert_value(result) if convert_value is not None else result
+        except Exception:
+            if error_message is not None:
+                print(error_message)
+def _ask_options(input_text, options=[], convert_value=None, default=0):
+    menu = BulletMenu(input_text, options)
+    result = menu.run(default_choice=default)
+    return convert_value(result) if convert_value is not None else result
+def _convert_compute_environment(value):
+    value = int(value)
+    return ComputeEnvironment(["LOCAL_MACHINE", "AMAZON_SAGEMAKER"][value])
+def _convert_distributed_mode(value):
+    value = int(value)
+    return DistributedType(
+        [
+            "NO",
+            "MULTI_CPU",
+            "MULTI_XPU",
+            "MULTI_HPU",
+            "MULTI_GPU",
+            "MULTI_NPU",
+            "MULTI_MLU",
+            "MULTI_SDAA",
+            "MULTI_MUSA",
+            "MULTI_NEURON",
+            "XLA",
+        ][value]
+    )
+def _convert_dynamo_backend(value):
+    value = int(value)
+    return DynamoBackend(DYNAMO_BACKENDS[value]).value
+def _convert_mixed_precision(value):
+    value = int(value)
+    return PrecisionType(["no", "fp16", "bf16", "fp8"][value])
+def _convert_sagemaker_distributed_mode(value):
+    value = int(value)
+    return SageMakerDistributedType(["NO", "DATA_PARALLEL", "MODEL_PARALLEL"][value])
+def _convert_fp8_backend(value):
+    value = int(value)
+    return FP8BackendType(["AO", "TE", "MSAMP"][value])
+def _convert_yes_no_to_bool(value):
+    return {"yes": True, "no": False}[value.lower()]
+class SubcommandHelpFormatter(argparse.RawDescriptionHelpFormatter):
+    """
+    A custom formatter that will remove the usage line from the help message for subcommands.
+    """
+    def _format_usage(self, usage, actions, groups, prefix):
+        usage = super()._format_usage(usage, actions, groups, prefix)
+        usage = usage.replace("<command> [<args>] ", "")
+        return usage

accelerate/commands/config/default.py ADDED Viewed

	@@ -0,0 +1,172 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+import torch
+from ...utils import (
+    is_hpu_available,
+    is_mlu_available,
+    is_musa_available,
+    is_neuron_available,
+    is_npu_available,
+    is_sdaa_available,
+    is_xpu_available,
+)
+from .config_args import ClusterConfig, default_json_config_file
+from .config_utils import SubcommandHelpFormatter
+description = "Create a default config file for Accelerate with only a few flags set."
+def write_basic_config(mixed_precision="no", save_location: str = default_json_config_file):
+    """
+    Creates and saves a basic cluster config to be used on a local machine with potentially multiple GPUs. Will also
+    set CPU if it is a CPU-only machine.
+    Args:
+        mixed_precision (`str`, *optional*, defaults to "no"):
+            Mixed Precision to use. Should be one of "no", "fp16", or "bf16"
+        save_location (`str`, *optional*, defaults to `default_json_config_file`):
+            Optional custom save location. Should be passed to `--config_file` when using `accelerate launch`. Default
+            location is inside the huggingface cache folder (`~/.cache/huggingface`) but can be overridden by setting
+            the `HF_HOME` environmental variable, followed by `accelerate/default_config.yaml`.
+    """
+    path = Path(save_location)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    if path.exists():
+        print(
+            f"Configuration already exists at {save_location}, will not override. Run `accelerate config` manually or pass a different `save_location`."
+        )
+        return False
+    mixed_precision = mixed_precision.lower()
+    if mixed_precision not in ["no", "fp16", "bf16", "fp8"]:
+        raise ValueError(
+            f"`mixed_precision` should be one of 'no', 'fp16', 'bf16', or 'fp8'. Received {mixed_precision}"
+        )
+    config = {
+        "compute_environment": "LOCAL_MACHINE",
+        "mixed_precision": mixed_precision,
+    }
+    if is_mlu_available():
+        num_mlus = torch.mlu.device_count()
+        config["num_processes"] = num_mlus
+        config["use_cpu"] = False
+        if num_mlus > 1:
+            config["distributed_type"] = "MULTI_MLU"
+        else:
+            config["distributed_type"] = "NO"
+    if is_sdaa_available():
+        num_sdaas = torch.sdaa.device_count()
+        config["num_processes"] = num_sdaas
+        config["use_cpu"] = False
+        if num_sdaas > 1:
+            config["distributed_type"] = "MULTI_SDAA"
+        else:
+            config["distributed_type"] = "NO"
+    elif is_musa_available():
+        num_musas = torch.musa.device_count()
+        config["num_processes"] = num_musas
+        config["use_cpu"] = False
+        if num_musas > 1:
+            config["distributed_type"] = "MULTI_MUSA"
+        else:
+            config["distributed_type"] = "NO"
+    elif is_hpu_available():
+        num_hpus = torch.hpu.device_count()
+        config["num_processes"] = num_hpus
+        config["use_cpu"] = False
+        if num_hpus > 1:
+            config["distributed_type"] = "MULTI_HPU"
+        else:
+            config["distributed_type"] = "NO"
+    elif torch.cuda.is_available():
+        num_gpus = torch.cuda.device_count()
+        config["num_processes"] = num_gpus
+        config["use_cpu"] = False
+        if num_gpus > 1:
+            config["distributed_type"] = "MULTI_GPU"
+        else:
+            config["distributed_type"] = "NO"
+    elif is_xpu_available():
+        num_xpus = torch.xpu.device_count()
+        config["num_processes"] = num_xpus
+        config["use_cpu"] = False
+        if num_xpus > 1:
+            config["distributed_type"] = "MULTI_XPU"
+        else:
+            config["distributed_type"] = "NO"
+    elif is_npu_available():
+        num_npus = torch.npu.device_count()
+        config["num_processes"] = num_npus
+        config["use_cpu"] = False
+        if num_npus > 1:
+            config["distributed_type"] = "MULTI_NPU"
+        else:
+            config["distributed_type"] = "NO"
+    elif is_neuron_available():
+        num_neuron_cores = torch.neuron.device_count()
+        config["num_processes"] = num_neuron_cores
+        config["use_cpu"] = False
+        if num_neuron_cores > 1:
+            config["distributed_type"] = "MULTI_NEURON"
+        else:
+            config["distributed_type"] = "NO"
+    else:
+        num_xpus = 0
+        config["use_cpu"] = True
+        config["num_processes"] = 1
+        config["distributed_type"] = "NO"
+    config["debug"] = False
+    config["enable_cpu_affinity"] = False
+    config = ClusterConfig(**config)
+    config.to_json_file(path)
+    return path
+def default_command_parser(parser, parents):
+    parser = parser.add_parser("default", parents=parents, help=description, formatter_class=SubcommandHelpFormatter)
+    parser.add_argument(
+        "--config_file",
+        default=default_json_config_file,
+        help=(
+            "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
+            "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
+            "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
+            "with 'huggingface'."
+        ),
+        dest="save_location",
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        choices=["no", "fp16", "bf16"],
+        type=str,
+        help="Whether or not to use mixed precision training. "
+        "Choose between FP16 and BF16 (bfloat16) training. "
+        "BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.",
+        default="no",
+    )
+    parser.set_defaults(func=default_config_command)
+    return parser
+def default_config_command(args):
+    config_file = write_basic_config(args.mixed_precision, args.save_location)
+    if config_file:
+        print(f"accelerate configuration saved at {config_file}")

accelerate/commands/config/sagemaker.py ADDED Viewed

	@@ -0,0 +1,274 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from ...utils.constants import SAGEMAKER_PARALLEL_EC2_INSTANCES, TORCH_DYNAMO_MODES
+from ...utils.dataclasses import ComputeEnvironment, SageMakerDistributedType
+from ...utils.imports import is_boto3_available
+from .config_args import SageMakerConfig
+from .config_utils import (
+    DYNAMO_BACKENDS,
+    _ask_field,
+    _ask_options,
+    _convert_dynamo_backend,
+    _convert_mixed_precision,
+    _convert_sagemaker_distributed_mode,
+    _convert_yes_no_to_bool,
+)
+if is_boto3_available():
+    import boto3  # noqa: F401
+def _create_iam_role_for_sagemaker(role_name):
+    iam_client = boto3.client("iam")
+    sagemaker_trust_policy = {
+        "Version": "2012-10-17",
+        "Statement": [
+            {"Effect": "Allow", "Principal": {"Service": "sagemaker.amazonaws.com"}, "Action": "sts:AssumeRole"}
+        ],
+    }
+    try:
+        # create the role, associated with the chosen trust policy
+        iam_client.create_role(
+            RoleName=role_name, AssumeRolePolicyDocument=json.dumps(sagemaker_trust_policy, indent=2)
+        )
+        policy_document = {
+            "Version": "2012-10-17",
+            "Statement": [
+                {
+                    "Effect": "Allow",
+                    "Action": [
+                        "sagemaker:*",
+                        "ecr:GetDownloadUrlForLayer",
+                        "ecr:BatchGetImage",
+                        "ecr:BatchCheckLayerAvailability",
+                        "ecr:GetAuthorizationToken",
+                        "cloudwatch:PutMetricData",
+                        "cloudwatch:GetMetricData",
+                        "cloudwatch:GetMetricStatistics",
+                        "cloudwatch:ListMetrics",
+                        "logs:CreateLogGroup",
+                        "logs:CreateLogStream",
+                        "logs:DescribeLogStreams",
+                        "logs:PutLogEvents",
+                        "logs:GetLogEvents",
+                        "s3:CreateBucket",
+                        "s3:ListBucket",
+                        "s3:GetBucketLocation",
+                        "s3:GetObject",
+                        "s3:PutObject",
+                    ],
+                    "Resource": "*",
+                }
+            ],
+        }
+        # attach policy to role
+        iam_client.put_role_policy(
+            RoleName=role_name,
+            PolicyName=f"{role_name}_policy_permission",
+            PolicyDocument=json.dumps(policy_document, indent=2),
+        )
+    except iam_client.exceptions.EntityAlreadyExistsException:
+        print(f"role {role_name} already exists. Using existing one")
+def _get_iam_role_arn(role_name):
+    iam_client = boto3.client("iam")
+    return iam_client.get_role(RoleName=role_name)["Role"]["Arn"]
+def get_sagemaker_input():
+    credentials_configuration = _ask_options(
+        "How do you want to authorize?",
+        ["AWS Profile", "Credentials (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) "],
+        int,
+    )
+    aws_profile = None
+    if credentials_configuration == 0:
+        aws_profile = _ask_field("Enter your AWS Profile name: [default] ", default="default")
+        os.environ["AWS_PROFILE"] = aws_profile
+    else:
+        print(
+            "Note you will need to provide AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY when you launch you training script with,"
+            "`accelerate launch --aws_access_key_id XXX --aws_secret_access_key YYY`"
+        )
+        aws_access_key_id = _ask_field("AWS Access Key ID: ")
+        os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
+        aws_secret_access_key = _ask_field("AWS Secret Access Key: ")
+        os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
+    aws_region = _ask_field("Enter your AWS Region: [us-east-1]", default="us-east-1")
+    os.environ["AWS_DEFAULT_REGION"] = aws_region
+    role_management = _ask_options(
+        "Do you already have an IAM Role for executing Amazon SageMaker Training Jobs?",
+        ["Provide IAM Role name", "Create new IAM role using credentials"],
+        int,
+    )
+    if role_management == 0:
+        iam_role_name = _ask_field("Enter your IAM role name: ")
+    else:
+        iam_role_name = "accelerate_sagemaker_execution_role"
+        print(f'Accelerate will create an iam role "{iam_role_name}" using the provided credentials')
+        _create_iam_role_for_sagemaker(iam_role_name)
+    is_custom_docker_image = _ask_field(
+        "Do you want to use custom Docker image? [yes/NO]: ",
+        _convert_yes_no_to_bool,
+        default=False,
+        error_message="Please enter yes or no.",
+    )
+    docker_image = None
+    if is_custom_docker_image:
+        docker_image = _ask_field("Enter your Docker image: ", lambda x: str(x).lower())
+    is_sagemaker_inputs_enabled = _ask_field(
+        "Do you want to provide SageMaker input channels with data locations? [yes/NO]: ",
+        _convert_yes_no_to_bool,
+        default=False,
+        error_message="Please enter yes or no.",
+    )
+    sagemaker_inputs_file = None
+    if is_sagemaker_inputs_enabled:
+        sagemaker_inputs_file = _ask_field(
+            "Enter the path to the SageMaker inputs TSV file with columns (channel_name, data_location): ",
+            lambda x: str(x).lower(),
+        )
+    is_sagemaker_metrics_enabled = _ask_field(
+        "Do you want to enable SageMaker metrics? [yes/NO]: ",
+        _convert_yes_no_to_bool,
+        default=False,
+        error_message="Please enter yes or no.",
+    )
+    sagemaker_metrics_file = None
+    if is_sagemaker_metrics_enabled:
+        sagemaker_metrics_file = _ask_field(
+            "Enter the path to the SageMaker metrics TSV file with columns (metric_name, metric_regex): ",
+            lambda x: str(x).lower(),
+        )
+    distributed_type = _ask_options(
+        "What is the distributed mode?",
+        ["No distributed training", "Data parallelism"],
+        _convert_sagemaker_distributed_mode,
+    )
+    dynamo_config = {}
+    use_dynamo = _ask_field(
+        "Do you wish to optimize your script with torch dynamo?[yes/NO]:",
+        _convert_yes_no_to_bool,
+        default=False,
+        error_message="Please enter yes or no.",
+    )
+    if use_dynamo:
+        prefix = "dynamo_"
+        dynamo_config[prefix + "backend"] = _ask_options(
+            "Which dynamo backend would you like to use?",
+            [x.lower() for x in DYNAMO_BACKENDS],
+            _convert_dynamo_backend,
+            default=2,
+        )
+        use_custom_options = _ask_field(
+            "Do you want to customize the defaults sent to torch.compile? [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+        if use_custom_options:
+            dynamo_config[prefix + "mode"] = _ask_options(
+                "Which mode do you want to use?",
+                TORCH_DYNAMO_MODES,
+                lambda x: TORCH_DYNAMO_MODES[int(x)],
+                default="default",
+            )
+            dynamo_config[prefix + "use_fullgraph"] = _ask_field(
+                "Do you want the fullgraph mode or it is ok to break model into several subgraphs? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            dynamo_config[prefix + "use_dynamic"] = _ask_field(
+                "Do you want to enable dynamic shape tracing? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            dynamo_config[prefix + "use_regional_compilation"] = _ask_field(
+                "Do you want to enable regional compilation? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+    ec2_instance_query = "Which EC2 instance type you want to use for your training?"
+    if distributed_type != SageMakerDistributedType.NO:
+        ec2_instance_type = _ask_options(
+            ec2_instance_query, SAGEMAKER_PARALLEL_EC2_INSTANCES, lambda x: SAGEMAKER_PARALLEL_EC2_INSTANCES[int(x)]
+        )
+    else:
+        ec2_instance_query += "? [ml.p3.2xlarge]:"
+        ec2_instance_type = _ask_field(ec2_instance_query, lambda x: str(x).lower(), default="ml.p3.2xlarge")
+    debug = False
+    if distributed_type != SageMakerDistributedType.NO:
+        debug = _ask_field(
+            "Should distributed operations be checked while running for errors? This can avoid timeout issues but will be slower. [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+    num_machines = 1
+    if distributed_type in (SageMakerDistributedType.DATA_PARALLEL, SageMakerDistributedType.MODEL_PARALLEL):
+        num_machines = _ask_field(
+            "How many machines do you want use? [1]: ",
+            int,
+            default=1,
+        )
+    mixed_precision = _ask_options(
+        "Do you wish to use FP16 or BF16 (mixed precision)?",
+        ["no", "fp16", "bf16", "fp8"],
+        _convert_mixed_precision,
+    )
+    if use_dynamo and mixed_precision == "no":
+        print(
+            "Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts."
+        )
+    return SageMakerConfig(
+        image_uri=docker_image,
+        compute_environment=ComputeEnvironment.AMAZON_SAGEMAKER,
+        distributed_type=distributed_type,
+        use_cpu=False,
+        dynamo_config=dynamo_config,
+        ec2_instance_type=ec2_instance_type,
+        profile=aws_profile,
+        region=aws_region,
+        iam_role_name=iam_role_name,
+        mixed_precision=mixed_precision,
+        num_machines=num_machines,
+        sagemaker_inputs_file=sagemaker_inputs_file,
+        sagemaker_metrics_file=sagemaker_metrics_file,
+        debug=debug,
+    )

accelerate/commands/config/update.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+from .config_args import default_config_file, load_config_from_file
+from .config_utils import SubcommandHelpFormatter
+description = "Update an existing config file with the latest defaults while maintaining the old configuration."
+def update_config(args):
+    """
+    Update an existing config file with the latest defaults while maintaining the old configuration.
+    """
+    config_file = args.config_file
+    if config_file is None and Path(default_config_file).exists():
+        config_file = default_config_file
+    elif not Path(config_file).exists():
+        raise ValueError(f"The passed config file located at {config_file} doesn't exist.")
+    config = load_config_from_file(config_file)
+    if config_file.endswith(".json"):
+        config.to_json_file(config_file)
+    else:
+        config.to_yaml_file(config_file)
+    return config_file
+def update_command_parser(parser, parents):
+    parser = parser.add_parser("update", parents=parents, help=description, formatter_class=SubcommandHelpFormatter)
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        help=(
+            "The path to the config file to update. Will default to a file named default_config.yaml in the cache "
+            "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
+            "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
+            "with 'huggingface'."
+        ),
+    )
+    parser.set_defaults(func=update_config_command)
+    return parser
+def update_config_command(args):
+    config_file = update_config(args)
+    print(f"Successfully updated the configuration file at {config_file}.")

accelerate/commands/env.py ADDED Viewed

	@@ -0,0 +1,143 @@

+#!/usr/bin/env python
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import platform
+import subprocess
+import numpy as np
+import psutil
+import torch
+from accelerate import __version__ as version
+from accelerate.commands.config import default_config_file, load_config_from_file
+from ..utils import (
+    is_mlu_available,
+    is_musa_available,
+    is_neuron_available,
+    is_npu_available,
+    is_sdaa_available,
+    is_xpu_available,
+)
+def env_command_parser(subparsers=None):
+    if subparsers is not None:
+        parser = subparsers.add_parser("env")
+    else:
+        parser = argparse.ArgumentParser("Accelerate env command")
+    parser.add_argument(
+        "--config_file", default=None, help="The config file to use for the default values in the launching script."
+    )
+    if subparsers is not None:
+        parser.set_defaults(func=env_command)
+    return parser
+def env_command(args):
+    pt_version = torch.__version__
+    pt_cuda_available = torch.cuda.is_available()
+    pt_xpu_available = is_xpu_available()
+    pt_mlu_available = is_mlu_available()
+    pt_sdaa_available = is_sdaa_available()
+    pt_musa_available = is_musa_available()
+    pt_npu_available = is_npu_available()
+    pt_neuron_available = is_neuron_available()
+    accelerator = "N/A"
+    if pt_cuda_available:
+        accelerator = "CUDA"
+    elif pt_xpu_available:
+        accelerator = "XPU"
+    elif pt_mlu_available:
+        accelerator = "MLU"
+    elif pt_sdaa_available:
+        accelerator = "SDAA"
+    elif pt_musa_available:
+        accelerator = "MUSA"
+    elif pt_npu_available:
+        accelerator = "NPU"
+    elif pt_neuron_available:
+        accelerator = "NEURON"
+    accelerate_config = "Not found"
+    # Get the default from the config file.
+    if args.config_file is not None or os.path.isfile(default_config_file):
+        accelerate_config = load_config_from_file(args.config_file).to_dict()
+    # if we can run which, get it
+    command = None
+    bash_location = "Not found"
+    if os.name == "nt":
+        command = ["where", "accelerate"]
+    elif os.name == "posix":
+        command = ["which", "accelerate"]
+    if command is not None:
+        bash_location = subprocess.check_output(command, text=True, stderr=subprocess.STDOUT).strip()
+    info = {
+        "`Accelerate` version": version,
+        "Platform": platform.platform(),
+        "`accelerate` bash location": bash_location,
+        "Python version": platform.python_version(),
+        "Numpy version": np.__version__,
+        "PyTorch version": f"{pt_version}",
+        "PyTorch accelerator": accelerator,
+        "System RAM": f"{psutil.virtual_memory().total / 1024**3:.2f} GB",
+    }
+    if pt_cuda_available:
+        info["GPU type"] = torch.cuda.get_device_name()
+    elif pt_xpu_available:
+        info["XPU type"] = torch.xpu.get_device_name()
+    elif pt_mlu_available:
+        info["MLU type"] = torch.mlu.get_device_name()
+    elif pt_sdaa_available:
+        info["SDAA type"] = torch.sdaa.get_device_name()
+    elif pt_musa_available:
+        info["MUSA type"] = torch.musa.get_device_name()
+    elif pt_neuron_available:
+        info["NEURON type"] = torch.neuron.get_device_name()
+    elif pt_npu_available:
+        info["CANN version"] = torch.version.cann
+    print("\nCopy-and-paste the text below in your GitHub issue\n")
+    print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]))
+    print("- `Accelerate` default config:" if args.config_file is None else "- `Accelerate` config passed:")
+    accelerate_config_str = (
+        "\n".join([f"\t- {prop}: {val}" for prop, val in accelerate_config.items()])
+        if isinstance(accelerate_config, dict)
+        else f"\t{accelerate_config}"
+    )
+    print(accelerate_config_str)
+    info["`Accelerate` configs"] = accelerate_config
+    return info
+def main() -> int:
+    parser = env_command_parser()
+    args = parser.parse_args()
+    env_command(args)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

accelerate/commands/estimate.py ADDED Viewed

	@@ -0,0 +1,318 @@

+#!/usr/bin/env python
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from huggingface_hub import model_info
+from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
+from accelerate import init_empty_weights
+from accelerate.commands.utils import CustomArgumentParser
+from accelerate.utils import (
+    calculate_maximum_sizes,
+    convert_bytes,
+    is_timm_available,
+    is_transformers_available,
+)
+if is_transformers_available():
+    import transformers
+    from transformers import AutoConfig, AutoModel
+if is_timm_available():
+    import timm
+def verify_on_hub(repo: str, token: Optional[str] = None):
+    "Verifies that the model is on the hub and returns the model info."
+    try:
+        return model_info(repo, token=token)
+    except (OSError, GatedRepoError):
+        return "gated"
+    except RepositoryNotFoundError:
+        return "repo"
+def check_has_model(error):
+    """
+    Checks what library spawned `error` when a model is not found
+    """
+    if is_timm_available() and isinstance(error, RuntimeError) and "Unknown model" in error.args[0]:
+        return "timm"
+    elif (
+        is_transformers_available()
+        and isinstance(error, OSError)
+        and "does not appear to have a file named" in error.args[0]
+    ):
+        return "transformers"
+    else:
+        return "unknown"
+def create_empty_model(
+    model_name: str, library_name: str, trust_remote_code: bool = False, access_token: Optional[str] = None
+):
+    """
+    Creates an empty model in full precision from its parent library on the `Hub` to calculate the overall memory
+    consumption.
+    Args:
+        model_name (`str`):
+            The model name on the Hub
+        library_name (`str`):
+            The library the model has an integration with, such as `transformers`. Will be used if `model_name` has no
+            metadata on the Hub to determine the library.
+        trust_remote_code (`bool`, `optional`, defaults to `False`):
+            Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+            should only be set to `True` for repositories you trust and in which you have read the code, as it will
+            execute code present on the Hub on your local machine.
+        access_token (`str`, `optional`, defaults to `None`):
+            The access token to use to access private or gated models on the Hub. (for use on the Gradio app)
+    Returns:
+        `torch.nn.Module`: The torch model that has been initialized on the `meta` device.
+    """
+    model_info = verify_on_hub(model_name, access_token)
+    # Simplified errors
+    if model_info == "gated":
+        raise OSError(
+            f"Repo for model `{model_name}` is gated. You must be authenticated to access it. Please run `huggingface-cli login`."
+        )
+    elif model_info == "repo":
+        raise OSError(
+            f"Repo for model `{model_name}` does not exist on the Hub. If you are trying to access a private repo,"
+            " make sure you are authenticated via `huggingface-cli login` and have access."
+        )
+    if library_name is None:
+        library_name = getattr(model_info, "library_name", False)
+        if not library_name:
+            raise ValueError(
+                f"Model `{model_name}` does not have any library metadata on the Hub, please manually pass in a `--library_name` to use (such as `transformers`)"
+            )
+    if library_name == "transformers":
+        if not is_transformers_available():
+            raise ImportError(
+                f"To check `{model_name}`, `transformers` must be installed. Please install it via `pip install transformers`"
+            )
+        print(f"Loading pretrained config for `{model_name}` from `transformers`...")
+        if model_info.config is None:
+            raise RuntimeError(f"Tried to load `{model_name}` with `transformers` but it does not have any metadata.")
+        auto_map = model_info.config.get("auto_map", False)
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code, token=access_token)
+        with init_empty_weights():
+            # remote code could specify a specific `AutoModel` class in the `auto_map`
+            constructor = AutoModel
+            if isinstance(auto_map, dict):
+                value = None
+                for key in auto_map.keys():
+                    if key.startswith("AutoModelFor"):
+                        value = key
+                        break
+                if value is not None:
+                    constructor = getattr(transformers, value)
+            # we need to pass the dtype, otherwise it is going to use the torch_dtype that is saved in the config
+            model = constructor.from_config(config, torch_dtype=torch.float32, trust_remote_code=trust_remote_code)
+    elif library_name == "timm":
+        if not is_timm_available():
+            raise ImportError(
+                f"To check `{model_name}`, `timm` must be installed. Please install it via `pip install timm`"
+            )
+        print(f"Loading pretrained config for `{model_name}` from `timm`...")
+        with init_empty_weights():
+            model = timm.create_model(model_name, pretrained=False)
+    else:
+        raise ValueError(
+            f"Library `{library_name}` is not supported yet, please open an issue on GitHub for us to add support."
+        )
+    return model
+def create_ascii_table(headers: list, rows: list, title: str):
+    "Creates a pretty table from a list of rows, minimal version of `tabulate`."
+    sep_char, in_between = "│", "─"
+    column_widths = []
+    for i in range(len(headers)):
+        column_values = [row[i] for row in rows] + [headers[i]]
+        max_column_width = max(len(value) for value in column_values)
+        column_widths.append(max_column_width)
+    formats = [f"%{column_widths[i]}s" for i in range(len(rows[0]))]
+    pattern = f"{sep_char}{sep_char.join(formats)}{sep_char}"
+    diff = 0
+    def make_row(left_char, middle_char, right_char):
+        return f"{left_char}{middle_char.join([in_between * n for n in column_widths])}{in_between * diff}{right_char}"
+    separator = make_row("├", "┼", "┤")
+    if len(title) > sum(column_widths):
+        diff = abs(len(title) - len(separator))
+        column_widths[-1] += diff
+    # Update with diff
+    separator = make_row("├", "┼", "┤")
+    initial_rows = [
+        make_row("┌", in_between, "┐"),
+        f"{sep_char}{title.center(len(separator) - 2)}{sep_char}",
+        make_row("├", "┬", "┤"),
+    ]
+    table = "\n".join(initial_rows) + "\n"
+    column_widths[-1] += diff
+    centered_line = [text.center(column_widths[i]) for i, text in enumerate(headers)]
+    table += f"{pattern % tuple(centered_line)}\n{separator}\n"
+    for i, line in enumerate(rows):
+        centered_line = [t.center(column_widths[i]) for i, t in enumerate(line)]
+        table += f"{pattern % tuple(centered_line)}\n"
+    table += f"└{'┴'.join([in_between * n for n in column_widths])}┘"
+    return table
+def estimate_command_parser(subparsers=None):
+    if subparsers is not None:
+        parser = subparsers.add_parser("estimate-memory")
+    else:
+        parser = CustomArgumentParser(
+            description="Model size estimator for fitting a model onto device(e.g. cuda, xpu) memory."
+        )
+    parser.add_argument("model_name", type=str, help="The model name on the Hugging Face Hub.")
+    parser.add_argument(
+        "--library_name",
+        type=str,
+        help="The library the model has an integration with, such as `transformers`, needed only if this information is not stored on the Hub.",
+        choices=["timm", "transformers"],
+    )
+    parser.add_argument(
+        "--dtypes",
+        type=str,
+        nargs="+",
+        default=["float32", "float16", "int8", "int4"],
+        help="The dtypes to use for the model, must be one (or many) of `float32`, `float16`, `int8`, and `int4`",
+        choices=["float32", "float16", "int8", "int4"],
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="""Whether or not to allow for custom models defined on the Hub in their own modeling files. This flag
+                should only be used for repositories you trust and in which you have read the code, as it will execute
+                code present on the Hub on your local machine.""",
+        default=False,
+    )
+    if subparsers is not None:
+        parser.set_defaults(func=estimate_command)
+    return parser
+def estimate_training_usage(bytes: int, mixed_precision: str, msamp_config: Optional[str] = None) -> dict:
+    """
+    Given an amount of `bytes` and `mixed_precision`, calculates how much training memory is needed for a batch size of
+    1.
+    Args:
+        bytes (`int`):
+            The size of the model being trained.
+        mixed_precision (`str`):
+            The mixed precision that would be ran.
+        msamp_config (`str`):
+            The msamp config to estimate the training memory for if `mixed_precision` is set to `"fp8"`.
+    """
+    memory_sizes = {"model": -1, "optimizer": -1, "gradients": -1, "step": -1}
+    fp32_size = bytes
+    fp16_size = bytes // 2
+    if mixed_precision == "float32":
+        memory_sizes["model"] = fp32_size
+        memory_sizes["gradients"] = fp32_size
+        memory_sizes["optimizer"] = fp32_size * 2
+        memory_sizes["step"] = fp32_size * 4
+    elif mixed_precision in ("float16", "bfloat16") or (mixed_precision == "fp8" and msamp_config is None):
+        # With native `TransformersEngine`, there is no memory savings with FP8
+        # With mixed precision training, the model has weights stored
+        # in FP16 and FP32
+        memory_sizes["model"] = fp32_size
+        # 1.5 from weight gradient + computation (GEMM)
+        memory_sizes["gradients"] = fp32_size + fp16_size
+        # 2x from optimizer states
+        memory_sizes["optimizer"] = fp32_size * 2  # Optimizer states
+        memory_sizes["step"] = memory_sizes["optimizer"]
+    return memory_sizes
+def gather_data(args):
+    "Creates an empty model and gathers the data for the sizes"
+    try:
+        model = create_empty_model(
+            args.model_name, library_name=args.library_name, trust_remote_code=args.trust_remote_code
+        )
+    except (RuntimeError, OSError) as e:
+        library = check_has_model(e)
+        if library != "unknown":
+            raise RuntimeError(
+                f"Tried to load `{args.model_name}` with `{library}` but a possible model to load was not found inside the repo."
+            )
+        raise e
+    total_size, largest_layer = calculate_maximum_sizes(model)
+    data = []
+    for dtype in args.dtypes:
+        dtype_total_size = total_size
+        dtype_largest_layer = largest_layer[0]
+        dtype_training_size = estimate_training_usage(dtype_total_size, dtype)
+        if dtype == "float16":
+            dtype_total_size /= 2
+            dtype_largest_layer /= 2
+        elif dtype == "int8":
+            dtype_total_size /= 4
+            dtype_largest_layer /= 4
+        elif dtype == "int4":
+            dtype_total_size /= 8
+            dtype_largest_layer /= 8
+        data.append([dtype, dtype_largest_layer, dtype_total_size, dtype_training_size])
+    return data
+def estimate_command(args):
+    data = gather_data(args)
+    for row in data:
+        for i, item in enumerate(row):
+            if isinstance(item, (int, float)):
+                row[i] = convert_bytes(item)
+            elif isinstance(item, dict):
+                training_usage = max(item.values())
+                row[i] = convert_bytes(training_usage) if training_usage != -1 else "N/A"
+    headers = ["dtype", "Largest Layer", "Total Size", "Training using Adam"]
+    title = f"Memory Usage for loading `{args.model_name}`"
+    table = create_ascii_table(headers, data, title)
+    print(table)
+def main():
+    parser = estimate_command_parser()
+    args = parser.parse_args()
+    estimate_command(args)
+if __name__ == "__main__":
+    main()

accelerate/commands/launch.py ADDED Viewed

	@@ -0,0 +1,1415 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import importlib
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+import torch
+from accelerate.commands.config import default_config_file, load_config_from_file
+from accelerate.commands.config.config_args import SageMakerConfig
+from accelerate.commands.config.config_utils import DYNAMO_BACKENDS
+from accelerate.commands.utils import CustomArgumentParser
+from accelerate.state import get_int_from_env
+from accelerate.utils import (
+    ComputeEnvironment,
+    DistributedType,
+    PrepareForLaunch,
+    _filter_args,
+    check_cuda_p2p_ib_support,
+    convert_dict_to_env_variables,
+    is_bf16_available,
+    is_deepspeed_available,
+    is_hpu_available,
+    is_mlu_available,
+    is_musa_available,
+    is_neuron_available,
+    is_npu_available,
+    is_rich_available,
+    is_sagemaker_available,
+    is_sdaa_available,
+    is_torch_xla_available,
+    is_xpu_available,
+    patch_environment,
+    prepare_deepspeed_cmd_env,
+    prepare_multi_gpu_env,
+    prepare_sagemager_args_inputs,
+    prepare_simple_launcher_cmd_env,
+    prepare_tpu,
+    str_to_bool,
+)
+from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS, TORCH_DYNAMO_MODES
+if is_rich_available():
+    from rich import get_console
+    from rich.logging import RichHandler
+    FORMAT = "%(message)s"
+    logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()])
+logger = logging.getLogger(__name__)
+options_to_group = {
+    "multi_gpu": "Distributed GPUs",
+    "tpu": "TPU",
+    "use_deepspeed": "DeepSpeed Arguments",
+    "use_fsdp": "FSDP Arguments",
+    "use_megatron_lm": "Megatron-LM Arguments",
+    "fp8_backend": "FP8 Arguments",
+}
+def clean_option(option):
+    "Finds all cases of - after the first two characters and changes them to _"
+    if "fp8_backend" in option:
+        option = "--fp8_backend"
+    if option.startswith("--"):
+        return option[2:].replace("-", "_")
+class CustomHelpFormatter(argparse.HelpFormatter):
+    """
+    This is a custom help formatter that will hide all arguments that are not used in the command line when the help is
+    called. This is useful for the case where the user is using a specific platform and only wants to see the arguments
+    for that platform.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.titles = [
+            "Hardware Selection Arguments",
+            "Resource Selection Arguments",
+            "Training Paradigm Arguments",
+            "positional arguments",
+            "optional arguments",
+        ]
+    def add_argument(self, action: argparse.Action):
+        if "accelerate" in sys.argv[0] and "launch" in sys.argv[1:]:
+            args = sys.argv[2:]
+        else:
+            args = sys.argv[1:]
+        if len(args) > 1:
+            args = list(map(clean_option, args))
+            used_platforms = [arg for arg in args if arg in options_to_group.keys()]
+            used_titles = [options_to_group[o] for o in used_platforms]
+            if action.container.title not in self.titles + used_titles:
+                action.help = argparse.SUPPRESS
+            elif action.container.title == "Hardware Selection Arguments":
+                if set(action.option_strings).isdisjoint(set(args)):
+                    action.help = argparse.SUPPRESS
+                else:
+                    action.help = action.help + " (currently selected)"
+            elif action.container.title == "Training Paradigm Arguments":
+                if set(action.option_strings).isdisjoint(set(args)):
+                    action.help = argparse.SUPPRESS
+                else:
+                    action.help = action.help + " (currently selected)"
+        action.option_strings = [s for s in action.option_strings if "-" not in s[2:]]
+        super().add_argument(action)
+    def end_section(self):
+        if len(self._current_section.items) < 2:
+            self._current_section.items = []
+            self._current_section.heading = ""
+        super().end_section()
+def launch_command_parser(subparsers=None):
+    description = "Launch a python script in a distributed scenario. Arguments can be passed in with either hyphens (`--num-processes=2`) or underscores (`--num_processes=2`)"
+    if subparsers is not None:
+        parser = subparsers.add_parser(
+            "launch", description=description, add_help=False, allow_abbrev=False, formatter_class=CustomHelpFormatter
+        )
+    else:
+        parser = CustomArgumentParser(
+            "Accelerate launch command",
+            description=description,
+            add_help=False,
+            allow_abbrev=False,
+            formatter_class=CustomHelpFormatter,
+        )
+    parser.add_argument("-h", "--help", action="help", help="Show this help message and exit.")
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        help="The config file to use for the default values in the launching script.",
+    )
+    parser.add_argument(
+        "--quiet",
+        "-q",
+        action="store_true",
+        help="Silence subprocess errors from the launch stack trace and only show the relevant tracebacks. (Only applicable to DeepSpeed and single-process configurations)",
+    )
+    # Hardware selection arguments
+    hardware_args = parser.add_argument_group(
+        "Hardware Selection Arguments", "Arguments for selecting the hardware to be used."
+    )
+    hardware_args.add_argument(
+        "--cpu", default=False, action="store_true", help="Whether or not to force the training on the CPU."
+    )
+    hardware_args.add_argument(
+        "--multi_gpu",
+        default=False,
+        action="store_true",
+        help="Whether or not this should launch a distributed GPU training.",
+    )
+    hardware_args.add_argument(
+        "--tpu", default=False, action="store_true", help="Whether or not this should launch a TPU training."
+    )
+    # Resource selection arguments
+    resource_args = parser.add_argument_group(
+        "Resource Selection Arguments", "Arguments for fine-tuning how available hardware should be used."
+    )
+    resource_args.add_argument(
+        "--mixed_precision",
+        type=str,
+        choices=["no", "fp16", "bf16", "fp8"],
+        help="Whether or not to use mixed precision training. "
+        "Choose between FP16 and BF16 (bfloat16) training. "
+        "BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.",
+    )
+    resource_args.add_argument(
+        "--num_processes", type=int, default=None, help="The total number of processes to be launched in parallel."
+    )
+    resource_args.add_argument(
+        "--num_machines", type=int, default=None, help="The total number of machines used in this training."
+    )
+    resource_args.add_argument(
+        "--num_cpu_threads_per_process",
+        type=int,
+        default=None,
+        help="The number of CPU threads per process. Can be tuned for optimal performance.",
+    )
+    resource_args.add_argument(
+        "--enable_cpu_affinity",
+        default=False,
+        action="store_true",
+        help="Whether or not CPU affinity and balancing should be enabled. Currently only supported on NVIDIA hardware.",
+    )
+    # Dynamo arguments
+    resource_args.add_argument(
+        "--dynamo_backend",
+        type=str,
+        choices=["no"] + [b.lower() for b in DYNAMO_BACKENDS],
+        help="Choose a backend to optimize your training with dynamo, see more at "
+        "https://github.com/pytorch/torchdynamo.",
+    )
+    resource_args.add_argument(
+        "--dynamo_mode",
+        type=str,
+        default="default",
+        choices=TORCH_DYNAMO_MODES,
+        help="Choose a mode to optimize your training with dynamo.",
+    )
+    resource_args.add_argument(
+        "--dynamo_use_fullgraph",
+        default=False,
+        action="store_true",
+        help="Whether to use full graph mode for dynamo or it is ok to break model into several subgraphs",
+    )
+    resource_args.add_argument(
+        "--dynamo_use_dynamic",
+        default=False,
+        action="store_true",
+        help="Whether to enable dynamic shape tracing.",
+    )
+    resource_args.add_argument(
+        "--dynamo_use_regional_compilation",
+        default=False,
+        action="store_true",
+        help="Whether to enable regional compilation.",
+    )
+    # Training Paradigm arguments
+    paradigm_args = parser.add_argument_group(
+        "Training Paradigm Arguments", "Arguments for selecting which training paradigm to be used."
+    )
+    paradigm_args.add_argument(
+        "--use_deepspeed",
+        default=False,
+        action="store_true",
+        help="Whether to use deepspeed.",
+    )
+    paradigm_args.add_argument(
+        "--use_fsdp",
+        default=False,
+        action="store_true",
+        help="Whether to use fsdp.",
+    )
+    paradigm_args.add_argument(
+        "--use_parallelism_config",
+        default=False,
+        action="store_true",
+        help="Whether to use the parallelism config to configure the N-d distributed training.",
+    )
+    paradigm_args.add_argument(
+        "--use_megatron_lm",
+        default=False,
+        action="store_true",
+        help="Whether to use Megatron-LM.",
+    )
+    # distributed GPU training arguments
+    distributed_args = parser.add_argument_group("Distributed GPUs", "Arguments related to distributed GPU training.")
+    distributed_args.add_argument(
+        "--gpu_ids",
+        default=None,
+        help="What GPUs (by id) should be used for training on this machine as a comma-separated list",
+    )
+    distributed_args.add_argument(
+        "--same_network",
+        default=False,
+        action="store_true",
+        help="Whether all machines used for multinode training exist on the same local network.",
+    )
+    distributed_args.add_argument(
+        "--machine_rank", type=int, default=None, help="The rank of the machine on which this script is launched."
+    )
+    distributed_args.add_argument(
+        "--main_process_ip", type=str, default=None, help="The IP address of the machine of rank 0."
+    )
+    distributed_args.add_argument(
+        "--main_process_port",
+        type=int,
+        default=None,
+        help="The port to use to communicate with the machine of rank 0.",
+    )
+    distributed_args.add_argument(
+        "-t",
+        "--tee",
+        default="0",
+        type=str,
+        help="Tee std streams into a log file and also to console.",
+    )
+    distributed_args.add_argument(
+        "--log_dir",
+        type=str,
+        default=None,
+        help=(
+            "Base directory to use for log files when using torchrun/torch.distributed.run as launcher. "
+            "Use with --tee to redirect std streams info log files."
+        ),
+    )
+    distributed_args.add_argument(
+        "--role",
+        type=str,
+        default="default",
+        help="User-defined role for the workers.",
+    )
+    # Rendezvous related arguments
+    distributed_args.add_argument(
+        "--rdzv_backend",
+        type=str,
+        default="static",
+        help="The rendezvous method to use, such as 'static' (the default) or 'c10d'",
+    )
+    distributed_args.add_argument(
+        "--rdzv_conf",
+        type=str,
+        default="",
+        help="Additional rendezvous configuration (<key1>=<value1>,<key2>=<value2>,...).",
+    )
+    distributed_args.add_argument(
+        "--max_restarts",
+        type=int,
+        default=0,
+        help="Maximum number of worker group restarts before failing.",
+    )
+    distributed_args.add_argument(
+        "--monitor_interval",
+        type=float,
+        default=0.1,
+        help="Interval, in seconds, to monitor the state of workers.",
+    )
+    parser.add_argument(
+        "-m",
+        "--module",
+        action="store_true",
+        help="Change each process to interpret the launch script as a Python module, executing with the same behavior as 'python -m'.",
+    )
+    parser.add_argument(
+        "--no_python",
+        action="store_true",
+        help="Skip prepending the training script with 'python' - just execute it directly. Useful when the script is not a Python script.",
+    )
+    # TPU arguments
+    tpu_args = parser.add_argument_group("TPU", "Arguments related to TPU.")
+    tpu_args.add_argument(
+        "--tpu_cluster",
+        action="store_true",
+        dest="tpu_use_cluster",
+        help="Whether to use a GCP TPU pod for training.",
+    )
+    tpu_args.add_argument(
+        "--no_tpu_cluster",
+        action="store_false",
+        dest="tpu_use_cluster",
+        help="Should not be passed explicitly, this is for internal use only.",
+    )
+    tpu_args.add_argument(
+        "--tpu_use_sudo",
+        action="store_true",
+        help="Whether to use `sudo` when running the TPU training script in each pod.",
+    )
+    tpu_args.add_argument(
+        "--vm",
+        type=str,
+        action="append",
+        help=(
+            "List of single Compute VM instance names. "
+            "If not provided we assume usage of instance groups. For TPU pods."
+        ),
+    )
+    tpu_args.add_argument(
+        "--env",
+        type=str,
+        action="append",
+        help="List of environment variables to set on the Compute VM instances. For TPU pods.",
+    )
+    tpu_args.add_argument(
+        "--main_training_function",
+        type=str,
+        default=None,
+        help="The name of the main function to be executed in your script (only for TPU training).",
+    )
+    tpu_args.add_argument(
+        "--downcast_bf16",
+        action="store_true",
+        help="Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.",
+    )
+    # DeepSpeed arguments
+    deepspeed_args = parser.add_argument_group("DeepSpeed Arguments", "Arguments related to DeepSpeed.")
+    deepspeed_args.add_argument(
+        "--deepspeed_config_file",
+        default=None,
+        type=str,
+        help="DeepSpeed config file.",
+    )
+    deepspeed_args.add_argument(
+        "--zero_stage",
+        default=None,
+        type=int,
+        help="DeepSpeed's ZeRO optimization stage (useful only when `use_deepspeed` flag is passed). "
+        "If unspecified, will default to `2`.",
+    )
+    deepspeed_args.add_argument(
+        "--offload_optimizer_device",
+        default=None,
+        type=str,
+        help="Decides where (none|cpu|nvme) to offload optimizer states (useful only when `use_deepspeed` flag is passed). "
+        "If unspecified, will default to 'none'.",
+    )
+    deepspeed_args.add_argument(
+        "--offload_param_device",
+        default=None,
+        type=str,
+        help="Decides where (none|cpu|nvme) to offload parameters (useful only when `use_deepspeed` flag is passed). "
+        "If unspecified, will default to 'none'.",
+    )
+    deepspeed_args.add_argument(
+        "--offload_optimizer_nvme_path",
+        default=None,
+        type=str,
+        help="Decides Nvme Path to offload optimizer states (useful only when `use_deepspeed` flag is passed). "
+        "If unspecified, will default to 'none'.",
+    )
+    deepspeed_args.add_argument(
+        "--offload_param_nvme_path",
+        default=None,
+        type=str,
+        help="Decides Nvme Path to offload parameters (useful only when `use_deepspeed` flag is passed). "
+        "If unspecified, will default to 'none'.",
+    )
+    deepspeed_args.add_argument(
+        "--gradient_accumulation_steps",
+        default=None,
+        type=int,
+        help="No of gradient_accumulation_steps used in your training script (useful only when `use_deepspeed` flag is passed). "
+        "If unspecified, will default to `1`.",
+    )
+    deepspeed_args.add_argument(
+        "--gradient_clipping",
+        default=None,
+        type=float,
+        help="gradient clipping value used in your training script (useful only when `use_deepspeed` flag is passed). "
+        "If unspecified, will default to `1.0`.",
+    )
+    deepspeed_args.add_argument(
+        "--zero3_init_flag",
+        default=None,
+        type=str,
+        help="Decides Whether (true|false) to enable `deepspeed.zero.Init` for constructing massive models. "
+        "Only applicable with DeepSpeed ZeRO Stage-3. If unspecified, will default to `true`.",
+    )
+    deepspeed_args.add_argument(
+        "--zero3_save_16bit_model",
+        default=None,
+        type=str,
+        help="Decides Whether (true|false) to save 16-bit model weights when using ZeRO Stage-3. "
+        "Only applicable with DeepSpeed ZeRO Stage-3. If unspecified, will default to `false`.",
+    )
+    deepspeed_args.add_argument(
+        "--deepspeed_hostfile",
+        default=None,
+        type=str,
+        help="DeepSpeed hostfile for configuring multi-node compute resources.",
+    )
+    deepspeed_args.add_argument(
+        "--deepspeed_exclusion_filter",
+        default=None,
+        type=str,
+        help="DeepSpeed exclusion filter string when using multi-node setup.",
+    )
+    deepspeed_args.add_argument(
+        "--deepspeed_inclusion_filter",
+        default=None,
+        type=str,
+        help="DeepSpeed inclusion filter string when using multi-node setup.",
+    )
+    deepspeed_args.add_argument(
+        "--deepspeed_multinode_launcher",
+        default=None,
+        type=str,
+        help="DeepSpeed multi-node launcher to use, e.g. `pdsh`, `standard`, `openmpi`, `mvapich`, `mpich`, `slurm`, `nossh` (requires DeepSpeed >= 0.14.5). If unspecified, will default to `pdsh`.",
+    )
+    deepspeed_args.add_argument(
+        "--deepspeed_moe_layer_cls_names",
+        default=None,
+        type=str,
+        help="comma-separated list of transformer MoE layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
+        " (useful only when `use_deepspeed` flag is passed).",
+    )
+    # fsdp arguments
+    fsdp_args = parser.add_argument_group("FSDP Arguments", "Arguments related to Fully Shared Data Parallelism.")
+    fsdp_args.add_argument(
+        "--fsdp_version",
+        type=str,
+        default="1",
+        choices=["1", "2"],
+        help="FSDP version to use. (useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_offload_params",
+        default="false",
+        type=str,
+        help="Decides Whether (true|false) to offload parameters and gradients to CPU. (useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_min_num_params",
+        type=int,
+        default=int(1e8),
+        help="FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `use_fsdp` flag is passed).",
+    )
+    # We enable this for backwards compatibility, throw a warning if this is set in `FullyShardedDataParallelPlugin`
+    fsdp_args.add_argument(
+        "--fsdp_sharding_strategy",
+        type=str,
+        default="FULL_SHARD",
+        help="FSDP's sharding strategy. (useful only when `use_fsdp` flag is passed and `fsdp_version=1`).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_reshard_after_forward",
+        type=str,
+        default="true",
+        help="FSDP's Reshard After Forward Strategy. (useful only when `use_fsdp` flag is passed). Supports either boolean (FSDP2) or `FULL_SHARD | SHARD_GRAD_OP | NO_RESHARD` (FSDP1).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_auto_wrap_policy",
+        type=str,
+        default=None,
+        help="FSDP's auto wrap policy. (useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_transformer_layer_cls_to_wrap",
+        default=None,
+        type=str,
+        help="Transformer layer class name (case-sensitive) to wrap ,e.g, `BertLayer`, `GPTJBlock`, `T5Block` .... "
+        "(useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_backward_prefetch",
+        default=None,
+        type=str,
+        help="FSDP's backward prefetch policy. (useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_state_dict_type",
+        default=None,
+        type=str,
+        help="FSDP's state dict type. (useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_forward_prefetch",
+        default="false",
+        type=str,
+        help="If True, then FSDP explicitly prefetches the next upcoming "
+        "all-gather while executing in the forward pass (useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_use_orig_params",
+        default="true",
+        type=str,
+        help="If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable parameters."
+        " (useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_cpu_ram_efficient_loading",
+        default="true",
+        type=str,
+        help="If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. "
+        "Only applicable for 🤗 Transformers. When using this, `--fsdp_sync_module_states` needs to True. "
+        "(useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_sync_module_states",
+        default="true",
+        type=str,
+        help="If True, each individually wrapped FSDP unit will broadcast module parameters from rank 0."
+        " (useful only when `use_fsdp` flag is passed).",
+    )
+    fsdp_args.add_argument(
+        "--fsdp_activation_checkpointing",
+        default="false",
+        type=str,
+        help="Decides Whether (true|false) intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder. (useful only when `use_fsdp` flag is passed).",
+    )
+    # megatron_lm args
+    megatron_lm_args = parser.add_argument_group("Megatron-LM Arguments", "Arguments related to Megatron-LM.")
+    megatron_lm_args.add_argument(
+        "--megatron_lm_tp_degree",
+        type=int,
+        default=1,
+        help="Megatron-LM's Tensor Parallelism (TP) degree. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_use_custom_fsdp",
+        type=bool,
+        default=False,
+        help="Whether to use custom FSDP. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_no_load_optim",
+        type=bool,
+        default=False,
+        help="Whether to not load optimizer. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_eod_mask_loss",
+        type=bool,
+        default=False,
+        help="Whether to use eod mask loss. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_overlap_cpu_optimizer_d2h_h2d",
+        type=bool,
+        default=False,
+        help="Whether to overlap CPU optimizer step, gradients D2H and updated parameters H2D. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_no_save_optim",
+        type=bool,
+        default=False,
+        help="Whether to not save optimizer. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_optimizer_cpu_offload",
+        type=bool,
+        default=False,
+        help="Whether to use CPU offload for optimizer. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_use_precision_aware_optimizer",
+        type=bool,
+        default=False,
+        help="Whether to use precision aware optimizer. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_decoder_last_pipeline_num_layers",
+        type=int,
+        default=None,
+        help="Megatron-LM's decoder last pipeline number of layers, default None is even split of transformer layers across all pipeline stages.",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_pp_degree",
+        type=int,
+        default=1,
+        help="Megatron-LM's Pipeline Parallelism (PP) degree. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_num_micro_batches",
+        type=int,
+        default=None,
+        help="Megatron-LM's number of micro batches when PP degree > 1. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_sequence_parallelism",
+        default=None,
+        type=str,
+        help="Decides Whether (true|false) to enable Sequence Parallelism when TP degree > 1. "
+        "(useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_recompute_activations",
+        default=None,
+        type=str,
+        help="Decides Whether (true|false) to enable Selective Activation Recomputation. "
+        "(useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_use_distributed_optimizer",
+        default=None,
+        type=str,
+        help="Decides Whether (true|false) to use distributed optimizer "
+        "which shards optimizer state and gradients across Data Pralellel (DP) ranks. "
+        "(useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_gradient_clipping",
+        default=1.0,
+        type=float,
+        help="Megatron-LM's gradient clipping value based on global L2 Norm (0 to disable). "
+        "(useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_recompute_granularity",
+        default=None,
+        type=str,
+        help="Megatron-LM's recompute granularity (full, selective). "
+        "(useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_recompute_method",
+        default=None,
+        type=str,
+        help="Megatron-LM's recompute method (uniform, block). (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_recompute_num_layers",
+        default=None,
+        type=int,
+        help="Megatron-LM's number of layers to recompute. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_attention_backend",
+        default=None,
+        type=str,
+        help="Decides Whether (true|false) to enable attention backend. "
+        "(useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_expert_model_parallel_size",
+        default=None,
+        type=int,
+        help="Megatron-LM's expert model parallel size. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_context_parallel_size",
+        default=None,
+        type=int,
+        help="Megatron-LM's context parallel size. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_attention_dropout",
+        default=None,
+        type=float,
+        help="Megatron-LM's attention dropout rate. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_hidden_dropout",
+        default=None,
+        type=float,
+        help="Megatron-LM's hidden dropout rate. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_attention_softmax_in_fp32",
+        default=None,
+        type=str,
+        help="Decides Whether (true|false) to use fp32 for attention softmax. "
+        "(useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_expert_tensor_parallel_size",
+        default=None,
+        type=int,
+        help="Megatron-LM's expert tensor parallel size. (useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_calculate_per_token_loss",
+        default=None,
+        type=str,
+        help="Decides Whether (true|false) to calculate per token loss. "
+        "(useful only when `use_megatron_lm` flag is passed).",
+    )
+    megatron_lm_args.add_argument(
+        "--megatron_lm_use_rotary_position_embeddings",
+        default=None,
+        type=str,
+        help="Decides Whether (true|false) to use rotary position embeddings. "
+        "(useful only when `use_megatron_lm` flag is passed).",
+    )
+    # FP8 arguments
+    fp8_args = parser.add_argument_group(
+        "FP8 Arguments", "Arguments related to FP8 training (requires `--mixed_precision=fp8`)"
+    )
+    fp8_args.add_argument(
+        "--fp8_backend",
+        type=str,
+        choices=["ao", "te", "msamp"],
+        help="Choose a backend to train with FP8 (ao: torchao, te: TransformerEngine, msamp: MS-AMP)",
+    )
+    fp8_args.add_argument(
+        "--fp8_use_autocast_during_eval",
+        default=False,
+        action="store_true",
+        help="Whether to use FP8 autocast during eval mode (useful only when `--fp8_backend=te` is passed). Generally better metrics are found when this is not passed.",
+    )
+    fp8_args.add_argument(
+        "--fp8_margin",
+        type=int,
+        default=0,
+        help="The margin to use for the gradient scaling (useful only when `--fp8_backend=te` is passed).",
+    )
+    fp8_args.add_argument(
+        "--fp8_interval",
+        type=int,
+        default=1,
+        help="The interval to use for how often the scaling factor is recomputed (useful only when `--fp8_backend=te` is passed).",
+    )
+    fp8_args.add_argument(
+        "--fp8_format",
+        type=str,
+        default="HYBRID",
+        choices=["HYBRID", "E4M3", "E5M2"],
+        help="The format to use for the FP8 recipe (useful only when `--fp8_backend=te` is passed).",
+    )
+    fp8_args.add_argument(
+        "--fp8_amax_history_len",
+        type=int,
+        default=1024,
+        help="The length of the history to use for the scaling factor computation (useful only when `--fp8_backend=te` is passed).",
+    )
+    fp8_args.add_argument(
+        "--fp8_amax_compute_algo",
+        type=str,
+        default="most_recent",
+        choices=["max", "most_recent"],
+        help="The algorithm to use for the scaling factor computation. (useful only when `--fp8_backend=te` is passed).",
+    )
+    fp8_args.add_argument(
+        "--fp8_override_linear_precision",
+        type=lambda x: tuple(map(str_to_bool, x.split(","))),
+        default=(False, False, False),
+        help="Whether or not to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision. Should be passed in a comma-separated string of booleans (useful only when `--fp8_backend=te` is passed).",
+    )
+    fp8_args.add_argument(
+        "--fp8_opt_level",
+        type=str,
+        default="O2",
+        choices=["O1", "O2"],
+        help="What level of 8-bit collective communication should be used with MS-AMP (useful only when `--fp8_backend=msamp` is passed).",
+    )
+    fp8_args.add_argument(
+        "--fp8_enable_fsdp_float8_all_gather",
+        default="true",
+        type=str_to_bool,
+        help="Whether to enable FSDP2 float8 all gather (useful only when `--fp8_backend=ao` is passed).",
+    )
+    fp8_args.add_argument(
+        "--fp8_pad_inner_dim",
+        default="true",
+        type=str_to_bool,
+        help="Whether to pad the inner dimension for FP8 GEMMs (useful only when `--fp8_backend=ao` is passed).",
+    )
+    # AWS arguments
+    aws_args = parser.add_argument_group("AWS Arguments", "Arguments related to AWS.")
+    aws_args.add_argument(
+        "--aws_access_key_id",
+        type=str,
+        default=None,
+        help="The AWS_ACCESS_KEY_ID used to launch the Amazon SageMaker training job",
+    )
+    aws_args.add_argument(
+        "--aws_secret_access_key",
+        type=str,
+        default=None,
+        help="The AWS_SECRET_ACCESS_KEY used to launch the Amazon SageMaker training job.",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Whether to print out the torch.distributed stack trace when something fails.",
+    )
+    parser.add_argument(
+        "training_script",
+        type=str,
+        help=(
+            "The full path to the script to be launched in parallel, followed by all the arguments for the training "
+            "script."
+        ),
+    )
+    # MPI arguments
+    mpirun_args = parser.add_argument_group("MPI Arguments", "Arguments related to mpirun for Multi-CPU")
+    mpirun_args.add_argument(
+        "--mpirun_hostfile",
+        type=str,
+        default=None,
+        help="Location for a hostfile for using Accelerate to launch a multi-CPU training job with mpirun. This will "
+        "get passed to the MPI --hostfile or -f parameter, depending on which MPI program is installed.",
+    )
+    # ParallelismConfig arguments
+    parallelism_config_args = parser.add_argument_group(
+        "ParallelismConfig Arguments",
+        "Arguments related to the ParallelismConfig used for distributed training.",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_dp_replicate_size",
+        type=int,
+        default=1,
+        help="The number of processes for data parallel training. Defaults to 1 (no data parallelism).",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_dp_shard_size",
+        type=int,
+        default=1,
+        help="The number of processes for FSDP sharding. Defaults to 1 (No FSDP sharding).",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_tp_size",
+        type=int,
+        default=1,
+        help="The number of processes for tensor parallel training. Defaults to 1 (no tensor parallelism).",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_cp_size",
+        type=int,
+        default=1,
+        help="The number of processese for context parallel training. Defaults to 1 (no context parallelism).",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_cp_backend",
+        type=str,
+        choices=["torch"],
+        default="torch",
+        help="Context Parallelism backend: torch (FSDP2) or deepspeed (ALST/Ulysses)",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_cp_comm_strategy",
+        type=str,
+        default="allgather",
+        help="The communication strategy for context parallel training. Defaults to 'allgather'. Other option is alltoall",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_sp_size",
+        type=int,
+        default=1,
+        help="The number of processese for context parallel training. Defaults to 1 (no context parallelism).",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_sp_backend",
+        type=str,
+        choices=["deepspeed"],
+        default="deepspeed",
+        help="Sequence Parallelism backend: deepspeed (ALST/Ulysses)",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_sp_seq_length",
+        type=str,
+        default=None,
+        help="Sequence length for when batches are all of the same length. For variable sequence lengths across batches set `parallelism_config_sp_seq_length_is_variable=True`",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_sp_seq_length_is_variable",
+        type=bool,
+        default=True,
+        help="If `True` will work with a sequence length that may change between batches, in which case `parallelism_config_sp_seq_length` value can be set to anything divisible by sp size or remain unset. If `False` then `parallelism_config_sp_seq_length` needs to match the batch's sequence length dimension. The default is `True`.",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_sp_attn_implementation",
+        type=str,
+        default="sdpa",
+        help="Attention implementation to use. Can be one of 'flash_attention_2', 'flash_attention_3' or 'sdpa'. Defaults to `sdpa`.",
+    )
+    # Other arguments of the training scripts
+    parser.add_argument("training_script_args", nargs=argparse.REMAINDER, help="Arguments of the training script.")
+    if subparsers is not None:
+        parser.set_defaults(func=launch_command)
+    return parser
+def simple_launcher(args):
+    cmd, current_env = prepare_simple_launcher_cmd_env(args)
+    process = subprocess.Popen(cmd, env=current_env)
+    process.wait()
+    if process.returncode != 0:
+        if not args.quiet:
+            raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
+        else:
+            sys.exit(1)
+def multi_gpu_launcher(args):
+    import torch.distributed.run as distrib_run
+    current_env = prepare_multi_gpu_env(args)
+    if not check_cuda_p2p_ib_support():
+        message = "Using RTX 4000 series which doesn't support faster communication speedups. Ensuring P2P and IB communications are disabled."
+        warn = False
+        if "NCCL_P2P_DISABLE" not in current_env:
+            current_env["NCCL_P2P_DISABLE"] = "1"
+            warn = True
+        if "NCCL_IB_DISABLE" not in current_env:
+            current_env["NCCL_IB_DISABLE"] = "1"
+            warn = True
+        if warn:
+            logger.warning(message)
+    debug = getattr(args, "debug", False)
+    args = _filter_args(
+        args,
+        distrib_run.get_args_parser(),
+        ["--training_script", args.training_script, "--training_script_args", args.training_script_args],
+    )
+    with patch_environment(**current_env):
+        try:
+            distrib_run.run(args)
+        except Exception:
+            if is_rich_available() and debug:
+                console = get_console()
+                console.print("\n[bold red]Using --debug, `torch.distributed` Stack Trace:[/bold red]")
+                console.print_exception(suppress=[__file__], show_locals=False)
+            else:
+                raise
+def deepspeed_launcher(args):
+    import torch.distributed.run as distrib_run
+    if not is_deepspeed_available():
+        raise ImportError("DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source.")
+    else:
+        from deepspeed.launcher.runner import DEEPSPEED_ENVIRONMENT_NAME
+    cmd, current_env = prepare_deepspeed_cmd_env(args)
+    if not check_cuda_p2p_ib_support():
+        message = "Using RTX 4000 series which doesn't support faster communication speedups. Ensuring P2P and IB communications are disabled."
+        warn = False
+        if "NCCL_P2P_DISABLE" not in current_env:
+            current_env["NCCL_P2P_DISABLE"] = "1"
+            warn = True
+        if "NCCL_IB_DISABLE" not in current_env:
+            current_env["NCCL_IB_DISABLE"] = "1"
+            warn = True
+        if warn:
+            logger.warning(message)
+    if args.num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
+        with open(DEEPSPEED_ENVIRONMENT_NAME, "a") as f:
+            valid_env_items = convert_dict_to_env_variables(current_env)
+            if len(valid_env_items) > 1:
+                f.writelines(valid_env_items)
+        process = subprocess.Popen(cmd, env=current_env)
+        process.wait()
+        if process.returncode != 0:
+            if not args.quiet:
+                raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
+            else:
+                sys.exit(1)
+    else:
+        debug = getattr(args, "debug", False)
+        args = _filter_args(
+            args,
+            distrib_run.get_args_parser(),
+            ["--training_script", args.training_script, "--training_script_args", args.training_script_args],
+        )
+        with patch_environment(**current_env):
+            try:
+                distrib_run.run(args)
+            except Exception:
+                if is_rich_available() and debug:
+                    console = get_console()
+                    console.print("\n[bold red]Using --debug, `torch.distributed` Stack Trace:[/bold red]")
+                    console.print_exception(suppress=[__file__], show_locals=False)
+                else:
+                    raise
+def tpu_launcher(args):
+    import torch_xla.distributed.xla_multiprocessing as xmp
+    if args.no_python:
+        raise ValueError("--no_python cannot be used with TPU launcher")
+    args, current_env = prepare_tpu(args, {})
+    if args.module:
+        mod_name = args.training_script
+    else:
+        # Import training_script as a module
+        script_path = Path(args.training_script)
+        sys.path.append(str(script_path.parent.resolve()))
+        mod_name = script_path.stem
+    mod = importlib.import_module(mod_name)
+    if not hasattr(mod, args.main_training_function):
+        raise ValueError(
+            f"Your training script should have a function named {args.main_training_function}, or you should pass a "
+            "different value to `--main_training_function`."
+        )
+    # Patch sys.argv
+    sys.argv = [mod.__file__] + args.training_script_args
+    main_function = getattr(mod, args.main_training_function)
+    with patch_environment(**current_env):
+        xmp.spawn(PrepareForLaunch(main_function), args=())
+def tpu_pod_launcher(args):
+    from torch_xla.distributed import xla_dist
+    current_env = {}
+    args, current_env = prepare_tpu(args, current_env, True)
+    debug = getattr(args, "debug", False)
+    training_script = args.training_script
+    training_script_args = args.training_script_args
+    new_args = _filter_args(
+        args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"]
+    )
+    if args.tpu_use_sudo:
+        new_cmd = ["sudo"]
+    else:
+        new_cmd = []
+    new_cmd += [
+        "accelerate-launch",
+        "--tpu",
+        "--no_tpu_cluster",
+        "--num_machines",
+        "1",
+        "--mixed_precision",
+        "no",
+        "--dynamo_backend",
+        "no",
+        "--num_processes",
+        str(args.num_processes),
+        "--main_training_function",
+        str(args.main_training_function),
+        training_script,
+    ] + training_script_args
+    new_args.positional = new_cmd
+    bad_flags = ""
+    for arg in vars(new_args):
+        if arg.startswith("docker_"):
+            value = getattr(new_args, arg)
+            if value != "" and value is not None:
+                bad_flags += f'{arg}="{value}"\n'
+    if bad_flags != "":
+        raise ValueError(
+            f"Docker containers are not supported for TPU pod launcher currently, please remove the following flags:\n{bad_flags}"
+        )
+    new_args.env = [f"{k}={v}" for k, v in current_env.items()]
+    new_args.env.append("ACCELERATE_IN_TPU_POD=1")
+    try:
+        xla_dist.resolve_and_execute(new_args)
+    except Exception:
+        if is_rich_available() and debug:
+            console = get_console()
+            console.print("\n[bold red]Using --debug, `torch_xla.xla_dist` Stack Trace:[/bold red]")
+            console.print_exception(suppress=[__file__], show_locals=False)
+        else:
+            raise
+def sagemaker_launcher(sagemaker_config: SageMakerConfig, args):
+    if not is_sagemaker_available():
+        raise ImportError(
+            "Please install sagemaker to be able to launch training on Amazon SageMaker with `pip install accelerate[sagemaker]`"
+        )
+    if args.module or args.no_python:
+        raise ValueError(
+            "SageMaker requires a python training script file and cannot be used with --module or --no_python"
+        )
+    from sagemaker.huggingface import HuggingFace
+    args, sagemaker_inputs = prepare_sagemager_args_inputs(sagemaker_config, args)
+    huggingface_estimator = HuggingFace(**args)
+    huggingface_estimator.fit(inputs=sagemaker_inputs)
+    print(f"You can find your model data at: {huggingface_estimator.model_data}")
+def _validate_launch_command(args):
+    # Sanity checks
+    if sum([args.multi_gpu, args.cpu, args.tpu, args.use_deepspeed, args.use_fsdp]) > 1:
+        raise ValueError(
+            "You can only use one of `--cpu`, `--multi_gpu`, `--tpu`, `--use_deepspeed`, `--use_fsdp` at a time."
+        )
+    if args.multi_gpu and (args.num_processes is not None) and (args.num_processes < 2):
+        raise ValueError("You need to use at least 2 processes to use `--multi_gpu`.")
+    if (not args.use_fsdp or args.fsdp_version == 1) and args.use_parallelism_config:
+        raise ValueError("You cannot use `--use_parallelism_config` without `--use_fsdp` and `--fsdp_version=2`. ")
+    defaults = None
+    warned = []
+    mp_from_config_flag = False
+    # Get the default from the config file.
+    if args.config_file is not None or os.path.isfile(default_config_file) and not args.cpu:
+        defaults = load_config_from_file(args.config_file)
+        if (
+            not args.multi_gpu
+            and not args.tpu
+            and not args.tpu_use_cluster
+            and not args.use_deepspeed
+            and not args.use_fsdp
+            and not args.use_megatron_lm
+        ):
+            args.use_deepspeed = defaults.distributed_type == DistributedType.DEEPSPEED
+            args.multi_gpu = (
+                True
+                if defaults.distributed_type
+                in (
+                    DistributedType.MULTI_GPU,
+                    DistributedType.MULTI_NPU,
+                    DistributedType.MULTI_MLU,
+                    DistributedType.MULTI_SDAA,
+                    DistributedType.MULTI_MUSA,
+                    DistributedType.MULTI_XPU,
+                    DistributedType.MULTI_HPU,
+                    DistributedType.MULTI_NEURON,
+                )
+                else False
+            )
+            args.tpu = defaults.distributed_type == DistributedType.XLA
+            args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
+            args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM
+            args.tpu_use_cluster = defaults.tpu_use_cluster if args.tpu else False
+            args.use_parallelism_config = defaults.parallelism_config != {}
+        if args.gpu_ids is None:
+            if defaults.gpu_ids is not None:
+                args.gpu_ids = defaults.gpu_ids
+            else:
+                args.gpu_ids = "all"
+        if args.multi_gpu and args.num_machines is None:
+            args.num_machines = defaults.num_machines
+        if len(args.gpu_ids.split(",")) < 2 and (args.gpu_ids != "all") and args.multi_gpu and args.num_machines <= 1:
+            raise ValueError(
+                "Less than two GPU ids were configured and tried to run on on multiple GPUs. "
+                "Please ensure at least two are specified for `--gpu_ids`, or use `--gpu_ids='all'`."
+            )
+        if defaults.compute_environment == ComputeEnvironment.LOCAL_MACHINE:
+            # Update args with the defaults
+            for name, attr in defaults.__dict__.items():
+                if isinstance(attr, dict):
+                    # Copy defaults.somedict.somearg to args.somearg and
+                    # defaults.fsdp_config.x to args.fsdp_x
+                    for key, value in attr.items():
+                        if name == "fsdp_config" and not key.startswith("fsdp"):
+                            key = "fsdp_" + key
+                        elif name == "fp8_config" and not key.startswith("fp8"):
+                            key = "fp8_" + key
+                        if hasattr(args, "nondefault") and key not in args.nondefault:
+                            setattr(args, key, value)
+                elif (
+                    name not in ["compute_environment", "mixed_precision", "distributed_type"]
+                    and getattr(args, name, None) is None
+                ):
+                    # Those args are handled separately
+                    setattr(args, name, attr)
+        if not args.debug:
+            args.debug = defaults.debug
+        if not args.mixed_precision:
+            if defaults.mixed_precision is None:
+                args.mixed_precision = "no"
+            else:
+                args.mixed_precision = defaults.mixed_precision
+                mp_from_config_flag = True
+        else:
+            native_amp = is_bf16_available(True)
+            if (
+                args.mixed_precision == "bf16"
+                and not native_amp
+                and not (args.tpu and is_torch_xla_available(check_is_tpu=True))
+            ):
+                raise ValueError("bf16 mixed precision requires PyTorch >= 1.10 and a supported device.")
+        # Silently set the default here
+        if args.dynamo_backend is None:
+            args.dynamo_backend = "no"
+        if args.num_processes == -1:
+            raise ValueError("You need to manually pass in `--num_processes` using this config yaml.")
+    else:
+        if args.num_processes is None:
+            if is_xpu_available():
+                args.num_processes = torch.xpu.device_count()
+            elif is_mlu_available():
+                args.num_processes = torch.mlu.device_count()
+            elif is_sdaa_available():
+                args.num_processes = torch.sdaa.device_count()
+            elif is_musa_available():
+                args.num_processes = torch.musa.device_count()
+            elif is_npu_available():
+                args.num_processes = torch.npu.device_count()
+            elif is_hpu_available():
+                args.num_processes = torch.hpu.device_count()
+            elif is_neuron_available():
+                args.num_processes = torch.neuron.device_count()
+            else:
+                args.num_processes = torch.cuda.device_count()
+            warned.append(f"\t`--num_processes` was set to a value of `{args.num_processes}`")
+        if args.debug is None:
+            args.debug = False
+        if (
+            not args.multi_gpu
+            and args.num_processes > 1
+            and (
+                (is_xpu_available() and torch.xpu.device_count() > 1)
+                or (is_npu_available() and torch.npu.device_count() > 1)
+                or (is_hpu_available() and torch.hpu.device_count() > 1)
+                or (is_mlu_available() and torch.mlu.device_count() > 1)
+                or (is_sdaa_available() and torch.sdaa.device_count() > 1)
+                or (is_musa_available() and torch.musa.device_count() > 1)
+                or (is_neuron_available() and torch.neuron.device_count() > 1)
+                or (torch.cuda.is_available() and torch.cuda.device_count() > 1)
+            )
+        ):
+            warned.append(
+                "\t\tMore than one GPU was found, enabling multi-GPU training.\n"
+                "\t\tIf this was unintended please pass in `--num_processes=1`."
+            )
+            args.multi_gpu = True
+        if args.num_machines is None:
+            warned.append("\t`--num_machines` was set to a value of `1`")
+            args.num_machines = 1
+        if args.mixed_precision is None:
+            warned.append("\t`--mixed_precision` was set to a value of `'no'`")
+            args.mixed_precision = "no"
+        if not hasattr(args, "use_cpu"):
+            args.use_cpu = args.cpu
+        if args.dynamo_backend is None:
+            warned.append("\t`--dynamo_backend` was set to a value of `'no'`")
+            args.dynamo_backend = "no"
+    if args.debug:
+        logger.debug("Running script in debug mode, expect distributed operations to be slightly slower.")
+    is_aws_env_disabled = defaults is None or (
+        defaults is not None and defaults.compute_environment != ComputeEnvironment.AMAZON_SAGEMAKER
+    )
+    if is_aws_env_disabled and args.num_cpu_threads_per_process is None:
+        args.num_cpu_threads_per_process = get_int_from_env(["OMP_NUM_THREADS"], 1)
+        if args.use_cpu and args.num_processes >= 1 and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0:
+            local_size = get_int_from_env(
+                ["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"],
+                max(int(args.num_processes / args.num_machines), 1),
+            )
+            import psutil
+            threads_per_process = int(psutil.cpu_count(logical=False) / local_size)
+            if threads_per_process > 1:
+                args.num_cpu_threads_per_process = threads_per_process
+                warned.append(
+                    f"\t`--num_cpu_threads_per_process` was set to `{args.num_cpu_threads_per_process}` to improve out-of-box performance when training on CPUs"
+                )
+    if any(warned):
+        message = "The following values were not passed to `accelerate launch` and had defaults used instead:\n"
+        message += "\n".join(warned)
+        message += (
+            "\nTo avoid this warning pass in values for each of the problematic parameters or run `accelerate config`."
+        )
+        logger.warning(message)
+    return args, defaults, mp_from_config_flag
+def launch_command(args):
+    args, defaults, mp_from_config_flag = _validate_launch_command(args)
+    # Use the proper launcher
+    if args.use_deepspeed and not args.cpu:
+        args.deepspeed_fields_from_accelerate_config = list(defaults.deepspeed_config.keys()) if defaults else []
+        if mp_from_config_flag:
+            args.deepspeed_fields_from_accelerate_config.append("mixed_precision")
+        args.deepspeed_fields_from_accelerate_config = ",".join(args.deepspeed_fields_from_accelerate_config)
+        deepspeed_launcher(args)
+    elif args.use_fsdp and not args.cpu:
+        multi_gpu_launcher(args)
+    elif args.use_megatron_lm and not args.cpu:
+        multi_gpu_launcher(args)
+    elif args.multi_gpu and not args.cpu:
+        multi_gpu_launcher(args)
+    elif args.tpu and not args.cpu:
+        if args.tpu_use_cluster:
+            tpu_pod_launcher(args)
+        else:
+            tpu_launcher(args)
+    elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
+        sagemaker_launcher(defaults, args)
+    else:
+        simple_launcher(args)
+def main():
+    parser = launch_command_parser()
+    args = parser.parse_args()
+    launch_command(args)
+if __name__ == "__main__":
+    main()

accelerate/commands/menu/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .selection_menu import BulletMenu

accelerate/commands/menu/cursor.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A utility for showing and hiding the terminal cursor on Windows and Linux, based on https://github.com/bchao1/bullet
+"""
+import os
+import sys
+from contextlib import contextmanager
+# Windows only
+if os.name == "nt":
+    import ctypes
+    import msvcrt  # noqa
+    class CursorInfo(ctypes.Structure):
+        # _fields is a specific attr expected by ctypes
+        _fields_ = [("size", ctypes.c_int), ("visible", ctypes.c_byte)]
+def hide_cursor():
+    if os.name == "nt":
+        ci = CursorInfo()
+        handle = ctypes.windll.kernel32.GetStdHandle(-11)
+        ctypes.windll.kernel32.GetConsoleCursorInfo(handle, ctypes.byref(ci))
+        ci.visible = False
+        ctypes.windll.kernel32.SetConsoleCursorInfo(handle, ctypes.byref(ci))
+    elif os.name == "posix":
+        sys.stdout.write("\033[?25l")
+        sys.stdout.flush()
+def show_cursor():
+    if os.name == "nt":
+        ci = CursorInfo()
+        handle = ctypes.windll.kernel32.GetStdHandle(-11)
+        ctypes.windll.kernel32.GetConsoleCursorInfo(handle, ctypes.byref(ci))
+        ci.visible = True
+        ctypes.windll.kernel32.SetConsoleCursorInfo(handle, ctypes.byref(ci))
+    elif os.name == "posix":
+        sys.stdout.write("\033[?25h")
+        sys.stdout.flush()
+@contextmanager
+def hide():
+    "Context manager to hide the terminal cursor"
+    try:
+        hide_cursor()
+        yield
+    finally:
+        show_cursor()

accelerate/commands/menu/helpers.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A variety of helper functions and constants when dealing with terminal menu choices, based on
+https://github.com/bchao1/bullet
+"""
+import enum
+import shutil
+import sys
+TERMINAL_WIDTH, _ = shutil.get_terminal_size()
+CURSOR_TO_CHAR = {"UP": "A", "DOWN": "B", "RIGHT": "C", "LEFT": "D"}
+class Direction(enum.Enum):
+    UP = 0
+    DOWN = 1
+def forceWrite(content, end=""):
+    sys.stdout.write(str(content) + end)
+    sys.stdout.flush()
+def writeColor(content, color, end=""):
+    forceWrite(f"\u001b[{color}m{content}\u001b[0m", end)
+def reset_cursor():
+    forceWrite("\r")
+def move_cursor(num_lines: int, direction: str):
+    forceWrite(f"\033[{num_lines}{CURSOR_TO_CHAR[direction.upper()]}")
+def clear_line():
+    forceWrite(" " * TERMINAL_WIDTH)
+    reset_cursor()
+def linebreak():
+    reset_cursor()
+    forceWrite("-" * TERMINAL_WIDTH)

accelerate/commands/menu/input.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains utilities for handling input from the user and registering specific keys to specific functions,
+based on https://github.com/bchao1/bullet
+"""
+from .keymap import KEYMAP, get_character
+def mark(key: str):
+    """
+    Mark the function with the key code so it can be handled in the register
+    """
+    def decorator(func):
+        handle = getattr(func, "handle_key", [])
+        handle += [key]
+        func.handle_key = handle
+        return func
+    return decorator
+def mark_multiple(*keys: list[str]):
+    """
+    Mark the function with the key codes so it can be handled in the register
+    """
+    def decorator(func):
+        handle = getattr(func, "handle_key", [])
+        handle += keys
+        func.handle_key = handle
+        return func
+    return decorator
+class KeyHandler(type):
+    """
+    Metaclass that adds the key handlers to the class
+    """
+    def __new__(cls, name, bases, attrs):
+        new_cls = super().__new__(cls, name, bases, attrs)
+        if not hasattr(new_cls, "key_handler"):
+            new_cls.key_handler = {}
+        new_cls.handle_input = KeyHandler.handle_input
+        for value in attrs.values():
+            handled_keys = getattr(value, "handle_key", [])
+            for key in handled_keys:
+                new_cls.key_handler[key] = value
+        return new_cls
+    @staticmethod
+    def handle_input(cls):
+        "Finds and returns the selected character if it exists in the handler"
+        char = get_character()
+        if char != KEYMAP["undefined"]:
+            char = ord(char)
+        handler = cls.key_handler.get(char)
+        if handler:
+            cls.current_selection = char
+            return handler(cls)
+        else:
+            return None
+def register(cls):
+    """Adds KeyHandler metaclass to the class"""
+    return KeyHandler(cls.__name__, cls.__bases__, cls.__dict__.copy())

accelerate/commands/menu/keymap.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities relating to parsing raw characters from the keyboard, based on https://github.com/bchao1/bullet
+"""
+import os
+import string
+import sys
+ARROW_KEY_FLAG = 1 << 8
+KEYMAP = {
+    "tab": ord("\t"),
+    "newline": ord("\r"),
+    "esc": 27,
+    "up": 65 + ARROW_KEY_FLAG,
+    "down": 66 + ARROW_KEY_FLAG,
+    "right": 67 + ARROW_KEY_FLAG,
+    "left": 68 + ARROW_KEY_FLAG,
+    "mod_int": 91,
+    "undefined": sys.maxsize,
+    "interrupt": 3,
+    "insert": 50,
+    "delete": 51,
+    "pg_up": 53,
+    "pg_down": 54,
+}
+KEYMAP["arrow_begin"] = KEYMAP["up"]
+KEYMAP["arrow_end"] = KEYMAP["left"]
+if sys.platform == "win32":
+    WIN_CH_BUFFER = []
+    WIN_KEYMAP = {
+        b"\xe0H": KEYMAP["up"] - ARROW_KEY_FLAG,
+        b"\x00H": KEYMAP["up"] - ARROW_KEY_FLAG,
+        b"\xe0P": KEYMAP["down"] - ARROW_KEY_FLAG,
+        b"\x00P": KEYMAP["down"] - ARROW_KEY_FLAG,
+        b"\xe0M": KEYMAP["right"] - ARROW_KEY_FLAG,
+        b"\x00M": KEYMAP["right"] - ARROW_KEY_FLAG,
+        b"\xe0K": KEYMAP["left"] - ARROW_KEY_FLAG,
+        b"\x00K": KEYMAP["left"] - ARROW_KEY_FLAG,
+    }
+for i in range(10):
+    KEYMAP[str(i)] = ord(str(i))
+def get_raw_chars():
+    "Gets raw characters from inputs"
+    if os.name == "nt":
+        import msvcrt
+        encoding = "mbcs"
+        # Flush the keyboard buffer
+        while msvcrt.kbhit():
+            msvcrt.getch()
+        if len(WIN_CH_BUFFER) == 0:
+            # Read the keystroke
+            ch = msvcrt.getch()
+            # If it is a prefix char, get second part
+            if ch in (b"\x00", b"\xe0"):
+                ch2 = ch + msvcrt.getch()
+                # Translate actual Win chars to bullet char types
+                try:
+                    chx = chr(WIN_KEYMAP[ch2])
+                    WIN_CH_BUFFER.append(chr(KEYMAP["mod_int"]))
+                    WIN_CH_BUFFER.append(chx)
+                    if ord(chx) in (
+                        KEYMAP["insert"] - 1 << 9,
+                        KEYMAP["delete"] - 1 << 9,
+                        KEYMAP["pg_up"] - 1 << 9,
+                        KEYMAP["pg_down"] - 1 << 9,
+                    ):
+                        WIN_CH_BUFFER.append(chr(126))
+                    ch = chr(KEYMAP["esc"])
+                except KeyError:
+                    ch = ch2[1]
+            else:
+                ch = ch.decode(encoding)
+        else:
+            ch = WIN_CH_BUFFER.pop(0)
+    elif os.name == "posix":
+        import termios
+        import tty
+        fd = sys.stdin.fileno()
+        old_settings = termios.tcgetattr(fd)
+        try:
+            tty.setraw(fd)
+            ch = sys.stdin.read(1)
+        finally:
+            termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+    return ch
+def get_character():
+    "Gets a character from the keyboard and returns the key code"
+    char = get_raw_chars()
+    if ord(char) in [KEYMAP["interrupt"], KEYMAP["newline"]]:
+        return char
+    elif ord(char) == KEYMAP["esc"]:
+        combo = get_raw_chars()
+        if ord(combo) == KEYMAP["mod_int"]:
+            key = get_raw_chars()
+            if ord(key) >= KEYMAP["arrow_begin"] - ARROW_KEY_FLAG and ord(key) <= KEYMAP["arrow_end"] - ARROW_KEY_FLAG:
+                return chr(ord(key) + ARROW_KEY_FLAG)
+            else:
+                return KEYMAP["undefined"]
+        else:
+            return get_raw_chars()
+    else:
+        if char in string.printable:
+            return char
+        else:
+            return KEYMAP["undefined"]

accelerate/commands/menu/selection_menu.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Main driver for the selection menu, based on https://github.com/bchao1/bullet
+"""
+import builtins
+import sys
+from typing import Optional
+from ...utils.imports import _is_package_available
+from . import cursor, input
+from .helpers import Direction, clear_line, forceWrite, linebreak, move_cursor, reset_cursor, writeColor
+from .keymap import KEYMAP
+in_colab = False
+try:
+    in_colab = _is_package_available("google.colab")
+except ModuleNotFoundError:
+    pass
+@input.register
+class BulletMenu:
+    """
+    A CLI menu to select a choice from a list of choices using the keyboard.
+    """
+    def __init__(self, prompt: Optional[str] = None, choices: list = []):
+        self.position = 0
+        self.choices = choices
+        self.prompt = prompt
+        if sys.platform == "win32":
+            self.arrow_char = "*"
+        else:
+            self.arrow_char = "➔ "
+    def write_choice(self, index, end: str = ""):
+        if sys.platform != "win32":
+            writeColor(self.choices[index], 32, end)
+        else:
+            forceWrite(self.choices[index], end)
+    def print_choice(self, index: int):
+        "Prints the choice at the given index"
+        if index == self.position:
+            forceWrite(f" {self.arrow_char} ")
+            self.write_choice(index)
+        else:
+            forceWrite(f"    {self.choices[index]}")
+        reset_cursor()
+    def move_direction(self, direction: Direction, num_spaces: int = 1):
+        "Should not be directly called, used to move a direction of either up or down"
+        old_position = self.position
+        if direction == Direction.DOWN:
+            if self.position + 1 >= len(self.choices):
+                return
+            self.position += num_spaces
+        else:
+            if self.position - 1 < 0:
+                return
+            self.position -= num_spaces
+        clear_line()
+        self.print_choice(old_position)
+        move_cursor(num_spaces, direction.name)
+        self.print_choice(self.position)
+    @input.mark(KEYMAP["up"])
+    def move_up(self):
+        self.move_direction(Direction.UP)
+    @input.mark(KEYMAP["down"])
+    def move_down(self):
+        self.move_direction(Direction.DOWN)
+    @input.mark(KEYMAP["newline"])
+    def select(self):
+        move_cursor(len(self.choices) - self.position, "DOWN")
+        return self.position
+    @input.mark(KEYMAP["interrupt"])
+    def interrupt(self):
+        move_cursor(len(self.choices) - self.position, "DOWN")
+        raise KeyboardInterrupt
+    @input.mark_multiple(*[KEYMAP[str(number)] for number in range(10)])
+    def select_row(self):
+        index = int(chr(self.current_selection))
+        movement = index - self.position
+        if index == self.position:
+            return
+        if index < len(self.choices):
+            if self.position > index:
+                self.move_direction(Direction.UP, -movement)
+            elif self.position < index:
+                self.move_direction(Direction.DOWN, movement)
+            else:
+                return
+        else:
+            return
+    def run(self, default_choice: int = 0):
+        "Start the menu and return the selected choice"
+        if self.prompt:
+            linebreak()
+            forceWrite(self.prompt, "\n")
+            if in_colab:
+                forceWrite("Please input a choice index (starting from 0), and press enter", "\n")
+            else:
+                forceWrite("Please select a choice using the arrow or number keys, and selecting with enter", "\n")
+        self.position = default_choice
+        for i in range(len(self.choices)):
+            self.print_choice(i)
+            forceWrite("\n")
+        move_cursor(len(self.choices) - self.position, "UP")
+        with cursor.hide():
+            while True:
+                if in_colab:
+                    try:
+                        choice = int(builtins.input())
+                    except ValueError:
+                        choice = default_choice
+                else:
+                    choice = self.handle_input()
+                if choice is not None:
+                    reset_cursor()
+                    for _ in range(len(self.choices) + 1):
+                        move_cursor(1, "UP")
+                        clear_line()
+                    self.write_choice(choice, "\n")
+                    return choice

accelerate/commands/merge.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from accelerate.commands.utils import CustomArgumentParser
+from accelerate.utils import merge_fsdp_weights
+description = """Utility to merge the weights from multiple FSDP checkpoints into a single combined checkpoint. Should be used if
+`SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}`.
+This is a CPU-bound process and requires enough RAM to load the entire model state dict."""
+def merge_command(args):
+    merge_fsdp_weights(
+        args.checkpoint_directory, args.output_path, not args.unsafe_serialization, args.remove_checkpoint_dir
+    )
+def merge_command_parser(subparsers=None):
+    if subparsers is not None:
+        parser = subparsers.add_parser("merge-weights", description=description)
+    else:
+        parser = CustomArgumentParser(description=description)
+    parser.add_argument("checkpoint_directory", type=str, help="A directory containing sharded weights saved by FSDP.")
+    parser.add_argument(
+        "output_path",
+        type=str,
+        help="The path to save the merged weights. Defaults to the current directory. ",
+    )
+    parser.add_argument(
+        "--unsafe_serialization",
+        action="store_true",
+        default=False,
+        help="Whether to save the merged weights as `.bin` rather than `.safetensors` (not recommended).",
+    )
+    parser.add_argument(
+        "--remove_checkpoint_dir",
+        action="store_true",
+        help="Whether to remove the checkpoint directory after merging.",
+        default=False,
+    )
+    if subparsers is not None:
+        parser.set_defaults(func=merge_command)
+    return parser
+def main():
+    parser = merge_command_parser()
+    args = parser.parse_args()
+    merge_command(args)
+if __name__ == "__main__":
+    main()

accelerate/commands/test.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from accelerate.test_utils import execute_subprocess_async, path_in_accelerate_package
+def test_command_parser(subparsers=None):
+    if subparsers is not None:
+        parser = subparsers.add_parser("test")
+    else:
+        parser = argparse.ArgumentParser("Accelerate test command")
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        help=(
+            "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
+            "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
+            "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
+            "with 'huggingface'."
+        ),
+    )
+    if subparsers is not None:
+        parser.set_defaults(func=test_command)
+    return parser
+def test_command(args):
+    script_name = path_in_accelerate_package("test_utils", "scripts", "test_script.py")
+    if args.config_file is None:
+        test_args = [script_name]
+    else:
+        test_args = f"--config_file={args.config_file} {script_name}".split()
+    cmd = ["accelerate-launch"] + test_args
+    result = execute_subprocess_async(cmd)
+    if result.returncode == 0:
+        print("Test is a success! You are ready for your distributed training!")
+def main():
+    parser = test_command_parser()
+    args = parser.parse_args()
+    test_command(args)
+if __name__ == "__main__":
+    main()

accelerate/commands/to_fsdp2.py ADDED Viewed

	@@ -0,0 +1,172 @@

+#!/usr/bin/env python
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import enum
+import logging
+from pathlib import Path
+import yaml
+from accelerate.commands.utils import CustomArgumentParser
+class ConversionStatus(enum.Enum):
+    NOT_YET_IMPLEMENTED = 0
+    REMOVED = -1
+ARGUMENT_KEY_MAPPING = {
+    # New keys in FSDP2
+    "fsdp_version": "fsdp_version",
+    "fsdp_reshard_after_forward": "fsdp_reshard_after_forward",
+    # https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md
+    # https://huggingface.co/docs/accelerate/en/usage_guides/fsdp
+    "fsdp_auto_wrap_policy": "fsdp_auto_wrap_policy",
+    "fsdp_backward_prefetch": ConversionStatus.REMOVED,
+    "fsdp_forward_prefetch": ConversionStatus.NOT_YET_IMPLEMENTED,
+    "fsdp_cpu_ram_efficient_loading": "fsdp_cpu_ram_efficient_loading",
+    "fsdp_offload_params": "fsdp_offload_params",
+    "fsdp_sharding_strategy": "fsdp_reshard_after_forward",
+    "fsdp_state_dict_type": "fsdp_state_dict_type",
+    "fsdp_sync_module_states": ConversionStatus.REMOVED,
+    "fsdp_transformer_layer_cls_to_wrap": "fsdp_transformer_layer_cls_to_wrap",
+    "fsdp_min_num_params": "fsdp_min_num_params",
+    "fsdp_use_orig_params": ConversionStatus.REMOVED,
+    "fsdp_activation_checkpointing": "fsdp_activation_checkpointing",
+}
+ARGUMENT_VALUE_MAPPING = {
+    "fsdp_sharding_strategy": {
+        "FULL_SHARD": True,
+        "SHARD_GRAD_OP": False,
+        "HYBRID_SHARD": True,
+        "HYBRID_SHARD_ZERO2": False,
+        "NO_SHARD": False,
+    },
+    "fsdp_reshard_after_forward": {  # Needed to convert newly created configs using FSDP1 to FSDP2
+        "FULL_SHARD": True,
+        "SHARD_GRAD_OP": False,
+        "HYBRID_SHARD": True,
+        "HYBRID_SHARD_ZERO2": False,
+        "NO_SHARD": False,
+    },
+}
+logger = logging.getLogger(__name__)
+def _validate_to_fsdp2_args(args):
+    if not Path(args.config_file).exists():
+        raise FileNotFoundError(f"Config file {args.config_file} not found")
+    if not args.overwrite and args.output_file is None:
+        raise ValueError("If --overwrite is not set, --output_file must be provided")
+    if not args.overwrite and Path(args.output_file).exists():
+        raise FileExistsError(f"Output file {args.output_file} already exists and --overwrite is not set")
+def convert_config_to_fsdp2(config: dict) -> dict:
+    fsdp_config = config.get("fsdp_config", {})
+    if not fsdp_config:
+        logger.info("No FSDP config found in the config file, skipping conversion...")
+        return config
+    new_fsdp_config = {}
+    if fsdp_config.get("fsdp_version", 1) == 2:
+        logger.warning("Config already specifies FSDP2, skipping conversion...")
+        logger.warning(
+            "If the config doesn't use new argument names, change `fsdp_version` to `1` and rerun the command."
+        )
+        return config
+    for key, value in fsdp_config.items():
+        conversion_status = ARGUMENT_KEY_MAPPING.get(key, None)
+        if isinstance(conversion_status, ConversionStatus) or conversion_status is None:
+            conversion_status = key
+            new_fsdp_config[conversion_status] = value
+            continue
+        if conversion_status == ConversionStatus.REMOVED:
+            logger.warning(f"Argument {key} has been removed in FSDP2, skipping this key...")
+            continue
+        if conversion_status == ConversionStatus.NOT_YET_IMPLEMENTED:
+            logger.warning(f"Argument {key} is not yet implemented in FSDP2, skipping this key...")
+            continue
+        if conversion_status is None:
+            logger.warning(f"Argument {key} is not being converted, skipping this key...")
+            new_fsdp_config[key] = value
+        else:
+            if key in ARGUMENT_VALUE_MAPPING:
+                value = ARGUMENT_VALUE_MAPPING[key].get(value, value)
+            new_fsdp_config[ARGUMENT_KEY_MAPPING[key]] = value
+    new_fsdp_config["fsdp_version"] = 2
+    config["fsdp_config"] = new_fsdp_config
+    return config
+def to_fsdp2_command_parser(subparsers=None):
+    description = "Convert an Accelerate config from FSDP1 to FSDP2"
+    if subparsers is not None:
+        parser = subparsers.add_parser("to-fsdp2", description=description)
+    else:
+        parser = CustomArgumentParser(description=description)
+    parser.add_argument("--config_file", type=str, help="The config file to convert to FSDP2", required=True)
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite the config file if it exists",
+        default=False,
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        help="The path to the output file to write the converted config to. If not provided, the input file will be overwritten (if --overwrite is set)",
+        default=None,
+    )
+    if subparsers is not None:
+        parser.set_defaults(func=to_fsdp2_command)
+    return parser
+def load_config(config_file: str) -> dict:
+    with open(config_file) as f:
+        config = yaml.safe_load(f)
+    if not config:
+        raise ValueError("Config file is empty")
+    return config
+def to_fsdp2_command(args):
+    _validate_to_fsdp2_args(args)
+    config = load_config(args.config_file)
+    if args.overwrite and args.output_file is None:
+        args.output_file = args.config_file
+    new_config = convert_config_to_fsdp2(config)
+    with open(args.output_file, "w") as f:
+        yaml.dump(new_config, f)

accelerate/commands/tpu.py ADDED Viewed

	@@ -0,0 +1,157 @@

+#!/usr/bin/env python
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import subprocess
+from packaging.version import Version, parse
+from accelerate.commands.config.config_args import default_config_file, load_config_from_file
+_description = "Run commands across TPU VMs for initial setup before running `accelerate launch`."
+def tpu_command_parser(subparsers=None):
+    if subparsers is not None:
+        parser = subparsers.add_parser("tpu-config", description=_description)
+    else:
+        parser = argparse.ArgumentParser("Accelerate tpu-config command", description=_description)
+    # Core arguments
+    config_args = parser.add_argument_group(
+        "Config Arguments", "Arguments that can be configured through `accelerate config`."
+    )
+    config_args.add_argument(
+        "--config_file",
+        type=str,
+        default=None,
+        help="Path to the config file to use for accelerate.",
+    )
+    config_args.add_argument(
+        "--tpu_name",
+        default=None,
+        help="The name of the TPU to use. If not specified, will use the TPU specified in the config file.",
+    )
+    config_args.add_argument(
+        "--tpu_zone",
+        default=None,
+        help="The zone of the TPU to use. If not specified, will use the zone specified in the config file.",
+    )
+    pod_args = parser.add_argument_group("TPU Arguments", "Arguments for options ran inside the TPU.")
+    pod_args.add_argument(
+        "--use_alpha",
+        action="store_true",
+        help="Whether to use `gcloud alpha` when running the TPU training script instead of `gcloud`.",
+    )
+    pod_args.add_argument(
+        "--command_file",
+        default=None,
+        help="The path to the file containing the commands to run on the pod on startup.",
+    )
+    pod_args.add_argument(
+        "--command",
+        action="append",
+        nargs="+",
+        help="A command to run on the pod. Can be passed multiple times.",
+    )
+    pod_args.add_argument(
+        "--install_accelerate",
+        action="store_true",
+        help="Whether to install accelerate on the pod. Defaults to False.",
+    )
+    pod_args.add_argument(
+        "--accelerate_version",
+        default="latest",
+        help="The version of accelerate to install on the pod. If not specified, will use the latest pypi version. Specify 'dev' to install from GitHub.",
+    )
+    pod_args.add_argument(
+        "--debug", action="store_true", help="If set, will print the command that would be run instead of running it."
+    )
+    if subparsers is not None:
+        parser.set_defaults(func=tpu_command_launcher)
+    return parser
+def tpu_command_launcher(args):
+    defaults = None
+    # Get the default from the config file if it exists.
+    if args.config_file is not None or os.path.isfile(default_config_file):
+        defaults = load_config_from_file(args.config_file)
+        if not args.command_file and defaults.command_file is not None and not args.command:
+            args.command_file = defaults.command_file
+        if not args.command and defaults.commands is not None:
+            args.command = defaults.commands
+        if not args.tpu_name:
+            args.tpu_name = defaults.tpu_name
+        if not args.tpu_zone:
+            args.tpu_zone = defaults.tpu_zone
+    if args.accelerate_version == "dev":
+        args.accelerate_version = "git+https://github.com/huggingface/accelerate.git"
+    elif args.accelerate_version == "latest":
+        args.accelerate_version = "accelerate -U"
+    elif isinstance(parse(args.accelerate_version), Version):
+        args.accelerate_version = f"accelerate=={args.accelerate_version}"
+    if not args.command_file and not args.command:
+        raise ValueError("You must specify either a command file or a command to run on the pod.")
+    if args.command_file:
+        with open(args.command_file) as f:
+            args.command = [f.read().splitlines()]
+    # To turn list of lists into list of strings
+    if isinstance(args.command[0], list):
+        args.command = [line for cmd in args.command for line in cmd]
+    # Default to the shared folder and install accelerate
+    new_cmd = ["cd /usr/share"]
+    if args.install_accelerate:
+        new_cmd += [f"pip install {args.accelerate_version}"]
+    new_cmd += args.command
+    args.command = "; ".join(new_cmd)
+    # Then send it to gcloud
+    # Eventually try to use google-api-core to do this instead of subprocess
+    cmd = ["gcloud"]
+    if args.use_alpha:
+        cmd += ["alpha"]
+    cmd += [
+        "compute",
+        "tpus",
+        "tpu-vm",
+        "ssh",
+        args.tpu_name,
+        "--zone",
+        args.tpu_zone,
+        "--command",
+        args.command,
+        "--worker",
+        "all",
+    ]
+    if args.debug:
+        print(f"Running {' '.join(cmd)}")
+        return
+    subprocess.run(cmd)
+    print("Successfully setup pod.")
+def main():
+    parser = tpu_command_parser()
+    args = parser.parse_args()
+    tpu_command_launcher(args)

accelerate/commands/utils.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+class _StoreAction(argparse.Action):
+    """
+    Custom action that allows for `-` or `_` to be passed in for an argument.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        new_option_strings = []
+        for option_string in self.option_strings:
+            new_option_strings.append(option_string)
+            if "_" in option_string[2:]:
+                # Add `-` version to the option string
+                new_option_strings.append(option_string.replace("_", "-"))
+        self.option_strings = new_option_strings
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, values)
+        if not hasattr(namespace, "nondefault"):
+            namespace.nondefault = set()
+        namespace.nondefault.add(self.dest)
+class _StoreConstAction(_StoreAction):
+    """
+    Same as `argparse._StoreConstAction` but uses the custom `_StoreAction`.
+    """
+    def __init__(self, option_strings, dest, const, default=None, required=False, help=None):
+        super().__init__(
+            option_strings=option_strings,
+            dest=dest,
+            nargs=0,
+            const=const,
+            default=default,
+            required=required,
+            help=help,
+        )
+    def __call__(self, parser, namespace, values, option_string=None):
+        super().__call__(parser, namespace, self.const, option_string)
+class _StoreTrueAction(_StoreConstAction):
+    """
+    Same as `argparse._StoreTrueAction` but uses the custom `_StoreConstAction`.
+    """
+    def __init__(
+        self,
+        option_strings,
+        dest,
+        default=None,
+        required=False,
+        help=None,
+    ):
+        super().__init__(
+            option_strings=option_strings, dest=dest, const=True, default=default, required=required, help=help
+        )
+class CustomArgumentGroup(argparse._ArgumentGroup):
+    """
+    Custom argument group that allows for the use of `-` or `_` in arguments passed and overrides the help for each
+    when applicable.
+    """
+    def _add_action(self, action):
+        args = vars(action)
+        if isinstance(action, argparse._StoreTrueAction):
+            action = _StoreTrueAction(
+                args["option_strings"], args["dest"], args["default"], args["required"], args["help"]
+            )
+        elif isinstance(action, argparse._StoreConstAction):
+            action = _StoreConstAction(
+                args["option_strings"],
+                args["dest"],
+                args["const"],
+                args["default"],
+                args["required"],
+                args["help"],
+            )
+        elif isinstance(action, argparse._StoreAction):
+            action = _StoreAction(**args)
+        action = super()._add_action(action)
+        return action
+class CustomArgumentParser(argparse.ArgumentParser):
+    """
+    Custom argument parser that allows for the use of `-` or `_` in arguments passed and overrides the help for each
+    when applicable.
+    """
+    def add_argument(self, *args, **kwargs):
+        if "action" in kwargs:
+            # Translate action -> class
+            if kwargs["action"] == "store_true":
+                kwargs["action"] = _StoreTrueAction
+        else:
+            kwargs["action"] = _StoreAction
+        super().add_argument(*args, **kwargs)
+    def add_argument_group(self, *args, **kwargs):
+        group = CustomArgumentGroup(self, *args, **kwargs)
+        self._action_groups.append(group)
+        return group

accelerate/test_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .testing import (
+    DEFAULT_LAUNCH_COMMAND,
+    are_the_same_tensors,
+    assert_exception,
+    capture_call_output,
+    device_count,
+    execute_subprocess_async,
+    get_launch_command,
+    get_torch_dist_unique_port,
+    memory_allocated_func,
+    path_in_accelerate_package,
+    pytest_xdist_worker_id,
+    require_bnb,
+    require_cpu,
+    require_cuda,
+    require_cuda_or_hpu,
+    require_cuda_or_xpu,
+    require_fp8,
+    require_fp16,
+    require_huggingface_suite,
+    require_mlu,
+    require_mps,
+    require_multi_device,
+    require_multi_gpu,
+    require_multi_gpu_or_xpu,
+    require_multi_xpu,
+    require_musa,
+    require_non_cpu,
+    require_non_hpu,
+    require_non_torch_xla,
+    require_non_xpu,
+    require_npu,
+    require_pippy,
+    require_sdaa,
+    require_single_device,
+    require_single_gpu,
+    require_single_xpu,
+    require_torch_min_version,
+    require_torchao,
+    require_torchvision,
+    require_tpu,
+    require_transformer_engine,
+    require_transformer_engine_mxfp8,
+    require_xpu,
+    run_first,
+    skip,
+    slow,
+    torch_device,
+)
+from .training import RegressionDataset, RegressionModel
+from .scripts import test_script, test_sync, test_ops  # isort: skip

accelerate/test_utils/examples.py ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/usr/bin/env python
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A collection of utilities for comparing `examples/complete_*_example.py` scripts with the capabilities inside of each
+`examples/by_feature` example. `compare_against_test` is the main function that should be used when testing, while the
+others are used to either get the code that matters, or to preprocess them (such as stripping comments)
+"""
+import os
+from typing import Optional
+def get_function_contents_by_name(lines: list[str], name: str):
+    """
+    Extracts a function from `lines` of segmented source code with the name `name`.
+    Args:
+        lines (`List[str]`):
+            Source code of a script separated by line.
+        name (`str`):
+            The name of the function to extract. Should be either `training_function` or `main`
+    """
+    if name != "training_function" and name != "main":
+        raise ValueError(f"Incorrect function name passed: {name}, choose either 'main' or 'training_function'")
+    good_lines, found_start = [], False
+    for line in lines:
+        if not found_start and f"def {name}" in line:
+            found_start = True
+            good_lines.append(line)
+            continue
+        if found_start:
+            if name == "training_function" and "def main" in line:
+                return good_lines
+            if name == "main" and "if __name__" in line:
+                return good_lines
+            good_lines.append(line)
+def clean_lines(lines: list[str]):
+    """
+    Filters `lines` and removes any entries that start with a comment ('#') or is just a newline ('\n')
+    Args:
+        lines (`List[str]`):
+            Source code of a script separated by line.
+    """
+    return [line for line in lines if not line.lstrip().startswith("#") and line != "\n"]
+def compare_against_test(
+    base_filename: str, feature_filename: str, parser_only: bool, secondary_filename: Optional[str] = None
+):
+    """
+    Tests whether the additional code inside of `feature_filename` was implemented in `base_filename`. This should be
+    used when testing to see if `complete_*_.py` examples have all of the implementations from each of the
+    `examples/by_feature/*` scripts.
+    It utilizes `nlp_example.py` to extract out all of the repeated training code, so that only the new additional code
+    is examined and checked. If something *other* than `nlp_example.py` should be used, such as `cv_example.py` for the
+    `complete_cv_example.py` script, it should be passed in for the `secondary_filename` parameter.
+    Args:
+        base_filename (`str` or `os.PathLike`):
+            The filepath of a single "complete" example script to test, such as `examples/complete_cv_example.py`
+        feature_filename (`str` or `os.PathLike`):
+            The filepath of a single feature example script. The contents of this script are checked to see if they
+            exist in `base_filename`
+        parser_only (`bool`):
+            Whether to compare only the `main()` sections in both files, or to compare the contents of
+            `training_loop()`
+        secondary_filename (`str`, *optional*):
+            A potential secondary filepath that should be included in the check. This function extracts the base
+            functionalities off of "examples/nlp_example.py", so if `base_filename` is a script other than
+            `complete_nlp_example.py`, the template script should be included here. Such as `examples/cv_example.py`
+    """
+    with open(base_filename) as f:
+        base_file_contents = f.readlines()
+    with open(os.path.abspath(os.path.join("examples", "nlp_example.py"))) as f:
+        full_file_contents = f.readlines()
+    with open(feature_filename) as f:
+        feature_file_contents = f.readlines()
+    if secondary_filename is not None:
+        with open(secondary_filename) as f:
+            secondary_file_contents = f.readlines()
+    # This is our base, we remove all the code from here in our `full_filename` and `feature_filename` to find the new content
+    if parser_only:
+        base_file_func = clean_lines(get_function_contents_by_name(base_file_contents, "main"))
+        full_file_func = clean_lines(get_function_contents_by_name(full_file_contents, "main"))
+        feature_file_func = clean_lines(get_function_contents_by_name(feature_file_contents, "main"))
+        if secondary_filename is not None:
+            secondary_file_func = clean_lines(get_function_contents_by_name(secondary_file_contents, "main"))
+    else:
+        base_file_func = clean_lines(get_function_contents_by_name(base_file_contents, "training_function"))
+        full_file_func = clean_lines(get_function_contents_by_name(full_file_contents, "training_function"))
+        feature_file_func = clean_lines(get_function_contents_by_name(feature_file_contents, "training_function"))
+        if secondary_filename is not None:
+            secondary_file_func = clean_lines(
+                get_function_contents_by_name(secondary_file_contents, "training_function")
+            )
+    _dl_line = "train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)\n"
+    # Specific code in our script that differs from the full version, aka what is new
+    new_feature_code = []
+    passed_idxs = []  # We keep track of the idxs just in case it's a repeated statement
+    it = iter(feature_file_func)
+    for i in range(len(feature_file_func) - 1):
+        if i not in passed_idxs:
+            line = next(it)
+            if (line not in full_file_func) and (line.lstrip() != _dl_line):
+                if "TESTING_MOCKED_DATALOADERS" not in line:
+                    new_feature_code.append(line)
+                    passed_idxs.append(i)
+                else:
+                    # Skip over the `config['num_epochs'] = 2` statement
+                    _ = next(it)
+    # Extract out just the new parts from the full_file_training_func
+    new_full_example_parts = []
+    passed_idxs = []  # We keep track of the idxs just in case it's a repeated statement
+    for i, line in enumerate(base_file_func):
+        if i not in passed_idxs:
+            if (line not in full_file_func) and (line.lstrip() != _dl_line):
+                if "TESTING_MOCKED_DATALOADERS" not in line:
+                    new_full_example_parts.append(line)
+                    passed_idxs.append(i)
+    # Finally, get the overall diff
+    diff_from_example = [line for line in new_feature_code if line not in new_full_example_parts]
+    if secondary_filename is not None:
+        diff_from_two = [line for line in full_file_contents if line not in secondary_file_func]
+        diff_from_example = [line for line in diff_from_example if line not in diff_from_two]
+    return diff_from_example

accelerate/test_utils/scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

accelerate/test_utils/scripts/external_deps/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

accelerate/test_utils/scripts/external_deps/test_checkpointing.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+import evaluate
+import torch
+from datasets import load_dataset
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
+from accelerate import Accelerator, DistributedType
+from accelerate.utils.deepspeed import DummyOptim, DummyScheduler
+MAX_GPU_BATCH_SIZE = 16
+EVAL_BATCH_SIZE = 32
+def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name: str = "bert-base-cased"):
+    """
+    Creates a set of `DataLoader`s for the `glue` dataset.
+    Args:
+        accelerator (`Accelerator`):
+            An `Accelerator` object
+        batch_size (`int`, *optional*):
+            The batch size for the train and validation DataLoaders.
+        model_name (`str`, *optional*):
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    datasets = load_dataset("glue", "mrpc")
+    def tokenize_function(examples):
+        # max_length=None => use the model max length (it's actually the default)
+        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
+        return outputs
+    # Apply the method we just defined to all the examples in all the splits of the dataset
+    tokenized_datasets = datasets.map(
+        tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
+    )
+    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
+    # transformers library
+    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    def collate_fn(examples):
+        # On TPU it's best to pad everything to the same length or training will be very slow.
+        if accelerator.distributed_type == DistributedType.XLA:
+            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
+        return tokenizer.pad(examples, padding="longest", return_tensors="pt")
+    # Instantiate dataloaders.
+    train_dataloader = DataLoader(
+        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
+    )
+    eval_dataloader = DataLoader(
+        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
+    )
+    return train_dataloader, eval_dataloader
+def evaluation_loop(accelerator, model, eval_dataloader, metric):
+    model.eval()
+    samples_seen = 0
+    for step, batch in enumerate(eval_dataloader):
+        # We could avoid this line since we set the accelerator with `device_placement=True`.
+        batch.to(accelerator.device)
+        with torch.no_grad():
+            outputs = model(**batch)
+        predictions = outputs.logits.argmax(dim=-1)
+        # It is slightly faster to call this once, than multiple times
+        predictions, references = accelerator.gather(
+            (predictions, batch["labels"])
+        )  # If we are in a multiprocess environment, the last batch has duplicates
+        if accelerator.use_distributed:
+            if step == len(eval_dataloader) - 1:
+                predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
+                references = references[: len(eval_dataloader.dataset) - samples_seen]
+            else:
+                samples_seen += references.shape[0]
+        metric.add_batch(
+            predictions=predictions,
+            references=references,
+        )
+    eval_metric = metric.compute()
+    return eval_metric["accuracy"]
+def training_function(config, args):
+    # Initialize accelerator
+    accelerator = Accelerator()
+    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
+    lr = config["lr"]
+    num_epochs = int(config["num_epochs"])
+    seed = int(config["seed"])
+    batch_size = int(config["batch_size"])
+    model_name = args.model_name_or_path
+    set_seed(seed)
+    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name)
+    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)
+    # Instantiate optimizer
+    optimizer_cls = (
+        AdamW
+        if accelerator.state.deepspeed_plugin is None
+        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
+        else DummyOptim
+    )
+    optimizer = optimizer_cls(params=model.parameters(), lr=lr)
+    if accelerator.state.deepspeed_plugin is not None:
+        gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
+            "gradient_accumulation_steps"
+        ]
+    else:
+        gradient_accumulation_steps = 1
+    max_training_steps = (len(train_dataloader) * num_epochs) // gradient_accumulation_steps
+    # Instantiate scheduler
+    if (
+        accelerator.state.deepspeed_plugin is None
+        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
+    ):
+        lr_scheduler = get_linear_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=0,
+            num_training_steps=max_training_steps,
+        )
+    else:
+        lr_scheduler = DummyScheduler(optimizer, total_num_steps=max_training_steps, warmup_num_steps=0)
+    # Prepare everything
+    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
+    # prepare method.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+    # We need to keep track of how many total steps we have iterated over
+    overall_step = 0
+    # We also need to keep track of the stating epoch so files are named properly
+    starting_epoch = 0
+    metric = evaluate.load("glue", "mrpc")
+    ending_epoch = num_epochs
+    if args.partial_train_epoch is not None:
+        ending_epoch = args.partial_train_epoch
+    if args.resume_from_checkpoint:
+        accelerator.load_state(args.resume_from_checkpoint)
+        epoch_string = args.resume_from_checkpoint.split("epoch_")[1]
+        state_epoch_num = ""
+        for char in epoch_string:
+            if char.isdigit():
+                state_epoch_num += char
+            else:
+                break
+        starting_epoch = int(state_epoch_num) + 1
+        accuracy = evaluation_loop(accelerator, model, eval_dataloader, metric)
+        accelerator.print("resumed checkpoint performance:", accuracy)
+        accelerator.print("resumed checkpoint's scheduler's lr:", lr_scheduler.get_lr()[0])
+        accelerator.print("resumed optimizers's lr:", optimizer.param_groups[0]["lr"])
+        with open(os.path.join(args.output_dir, f"state_{starting_epoch - 1}.json")) as f:
+            resumed_state = json.load(f)
+            assert resumed_state["accuracy"] == accuracy, "Accuracy mismatch, loading from checkpoint failed"
+            assert resumed_state["lr"] == lr_scheduler.get_lr()[0], (
+                "Scheduler learning rate mismatch, loading from checkpoint failed"
+            )
+            assert resumed_state["optimizer_lr"] == optimizer.param_groups[0]["lr"], (
+                "Optimizer learning rate mismatch, loading from checkpoint failed"
+            )
+            assert resumed_state["epoch"] == starting_epoch - 1, "Epoch mismatch, loading from checkpoint failed"
+            return
+    # Now we train the model
+    state = {}
+    for epoch in range(starting_epoch, ending_epoch):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % gradient_accumulation_steps == 0:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            overall_step += 1
+        output_dir = f"epoch_{epoch}"
+        output_dir = os.path.join(args.output_dir, output_dir)
+        accelerator.save_state(output_dir)
+        accuracy = evaluation_loop(accelerator, model, eval_dataloader, metric)
+        state["accuracy"] = accuracy
+        state["lr"] = lr_scheduler.get_lr()[0]
+        state["optimizer_lr"] = optimizer.param_groups[0]["lr"]
+        state["epoch"] = epoch
+        state["step"] = overall_step
+        accelerator.print(f"epoch {epoch}:", state)
+        accelerator.wait_for_everyone()
+        if accelerator.is_main_process:
+            with open(os.path.join(args.output_dir, f"state_{epoch}.json"), "w") as f:
+                json.dump(state, f)
+    accelerator.end_training()
+def main():
+    parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="bert-base-cased",
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=False,
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=".",
+        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--partial_train_epoch",
+        type=int,
+        default=None,
+        help="If passed, the training will stop after this number of epochs.",
+    )
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=2,
+        help="Number of train epochs.",
+    )
+    args = parser.parse_args()
+    config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}
+    training_function(config, args)
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/external_deps/test_ds_alst_ulysses_sp.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test script for verifying ALST/Ulysses SP works
+"""
+import torch
+from deepspeed.runtime.utils import move_to_device
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from accelerate import Accelerator
+from accelerate.utils import ParallelismConfig, set_seed
+from accelerate.utils.dataclasses import DeepSpeedSequenceParallelConfig
+set_seed(42)
+world_size = 2
+model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"
+micro_batch_size = 1
+parallelism_config = ParallelismConfig(
+    sp_backend="deepspeed",
+    sp_size=world_size,
+    # dp_shard_size=1, # set if dp is wanted as well
+    sp_handler=DeepSpeedSequenceParallelConfig(
+        sp_seq_length=256,
+        sp_seq_length_is_variable=True,
+        sp_attn_implementation="sdpa",
+    ),
+)
+accelerator = Accelerator(
+    parallelism_config=parallelism_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+samples = 4
+seqlen = 32
+input_ids = torch.arange(1, seqlen * samples + 1).view(-1, seqlen) + 100
+position_ids = torch.arange(seqlen * samples).view(-1, seqlen)
+ds = torch.utils.data.TensorDataset(input_ids, position_ids)
+def collate_fn(batch):
+    input_ids, position_ids = batch[0]
+    return dict(
+        input_ids=input_ids.unsqueeze(0),
+        position_ids=position_ids.unsqueeze(0),
+        labels=input_ids.unsqueeze(0),
+    )
+dl = torch.utils.data.DataLoader(ds, batch_size=micro_batch_size, collate_fn=collate_fn)
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
+rank = torch.distributed.get_rank()
+if rank == 0:
+    print(f"DL orig: {len(dl)} samples")
+model, optimizer, dl = accelerator.prepare(model, optimizer, dl)
+if rank == 0:
+    print(f"DL w/ adapter: {len(dl)} samples")
+sp_size = parallelism_config.sp_size if parallelism_config else 1
+if sp_size > 1:
+    from deepspeed.utils import groups
+    sp_group = groups._get_sequence_parallel_group()
+    sp_world_size = parallelism_config.sp_size
+unwrapped_model = accelerator.unwrap_model(model)
+# Normal training loop
+for iter, batch in enumerate(dl):
+    optimizer.zero_grad()
+    if rank == 0:
+        print(f"batch {iter}: seqlen: {len(batch['input_ids'][0])}")
+    batch = move_to_device(batch, model.device)
+    outputs = model(**batch)
+    shift_labels = batch["shift_labels"]
+    loss = unwrapped_model.loss_function(
+        logits=outputs.logits,
+        labels=None,
+        shift_labels=shift_labels,
+        vocab_size=unwrapped_model.config.vocab_size,
+    )
+    if sp_size > 1:
+        # differentiable weighted per-shard-loss aggregation across ranks
+        losses_per_rank = torch.distributed.nn.functional.all_gather(loss, group=sp_group)
+        # special dealing with SFT that has prompt tokens that aren't used in loss computation
+        good_tokens = (shift_labels != -100).view(-1).sum()
+        good_tokens_per_rank = torch.distributed.nn.functional.all_gather(good_tokens, group=sp_group)
+        total_loss = sum(
+            losses_per_rank[rank] * good_tokens_per_rank[rank]
+            for rank in range(sp_world_size)
+            if good_tokens_per_rank[rank] > 0
+        )
+        total_good_tokens = sum(good_tokens_per_rank)
+        loss = total_loss / max(total_good_tokens, 1)
+    if rank == 0:
+        accelerator.print(f"{iter}: {loss=}")
+    accelerator.log(dict(train_loss=loss, step=iter))
+    accelerator.backward(loss)
+    optimizer.step()
+accelerator.end_training()

accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test script for verifying multiple models can be utilized with Accelerate + DeepSpeed:
+Scenario 1: One model is training, another model is being used for inference/logits to impact training in some form.
+Scenario 2: Two models are training simultaneously, which means two optimizers, etc.
+"""
+import argparse
+from pathlib import Path
+import evaluate
+import torch
+from datasets import load_dataset
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
+from accelerate import Accelerator, DeepSpeedPlugin, DistributedType
+from accelerate.state import AcceleratorState
+from accelerate.utils.deepspeed import get_active_deepspeed_plugin
+EVAL_BATCH_SIZE = 16
+class NoiseModel(torch.nn.Module):
+    def __init__(self, noise_factor=0.1):
+        super().__init__()
+        self.noise_factor = torch.nn.Parameter(torch.tensor(noise_factor, dtype=torch.float32))
+    def forward(self, loss):
+        return loss * self.noise_factor
+def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name: str = "bert-base-cased"):
+    """
+    Creates a set of `DataLoader`s for the `glue` dataset.
+    Args:
+        accelerator (`Accelerator`):
+            An `Accelerator` object
+        batch_size (`int`, *optional*):
+            The batch size for the train and validation DataLoaders.
+        model_name (`str`, *optional*):
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    datasets = load_dataset("glue", "mrpc")
+    def tokenize_function(examples):
+        # max_length=None => use the model max length (it's actually the default)
+        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
+        return outputs
+    # Apply the method we just defined to all the examples in all the splits of the dataset
+    tokenized_datasets = datasets.map(
+        tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
+    )
+    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
+    # transformers library
+    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    def collate_fn(examples):
+        # On TPU it's best to pad everything to the same length or training will be very slow.
+        if accelerator.distributed_type == DistributedType.XLA:
+            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
+        return tokenizer.pad(examples, padding="longest", return_tensors="pt")
+    # Instantiate dataloaders.
+    train_dataloader = DataLoader(
+        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
+    )
+    eval_dataloader = DataLoader(
+        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
+    )
+    return train_dataloader, eval_dataloader
+test_file_path = __file__
+path = Path(test_file_path).resolve()
+test_file_dir_str = str(path.parent.parent.parent.parent.parent.parent)
+# Create our DS plugins
+# We use custom schedulers and optimizers, hence `model_only`
+ds_config_file = dict(
+    zero2=f"{test_file_dir_str}/tests/deepspeed/ds_config_zero2_model_only.json",
+    zero3=f"{test_file_dir_str}/tests/deepspeed/ds_config_zero3_model_only.json",
+)
+def single_model_training(config, args):
+    # Training a single model, we have a `noise` model that is untrainable used to inject some noise into the training process
+    num_epochs = config["num_epochs"]
+    zero2_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero2"])
+    zero3_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero3"])
+    deepspeed_plugins = {"training": zero2_plugin, "inference": zero3_plugin}
+    # Initialize accelerator
+    accelerator = Accelerator(
+        deepspeed_plugins=deepspeed_plugins,
+        mixed_precision="bf16",
+    )
+    # Initialize model under zero2 plugin
+    assert get_active_deepspeed_plugin(accelerator.state) is zero2_plugin
+    train_model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
+    train_dataloader, eval_dataloader = get_dataloaders(
+        accelerator, batch_size=config["batch_size"], model_name=args.model_name_or_path
+    )
+    max_training_steps = len(train_dataloader) * config["num_epochs"]
+    optimizer = AdamW(train_model.parameters(), lr=config["lr"])
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=0, num_training_steps=max_training_steps
+    )
+    train_dataloader, eval_dataloader, train_model, optimizer, lr_scheduler = accelerator.prepare(
+        train_dataloader, eval_dataloader, train_model, optimizer, lr_scheduler
+    )
+    # Now prepare the model under zero3 plugin
+    accelerator.state.select_deepspeed_plugin("inference")
+    assert get_active_deepspeed_plugin(accelerator.state) is zero3_plugin
+    inference_model = NoiseModel()
+    inference_model = accelerator.prepare(inference_model)
+    inference_model.eval()
+    # Run training loop
+    accelerator.state.select_deepspeed_plugin("training")
+    # We also need to keep track of the stating epoch so files are named properly
+    starting_epoch = 0
+    # Now we train the model
+    best_performance = 0
+    metric = evaluate.load("glue", "mrpc")
+    performance_metric = {}
+    for epoch in range(starting_epoch, num_epochs):
+        train_model.train()
+        inference_model.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(train_model):
+                outputs_1 = train_model(**batch)
+                with torch.no_grad():
+                    outputs_2 = inference_model(outputs_1.loss)
+                # Combine the losses
+                loss = outputs_1.loss + outputs_2
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+        train_model.eval()
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                outputs = train_model(**batch)
+            predictions = outputs.logits.argmax(dim=-1)
+            # It is slightly faster to call this once, than multiple times
+            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
+            metric.add_batch(
+                predictions=predictions,
+                references=references,
+            )
+        eval_metric = metric.compute()
+        # Use accelerator.print to print only on the main process.
+        accelerator.print(f"epoch {epoch}:", eval_metric)
+        performance_metric[f"epoch-{epoch}"] = eval_metric["accuracy"]
+        if best_performance < eval_metric["accuracy"]:
+            best_performance = eval_metric["accuracy"]
+    assert best_performance > performance_metric["epoch-0"]
+def multiple_model_training(config, args):
+    # This will essentially be like a k-fold model, but one model is Zero-2 and another model is Zero-3
+    num_epochs = config["num_epochs"]
+    zero2_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero2"])
+    zero3_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero3"])
+    deepspeed_plugins = {"zero2": zero2_plugin, "zero3": zero3_plugin}
+    # Initialize accelerator
+    zero2_accelerator = Accelerator(
+        deepspeed_plugins=deepspeed_plugins,
+        mixed_precision="bf16",
+    )
+    # Since an `AcceleratorState` has already been made, we can just reuse it here
+    zero3_accelerator = Accelerator()
+    # Initialize model under zero2 plugin
+    assert get_active_deepspeed_plugin(zero2_accelerator.state) is zero2_plugin
+    zero2_model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
+    train_dataloader, eval_dataloader = get_dataloaders(
+        zero2_accelerator, batch_size=config["batch_size"], model_name=args.model_name_or_path
+    )
+    max_training_steps = len(train_dataloader) * config["num_epochs"]
+    zero2_optimizer = AdamW(zero2_model.parameters(), lr=config["lr"])
+    zero2_lr_scheduler = get_linear_schedule_with_warmup(
+        zero2_optimizer, num_warmup_steps=0, num_training_steps=max_training_steps
+    )
+    train_dataloader, eval_dataloader, zero2_model, zero2_optimizer, zero2_lr_scheduler = zero2_accelerator.prepare(
+        train_dataloader, eval_dataloader, zero2_model, zero2_optimizer, zero2_lr_scheduler
+    )
+    assert zero2_accelerator.deepspeed_engine_wrapped.engine is zero2_model
+    # now do Zero3
+    zero3_accelerator.state.select_deepspeed_plugin("zero3")
+    zero3_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = zero2_plugin.deepspeed_config[
+        "train_micro_batch_size_per_gpu"
+    ]
+    assert get_active_deepspeed_plugin(zero3_accelerator.state) is zero3_plugin
+    zero3_model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
+    zero3_optimizer = AdamW(zero3_model.parameters(), lr=config["lr"])
+    zero3_lr_scheduler = get_linear_schedule_with_warmup(
+        zero3_optimizer, num_warmup_steps=0, num_training_steps=max_training_steps
+    )
+    zero3_model, zero3_optimizer, zero3_lr_scheduler = zero3_accelerator.prepare(
+        zero3_model, zero3_optimizer, zero3_lr_scheduler
+    )
+    assert zero3_accelerator.deepspeed_engine_wrapped.engine is zero3_model
+    # Run training loop
+    starting_epoch = 0
+    # Now we train the model
+    best_performance_a = 0
+    best_performance_b = 0
+    metric_a = evaluate.load("glue", "mrpc")
+    metric_b = evaluate.load("glue", "mrpc")
+    performance_metric_a = {}
+    performance_metric_b = {}
+    for epoch in range(starting_epoch, num_epochs):
+        zero2_model.train()
+        zero3_model.train()
+        for step, batch in enumerate(train_dataloader):
+            with zero2_accelerator.accumulate(zero2_model, zero3_model):
+                outputs_1 = zero2_model(**batch)
+                zero2_accelerator.backward(outputs_1.loss)
+                zero2_optimizer.step()
+                zero2_lr_scheduler.step()
+                zero2_optimizer.zero_grad()
+                outputs_2 = zero3_model(**batch)
+                zero3_accelerator.backward(outputs_2.loss)
+                zero3_optimizer.step()
+                zero3_lr_scheduler.step()
+                zero3_optimizer.zero_grad()
+        zero2_model.eval()
+        zero3_model.eval()
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                logits_a = zero2_model(**batch).logits
+                logits_b = zero3_model(**batch).logits
+            # Combine the logits from both models
+            predictions_a = logits_a.argmax(dim=-1)
+            predictions_b = logits_b.argmax(dim=-1)
+            # It is slightly faster to call this once, than multiple times
+            predictions_a, predictions_b, references = zero2_accelerator.gather_for_metrics(
+                (predictions_a, predictions_b, batch["labels"])
+            )
+            metric_a.add_batch(
+                predictions=predictions_a,
+                references=references,
+            )
+            metric_b.add_batch(
+                predictions=predictions_b,
+                references=references,
+            )
+        eval_metric_a = metric_a.compute()
+        eval_metric_b = metric_b.compute()
+        # Use accelerator.print to print only on the main process.
+        zero2_accelerator.print(f"epoch {epoch}:", eval_metric_a, eval_metric_b)
+        performance_metric_a[f"epoch-{epoch}"] = eval_metric_a["accuracy"]
+        performance_metric_b[f"epoch-{epoch}"] = eval_metric_b["accuracy"]
+        if best_performance_a < eval_metric_a["accuracy"]:
+            best_performance_a = eval_metric_a["accuracy"]
+        if best_performance_b < eval_metric_b["accuracy"]:
+            best_performance_b = eval_metric_b["accuracy"]
+    assert best_performance_a > performance_metric_a["epoch-0"]
+    assert best_performance_b > performance_metric_b["epoch-0"]
+def main():
+    parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="bert-base-cased",
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=False,
+    )
+    parser.add_argument(
+        "--performance_lower_bound",
+        type=float,
+        default=None,
+        help="Optional lower bound for the performance metric. If set, the training will throw error when the performance metric drops below this value.",
+    )
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=3,
+        help="Number of train epochs.",
+    )
+    args = parser.parse_args()
+    config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 8}
+    single_model_training(config, args)
+    AcceleratorState._reset_state(True)
+    multiple_model_training(config, args)
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/external_deps/test_metrics.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import math
+import os
+from copy import deepcopy
+import datasets
+import evaluate
+import torch
+import transformers
+from datasets import load_dataset
+from torch.utils.data import DataLoader, IterableDataset
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from accelerate import Accelerator, DataLoaderConfiguration, DistributedType
+from accelerate.data_loader import DataLoaderDispatcher
+from accelerate.test_utils import RegressionDataset, RegressionModel, torch_device
+from accelerate.utils import is_torch_xla_available, set_seed
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
+class ListHandler(logging.Handler):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.logs = []
+    def emit(self, record):
+        self.logs.append(record)
+def get_basic_setup(accelerator, num_samples=82, batch_size=16):
+    "Returns everything needed to perform basic training"
+    set_seed(42)
+    model = RegressionModel()
+    ddp_model = deepcopy(model)
+    dset = RegressionDataset(length=num_samples)
+    dataloader = DataLoader(dset, batch_size=batch_size)
+    model.to(accelerator.device)
+    ddp_model, dataloader = accelerator.prepare(ddp_model, dataloader)
+    return model, ddp_model, dataloader
+def get_dataloader(accelerator: Accelerator, use_longest=False):
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/mrpc-bert-base-cased")
+    dataset = load_dataset("glue", "mrpc", split="validation")
+    def tokenize_function(examples):
+        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
+        return outputs
+    with accelerator.main_process_first():
+        tokenized_datasets = dataset.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
+    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    def collate_fn(examples):
+        if use_longest:
+            return tokenizer.pad(examples, padding="longest", return_tensors="pt")
+        return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
+    return DataLoader(tokenized_datasets, shuffle=False, collate_fn=collate_fn, batch_size=16)
+def get_mrpc_setup(dispatch_batches, split_batches):
+    dataloader_config = DataLoaderConfiguration(dispatch_batches=dispatch_batches, split_batches=split_batches)
+    accelerator = Accelerator(dataloader_config=dataloader_config)
+    dataloader = get_dataloader(accelerator, not dispatch_batches)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        "hf-internal-testing/mrpc-bert-base-cased", return_dict=True
+    )
+    ddp_model, ddp_dataloader = accelerator.prepare(model, dataloader)
+    return {
+        "ddp": [ddp_model, ddp_dataloader, torch_device],
+        "no": [model, dataloader, accelerator.device],
+    }, accelerator
+def generate_predictions(model, dataloader, accelerator):
+    logits_and_targets = []
+    for batch in dataloader:
+        input, target = batch.values()
+        with torch.no_grad():
+            logit = model(input)
+            logit, target = accelerator.gather_for_metrics((logit, target))
+            logits_and_targets.append((logit, target))
+    logits, targs = [], []
+    for logit, targ in logits_and_targets:
+        logits.append(logit)
+        targs.append(targ)
+    logits, targs = torch.cat(logits), torch.cat(targs)
+    return logits, targs
+def test_torch_metrics(
+    accelerator: Accelerator, num_samples=82, dispatch_batches=False, split_batches=False, batch_size=16
+):
+    _, ddp_model, dataloader = get_basic_setup(accelerator, num_samples, batch_size)
+    logits, _ = generate_predictions(ddp_model, dataloader, accelerator)
+    assert len(logits) == num_samples, (
+        f"Unexpected number of inputs:\n    Expected: {num_samples}\n    Actual: {len(logits)}"
+    )
+def test_mrpc(dispatch_batches: bool = False, split_batches: bool = False):
+    metric = evaluate.load("glue", "mrpc")
+    setup, accelerator = get_mrpc_setup(dispatch_batches, split_batches)
+    # First do baseline
+    model, dataloader, device = setup["no"]
+    model.to(device)
+    model.eval()
+    for batch in dataloader:
+        batch.to(device)
+        with torch.inference_mode():
+            outputs = model(**batch)
+        preds = outputs.logits.argmax(dim=-1)
+        metric.add_batch(predictions=preds, references=batch["labels"])
+    baseline = metric.compute()
+    # Then do distributed
+    model, dataloader, device = setup["ddp"]
+    model.eval()
+    for batch in dataloader:
+        with torch.inference_mode():
+            outputs = model(**batch)
+        preds = outputs.logits.argmax(dim=-1)
+        references = batch["labels"]
+        preds, references = accelerator.gather_for_metrics((preds, references))
+        metric.add_batch(predictions=preds, references=references)
+    distributed = metric.compute()
+    for key in "accuracy f1".split():
+        assert math.isclose(baseline[key], distributed[key]), (
+            f"Baseline and Distributed are not the same for key {key}:\n\tBaseline: {baseline[key]}\n\tDistributed: {distributed[key]}\n"
+        )
+def test_gather_for_metrics_with_non_tensor_objects_iterable_dataset():
+    class DummyIterableDataset(IterableDataset):
+        def __init__(self, data):
+            self.data = data
+        def __len__(self):
+            return len(self.data)
+        def __iter__(self):
+            yield from self.data
+    iterable_dataset = DummyIterableDataset([n for n in range(30)])
+    dataloader = DataLoader(iterable_dataset, batch_size=4)
+    accelerator = Accelerator()
+    prepared_dataloader = accelerator.prepare(dataloader)
+    if accelerator.is_main_process:
+        logger = logging.root.manager.loggerDict["accelerate.accelerator"]
+        list_handler = ListHandler()
+        logger.addHandler(list_handler)
+    batches_for_metrics = []
+    for batch in prepared_dataloader:
+        batches_for_metrics.append(accelerator.gather_for_metrics(batch))
+    assert torch.cat(batches_for_metrics).size(0) == 30
+    if accelerator.is_main_process:
+        assert len(list_handler.logs) == 0
+        logger.removeHandler(list_handler)
+def test_gather_for_metrics_with_iterable_dataset():
+    class DummyIterableDataset(IterableDataset):
+        def __init__(self, data):
+            self.data = data
+        def __len__(self):
+            return len(self.data)
+        def __iter__(self):
+            yield from self.data
+    iterable_dataset = DummyIterableDataset(torch.as_tensor(range(30)))
+    dataloader = DataLoader(iterable_dataset, batch_size=4)
+    accelerator = Accelerator()
+    prepared_dataloader = accelerator.prepare(dataloader)
+    assert isinstance(prepared_dataloader, DataLoaderDispatcher)
+    if accelerator.is_main_process:
+        logger = logging.root.manager.loggerDict["accelerate.accelerator"]
+        list_handler = ListHandler()
+        logger.addHandler(list_handler)
+    batches_for_metrics = []
+    for batch in prepared_dataloader:
+        batches_for_metrics.append(accelerator.gather_for_metrics(batch))
+    assert torch.cat(batches_for_metrics).size(0) == 30
+    if accelerator.is_main_process:
+        assert len(list_handler.logs) == 0
+        logger.removeHandler(list_handler)
+def test_gather_for_metrics_drop_last():
+    accelerator = Accelerator()
+    per_device_batch_size = 5
+    num_items = (10 * accelerator.num_processes) + 1
+    dataloader = DataLoader(range(num_items), batch_size=per_device_batch_size, drop_last=True)
+    dataloader = accelerator.prepare(dataloader)
+    iterator = iter(dataloader)
+    next(iterator)  # Skip first batch tensor([0, 1, 2, 3, 4], device='cuda:0')
+    batch = next(iterator)
+    gathered_items = accelerator.gather_for_metrics(batch)
+    # Should return a full set of complete batches from each GPU
+    num_expected_items = per_device_batch_size * accelerator.num_processes
+    assert gathered_items.size(0) == (num_expected_items), (
+        f"Expected number of items: {num_expected_items}, Actual: {gathered_items.size(0)}"
+    )
+def main():
+    dataloader_config = DataLoaderConfiguration(split_batches=False, dispatch_batches=False)
+    accelerator = Accelerator(dataloader_config=dataloader_config)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # TorchXLA does not support batch dispatching. 'put_on_device' is always False for
+    # TorchXLA, which can cause a value error in 'prepare_data_loader' function.
+    dispatch_batches_options = [False] if accelerator.state.distributed_type == DistributedType.XLA else [True, False]
+    # Temporarily close this test for TorchXLA due to the 'Cannot set version_counter for
+    # inference tensor' error in inference mode. Reopen it after TorchXLA fixes this bug.
+    # These are a bit slower so they should only be ran on the GPU or TPU
+    if accelerator.device.type != "cpu" and not is_torch_xla_available():
+        if accelerator.is_local_main_process:
+            print("**Testing gather_for_metrics**")
+        for split_batches in [True, False]:
+            for dispatch_batches in dispatch_batches_options:
+                if accelerator.is_local_main_process:
+                    print(f"With: `split_batches={split_batches}`, `dispatch_batches={dispatch_batches}`")
+                test_mrpc(dispatch_batches, split_batches)
+                accelerator.state._reset_state()
+        print("test_gather_for_metrics_with_iterable_dataset")
+        test_gather_for_metrics_with_iterable_dataset()
+        print("test gather_for_metrics_with_non_tensor_objects_iterable_dataset")
+        test_gather_for_metrics_with_non_tensor_objects_iterable_dataset()
+    # MpDeviceLoader in TorchXLA is an asynchronous loader that preloads several batches into cache.
+    # This can cause the 'end_of_dataloader' of DataLoaderStateMixin to be set earlier than intended.
+    # Skip this test when TorchXLA is enabled.
+    if accelerator.state.distributed_type != DistributedType.XLA:
+        if accelerator.is_local_main_process:
+            print("**Test torch metrics**")
+        for split_batches in [True, False]:
+            for dispatch_batches in dispatch_batches_options:
+                dataloader_config = DataLoaderConfiguration(
+                    split_batches=split_batches, dispatch_batches=dispatch_batches
+                )
+                accelerator = Accelerator(dataloader_config=dataloader_config)
+                if accelerator.is_local_main_process:
+                    print(f"With: `split_batches={split_batches}`, `dispatch_batches={dispatch_batches}`, length=99")
+                test_torch_metrics(accelerator, 99)
+                accelerator.state._reset_state()
+    if accelerator.is_local_main_process:
+        print("**Test last batch is not dropped when perfectly divisible**")
+    accelerator = Accelerator()
+    test_torch_metrics(accelerator, 512)
+    accelerator.state._reset_state()
+    if accelerator.is_local_main_process:
+        print("**Test that `drop_last` is taken into account**")
+    test_gather_for_metrics_drop_last()
+    accelerator.end_training()
+    accelerator.state._reset_state()
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import os
+import torch
+from datasets import load_dataset
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
+from accelerate import Accelerator, DistributedType
+from accelerate.utils import (
+    is_hpu_available,
+    is_mlu_available,
+    is_musa_available,
+    is_neuron_available,
+    is_npu_available,
+    is_sdaa_available,
+    is_xpu_available,
+)
+from accelerate.utils.deepspeed import DummyOptim, DummyScheduler
+MAX_GPU_BATCH_SIZE = 16
+EVAL_BATCH_SIZE = 32
+# Converting Bytes to Megabytes
+def b2mb(x):
+    return int(x / 2**20)
+# This context manager is used to track the peak memory usage of the process
+class TorchTracemalloc:
+    def __enter__(self):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
+            self.begin = torch.cuda.memory_allocated()
+        elif is_mlu_available():
+            torch.mlu.empty_cache()
+            torch.mlu.reset_max_memory_allocated()  # reset the peak gauge to zero
+            self.begin = torch.mlu.memory_allocated()
+        elif is_sdaa_available():
+            torch.sdaa.empty_cache()
+            torch.sdaa.reset_max_memory_allocated()  # reset the peak gauge to zero
+            self.begin = torch.sdaa.memory_allocated()
+        elif is_musa_available():
+            torch.musa.empty_cache()
+            torch.musa.reset_max_memory_allocated()  # reset the peak gauge to zero
+            self.begin = torch.musa.memory_allocated()
+        elif is_npu_available():
+            torch.npu.empty_cache()
+            torch.npu.reset_max_memory_allocated()  # reset the peak gauge to zero
+            self.begin = torch.npu.memory_allocated()
+        elif is_xpu_available():
+            torch.xpu.empty_cache()
+            torch.xpu.reset_peak_memory_stats()  # reset the peak gauge to zero
+            self.begin = torch.xpu.memory_allocated()
+        elif is_hpu_available():
+            # torch.hpu.empty_cache() # not available on hpu as it reserves all device memory for the current process
+            torch.hpu.reset_peak_memory_stats()  # reset the peak gauge to zero
+            self.begin = torch.hpu.memory_allocated()
+        elif is_neuron_available():
+            torch.neuron.empty_cache()
+            torch.neuron.reset_peak_memory_stats()  # reset the peak gauge to zero
+            self.begin = torch.neuron.memory_allocated()
+        return self
+    def __exit__(self, *exc):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            self.end = torch.cuda.memory_allocated()
+            self.peak = torch.cuda.max_memory_allocated()
+        elif is_mlu_available():
+            torch.mlu.empty_cache()
+            self.end = torch.mlu.memory_allocated()
+            self.begin = torch.mlu.max_memory_allocated()
+        elif is_sdaa_available():
+            torch.sdaa.empty_cache()
+            self.end = torch.sdaa.memory_allocated()
+            self.begin = torch.sdaa.max_memory_allocated()
+        elif is_musa_available():
+            torch.musa.empty_cache()
+            self.end = torch.musa.memory_allocated()
+            self.begin = torch.musa.max_memory_allocated()
+        elif is_npu_available():
+            torch.npu.empty_cache()
+            self.end = torch.npu.memory_allocated()
+            self.peak = torch.npu.max_memory_allocated()
+        elif is_xpu_available():
+            torch.xpu.empty_cache()
+            self.end = torch.xpu.memory_allocated()
+            self.peak = torch.xpu.max_memory_allocated()
+        elif is_hpu_available():
+            # torch.hpu.empty_cache() # not available on hpu as it reserves all device memory for the current process
+            self.end = torch.hpu.memory_allocated()
+            self.peak = torch.hpu.max_memory_allocated()
+        elif is_neuron_available():
+            torch.neuron.empty_cache()
+            self.end = torch.neuron.memory_allocated()
+            self.peak = torch.neuron.max_memory_allocated()
+        self.used = b2mb(self.end - self.begin)
+        self.peaked = b2mb(self.peak - self.begin)
+        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")
+def get_dataloaders(
+    accelerator: Accelerator,
+    batch_size: int = 16,
+    model_name: str = "bert-base-cased",
+    n_train: int = 320,
+    n_val: int = 160,
+):
+    """
+    Creates a set of `DataLoader`s for the `glue` dataset.
+    Args:
+        accelerator (`Accelerator`):
+            An `Accelerator` object
+        batch_size (`int`, *optional*):
+            The batch size for the train and validation DataLoaders.
+        model_name (`str`, *optional*):
+            The name of the model to use.
+        n_train (`int`, *optional*):
+            The number of training examples to use.
+        n_val (`int`, *optional*):
+            The number of validation examples to use.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    datasets = load_dataset(
+        "glue", "mrpc", split={"train": f"train[:{n_train}]", "validation": f"validation[:{n_val}]"}
+    )
+    def tokenize_function(examples):
+        # max_length=None => use the model max length (it's actually the default)
+        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
+        return outputs
+    # Apply the method we just defined to all the examples in all the splits of the dataset
+    tokenized_datasets = datasets.map(
+        tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
+    )
+    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
+    # transformers library
+    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    def collate_fn(examples):
+        # On TPU it's best to pad everything to the same length or training will be very slow.
+        if accelerator.distributed_type == DistributedType.XLA:
+            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
+        return tokenizer.pad(examples, padding="longest", return_tensors="pt")
+    # Instantiate dataloaders.
+    train_dataloader = DataLoader(
+        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
+    )
+    eval_dataloader = DataLoader(
+        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
+    )
+    return train_dataloader, eval_dataloader
+def training_function(config, args):
+    # Initialize accelerator
+    accelerator = Accelerator()
+    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
+    lr = config["lr"]
+    num_epochs = int(config["num_epochs"])
+    seed = int(config["seed"])
+    batch_size = int(config["batch_size"])
+    model_name = args.model_name_or_path
+    set_seed(seed)
+    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name, args.n_train, args.n_val)
+    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)
+    # Instantiate optimizer
+    optimizer_cls = (
+        AdamW
+        if accelerator.state.deepspeed_plugin is None
+        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
+        else DummyOptim
+    )
+    optimizer = optimizer_cls(params=model.parameters(), lr=lr)
+    if accelerator.state.deepspeed_plugin is not None:
+        gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
+            "gradient_accumulation_steps"
+        ]
+    else:
+        gradient_accumulation_steps = 1
+    max_training_steps = (len(train_dataloader) * num_epochs) // gradient_accumulation_steps
+    # Instantiate scheduler
+    if (
+        accelerator.state.deepspeed_plugin is None
+        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
+    ):
+        lr_scheduler = get_linear_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=0,
+            num_training_steps=max_training_steps,
+        )
+    else:
+        lr_scheduler = DummyScheduler(optimizer, total_num_steps=max_training_steps, warmup_num_steps=0)
+    # Prepare everything
+    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
+    # prepare method.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+    # We need to keep track of how many total steps we have iterated over
+    overall_step = 0
+    # We also need to keep track of the stating epoch so files are named properly
+    starting_epoch = 0
+    # Now we train the model
+    train_total_peak_memory = {}
+    for epoch in range(starting_epoch, num_epochs):
+        with TorchTracemalloc() as tracemalloc:
+            model.train()
+            for step, batch in enumerate(train_dataloader):
+                outputs = model(**batch)
+                loss = outputs.loss
+                loss = loss / gradient_accumulation_steps
+                accelerator.backward(loss)
+                if step % gradient_accumulation_steps == 0:
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+                overall_step += 1
+        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
+        accelerator.print(f"Memory before entering the train : {b2mb(tracemalloc.begin)}")
+        accelerator.print(f"Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
+        accelerator.print(f"Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
+        accelerator.print(
+            f"Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+        )
+        train_total_peak_memory[f"epoch-{epoch}"] = tracemalloc.peaked + b2mb(tracemalloc.begin)
+        if args.peak_memory_upper_bound is not None:
+            assert train_total_peak_memory[f"epoch-{epoch}"] <= args.peak_memory_upper_bound, (
+                "Peak memory usage exceeded the upper bound"
+            )
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        with open(os.path.join(args.output_dir, "peak_memory_utilization.json"), "w") as f:
+            json.dump(train_total_peak_memory, f)
+    accelerator.end_training()
+def main():
+    parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="bert-base-cased",
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=False,
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=".",
+        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
+    )
+    parser.add_argument(
+        "--peak_memory_upper_bound",
+        type=float,
+        default=None,
+        help="The upper bound of peak memory usage in MB. If set, the training will throw an error if the peak memory usage exceeds this value.",
+    )
+    parser.add_argument(
+        "--n_train",
+        type=int,
+        default=320,
+        help="Number of training examples to use.",
+    )
+    parser.add_argument(
+        "--n_val",
+        type=int,
+        default=160,
+        help="Number of validation examples to use.",
+    )
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=1,
+        help="Number of train epochs.",
+    )
+    args = parser.parse_args()
+    config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}
+    training_function(config, args)
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/external_deps/test_performance.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+from contextlib import nullcontext
+from pathlib import Path
+import evaluate
+import torch
+from datasets import load_dataset
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
+from accelerate import Accelerator, DistributedType
+from accelerate.parallelism_config import ParallelismConfig
+from accelerate.utils import SAFE_WEIGHTS_NAME, set_seed
+from accelerate.utils.deepspeed import DummyOptim, DummyScheduler
+MAX_GPU_BATCH_SIZE = 16
+EVAL_BATCH_SIZE = 32
+def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name: str = "bert-base-cased"):
+    """
+    Creates a set of `DataLoader`s for the `glue` dataset.
+    Args:
+        accelerator (`Accelerator`):
+            An `Accelerator` object
+        batch_size (`int`, *optional*):
+            The batch size for the train and validation DataLoaders.
+        model_name (`str`, *optional*):
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    datasets = load_dataset("glue", "mrpc")
+    def tokenize_function(examples):
+        # max_length=None => use the model max length (it's actually the default)
+        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
+        return outputs
+    # Apply the method we just defined to all the examples in all the splits of the dataset
+    tokenized_datasets = datasets.map(
+        tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
+    )
+    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
+    # transformers library
+    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    def collate_fn(examples):
+        # On TPU it's best to pad everything to the same length or training will be very slow.
+        if accelerator.distributed_type == DistributedType.XLA:
+            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
+        return tokenizer.pad(examples, padding="longest", return_tensors="pt")
+    # Instantiate dataloaders.
+    train_dataloader = DataLoader(
+        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
+    )
+    eval_dataloader = DataLoader(
+        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
+    )
+    return train_dataloader, eval_dataloader
+def training_function(config, args):
+    accelerator_kwargs = {}
+    # need this for DeepSpeed tests as `args.tp_size` would be None and `torch.distributed.init_device_mesh` would fail
+    if args.tp_size is not None:
+        accelerator_kwargs["parallelism_config"] = ParallelismConfig(tp_size=args.tp_size)
+    # Initialize accelerator
+    accelerator = Accelerator(**accelerator_kwargs)
+    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
+    lr = config["lr"]
+    num_epochs = int(config["num_epochs"])
+    seed = int(config["seed"])
+    batch_size = int(config["batch_size"])
+    model_name = args.model_name_or_path
+    set_seed(seed)
+    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name)
+    # Add TP related kwargs if provided
+    model_kwargs = {}
+    if args.tp_plan is not None:
+        model_kwargs["tp_plan"] = args.tp_plan
+    if args.tp_size is not None:
+        model_kwargs["tp_size"] = args.tp_size
+    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, **model_kwargs)
+    if args.add_pad_token:
+        if model.config.pad_token_id is None:
+            model.config.pad_token_id = 0
+    # Instantiate optimizer
+    optimizer_cls = (
+        AdamW
+        if accelerator.state.deepspeed_plugin is None
+        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
+        else DummyOptim
+    )
+    optimizer = optimizer_cls(params=model.parameters(), lr=lr)
+    max_training_steps = len(train_dataloader) * num_epochs
+    # Instantiate scheduler
+    linear_decay_scheduler = False
+    if (
+        accelerator.state.deepspeed_plugin is None
+        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
+    ):
+        lr_scheduler = get_linear_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=0,
+            num_training_steps=max_training_steps,
+        )
+        linear_decay_scheduler = True
+    else:
+        lr_scheduler = DummyScheduler(optimizer, total_num_steps=max_training_steps, warmup_num_steps=0)
+    # Prepare everything
+    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
+    # prepare method.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+    # We also need to keep track of the stating epoch so files are named properly
+    starting_epoch = 0
+    # Now we train the model
+    metric = evaluate.load("glue", "mrpc")
+    best_performance = 0
+    performance_metric = {}
+    expected_lr_after_first_optim_step = lr * (
+        1 - 1 / (max_training_steps / accelerator.num_processes / accelerator.gradient_accumulation_steps)
+    )
+    lr_scheduler_check_completed = False
+    for epoch in range(starting_epoch, num_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                accelerator.backward(loss)
+                context = nullcontext
+                if args.tp_plan is not None:
+                    from torch.distributed._tensor.experimental import implicit_replication
+                    context = implicit_replication
+                with context():
+                    optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                # assert the learning rate after first optimizer step
+                if (
+                    accelerator.sync_gradients
+                    and not lr_scheduler_check_completed
+                    and linear_decay_scheduler
+                    and accelerator.state.mixed_precision == "no"
+                ):
+                    assert lr_scheduler.get_last_lr()[0] == expected_lr_after_first_optim_step, (
+                        f"Wrong lr found at second step, expected {expected_lr_after_first_optim_step}, got {lr_scheduler.get_last_lr()[0]}"
+                    )
+                    lr_scheduler_check_completed = True
+        model.eval()
+        samples_seen = 0
+        for step, batch in enumerate(eval_dataloader):
+            # We could avoid this line since we set the accelerator with `device_placement=True`.
+            batch.to(accelerator.device)
+            with torch.no_grad():
+                outputs = model(**batch)
+            predictions = outputs.logits.argmax(dim=-1)
+            # It is slightly faster to call this once, than multiple times
+            predictions, references = accelerator.gather(
+                (predictions, batch["labels"])
+            )  # If we are in a multiprocess environment, the last batch has duplicates
+            if accelerator.use_distributed:
+                if step == len(eval_dataloader) - 1:
+                    predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
+                    references = references[: len(eval_dataloader.dataset) - samples_seen]
+                else:
+                    samples_seen += references.shape[0]
+            metric.add_batch(
+                predictions=predictions,
+                references=references,
+            )
+        eval_metric = metric.compute()
+        # Use accelerator.print to print only on the main process.
+        accelerator.print(f"epoch {epoch}:", eval_metric)
+        performance_metric[f"epoch-{epoch}"] = eval_metric["accuracy"]
+        if best_performance < eval_metric["accuracy"]:
+            best_performance = eval_metric["accuracy"]
+    # check that the LR is 0
+    if linear_decay_scheduler and accelerator.state.mixed_precision == "no":
+        assert lr_scheduler.get_last_lr()[0] == 0, (
+            f"Wrong lr found at last step, expected 0, got {lr_scheduler.get_last_lr()[0]}"
+        )
+    if args.performance_lower_bound is not None:
+        assert args.performance_lower_bound <= best_performance, (
+            f"Best performance metric {best_performance} is lower than the lower bound {args.performance_lower_bound}"
+        )
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump(performance_metric, f)
+    # TODO: skip saving of the model test for TP until the feature lands
+    if args.tp_plan is None:
+        # Finally try saving the model
+        accelerator.save_model(model, args.output_dir)
+    accelerator.wait_for_everyone()
+    if args.tp_plan is None:
+        assert Path(args.output_dir, SAFE_WEIGHTS_NAME).exists(), (
+            "Model was not saved when calling `Accelerator.save_model`"
+        )
+    accelerator.end_training()
+def main():
+    parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="bert-base-cased",
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=False,
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=".",
+        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
+    )
+    parser.add_argument(
+        "--performance_lower_bound",
+        type=float,
+        default=None,
+        help="Optional lower bound for the performance metric. If set, the training will throw error when the performance metric drops below this value.",
+    )
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=3,
+        help="Number of train epochs.",
+    )
+    parser.add_argument(
+        "--add_pad_token",
+        type=bool,
+        default=False,
+        help="To add pad token if not exists.",
+    )
+    parser.add_argument(
+        "--tp_plan",
+        type=str,
+        default=None,
+        help="pass 'auto' to use TP",
+    )
+    parser.add_argument(
+        "--tp_size",
+        type=int,
+        default=None,
+        help="TP size to be used to shard the model",
+    )
+    args = parser.parse_args()
+    config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}
+    training_function(config, args)
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/external_deps/test_pippy.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from transformers import (
+    BertConfig,
+    BertForMaskedLM,
+    GPT2Config,
+    GPT2ForSequenceClassification,
+)
+from accelerate import PartialState
+from accelerate.inference import prepare_pippy
+from accelerate.test_utils import torch_device
+from accelerate.utils import DistributedType, set_seed
+model_to_config = {
+    "bert": (BertForMaskedLM, BertConfig, 512),
+    "gpt2": (GPT2ForSequenceClassification, GPT2Config, 1024),
+}
+def get_model_and_data_for_text(model_name, device, num_processes: int = 2):
+    initializer, config, seq_len = model_to_config[model_name]
+    config_args = {}
+    # Eventually needed for batch inference tests on gpt-2 when bs != 1
+    # if model_name == "gpt2":
+    #     config_args["pad_token_id"] = 0
+    model_config = config(**config_args)
+    model = initializer(model_config)
+    kwargs = dict(low=0, high=model_config.vocab_size, device=device, dtype=torch.int64, requires_grad=False)
+    trace_input = torch.randint(size=(1, seq_len), **kwargs)
+    inference_inputs = torch.randint(size=(num_processes, seq_len), **kwargs)
+    return model, trace_input, inference_inputs
+def test_bert(batch_size: int = 2):
+    set_seed(42)
+    state = PartialState()
+    model, trace_input, inference_inputs = get_model_and_data_for_text("bert", "cpu", batch_size)
+    model = prepare_pippy(model, example_args=(trace_input,), no_split_module_classes=model._no_split_modules)
+    # For inference args need to be a tuple
+    inputs = inference_inputs.to(torch_device)
+    with torch.no_grad():
+        output = model(inputs)
+    # Zach: Check that we just grab the real outputs we need at the end
+    if not state.is_last_process:
+        assert output is None, "Output was not generated on just the last process!"
+    else:
+        assert output is not None, "Output was not generated in the last process!"
+def test_gpt2(batch_size: int = 2):
+    set_seed(42)
+    state = PartialState()
+    model, trace_input, inference_inputs = get_model_and_data_for_text("gpt2", "cpu", batch_size)
+    model = prepare_pippy(model, example_args=(trace_input,), no_split_module_classes=model._no_split_modules)
+    # For inference args need to be a tuple
+    inputs = inference_inputs.to(torch_device)
+    with torch.no_grad():
+        output = model(inputs)
+    # Zach: Check that we just grab the real outputs we need at the end
+    if not state.is_last_process:
+        assert output is None, "Output was not generated on just the last process!"
+    else:
+        assert output is not None, "Output was not generated in the last process!"
+# Currently disabled, enable again once PyTorch pippy interface can trace a resnet34
+# def test_resnet(batch_size: int = 2):
+#     set_seed(42)
+#     state = PartialState()
+#     model = resnet34()
+#     input_tensor = torch.rand(1, 3, 224, 224)
+#     model = prepare_pippy(
+#         model,
+#         example_args=(input_tensor,),
+#     )
+#     inference_inputs = torch.rand(batch_size, 3, 224, 224)
+#     inputs = send_to_device(inference_inputs, torch_device)
+#     with torch.no_grad():
+#         output = model(inputs)
+#     # Zach: Check that we just grab the real outputs we need at the end
+#     if not state.is_last_process:
+#         assert output is None, "Output was not generated on just the last process!"
+#     else:
+#         assert output is not None, "Output was not generated in the last process!"
+if __name__ == "__main__":
+    state = PartialState()
+    state.print("Testing pippy integration...")
+    try:
+        if state.distributed_type in [DistributedType.MULTI_GPU, DistributedType.MULTI_XPU, DistributedType.MULTI_HPU]:
+            state.print("Testing GPT2...")
+            test_gpt2()
+            # Issue: When modifying the tokenizer for batch GPT2 inference, there's an issue
+            # due to references
+            # NameError: cannot access free variable 'chunk_args_list' where it is not associated with a value in enclosing scope
+            # test_gpt2(3)
+            state.print("Testing BERT...")
+            test_bert()
+        else:
+            print("Less than two GPUs found, not running tests!")
+    finally:
+        state.destroy_process_group()

accelerate/test_utils/scripts/external_deps/test_zero3_integration.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.distributed
+from accelerate.test_utils import require_huggingface_suite, torch_device
+from accelerate.utils import is_transformers_available
+if is_transformers_available():
+    from transformers import AutoModel, TrainingArguments
+GPT2_TINY = "sshleifer/tiny-gpt2"
+@require_huggingface_suite
+def init_torch_dist_then_launch_deepspeed():
+    if torch_device == "xpu":
+        backend = "xccl"
+    elif torch_device == "hpu":
+        backend = "hccl"
+    else:
+        backend = "nccl"
+    torch.distributed.init_process_group(backend=backend)
+    deepspeed_config = {
+        "zero_optimization": {
+            "stage": 3,
+        },
+        "train_batch_size": "auto",
+        "train_micro_batch_size_per_gpu": "auto",
+    }
+    train_args = TrainingArguments(
+        output_dir="./",
+        deepspeed=deepspeed_config,
+    )
+    model = AutoModel.from_pretrained(GPT2_TINY)
+    assert train_args is not None
+    assert model is not None
+def main():
+    init_torch_dist_then_launch_deepspeed()
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/test_cli.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from accelerate.utils import is_xpu_available
+def main():
+    accelerator_type = "GPU"
+    num_accelerators = 0
+    if torch.cuda.is_available():
+        num_accelerators = torch.cuda.device_count()
+        accelerator_type = "GPU"
+    elif is_xpu_available():
+        num_accelerators = torch.xpu.device_count()
+        accelerator_type = "XPU"
+    print(f"Successfully ran on {num_accelerators} {accelerator_type}s")
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/test_ddp_comm_hook.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs, PartialState
+from accelerate.utils import is_hpu_available
+class MockModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        torch.manual_seed(0)
+        self.p = torch.nn.Parameter(torch.randn(40, 20))
+    def forward(self, x, rank):
+        return self.p * (x ** (1 + rank))
+def _run_and_get_grads(model, rank):
+    torch.manual_seed(2024)
+    input = torch.randn(40, 20)
+    output = model(input, rank)
+    output.mean().backward()
+    param = next(model.parameters())
+    return param.grad
+def test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option):
+    ddp_kwargs = DistributedDataParallelKwargs(
+        comm_hook=comm_hook,
+        comm_wrapper=comm_wrapper,
+        comm_state_option=comm_state_option,
+    )
+    accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
+    model = accelerator.prepare(MockModel())
+    hook_grads = _run_and_get_grads(model, accelerator.local_process_index)
+    reference_model = torch.nn.parallel.DistributedDataParallel(
+        MockModel().to(accelerator.device),
+        device_ids=[accelerator.local_process_index],
+        output_device=accelerator.local_process_index,
+    )
+    reference_grads = _run_and_get_grads(reference_model, accelerator.local_process_index)
+    torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-2, atol=1e-2)
+def main():
+    for comm_hook, comm_wrapper, comm_state_option in [
+        (DDPCommunicationHookType.NO, DDPCommunicationHookType.NO, {}),
+        (DDPCommunicationHookType.FP16, DDPCommunicationHookType.NO, {}),
+        (DDPCommunicationHookType.BF16, DDPCommunicationHookType.NO, {}),
+        (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {}),
+        (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.FP16, {}),
+        (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.BF16, {}),
+        (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {"matrix_approximation_rank": 2}),
+        (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.NO, {}),
+        (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.FP16, {}),
+        (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.BF16, {}),
+    ]:
+        if is_hpu_available():
+            HPU_UNSUPPORTED_COMM_HOOKS = {DDPCommunicationHookType.FP16, DDPCommunicationHookType.BF16}
+            if comm_hook in HPU_UNSUPPORTED_COMM_HOOKS or comm_wrapper in HPU_UNSUPPORTED_COMM_HOOKS:
+                print(f"Skipping test DDP comm hook: {comm_hook}, comm wrapper: {comm_wrapper} on HPU")
+                continue
+        print(f"Test DDP comm hook: {comm_hook}, comm wrapper: {comm_wrapper}")
+        test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option)
+    PartialState().destroy_process_group()
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/test_distributed_data_loop.py ADDED Viewed

	@@ -0,0 +1,429 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pickle
+import tempfile
+import warnings
+from unittest.mock import Mock
+import torch
+from torch.utils.data import (
+    BatchSampler,
+    DataLoader,
+    Dataset,
+    IterableDataset,
+    RandomSampler,
+    TensorDataset,
+    default_collate,
+)
+from accelerate.accelerator import Accelerator, DataLoaderConfiguration
+from accelerate.utils.dataclasses import DistributedType
+NUM_ELEMENTS = 22
+NUM_WORKERS = 4
+BATCH_SIZE = 4
+class DummyDataset(Dataset):
+    def __len__(self):
+        return NUM_ELEMENTS
+    def __getitem__(self, index):
+        squeeze = False
+        if isinstance(index, int):
+            index = [index]
+            squeeze = True
+        elif isinstance(index, slice):
+            index = list(range(*index.indices(self.size)))
+        else:
+            index = list(index)
+        batch = [{"index": i, "label": i % 2, "random_augmentation": torch.rand(1).item()} for i in index]
+        if squeeze:
+            batch = batch[0]
+        return batch
+class DummyIterableDataset(IterableDataset):
+    def __init__(self, data):
+        self.data = data
+    def __iter__(self):
+        yield from self.data
+def create_accelerator(even_batches=True):
+    dataloader_config = DataLoaderConfiguration(even_batches=even_batches)
+    accelerator = Accelerator(dataloader_config=dataloader_config)
+    assert accelerator.num_processes == 2, "this script expects that two GPUs are available"
+    return accelerator
+def create_dataloader(
+    accelerator: Accelerator, dataset_size: int, batch_size: int, iterable: bool = False, shuffle: bool = False
+):
+    """
+    Create a simple DataLoader to use during the test cases
+    """
+    values = torch.as_tensor(range(dataset_size))
+    if shuffle:
+        values = values[torch.randperm(values.size(0))]
+    if iterable:
+        dataset = DummyIterableDataset(values)
+    else:
+        dataset = TensorDataset(torch.as_tensor(range(dataset_size)))
+    dl = DataLoader(dataset, batch_size=batch_size)
+    dl = accelerator.prepare(dl)
+    return dl
+def verify_dataloader_batch_sizes(
+    accelerator: Accelerator,
+    dataset_size: int,
+    batch_size: int,
+    process_0_expected_batch_sizes: list[int],
+    process_1_expected_batch_sizes: list[int],
+):
+    """
+    A helper function for verifying the batch sizes coming from a prepared dataloader in each process
+    """
+    dl = create_dataloader(accelerator=accelerator, dataset_size=dataset_size, batch_size=batch_size)
+    batch_sizes = [len(batch[0]) for batch in dl]
+    if accelerator.process_index == 0:
+        assert batch_sizes == process_0_expected_batch_sizes
+    elif accelerator.process_index == 1:
+        assert batch_sizes == process_1_expected_batch_sizes
+def test_default_ensures_even_batch_sizes():
+    accelerator = create_accelerator()
+    # without padding, we would expect a different number of batches
+    verify_dataloader_batch_sizes(
+        accelerator,
+        dataset_size=3,
+        batch_size=1,
+        process_0_expected_batch_sizes=[1, 1],
+        process_1_expected_batch_sizes=[1, 1],
+    )
+    # without padding, we would expect the same number of batches, but different sizes
+    verify_dataloader_batch_sizes(
+        accelerator,
+        dataset_size=7,
+        batch_size=2,
+        process_0_expected_batch_sizes=[2, 2],
+        process_1_expected_batch_sizes=[2, 2],
+    )
+def test_can_disable_even_batches():
+    accelerator = create_accelerator(even_batches=False)
+    verify_dataloader_batch_sizes(
+        accelerator,
+        dataset_size=3,
+        batch_size=1,
+        process_0_expected_batch_sizes=[1, 1],
+        process_1_expected_batch_sizes=[1],
+    )
+    verify_dataloader_batch_sizes(
+        accelerator,
+        dataset_size=7,
+        batch_size=2,
+        process_0_expected_batch_sizes=[2, 2],
+        process_1_expected_batch_sizes=[2, 1],
+    )
+def test_can_join_uneven_inputs():
+    accelerator = create_accelerator(even_batches=False)
+    model = torch.nn.Linear(1, 1)
+    ddp_model = accelerator.prepare(model)
+    dl = create_dataloader(accelerator, dataset_size=3, batch_size=1)
+    batch_idxs = []
+    with accelerator.join_uneven_inputs([ddp_model]):
+        for batch_idx, batch in enumerate(dl):
+            output = ddp_model(batch[0].float())
+            loss = output.sum()
+            loss.backward()
+            batch_idxs.append(batch_idx)
+    accelerator.wait_for_everyone()
+    if accelerator.process_index == 0:
+        assert batch_idxs == [0, 1]
+    elif accelerator.process_index == 1:
+        assert batch_idxs == [0]
+def test_join_raises_warning_for_non_ddp_distributed(accelerator):
+    with warnings.catch_warnings(record=True) as w:
+        with accelerator.join_uneven_inputs([Mock()]):
+            pass
+        assert issubclass(w[-1].category, UserWarning)
+        assert "only supported for multi-GPU" in str(w[-1].message)
+def test_join_can_override_even_batches():
+    default_even_batches = True
+    overridden_even_batches = False
+    accelerator = create_accelerator(even_batches=default_even_batches)
+    model = torch.nn.Linear(1, 1)
+    ddp_model = accelerator.prepare(model)
+    train_dl = create_dataloader(accelerator, dataset_size=3, batch_size=1)
+    valid_dl = create_dataloader(accelerator, dataset_size=3, batch_size=1)
+    with accelerator.join_uneven_inputs([ddp_model], even_batches=overridden_even_batches):
+        train_dl_overridden_value = train_dl.batch_sampler.even_batches
+        valid_dl_overridden_value = valid_dl.batch_sampler.even_batches
+    assert train_dl_overridden_value == overridden_even_batches
+    assert valid_dl_overridden_value == overridden_even_batches
+    assert train_dl.batch_sampler.even_batches == default_even_batches
+    assert valid_dl.batch_sampler.even_batches == default_even_batches
+def test_join_can_override_for_mixed_type_dataloaders():
+    default_even_batches = True
+    overridden_even_batches = False
+    accelerator = create_accelerator(even_batches=default_even_batches)
+    model = torch.nn.Linear(1, 1)
+    ddp_model = accelerator.prepare(model)
+    create_dataloader(accelerator, dataset_size=3, batch_size=1, iterable=True)
+    batch_dl = create_dataloader(accelerator, dataset_size=3, batch_size=1)
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore")
+        try:
+            with accelerator.join_uneven_inputs([ddp_model], even_batches=overridden_even_batches):
+                batch_dl_overridden_value = batch_dl.batch_sampler.even_batches
+        except AttributeError:
+            # ensure attribute error is not raised when processing iterable dl
+            raise AssertionError
+    assert batch_dl_overridden_value == overridden_even_batches
+    assert batch_dl.batch_sampler.even_batches == default_even_batches
+def test_join_raises_warning_for_iterable_when_overriding_even_batches():
+    accelerator = create_accelerator()
+    model = torch.nn.Linear(1, 1)
+    ddp_model = accelerator.prepare(model)
+    create_dataloader(accelerator, dataset_size=3, batch_size=1, iterable=True)
+    with warnings.catch_warnings(record=True) as w:
+        with accelerator.join_uneven_inputs([ddp_model], even_batches=False):
+            pass
+        assert issubclass(w[-1].category, UserWarning)
+        assert "only supported for map-style datasets" in str(w[-1].message)
+def test_pickle_accelerator():
+    accelerator = create_accelerator()
+    data_loader = create_dataloader(accelerator, dataset_size=32, batch_size=4)
+    _ = accelerator.prepare(data_loader)
+    pickled_accelerator = pickle.dumps(accelerator)
+    unpickled_accelerator = pickle.loads(pickled_accelerator)
+    # TODO: Maybe this should be implemented as __eq__ for AcceleratorState?
+    assert accelerator.state.__dict__ == unpickled_accelerator.state.__dict__
+def test_data_loader(data_loader, accelerator):
+    # Prepare the DataLoader
+    data_loader = accelerator.prepare(data_loader)
+    all_examples = []
+    for i, batch in enumerate(data_loader):
+        index, _ = accelerator.gather_for_metrics((batch["index"], batch["label"]))
+        all_examples.extend(index.detach().cpu().numpy().tolist())
+    # Sort the examples
+    sorted_all_examples = sorted(all_examples)
+    # Check if all elements are present in the sorted list of iterated samples
+    assert len(set(sorted_all_examples)) == NUM_ELEMENTS, (
+        "Not all the dataset elements have been iterated in an epoch due to duplication of samples across processes."
+    )
+def _test_stateful_dataloader_resume(accelerator, iterable):
+    """
+    Helper: iterate a stateful dataloader, save state after a few batches using `load_state_dict`,
+    resume from the saved state, and verify the resumed batches match what was originally unseen.
+    Saves early (after 3 batches) so many batches remain, exposing any off-by-one in state restoration.
+    Tested with both iterable and map-style datasets to cover different state_dict code paths.
+    """
+    old_dataloader_config = accelerator.dataloader_config
+    try:
+        accelerator.dataloader_config = DataLoaderConfiguration(use_stateful_dataloader=True)
+        prepared_dl = create_dataloader(
+            accelerator, dataset_size=32 * accelerator.num_processes, batch_size=4, iterable=iterable, shuffle=True
+        )
+        untrained_batches = []
+        save_step = 2
+        for step, batch in enumerate(prepared_dl):
+            if step == save_step:
+                state_dict = prepared_dl.state_dict()
+            if step > save_step:
+                untrained_batches.append(batch)
+        not_skipped_batches = accelerator.gather(untrained_batches)
+        prepared_dl.load_state_dict(state_dict)
+        resumed_batches = []
+        for batch in prepared_dl:
+            resumed_batches.append(batch)
+        resumed_batches = accelerator.gather(resumed_batches)
+        assert len(not_skipped_batches) == len(resumed_batches), (
+            f"Expected {len(not_skipped_batches)} batches after resume, got {len(resumed_batches)}"
+        )
+        for b1, b2 in zip(not_skipped_batches, resumed_batches):
+            for v1, v2 in zip(b1, b2):
+                assert torch.equal(v1, v2), f"Batch {b1} and {b2} are not equal"
+    finally:
+        accelerator.dataloader_config = old_dataloader_config
+def test_stateful_dataloader(accelerator):
+    """
+    Tests that a stateful dataloader can be iterated over, saved after a few batches using `load_state_dict`, and then
+    resumed from the saved state.
+    The result should be the same as the rest of the data that iterated over after saving.
+    """
+    _test_stateful_dataloader_resume(accelerator, iterable=True)
+    _test_stateful_dataloader_resume(accelerator, iterable=False)
+def _test_stateful_dataloader_save_state_resume(accelerator, iterable):
+    """
+    Helper: iterate a stateful dataloader, save state after a few batches using `Accelerator.save_state`,
+    resume, and verify the resumed batches match what was originally unseen.
+    """
+    old_dataloader_config = accelerator.dataloader_config
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            accelerator.dataloader_config = DataLoaderConfiguration(use_stateful_dataloader=True)
+            prepared_dl = create_dataloader(
+                accelerator, dataset_size=32 * accelerator.num_processes, batch_size=4, iterable=iterable, shuffle=True
+            )
+            untrained_batches = []
+            save_step = 2
+            for step, batch in enumerate(prepared_dl):
+                if step == save_step:
+                    accelerator.save_state(tmpdir)
+                if step > save_step:
+                    untrained_batches.append(batch)
+            not_skipped_batches = accelerator.gather(untrained_batches)
+            accelerator.load_state(tmpdir)
+            resumed_batches = []
+            for batch in prepared_dl:
+                resumed_batches.append(batch)
+            resumed_batches = accelerator.gather(resumed_batches)
+            assert len(not_skipped_batches) == len(resumed_batches), (
+                f"Expected {len(not_skipped_batches)} batches after resume, got {len(resumed_batches)}"
+            )
+            for b1, b2 in zip(not_skipped_batches, resumed_batches):
+                for v1, v2 in zip(b1, b2):
+                    assert torch.equal(v1, v2), f"Batch {b1} and {b2} are not equal"
+    finally:
+        accelerator.dataloader_config = old_dataloader_config
+def test_stateful_dataloader_save_state(accelerator):
+    """
+    Tests that a stateful dataloader can be iterated over, saved after a few batches using `Accelerator.save_state`,
+    and then resumed from the saved state.
+    The result should be the same as the rest of the data that iterated over after saving.
+    """
+    _test_stateful_dataloader_save_state_resume(accelerator, iterable=True)
+    _test_stateful_dataloader_save_state_resume(accelerator, iterable=False)
+def main():
+    accelerator = create_accelerator()
+    torch.manual_seed(accelerator.process_index)
+    accelerator.print("Test that even_batches variable ensures uniform batches across processes")
+    test_default_ensures_even_batch_sizes()
+    accelerator.print("Run tests with even_batches disabled")
+    test_can_disable_even_batches()
+    accelerator.print("Test joining uneven inputs")
+    test_can_join_uneven_inputs()
+    accelerator.print("Test overriding even_batches when joining uneven inputs")
+    test_join_can_override_even_batches()
+    accelerator.print("Test overriding even_batches for mixed dataloader types")
+    test_join_can_override_for_mixed_type_dataloaders()
+    accelerator.print("Test overriding even_batches raises a warning for iterable dataloaders")
+    test_join_raises_warning_for_iterable_when_overriding_even_batches()
+    accelerator.print("Test join with non DDP distributed raises warning")
+    original_state = accelerator.state.distributed_type
+    accelerator.state.distributed_type = DistributedType.FSDP
+    test_join_raises_warning_for_non_ddp_distributed(accelerator)
+    accelerator.state.distributed_type = original_state
+    accelerator.print("Test pickling an accelerator")
+    test_pickle_accelerator()
+    dataset = DummyDataset()
+    accelerator.print("Test DataLoader with shuffle=False")
+    loader = DataLoader(dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
+    test_data_loader(loader, accelerator)
+    accelerator.print("Test DataLoader with shuffle=True")
+    loader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
+    test_data_loader(loader, accelerator)
+    accelerator.print("Test DataLoader with batch_sampler")
+    sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
+    loader = DataLoader(dataset, batch_sampler=sampler, num_workers=NUM_WORKERS)
+    test_data_loader(loader, accelerator)
+    accelerator.print("Test DataLoader with sampler as an instance of `BatchSampler`")
+    sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
+    loader = DataLoader(dataset, sampler=sampler, batch_size=None, collate_fn=default_collate, num_workers=NUM_WORKERS)
+    test_data_loader(loader, accelerator)
+    test_stateful_dataloader(accelerator)
+    test_stateful_dataloader_save_state(accelerator)
+    accelerator.end_training()
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/test_merge_weights.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import logging
+import shutil
+from pathlib import Path
+import torch
+from safetensors.torch import load_file
+from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy, StateDictType
+from torch.utils.data import DataLoader
+from accelerate import Accelerator, FullyShardedDataParallelPlugin
+from accelerate.commands.merge import merge_command, merge_command_parser
+from accelerate.state import AcceleratorState
+from accelerate.test_utils import torch_device
+from accelerate.test_utils.training import RegressionDataset
+from accelerate.utils import merge_fsdp_weights, patch_environment, save_fsdp_model
+logging.basicConfig(level=logging.INFO)
+parser = merge_command_parser()
+class TinyModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(16, 16)
+        self.activation = torch.nn.ReLU()
+        self.linear2 = torch.nn.Linear(16, 16)
+        self.softmax = torch.nn.Softmax()
+    def forward(self, x):
+        return self.linear2(self.activation(self.linear1(x)))
+def setup():
+    if AcceleratorState._shared_state != {}:
+        AcceleratorState()._reset_state()
+    plugin = FullyShardedDataParallelPlugin(
+        sharding_strategy=ShardingStrategy.FULL_SHARD, state_dict_type=StateDictType.SHARDED_STATE_DICT
+    )
+    model = TinyModel()
+    with patch_environment(fsdp_auto_wrap_policy="SIZE_BASED_WRAP"):
+        plugin.set_auto_wrap_policy(model)
+    accelerator = Accelerator(fsdp_plugin=plugin)
+    model = accelerator.prepare(model)
+    return model, plugin, accelerator
+def mock_training(accelerator, model):
+    train_set = RegressionDataset(length=128, seed=42)
+    train_dl = DataLoader(train_set, batch_size=16, shuffle=False)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
+    for _ in range(3):
+        for batch in train_dl:
+            model.zero_grad()
+            output = model(batch["x"])
+            loss = torch.nn.functional.mse_loss(output, batch["y"])
+            accelerator.backward(loss)
+            optimizer.step()
+    return model
+def check_weights(operation, state_1, state_2):
+    for weight_1, weight_2 in zip(state_1.values(), state_2.values()):
+        if operation == "same":
+            assert torch.allclose(weight_1, weight_2)
+        else:
+            assert not torch.allclose(weight_1, weight_2)
+def check_safetensors_weights(path, model):
+    safe_state_dict = load_file(path / "model.safetensors")
+    safe_loaded_model = TinyModel().to(torch_device)
+    check_weights("diff", model.state_dict(), safe_loaded_model.state_dict())
+    safe_loaded_model.load_state_dict(safe_state_dict)
+    check_weights("same", model.state_dict(), safe_loaded_model.state_dict())
+def check_pytorch_weights(path, model):
+    nonsafe_state_dict = torch.load(path / "pytorch_model.bin", weights_only=True)
+    nonsafe_loaded_model = TinyModel().to(torch_device)
+    check_weights("diff", model.state_dict(), nonsafe_loaded_model.state_dict())
+    nonsafe_loaded_model.load_state_dict(nonsafe_state_dict)
+    check_weights("same", model.state_dict(), nonsafe_loaded_model.state_dict())
+def test_merge_weights_safetensors(model, path):
+    # Should now be saved at `path/merged.safetensors`
+    merge_fsdp_weights(path / "pytorch_model_fsdp_0", path, safe_serialization=True)
+    check_safetensors_weights(path, model)
+def test_merge_weights_command_safetensors(model, path):
+    args = parser.parse_args([str(path / "pytorch_model_fsdp_0"), str(path)])
+    merge_command(args)
+    check_safetensors_weights(path, model)
+def test_merge_weights_pytorch(model, path):
+    # Should now be saved at `path/merged.bin`
+    merge_fsdp_weights(path / "pytorch_model_fsdp_0", path, safe_serialization=False)
+    check_pytorch_weights(path, model)
+def test_merge_weights_command_pytorch(model, path):
+    args = parser.parse_args([str(path / "pytorch_model_fsdp_0"), str(path), "--unsafe_serialization"])
+    merge_command(args)
+    check_pytorch_weights(path, model)
+if __name__ == "__main__":
+    # Note this test requires at least two accelerators!
+    model, plugin, accelerator = setup()
+    if accelerator.num_processes > 1:
+        try:
+            # Initial setup for things
+            out_path = Path("test_merge_weights_fsdp_weights")
+            if not out_path.exists():
+                out_path.mkdir(parents=True, exist_ok=True)
+            # Train briefly once weights aren't the baseline
+            model = mock_training(accelerator, model)
+            accelerator.wait_for_everyone()
+            gc.collect()  # Needed for some lingering refs after training
+            save_fsdp_model(plugin, accelerator, model, out_path)
+            accelerator.wait_for_everyone()
+            # Finally we can test
+            test_merge_weights_safetensors(model, out_path)
+            test_merge_weights_command_safetensors(model, out_path)
+            test_merge_weights_pytorch(model, out_path)
+            test_merge_weights_command_pytorch(model, out_path)
+        except Exception:
+            raise
+        finally:
+            # Cleanup in case of any failures
+            if accelerator.is_main_process:
+                shutil.rmtree(out_path)
+            accelerator.wait_for_everyone()
+            accelerator.end_training()

accelerate/test_utils/scripts/test_notebook.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test file to ensure that in general certain situational setups for notebooks work.
+"""
+import os
+import time
+from pytest import mark, raises
+from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
+from accelerate import PartialState, notebook_launcher
+from accelerate.test_utils import require_bnb
+from accelerate.utils import is_bnb_available, is_xpu_available
+def basic_function():
+    # Just prints the PartialState
+    print(f"PartialState:\n{PartialState()}")
+def tough_nut_function(queue):
+    if queue.empty():
+        return
+    trial = queue.get()
+    if trial > 0:
+        queue.put(trial - 1)
+        raise RuntimeError("The nut hasn't cracked yet! Try again.")
+    print(f"PartialState:\n{PartialState()}")
+def bipolar_sleep_function(sleep_sec: int):
+    state = PartialState()
+    if state.process_index % 2 == 0:
+        raise RuntimeError("I'm an even process. I don't like to sleep.")
+    else:
+        time.sleep(sleep_sec)
+NUM_PROCESSES = int(os.environ.get("ACCELERATE_NUM_PROCESSES", 1))
+def test_can_initialize():
+    notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES)
+@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test static rendezvous backends")
+def test_static_rdzv_backend():
+    notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES, rdzv_backend="static")
+@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test c10d rendezvous backends")
+def test_c10d_rdzv_backend():
+    notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES, rdzv_backend="c10d")
+@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test fault tolerance")
+def test_fault_tolerant(max_restarts: int = 3):
+    # Use torch.multiprocessing to get the right context for the current device
+    import torch.multiprocessing as mp
+    # Get appropriate context - 'spawn' for XPU, 'fork' for others
+    if is_xpu_available():
+        ctx = mp.get_context("spawn")
+    else:
+        ctx = mp.get_context("fork")
+    queue = ctx.Queue()
+    queue.put(max_restarts)
+    notebook_launcher(tough_nut_function, (queue,), num_processes=NUM_PROCESSES, max_restarts=max_restarts)
+@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test monitoring")
+def test_monitoring(monitor_interval: float = 0.01, sleep_sec: int = 100):
+    start_time = time.time()
+    with raises(ChildFailedError, match="I'm an even process. I don't like to sleep."):
+        notebook_launcher(
+            bipolar_sleep_function,
+            (sleep_sec,),
+            num_processes=NUM_PROCESSES,
+            monitor_interval=monitor_interval,
+        )
+    assert time.time() - start_time < sleep_sec, "Monitoring did not stop the process in time."
+@require_bnb
+def test_problematic_imports():
+    with raises(RuntimeError, match="Please keep these imports"):
+        import bitsandbytes as bnb  # noqa: F401
+        notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES)
+def main():
+    print("Test basic notebook can be ran")
+    test_can_initialize()
+    print("Test static rendezvous backend")
+    test_static_rdzv_backend()
+    print("Test c10d rendezvous backend")
+    test_c10d_rdzv_backend()
+    print("Test fault tolerant")
+    test_fault_tolerant()
+    print("Test monitoring")
+    test_monitoring()
+    if is_bnb_available():
+        print("Test problematic imports (bnb)")
+        test_problematic_imports()
+    if NUM_PROCESSES > 1:
+        PartialState().destroy_process_group()
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/test_ops.py ADDED Viewed

	@@ -0,0 +1,181 @@

+#!/usr/bin/env python
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from accelerate import PartialState
+from accelerate.test_utils.testing import assert_exception
+from accelerate.utils.dataclasses import DistributedType
+from accelerate.utils.operations import (
+    DistributedOperationException,
+    broadcast,
+    copy_tensor_to_devices,
+    gather,
+    gather_object,
+    pad_across_processes,
+    reduce,
+)
+def create_tensor(state):
+    return (torch.arange(state.num_processes) + 1.0 + (state.num_processes * state.process_index)).to(state.device)
+def test_gather(state):
+    tensor = create_tensor(state)
+    gathered_tensor = gather(tensor)
+    assert gathered_tensor.tolist() == list(range(1, state.num_processes**2 + 1))
+def test_gather_object(state):
+    # Gather objects in TorchXLA is not supported.
+    if state.distributed_type == DistributedType.XLA:
+        return
+    obj = [state.process_index]
+    gathered_obj = gather_object(obj)
+    assert len(gathered_obj) == state.num_processes, f"{gathered_obj}, {len(gathered_obj)} != {state.num_processes}"
+    assert gathered_obj == list(range(state.num_processes)), f"{gathered_obj} != {list(range(state.num_processes))}"
+def test_gather_non_contiguous(state):
+    # Skip this test because the 'is_contiguous' function of XLA tensor always returns True.
+    if state.distributed_type == DistributedType.XLA:
+        return
+    # Create a non-contiguous tensor (enforce non-contiguity after device memory allocation)
+    tensor = torch.arange(12, device=state.device).view(4, 3).t()
+    assert not tensor.is_contiguous()
+    # Shouldn't error out
+    _ = gather(tensor)
+def test_broadcast(state):
+    tensor = create_tensor(state)
+    broadcasted_tensor = broadcast(tensor)
+    assert broadcasted_tensor.shape == torch.Size([state.num_processes])
+    assert broadcasted_tensor.tolist() == list(range(1, state.num_processes + 1))
+def test_pad_across_processes(state):
+    # We need to pad the tensor with one more element if we are the main process
+    # to ensure that we can pad
+    if state.is_main_process:
+        tensor = torch.arange(state.num_processes + 1).to(state.device)
+    else:
+        tensor = torch.arange(state.num_processes).to(state.device)
+    padded_tensor = pad_across_processes(tensor)
+    assert padded_tensor.shape == torch.Size([state.num_processes + 1])
+    if not state.is_main_process:
+        assert padded_tensor.tolist() == list(range(0, state.num_processes)) + [0]
+def test_reduce_sum(state):
+    # For now runs on only two processes
+    if state.num_processes != 2:
+        return
+    tensor = create_tensor(state)
+    reduced_tensor = reduce(tensor, "sum")
+    truth_tensor = torch.tensor([4.0, 6]).to(state.device)
+    assert torch.allclose(reduced_tensor, truth_tensor), f"{reduced_tensor} != {truth_tensor}"
+def test_reduce_mean(state):
+    # For now runs on only two processes
+    if state.num_processes != 2:
+        return
+    tensor = create_tensor(state)
+    reduced_tensor = reduce(tensor, "mean")
+    truth_tensor = torch.tensor([2.0, 3]).to(state.device)
+    assert torch.allclose(reduced_tensor, truth_tensor), f"{reduced_tensor} != {truth_tensor}"
+def test_op_checker(state):
+    # Must be in a distributed state, and gathering is currently not supported in TorchXLA.
+    if state.distributed_type in [DistributedType.NO, DistributedType.XLA]:
+        return
+    state.debug = True
+    # `pad_across_processes`
+    if state.process_index == 0:
+        data = {"tensor": torch.tensor([[0.0, 1, 2, 3, 4]]).to(state.device)}
+    else:
+        data = {"tensor": torch.tensor([[[0.0, 1, 2, 3, 4, 5]]]).to(state.device)}
+    with assert_exception(DistributedOperationException):
+        pad_across_processes(data, dim=0)
+    # `reduce`
+    if state.process_index == 0:
+        data = {"tensor": torch.tensor([[0.0, 1, 2, 3, 4]]).to(state.device)}
+    else:
+        data = {"tensor": torch.tensor([[[0.0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]]).to(state.device)}
+    with assert_exception(DistributedOperationException):
+        reduce(data)
+    # `broadcast`
+    if state.process_index == 0:
+        data = {"tensor": torch.tensor([[0.0, 1, 2, 3, 4]]).to(state.device)}
+    else:
+        data = {"tensor": torch.tensor([[[0.0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]]).to(state.device)}
+    with assert_exception(DistributedOperationException):
+        broadcast(data)
+    state.debug = False
+def test_copy_tensor_to_devices(state):
+    if state.distributed_type not in [DistributedType.MULTI_GPU, DistributedType.XLA]:
+        return
+    if state.is_main_process:
+        tensor = torch.tensor([1, 2, 3], dtype=torch.int).to(state.device)
+    else:
+        tensor = None
+    tensor = copy_tensor_to_devices(tensor)
+    assert torch.allclose(tensor, torch.tensor([1, 2, 3], dtype=torch.int, device=state.device))
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+def main():
+    state = PartialState()
+    state.print(f"State: {state}")
+    state.print("testing gather")
+    test_gather(state)
+    state.print("testing gather_object")
+    test_gather_object(state)
+    state.print("testing gather non-contiguous")
+    test_gather_non_contiguous(state)
+    state.print("testing broadcast")
+    test_broadcast(state)
+    state.print("testing pad_across_processes")
+    test_pad_across_processes(state)
+    state.print("testing reduce_sum")
+    test_reduce_sum(state)
+    state.print("testing reduce_mean")
+    test_reduce_mean(state)
+    state.print("testing op_checker")
+    test_op_checker(state)
+    state.print("testing sending tensors across devices")
+    test_copy_tensor_to_devices(state)
+    state.destroy_process_group()
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/test_script.py ADDED Viewed

	@@ -0,0 +1,909 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import io
+import math
+import time
+from copy import deepcopy
+from pathlib import Path
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+from accelerate import Accelerator
+from accelerate.data_loader import SeedableRandomSampler, prepare_data_loader
+from accelerate.state import AcceleratorState
+from accelerate.test_utils import RegressionDataset, RegressionModel, are_the_same_tensors
+from accelerate.utils import (
+    DataLoaderConfiguration,
+    DistributedType,
+    gather,
+    gather_object,
+    is_bf16_available,
+    is_cuda_available,
+    is_datasets_available,
+    is_fp16_available,
+    is_hpu_available,
+    is_mps_available,
+    is_pytest_available,
+    set_seed,
+    synchronize_rng_states,
+)
+if is_hpu_available():
+    ATOL = 1e-3
+    RTOL = 1e-3
+else:
+    ATOL = 1e-6
+    RTOL = 1e-6
+def generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler=False):
+    "Creates a dataloader that can also use the `SeedableRandomSampler`"
+    if use_seedable_sampler:
+        # The SeedableRandomSampler is needed during distributed setups
+        # for full reproducibility across processes with the `DataLoader`
+        sampler = SeedableRandomSampler(
+            generator=generator,
+            data_source=train_set,
+            num_samples=len(train_set),
+        )
+        return DataLoader(train_set, batch_size=batch_size, sampler=sampler)
+    else:
+        return DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
+def print_main(state):
+    print(f"Printing from the main process {state.process_index}")
+def print_local_main(state):
+    print(f"Printing from the local main process {state.local_process_index}")
+def print_last(state):
+    print(f"Printing from the last process {state.process_index}")
+def print_on(state, process_idx):
+    print(f"Printing from process {process_idx}: {state.process_index}")
+def process_execution_check():
+    accelerator = Accelerator()
+    num_processes = accelerator.num_processes
+    # Test main_process_first context manager
+    path = Path("check_main_process_first.txt")
+    with accelerator.main_process_first():
+        if accelerator.is_main_process:
+            time.sleep(0.1)  # ensure main process takes longest
+            with open(path, "a+") as f:
+                f.write("Currently in the main process\n")
+        else:
+            with open(path, "a+") as f:
+                f.write("Now on another process\n")
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        with open(path) as f:
+            text = "".join(f.readlines())
+        try:
+            assert text.startswith("Currently in the main process\n"), "Main process was not first"
+            if num_processes > 1:
+                assert text.endswith("Now on another process\n"), "Main process was not first"
+            assert text.count("Now on another process\n") == accelerator.num_processes - 1, (
+                f"Only wrote to file {text.count('Now on another process') + 1} times, not {accelerator.num_processes}"
+            )
+        except AssertionError:
+            path.unlink()
+            raise
+    if accelerator.is_main_process and path.exists():
+        path.unlink()
+    accelerator.wait_for_everyone()
+    # Test the decorators
+    f = io.StringIO()
+    with contextlib.redirect_stdout(f):
+        accelerator.on_main_process(print_main)(accelerator.state)
+    result = f.getvalue().rstrip()
+    if accelerator.is_main_process:
+        assert result == "Printing from the main process 0", f"{result} != Printing from the main process 0"
+    else:
+        assert f.getvalue().rstrip() == "", f'{result} != ""'
+    f.truncate(0)
+    f.seek(0)
+    with contextlib.redirect_stdout(f):
+        accelerator.on_local_main_process(print_local_main)(accelerator.state)
+    if accelerator.is_local_main_process:
+        assert f.getvalue().rstrip() == "Printing from the local main process 0"
+    else:
+        assert f.getvalue().rstrip() == ""
+    f.truncate(0)
+    f.seek(0)
+    with contextlib.redirect_stdout(f):
+        accelerator.on_last_process(print_last)(accelerator.state)
+    if accelerator.is_last_process:
+        assert f.getvalue().rstrip() == f"Printing from the last process {accelerator.state.num_processes - 1}"
+    else:
+        assert f.getvalue().rstrip() == ""
+    f.truncate(0)
+    f.seek(0)
+    for process_idx in range(num_processes):
+        with contextlib.redirect_stdout(f):
+            accelerator.on_process(print_on, process_index=process_idx)(accelerator.state, process_idx)
+        if accelerator.process_index == process_idx:
+            assert f.getvalue().rstrip() == f"Printing from process {process_idx}: {accelerator.process_index}"
+        else:
+            assert f.getvalue().rstrip() == ""
+        f.truncate(0)
+        f.seek(0)
+def init_state_check():
+    # Test we can instantiate this twice in a row.
+    state = AcceleratorState()
+    if state.local_process_index == 0:
+        print("Testing, testing. 1, 2, 3.")
+    print(state)
+def rng_sync_check():
+    state = AcceleratorState()
+    synchronize_rng_states(["torch"])
+    assert are_the_same_tensors(torch.get_rng_state()), "RNG states improperly synchronized on CPU."
+    if state.distributed_type == DistributedType.MULTI_GPU:
+        synchronize_rng_states(["cuda"])
+        assert are_the_same_tensors(torch.cuda.get_rng_state()), "RNG states improperly synchronized on GPU."
+    elif state.distributed_type == DistributedType.MULTI_XPU:
+        synchronize_rng_states(["xpu"])
+        assert are_the_same_tensors(torch.xpu.get_rng_state()), "RNG states improperly synchronized on XPU."
+    generator = torch.Generator()
+    synchronize_rng_states(["generator"], generator=generator)
+    assert are_the_same_tensors(generator.get_state()), "RNG states improperly synchronized in generator."
+    if state.local_process_index == 0:
+        print("All rng are properly synched.")
+def dl_preparation_check():
+    state = AcceleratorState()
+    length = 32 * state.num_processes
+    dl = DataLoader(range(length), batch_size=8)
+    dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index, put_on_device=True)
+    result = []
+    for batch in dl:
+        result.append(gather(batch))
+    result = torch.cat(result)
+    assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."
+    dl = DataLoader(range(length), batch_size=8)
+    dl = prepare_data_loader(
+        dl,
+        state.device,
+        state.num_processes,
+        state.process_index,
+        put_on_device=True,
+        split_batches=True,
+    )
+    result = []
+    for batch in dl:
+        result.append(gather(batch))
+    result = torch.cat(result)
+    assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."
+    if state.process_index == 0:
+        print("Non-shuffled dataloader passing.")
+    dl = DataLoader(range(length), batch_size=8, shuffle=True)
+    dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index, put_on_device=True)
+    result = []
+    for batch in dl:
+        result.append(gather(batch))
+    result = torch.cat(result).tolist()
+    result.sort()
+    assert result == list(range(length)), "Wrong shuffled dataloader result."
+    dl = DataLoader(range(length), batch_size=8, shuffle=True)
+    dl = prepare_data_loader(
+        dl,
+        state.device,
+        state.num_processes,
+        state.process_index,
+        put_on_device=True,
+        split_batches=True,
+    )
+    result = []
+    for batch in dl:
+        result.append(gather(batch))
+    result = torch.cat(result).tolist()
+    result.sort()
+    assert result == list(range(length)), "Wrong shuffled dataloader result."
+    if state.local_process_index == 0:
+        print("Shuffled dataloader passing.")
+def central_dl_preparation_check():
+    state = AcceleratorState()
+    length = 32 * state.num_processes
+    dl = DataLoader(range(length), batch_size=8)
+    dl = prepare_data_loader(
+        dl, state.device, state.num_processes, state.process_index, put_on_device=True, dispatch_batches=True
+    )
+    result = []
+    for batch in dl:
+        result.append(gather(batch))
+    result = torch.cat(result)
+    assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."
+    dl = DataLoader(range(length), batch_size=8)
+    dl = prepare_data_loader(
+        dl,
+        state.device,
+        state.num_processes,
+        state.process_index,
+        put_on_device=True,
+        split_batches=True,
+        dispatch_batches=True,
+    )
+    result = []
+    for batch in dl:
+        result.append(gather(batch))
+    result = torch.cat(result)
+    assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."
+    if state.process_index == 0:
+        print("Non-shuffled central dataloader passing.")
+    dl = DataLoader(range(length), batch_size=8, shuffle=True)
+    dl = prepare_data_loader(
+        dl, state.device, state.num_processes, state.process_index, put_on_device=True, dispatch_batches=True
+    )
+    result = []
+    for batch in dl:
+        result.append(gather(batch))
+    result = torch.cat(result).tolist()
+    result.sort()
+    assert result == list(range(length)), "Wrong shuffled dataloader result."
+    dl = DataLoader(range(length), batch_size=8, shuffle=True)
+    dl = prepare_data_loader(
+        dl,
+        state.device,
+        state.num_processes,
+        state.process_index,
+        put_on_device=True,
+        split_batches=True,
+        dispatch_batches=True,
+    )
+    result = []
+    for batch in dl:
+        result.append(gather(batch))
+    result = torch.cat(result).tolist()
+    result.sort()
+    assert result == list(range(length)), "Wrong shuffled dataloader result."
+    if state.local_process_index == 0:
+        print("Shuffled central dataloader passing.")
+def custom_sampler_check():
+    state = AcceleratorState()
+    class CustomDataset(Dataset):
+        def __init__(self, data):
+            self.data = data
+        def __len__(self):
+            return len(self.data)
+        def __getitem__(self, index):
+            return self.data[index]
+    class CustomBatchSampler:
+        def __init__(self, dataset_length: int, batch_size: int, shuffle: bool = True):
+            self.batch_size = batch_size
+            self.data_index = np.arange(dataset_length)
+            self.shuffle = shuffle
+        def __iter__(self):
+            num_batches = len(self)
+            if self.shuffle:
+                index = np.random.permutation(self.data_index)
+            else:
+                index = self.data_index
+            output = np.array_split(index, num_batches)
+            yield from output
+        def __len__(self):
+            return math.ceil(len(self.data_index) / self.batch_size)
+    dataset = CustomDataset(range(32 * state.num_processes))
+    sampler = CustomBatchSampler(len(dataset), batch_size=8)
+    dl = DataLoader(dataset, batch_sampler=sampler)
+    dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index)
+    # We need just ensure that `dl.batch_sampler` (or `dl.batch_sampler.batch_sampler` is indeed the old batch sampler
+    if hasattr(dl.batch_sampler, "batch_sampler"):
+        assert isinstance(dl.batch_sampler.batch_sampler, CustomBatchSampler), (
+            "Custom sampler was changed after calling `prepare_data_loader`"
+        )
+    else:
+        assert isinstance(dl.batch_sampler, CustomBatchSampler), (
+            "Custom sampler was changed after calling `prepare_data_loader`"
+        )
+def check_seedable_sampler():
+    # Set seed
+    set_seed(42)
+    train_set = RegressionDataset(length=10, seed=42)
+    train_dl = DataLoader(train_set, batch_size=2, shuffle=True)
+    config = DataLoaderConfiguration(use_seedable_sampler=True)
+    accelerator = Accelerator(dataloader_config=config)
+    train_dl = accelerator.prepare(train_dl)
+    original_items = []
+    for _ in range(3):
+        for batch in train_dl:
+            original_items.append(batch["x"])
+    original_items = torch.cat(original_items)
+    # Set seed again and the epoch
+    set_seed(42)
+    train_dl.set_epoch(0)
+    new_items = []
+    for _ in range(3):
+        for batch in train_dl:
+            new_items.append(batch["x"])
+    new_items = torch.cat(new_items)
+    assert torch.allclose(original_items, new_items), "Did not obtain the same items with the same seed and epoch."
+def check_seedable_sampler_in_batch_sampler_shard():
+    set_seed(42)
+    config = DataLoaderConfiguration(use_seedable_sampler=True)
+    accelerator = Accelerator(dataloader_config=config)
+    assert accelerator.num_processes > 1, "This test requires more than one process."
+    dataloader = DataLoader(list(range(10)), batch_size=1, shuffle=True)
+    prepared_data_loader = prepare_data_loader(
+        dataloader=dataloader,
+        use_seedable_sampler=True,
+    )
+    target_sampler = prepared_data_loader.batch_sampler.batch_sampler.sampler
+    assert isinstance(target_sampler, SeedableRandomSampler), (
+        "Sampler in BatchSamplerShard is not SeedableRandomSampler."
+    )
+def check_seedable_sampler_with_data_seed():
+    # Set seed
+    set_seed(42)
+    data_seed = 42
+    train_set = RegressionDataset(length=10, seed=42)
+    train_dl = DataLoader(train_set, batch_size=2, shuffle=True)
+    config = DataLoaderConfiguration(use_seedable_sampler=True, data_seed=data_seed)
+    accelerator = Accelerator(dataloader_config=config)
+    prepared_dl = accelerator.prepare(train_dl)
+    original_items = []
+    for _ in range(3):
+        for batch in prepared_dl:
+            original_items.append(batch["x"])
+    original_items = torch.cat(original_items)
+    # Set new data seed
+    config.data_seed = 43
+    accelerator = Accelerator(dataloader_config=config)
+    prepared_dl = accelerator.prepare(train_dl)
+    new_items = []
+    for _ in range(3):
+        for batch in prepared_dl:
+            new_items.append(batch["x"])
+    new_items = torch.cat(new_items)
+    assert not torch.allclose(original_items, new_items), "Obtained the same items with different data seed."
+def mock_training(length, batch_size, generator, use_seedable_sampler=False):
+    set_seed(42)
+    generator.manual_seed(42)
+    train_set = RegressionDataset(length=length, seed=42)
+    train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
+    model = RegressionModel()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    for epoch in range(3):
+        for batch in train_dl:
+            model.zero_grad()
+            output = model(batch["x"])
+            loss = torch.nn.functional.mse_loss(output, batch["y"])
+            loss.backward()
+            optimizer.step()
+    return train_set, model
+def training_check(use_seedable_sampler=False):
+    state = AcceleratorState()
+    generator = torch.Generator()
+    batch_size = 8
+    length = batch_size * 4 * state.num_processes
+    train_set, old_model = mock_training(length, batch_size * state.num_processes, generator, use_seedable_sampler)
+    assert are_the_same_tensors(old_model.a), "Did not obtain the same model on both processes."
+    assert are_the_same_tensors(old_model.b), "Did not obtain the same model on both processes."
+    accelerator = Accelerator()
+    train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
+    model = RegressionModel()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
+    set_seed(42)
+    generator.manual_seed(42)
+    for _ in range(3):
+        for batch in train_dl:
+            model.zero_grad()
+            output = model(batch["x"])
+            loss = torch.nn.functional.mse_loss(output, batch["y"])
+            accelerator.backward(loss)
+            optimizer.step()
+    model = accelerator.unwrap_model(model).cpu()
+    torch.testing.assert_close(
+        old_model.a,
+        model.a,
+        atol=ATOL,
+        rtol=RTOL,
+        msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
+    )
+    torch.testing.assert_close(
+        old_model.b,
+        model.b,
+        atol=ATOL,
+        rtol=RTOL,
+        msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
+    )
+    accelerator.print("Training yielded the same results on one CPU or distributed setup with no batch split.")
+    dataloader_config = DataLoaderConfiguration(split_batches=True, use_seedable_sampler=use_seedable_sampler)
+    accelerator = Accelerator(dataloader_config=dataloader_config)
+    train_dl = generate_baseline_dataloader(
+        train_set, generator, batch_size * state.num_processes, use_seedable_sampler
+    )
+    model = RegressionModel()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
+    set_seed(42)
+    generator.manual_seed(42)
+    for _ in range(3):
+        for batch in train_dl:
+            model.zero_grad()
+            output = model(batch["x"])
+            loss = torch.nn.functional.mse_loss(output, batch["y"])
+            accelerator.backward(loss)
+            optimizer.step()
+    model = accelerator.unwrap_model(model).cpu()
+    torch.testing.assert_close(
+        old_model.a,
+        model.a,
+        atol=ATOL,
+        rtol=RTOL,
+        msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
+    )
+    torch.testing.assert_close(
+        old_model.b,
+        model.b,
+        atol=ATOL,
+        rtol=RTOL,
+        msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
+    )
+    accelerator.print("Training yielded the same results on one CPU or distributed setup with batch split.")
+    # FP32 wrapper check
+    if is_cuda_available() or is_mps_available():
+        # Mostly a test that model.forward will have autocast when running unwrap_model(model, keep_fp32_wrapper=True)
+        print("Keep fp32 wrapper check.")
+        AcceleratorState._reset_state()
+        accelerator = Accelerator(mixed_precision="fp16")
+        model = torch.nn.Linear(2, 4)
+        model = accelerator.prepare(model)
+        model_with_fp32_wrapper = accelerator.unwrap_model(model, keep_fp32_wrapper=True)
+        # Run forward with fp16 as input.
+        # When the model is with mixed precision wrapper, no error will be raised.
+        input_tensor = torch.Tensor([1, 2]).to(dtype=torch.float16, device=accelerator.device)
+        output = model_with_fp32_wrapper(input_tensor)
+    # BF16 support
+    if is_bf16_available():
+        # Mostly a test that BF16 doesn't crash as the operation inside the model is not converted to BF16
+        print("BF16 training check.")
+        AcceleratorState._reset_state()
+        dataloader_config = DataLoaderConfiguration(use_seedable_sampler=use_seedable_sampler)
+        accelerator = Accelerator(mixed_precision="bf16", dataloader_config=dataloader_config)
+        train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
+        model = RegressionModel()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+        train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
+        set_seed(42)
+        generator.manual_seed(42)
+        for _ in range(3):
+            for batch in train_dl:
+                model.zero_grad()
+                output = model(batch["x"])
+                loss = torch.nn.functional.mse_loss(output, batch["y"])
+                accelerator.backward(loss)
+                optimizer.step()
+        model = accelerator.unwrap_model(model).cpu()
+        torch.testing.assert_close(
+            old_model.a,
+            model.a,
+            atol=ATOL,
+            rtol=RTOL,
+            msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
+        )
+        torch.testing.assert_close(
+            old_model.b,
+            model.b,
+            atol=ATOL,
+            rtol=RTOL,
+            msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
+        )
+    # FP16 support (HPU fp16 model seems to be off by 10% from the CPU, which is a lot of numerical error)
+    if is_fp16_available() and not is_hpu_available():
+        # Mostly a test that FP16 doesn't crash as the operation inside the model is not converted to FP16
+        print("FP16 training check.")
+        AcceleratorState._reset_state()
+        dataloader_config = DataLoaderConfiguration(use_seedable_sampler=use_seedable_sampler)
+        accelerator = Accelerator(mixed_precision="fp16", dataloader_config=dataloader_config)
+        train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
+        model = RegressionModel()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+        train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
+        set_seed(42)
+        generator.manual_seed(42)
+        for _ in range(3):
+            for batch in train_dl:
+                model.zero_grad()
+                output = model(batch["x"])
+                loss = torch.nn.functional.mse_loss(output, batch["y"])
+                accelerator.backward(loss)
+                optimizer.step()
+        model = accelerator.unwrap_model(model).cpu()
+        torch.testing.assert_close(
+            old_model.a,
+            model.a,
+            atol=ATOL,
+            rtol=RTOL,
+            msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
+        )
+        torch.testing.assert_close(
+            old_model.b,
+            model.b,
+            atol=ATOL,
+            rtol=RTOL,
+            msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
+        )
+def test_split_between_processes_dataset(datasets_Dataset):
+    state = AcceleratorState()
+    data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes)])
+    with state.split_between_processes(data, apply_padding=False) as results:
+        assert len(results) == 2, (
+            f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
+        )
+    data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
+    with state.split_between_processes(data, apply_padding=False) as results:
+        if state.is_last_process:
+            assert len(results) == 1, (
+                f"Last process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}"
+            )
+        else:
+            assert len(results) == 2, (
+                f"One of the intermediate processes did not receive two items. Process index: {state.process_index}; Length: {len(results)}"
+            )
+    state.wait_for_everyone()
+    odd_data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
+    even_data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes)])
+    for data in [odd_data, even_data]:
+        expected_output = data["k"]
+        with state.split_between_processes(data, apply_padding=True) as results:
+            if state.num_processes == 1:
+                assert len(results) == len(data), (
+                    f"Single process did not receive all items. Process index: {state.process_index}; Length: {len(results)}"
+                )
+            else:
+                assert len(results) == 2, (
+                    f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
+                )
+            results_per_process = []
+            for result in results:
+                results_per_process.append(result)
+        state.wait_for_everyone()
+        gathered_results = gather_object(results_per_process)
+        output = [r["k"] for r in gathered_results[: len(data)]]
+        assert expected_output == output, f"Gathered results is incorrect. Expected: {expected_output}; Got: {output}"
+def test_split_between_processes_list():
+    state = AcceleratorState()
+    data = list(range(0, 2 * state.num_processes))
+    with state.split_between_processes(data) as results:
+        assert len(results) == 2, (
+            f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
+        )
+    state.wait_for_everyone()
+    even_data = list(range(0, (2 * state.num_processes)))
+    odd_data = list(range(0, (2 * state.num_processes) - 1))
+    for data in [odd_data, even_data]:
+        expected_output = data
+        with state.split_between_processes(data, apply_padding=True) as results:
+            num_samples_per_device = math.ceil(len(data) / state.num_processes)
+            # Test all processes gets the correct number of item(s)
+            assert len(results) == num_samples_per_device, (
+                f"Process {state.device} did not get the correct number of item(s). Process index: {state.process_index}; Length: {len(results)}"
+            )
+            results_per_process = []
+            for result in results:
+                results_per_process.append(result)
+        state.wait_for_everyone()
+        gathered_results = gather_object(results_per_process)
+        output = gathered_results[: len(data)]
+        assert expected_output == output, f"Gathered results is incorrect. Expected: {expected_output}; Got: {output}"
+def test_split_between_processes_nested_dict():
+    state = AcceleratorState()
+    a = [1, 2, 3, 4, 5, 6, 7, 8]
+    b = ["a", "b", "c", "d", "e", "f", "g", "h"]
+    c = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    if state.num_processes in (1, 2, 4):
+        data = {"a": a, "b": b, "c": c}
+        data_copy = deepcopy(data)
+        with state.split_between_processes(data) as results:
+            if state.process_index == 0:
+                assert results["a"] == data_copy["a"][: 8 // state.num_processes]
+            elif state.num_processes == 2:
+                assert results["a"] == data_copy["a"][4:]
+            elif state.process_index == 3:
+                # We return a list each time
+                assert results["a"] == data_copy["a"][-2:], f"Expected: {data_copy['a'][-2]}, Actual: {results['a']}"
+            if state.process_index == 0:
+                assert results["b"] == data_copy["b"][: 8 // state.num_processes]
+            elif state.num_processes == 2:
+                assert results["b"] == data_copy["b"][4:]
+            elif state.process_index == 3:
+                assert results["b"] == data_copy["b"][-2:]
+            if state.process_index == 0:
+                assert torch.allclose(results["c"], data_copy["c"][: 8 // state.num_processes]), (
+                    f"Did not obtain expected values on process 0, expected `{data['c'][: 8 // state.num_processes]}`, received: {results['c']}"
+                )
+            elif state.num_processes == 2:
+                assert torch.allclose(results["c"], data_copy["c"][4:]), (
+                    f"Did not obtain expected values on process 2, expected `{data['c'][4:]}`, received: {results['c']}"
+                )
+            elif state.process_index == 3:
+                assert torch.allclose(results["c"], data_copy["c"][-2:]), (
+                    f"Did not obtain expected values on process 4, expected `{data['c'][-2:]}`, received: {results['c']}"
+                )
+    state.wait_for_everyone()
+def test_split_between_processes_tensor():
+    state = AcceleratorState()
+    if state.num_processes > 1:
+        data = torch.tensor([[0, 1, 2, 3], [4, 5, 6, 7]]).to(state.device)
+        with state.split_between_processes(data) as results:
+            if state.process_index == 0:
+                expected = torch.tensor([[0, 1, 2, 3]]).to(state.device)
+            else:
+                expected = torch.tensor([[4, 5, 6, 7]]).to(state.device)
+            torch.testing.assert_close(results, expected)
+        state.wait_for_everyone()
+    even_data = torch.tensor([[i] for i in range(2 * state.num_processes)]).to(state.device)
+    odd_data = torch.tensor([[i] for i in range(2 * state.num_processes - 1)]).to(state.device)
+    for data in [even_data, odd_data]:
+        expected_output = [torch.tensor(i) for i in data.tolist()]
+        with state.split_between_processes(data, apply_padding=True) as results:
+            num_samples_per_device = math.ceil(len(data) / state.num_processes)
+            assert len(results) == num_samples_per_device, (
+                f"Process {state.device} did not get the correct number of item(s). Process index: {state.process_index}; Length: {len(results)}"
+            )
+            results_per_process = []
+            for result in results:
+                results_per_process.append(result.to("cpu"))
+        state.wait_for_everyone()
+        gathered_results = gather_object(results_per_process)
+        output = gathered_results[: len(data)]
+        assert expected_output == output, f"Gathered results is incorrect. Expected: {expected_output}; Got: {output}"
+def test_split_between_processes_evenly():
+    state = AcceleratorState()
+    if state.num_processes in (1, 2, 4, 8):
+        data = list(range(17))
+        num_samples_per_process = len(data) // state.num_processes
+        num_extras = len(data) % state.num_processes
+        with state.split_between_processes(data) as results:
+            if state.process_index < num_extras:
+                assert len(results) == num_samples_per_process + 1, (
+                    f"Each Process should have even elements. Expected: {num_samples_per_process + 1}, Actual: {len(results)}"
+                )
+            else:
+                assert len(results) == num_samples_per_process, (
+                    f"Each Process should have even elements. Expected: {num_samples_per_process}, Actual: {len(results)}"
+                )
+    state.wait_for_everyone()
+def test_trigger():
+    accelerator = Accelerator()
+    # should start with being false
+    assert accelerator.check_trigger() is False
+    # set a breakpoint on the main process
+    if accelerator.is_main_process:
+        accelerator.set_trigger()
+    # check it's been activated across all processes
+    # calls `all_reduce` and triggers a sync
+    assert accelerator.check_trigger() is True
+    # check it's been reset after the sync
+    assert accelerator.check_trigger() is False
+def test_reinstantiated_state():
+    import pytest
+    AcceleratorState._reset_state()
+    simple_model = torch.nn.Linear(1, 1)
+    # First define an accelerator
+    accelerator = Accelerator()
+    # Then call `reset_state`, breaking the state existing in the accelerator
+    AcceleratorState._reset_state()
+    # Now try and prepare a simple model, should raise the custom error early
+    with pytest.raises(AttributeError) as cm:
+        accelerator.prepare(simple_model)
+    assert "`AcceleratorState` object has no attribute" in str(cm.value.args[0])
+    assert "This happens if `AcceleratorState._reset_state()`" in str(cm.value.args[0])
+def main():
+    accelerator = Accelerator()
+    state = accelerator.state
+    if state.local_process_index == 0:
+        print("**Initialization**")
+    init_state_check()
+    state.wait_for_everyone()
+    if state.distributed_type == DistributedType.MULTI_GPU:
+        num_processes_per_node = torch.cuda.device_count()
+    else:
+        num_processes_per_node = state.num_processes
+    # We only run this test on non-multinode
+    if num_processes_per_node == state.num_processes:
+        if state.process_index == 0:
+            print("\n**Test process execution**")
+        process_execution_check()
+        if state.process_index == 0:
+            print("\n**Test split between processes as a list**")
+        test_split_between_processes_list()
+        if state.process_index == 0:
+            print("\n**Test split between processes as a dict**")
+        test_split_between_processes_nested_dict()
+        if state.process_index == 0:
+            print("\n**Test split between processes as a tensor**")
+        test_split_between_processes_tensor()
+        if state.process_index == 0:
+            print("\n**Test split between processes evenly**")
+        test_split_between_processes_evenly()
+        if state.process_index == 0:
+            print("\n**Test split between processes as a datasets.Dataset**")
+        if is_datasets_available():
+            from datasets import Dataset as datasets_Dataset
+            test_split_between_processes_dataset(datasets_Dataset)
+        else:
+            print("Skipped because Hugging Face datasets is not available")
+    if state.local_process_index == 0:
+        print("\n**Test random number generator synchronization**")
+    rng_sync_check()
+    if state.local_process_index == 0:
+        print("\n**DataLoader integration test**")
+    dl_preparation_check()
+    if state.distributed_type != DistributedType.XLA:
+        central_dl_preparation_check()
+        custom_sampler_check()
+        check_seedable_sampler()
+        check_seedable_sampler_with_data_seed()
+    if state.num_processes > 1:
+        check_seedable_sampler_in_batch_sampler_shard()
+    # Trainings are not exactly the same in DeepSpeed and CPU mode
+    if state.distributed_type == DistributedType.DEEPSPEED:
+        return
+    if state.local_process_index == 0:
+        print("\n**Training integration test**")
+    training_check(use_seedable_sampler=False)
+    training_check(use_seedable_sampler=True)
+    if state.local_process_index == 0:
+        print("\n**Breakpoint trigger test**")
+    test_trigger()
+    if is_pytest_available():
+        if state.local_process_index == 0:
+            print("\n**Test reinstantiated state**")
+        test_reinstantiated_state()
+    state.destroy_process_group()
+if __name__ == "__main__":
+    main()

accelerate/test_utils/scripts/test_sync.py ADDED Viewed

	@@ -0,0 +1,413 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from copy import deepcopy
+import torch
+import torch.nn.functional as F
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import LambdaLR
+from torch.utils.data import DataLoader
+from accelerate.accelerator import Accelerator, DataLoaderConfiguration, GradientAccumulationPlugin
+from accelerate.state import GradientState
+from accelerate.test_utils import RegressionDataset, RegressionModel
+from accelerate.utils import DistributedType, set_seed
+def check_model_parameters(model_a, model_b, did_step, iteration, **kwargs):
+    for param, grad_param in zip(model_a.parameters(), model_b.parameters()):
+        if not param.requires_grad:
+            continue
+        if not did_step:
+            # Grads should not be in sync
+            assert torch.allclose(param.grad, grad_param.grad, **kwargs) is False, (
+                f"Gradients in sync when they should not be at iteration {iteration}:\nmodel_a grad ({param.grad}) == model_b grad ({grad_param.grad})"
+            )
+        else:
+            # Grads should be in sync
+            assert torch.allclose(param.grad, grad_param.grad, **kwargs) is True, (
+                f"Gradients not in sync when they should be at iteration {iteration}:\nmodel_a grad ({param.grad}) != model_b grad ({grad_param.grad})"
+            )
+def step_model(model, input, target, accelerator, do_backward=True):
+    model.train()
+    output = model(input)
+    loss = F.mse_loss(output, target.to(output.device))
+    if not do_backward:
+        loss /= accelerator.gradient_accumulation_steps
+        loss.backward()
+    else:
+        accelerator.backward(loss)
+def get_training_setup(accelerator, sched=False):
+    "Returns everything needed to perform basic training"
+    set_seed(42)
+    model = RegressionModel()
+    ddp_model = deepcopy(model)
+    dset = RegressionDataset(length=80)
+    dataloader = DataLoader(dset, batch_size=16)
+    model.to(accelerator.device)
+    if sched:
+        opt = AdamW(params=model.parameters(), lr=1e-3)
+        ddp_opt = AdamW(params=ddp_model.parameters(), lr=1e-3)
+        sched = LambdaLR(opt, lr_lambda=lambda epoch: epoch**0.65)
+        ddp_sched = LambdaLR(ddp_opt, lr_lambda=lambda epoch: epoch**0.65)
+    # Make a copy of `model`
+    if sched:
+        ddp_model, ddp_opt, ddp_sched, dataloader = accelerator.prepare(ddp_model, ddp_opt, ddp_sched, dataloader)
+    else:
+        ddp_model, dataloader = accelerator.prepare(ddp_model, dataloader)
+    if sched:
+        return (model, opt, sched, dataloader, ddp_model, ddp_opt, ddp_sched)
+    return model, ddp_model, dataloader
+def test_noop_sync(accelerator):
+    # Test when on a single CPU or GPU that the context manager does nothing
+    model, ddp_model, dataloader = get_training_setup(accelerator)
+    # Use a single batch
+    ddp_input, ddp_target = next(iter(dataloader)).values()
+    for iteration in range(3):
+        # Gather the distributed inputs and targs for the base model
+        input, target = accelerator.gather((ddp_input, ddp_target))
+        input, target = input.to(accelerator.device), target.to(accelerator.device)
+        # Perform our initial ground truth step in non "DDP"
+        step_model(model, input, target, accelerator)
+        # Do "gradient accumulation" (noop)
+        if iteration % 2 == 0:
+            # Accumulate grads locally
+            with accelerator.no_sync(ddp_model):
+                step_model(ddp_model, ddp_input, ddp_target, accelerator)
+        else:
+            # Sync grads
+            step_model(ddp_model, ddp_input, ddp_target, accelerator)
+        # Since `no_sync` is a noop, `ddp_model` and `model` grads should always be in sync
+        check_model_parameters(model, ddp_model, True, iteration)
+        for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
+            if not param.requires_grad:
+                continue
+            assert torch.allclose(param.grad, ddp_param.grad), (
+                f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
+            )
+        # Shuffle ddp_input on each iteration
+        torch.manual_seed(1337 + iteration)
+        ddp_input = ddp_input[torch.randperm(len(ddp_input))]
+def test_distributed_sync(accelerator):
+    # Test on distributed setup that context manager behaves properly
+    model, ddp_model, dataloader = get_training_setup(accelerator)
+    # Use a single batch
+    ddp_input, ddp_target = next(iter(dataloader)).values()
+    for iteration in range(3):
+        # Gather the distributed inputs and targs for the base model
+        input, target = accelerator.gather((ddp_input, ddp_target))
+        input, target = input.to(accelerator.device), target.to(accelerator.device)
+        # Perform our initial ground truth step in non "DDP"
+        step_model(model, input, target, accelerator)
+        # Do "gradient accumulation" (noop)
+        if iteration % 2 == 0:
+            # Accumulate grads locally
+            with accelerator.no_sync(ddp_model):
+                step_model(ddp_model, ddp_input, ddp_target, accelerator)
+        else:
+            # Sync grads
+            step_model(ddp_model, ddp_input, ddp_target, accelerator)
+        # DDP model and model should only be in sync when not (iteration % 2 == 0)
+        for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
+            if not param.requires_grad:
+                continue
+            if iteration % 2 == 0:
+                # Grads should not be in sync
+                assert torch.allclose(param.grad, ddp_param.grad) is False, (
+                    f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
+                )
+            else:
+                # Grads should be in sync
+                assert torch.allclose(param.grad, ddp_param.grad) is True, (
+                    f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
+                )
+        # Shuffle ddp_input on each iteration
+        torch.manual_seed(1337 + iteration)
+        ddp_input = ddp_input[torch.randperm(len(ddp_input))]
+def test_distributed_sync_multiple_fwd(accelerator):
+    # Test on distributed setup that context manager behaves properly when used with multiple forwards followed by multiple backwards
+    model, ddp_model, dataloader = get_training_setup(accelerator)
+    # Do multiple forwards
+    losses = []
+    num_iterations = 3
+    for iteration in range(num_iterations):
+        ddp_input, ddp_target = next(iter(dataloader)).values()
+        # Gather the distributed inputs and targs for the base model
+        input, target = accelerator.gather((ddp_input, ddp_target))
+        input, target = input.to(accelerator.device), target.to(accelerator.device)
+        # Perform our initial ground truth step in non "DDP"
+        step_model(model, input, target, accelerator)
+        # Accumulate grads locally
+        with accelerator.no_sync(ddp_model):
+            ddp_output = ddp_model(ddp_input)
+            loss = F.mse_loss(ddp_output, ddp_target.to(ddp_output.device))
+            losses.append(loss)
+    # Do multiple backwards and sync only at the last backward
+    for iteration in range(num_iterations):
+        loss = losses[iteration]
+        if iteration < num_iterations - 1:
+            # Accumulate grads locally
+            accelerator.backward(loss)
+            # DDP model and model should only be in sync after last backward
+            for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
+                if not param.requires_grad:
+                    continue
+                # Grads should not be in sync
+                assert torch.allclose(param.grad, ddp_param.grad) is False, (
+                    f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
+                )
+        else:
+            # Sync grads if last backward
+            with accelerator.trigger_sync_in_backward(ddp_model):
+                accelerator.backward(loss)
+            # DDP model and model should only be in sync after last backward
+            for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
+                if not param.requires_grad:
+                    continue
+                # Grads should be in sync
+                assert torch.allclose(param.grad, ddp_param.grad) is True, (
+                    f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
+                )
+def test_gradient_accumulation(split_batches=False, dispatch_batches=False, sync_each_batch=False):
+    gradient_accumulation_plugin = GradientAccumulationPlugin(num_steps=2, sync_each_batch=sync_each_batch)
+    dataloader_config = DataLoaderConfiguration(split_batches=split_batches, dispatch_batches=dispatch_batches)
+    accelerator = Accelerator(
+        dataloader_config=dataloader_config,
+        gradient_accumulation_plugin=gradient_accumulation_plugin,
+    )
+    # Test that context manager behaves properly
+    model, ddp_model, dataloader = get_training_setup(accelerator)
+    for iteration, batch in enumerate(dataloader):
+        ddp_input, ddp_target = batch.values()
+        # Gather the distributed inputs and targs for the base model
+        input, target = accelerator.gather((ddp_input, ddp_target))
+        input, target = input.to(accelerator.device), target.to(accelerator.device)
+        # Perform our initial ground truth step in non "DDP"
+        step_model(model, input, target, accelerator, False)
+        # Do "gradient accumulation" (noop)
+        with accelerator.accumulate(ddp_model):
+            step_model(ddp_model, ddp_input, ddp_target, accelerator)
+        # DDP model and model should only be in sync when not (iteration % 2 == 0)
+        for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
+            if not param.requires_grad:
+                continue
+            if ((iteration + 1) % 2 == 0) or (iteration == len(dataloader) - 1) or sync_each_batch:
+                # Grads should be in sync
+                assert torch.allclose(param.grad, ddp_param.grad) is True, (
+                    f"Gradients not in sync when they should be at iteration {iteration}:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
+                )
+            else:
+                # Grads should not be in sync
+                assert torch.allclose(param.grad, ddp_param.grad) is False, (
+                    f"Gradients in sync when they should not be at iteration {iteration}:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
+                )
+        # Shuffle ddp_input on each iteration
+        torch.manual_seed(1337 + iteration)
+        ddp_input = ddp_input[torch.randperm(len(ddp_input))]
+    GradientState._reset_state()
+def test_gradient_accumulation_with_opt_and_scheduler(
+    split_batches=False, dispatch_batches=False, sync_each_batch=False
+):
+    gradient_accumulation_plugin = GradientAccumulationPlugin(num_steps=2, sync_each_batch=sync_each_batch)
+    dataloader_config = DataLoaderConfiguration(split_batches=split_batches, dispatch_batches=dispatch_batches)
+    accelerator = Accelerator(
+        dataloader_config=dataloader_config,
+        gradient_accumulation_plugin=gradient_accumulation_plugin,
+    )
+    # Test that context manager behaves properly
+    model, opt, sched, dataloader, ddp_model, ddp_opt, ddp_sched = get_training_setup(accelerator, True)
+    for iteration, batch in enumerate(dataloader):
+        ddp_input, ddp_target = batch.values()
+        # Gather the distributed inputs and targs for the base model
+        input, target = accelerator.gather((ddp_input, ddp_target))
+        input, target = input.to(accelerator.device), target.to(accelerator.device)
+        # Perform our initial ground truth step in non "DDP"
+        model.train()
+        ddp_model.train()
+        step_model(model, input, target, accelerator, False)
+        opt.step()
+        if ((iteration + 1) % 2 == 0) or ((iteration + 1) == len(dataloader)):
+            if split_batches:
+                sched.step()
+            else:
+                for _ in range(accelerator.num_processes):
+                    sched.step()
+        # Perform gradient accumulation under wrapper
+        with accelerator.accumulate(ddp_model):
+            step_model(ddp_model, ddp_input, ddp_target, accelerator)
+            ddp_opt.step()
+            ddp_sched.step()
+        # Learning rates should be the same
+        assert opt.param_groups[0]["lr"] == ddp_opt.param_groups[0]["lr"], (
+            f"Learning rates found in each optimizer did not align\nopt: {opt.param_groups[0]['lr']}\nDDP opt: {ddp_opt.param_groups[0]['lr']}\n"
+        )
+        did_step = (((iteration + 1) % 2) == 0) or ((iteration + 1) == len(dataloader))
+        if accelerator.num_processes > 1:
+            check_model_parameters(
+                model,
+                ddp_model,
+                did_step or sync_each_batch,  # syncs at each grad_accum interval of if sync_each_batch==True
+                iteration,
+                rtol=1e-3,  # needs a relative tolerance due to roundoff errors
+            )
+        if did_step:
+            opt.zero_grad()  # flush gradients every accum step
+        ddp_opt.zero_grad()
+        # Shuffle ddp_input on each iteration
+        torch.manual_seed(1337 + iteration)
+    GradientState._reset_state()
+def test_dataloader_break():
+    accelerator = Accelerator()
+    first_dset = RegressionDataset(length=80)
+    first_dataloader = DataLoader(first_dset, batch_size=16)
+    second_dset = RegressionDataset(length=96)
+    second_dataloader = DataLoader(second_dset, batch_size=16)
+    first_dataloader, second_dataloader = accelerator.prepare(first_dataloader, second_dataloader)
+    assert accelerator.gradient_state.active_dataloader is None
+    for iteration, _ in enumerate(first_dataloader):
+        assert id(accelerator.gradient_state.active_dataloader) == id(first_dataloader)
+        if iteration < len(first_dataloader) - 1:
+            assert not accelerator.gradient_state.end_of_dataloader
+            if iteration == 1:
+                for batch_num, _ in enumerate(second_dataloader):
+                    assert id(accelerator.gradient_state.active_dataloader) == id(second_dataloader)
+                    if batch_num < len(second_dataloader) - 1:
+                        assert not accelerator.gradient_state.end_of_dataloader
+                    else:
+                        assert accelerator.gradient_state.end_of_dataloader
+        else:
+            assert accelerator.gradient_state.end_of_dataloader
+    assert accelerator.gradient_state.active_dataloader is None
+def main():
+    accelerator = Accelerator()
+    state = accelerator.state
+    if state.local_process_index == 0:
+        print("**Test `accumulate` gradient accumulation with dataloader break**")
+    if state.distributed_type != DistributedType.XLA:
+        test_dataloader_break()
+    if state.distributed_type == DistributedType.NO:
+        if state.local_process_index == 0:
+            print("**Test NOOP `no_sync` context manager**")
+        test_noop_sync(accelerator)
+    if state.distributed_type in (
+        DistributedType.MULTI_GPU,
+        DistributedType.MULTI_NPU,
+        DistributedType.MULTI_MLU,
+        DistributedType.MULTI_SDAA,
+        DistributedType.MULTI_MUSA,
+        DistributedType.MULTI_CPU,
+        DistributedType.MULTI_HPU,
+        DistributedType.MULTI_NEURON,
+    ):
+        if state.local_process_index == 0:
+            print("**Test Distributed `no_sync` context manager**")
+        test_distributed_sync(accelerator)
+        if state.local_process_index == 0:
+            print("**Test Distributed `no_sync` context manager with multiple forwards**")
+        test_distributed_sync_multiple_fwd(accelerator)
+    if state.distributed_type in (
+        DistributedType.MULTI_GPU,
+        DistributedType.MULTI_NPU,
+        DistributedType.MULTI_MLU,
+        DistributedType.MULTI_SDAA,
+        DistributedType.MULTI_MUSA,
+        DistributedType.MULTI_HPU,
+        DistributedType.MULTI_NEURON,
+    ):
+        for split_batch in [True, False]:
+            for dispatch_batches in [True, False]:
+                for sync_each_batch in [True, False]:
+                    if state.local_process_index == 0:
+                        print(
+                            "**Test `accumulate` gradient accumulation, ",
+                            f"`split_batches={split_batch}` and `dispatch_batches={dispatch_batches}` and `sync_each_batch={sync_each_batch}`**",
+                        )
+                    test_gradient_accumulation(split_batch, dispatch_batches, sync_each_batch)
+    # Currently will break on torch 2.0 +, need to investigate why
+    if state.local_process_index == 0:
+        print(
+            "**Test `accumulate` gradient accumulation with optimizer and scheduler, ",
+            "`split_batches=False`, `dispatch_batches=False`, `sync_each_batch=False`**",
+        )
+    test_gradient_accumulation_with_opt_and_scheduler()
+    if state.distributed_type in (
+        DistributedType.MULTI_GPU,
+        DistributedType.MULTI_NPU,
+        DistributedType.MULTI_MLU,
+        DistributedType.MULTI_SDAA,
+        DistributedType.MULTI_MUSA,
+        DistributedType.MULTI_HPU,
+        DistributedType.MULTI_NEURON,
+    ):
+        for split_batch in [True, False]:
+            for dispatch_batches in [True, False]:
+                for sync_each_batch in [True, False]:
+                    if not split_batch and not dispatch_batches and not sync_each_batch:
+                        continue
+                    if state.local_process_index == 0:
+                        print(
+                            "**Test `accumulate` gradient accumulation with optimizer and scheduler, ",
+                            f"`split_batches={split_batch}` and `dispatch_batches={dispatch_batches}` and `sync_each_batch={sync_each_batch}`**",
+                        )
+                    test_gradient_accumulation_with_opt_and_scheduler(split_batch, dispatch_batches, sync_each_batch)
+    state.destroy_process_group()
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+if __name__ == "__main__":
+    main()

accelerate/test_utils/testing.py ADDED Viewed

	@@ -0,0 +1,889 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import inspect
+import io
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import unittest
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Optional, Union
+from unittest import mock
+import torch
+import accelerate
+from ..state import AcceleratorState
+from ..utils import (
+    check_cuda_fp8_capability,
+    compare_versions,
+    gather,
+    is_aim_available,
+    is_bnb_available,
+    is_clearml_available,
+    is_comet_ml_available,
+    is_cuda_available,
+    is_datasets_available,
+    is_deepspeed_available,
+    is_dvclive_available,
+    is_fp8_available,
+    is_fp16_available,
+    is_habana_gaudi1,
+    is_hpu_available,
+    is_import_timer_available,
+    is_matplotlib_available,
+    is_mlflow_available,
+    is_mlu_available,
+    is_mps_available,
+    is_musa_available,
+    is_neuron_available,
+    is_npu_available,
+    is_pandas_available,
+    is_pippy_available,
+    is_pytest_available,
+    is_schedulefree_available,
+    is_sdaa_available,
+    is_swanlab_available,
+    is_tensorboard_available,
+    is_timm_available,
+    is_torch_version,
+    is_torch_xla_available,
+    is_torchao_available,
+    is_torchdata_stateful_dataloader_available,
+    is_torchvision_available,
+    is_trackio_available,
+    is_transformer_engine_available,
+    is_transformer_engine_mxfp8_available,
+    is_transformers_available,
+    is_triton_available,
+    is_wandb_available,
+    is_xpu_available,
+    str_to_bool,
+)
+def get_backend():
+    if is_torch_xla_available():
+        return "xla", torch.cuda.device_count(), torch.cuda.memory_allocated
+    elif is_cuda_available():
+        return "cuda", torch.cuda.device_count(), torch.cuda.memory_allocated
+    elif is_mps_available(min_version="2.0"):
+        return "mps", 1, torch.mps.current_allocated_memory
+    elif is_mps_available():
+        return "mps", 1, lambda: 0
+    elif is_mlu_available():
+        return "mlu", torch.mlu.device_count(), torch.mlu.memory_allocated
+    elif is_sdaa_available():
+        return "sdaa", torch.sdaa.device_count(), torch.sdaa.memory_allocated
+    elif is_musa_available():
+        return "musa", torch.musa.device_count(), torch.musa.memory_allocated
+    elif is_npu_available():
+        return "npu", torch.npu.device_count(), torch.npu.memory_allocated
+    elif is_xpu_available():
+        return "xpu", torch.xpu.device_count(), torch.xpu.memory_allocated
+    elif is_hpu_available():
+        return "hpu", torch.hpu.device_count(), torch.hpu.memory_allocated
+    elif is_neuron_available():
+        return "neuron", torch.neuron.device_count(), torch.neuron.memory_allocated
+    else:
+        return "cpu", 1, lambda: 0
+torch_device, device_count, memory_allocated_func = get_backend()
+def get_launch_command(**kwargs) -> list:
+    """
+    Wraps around `kwargs` to help simplify launching from `subprocess`.
+    Example:
+    ```python
+    # returns ['accelerate', 'launch', '--num_processes=2', '--device_count=2']
+    get_launch_command(num_processes=2, device_count=2)
+    ```
+    """
+    command = ["accelerate", "launch"]
+    for k, v in kwargs.items():
+        if isinstance(v, bool) and v:
+            command.append(f"--{k}")
+        elif v is not None:
+            command.append(f"--{k}={v}")
+    return command
+DEFAULT_LAUNCH_COMMAND = get_launch_command(num_processes=device_count, monitor_interval=0.1)
+def parse_flag_from_env(key, default=False):
+    try:
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = str_to_bool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError(f"If set, {key} must be yes or no.")
+    return _value
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+def skip(test_case):
+    "Decorator that skips a test unconditionally"
+    return unittest.skip("Test was skipped")(test_case)
+def slow(test_case):
+    """
+    Decorator marking a test as slow. Slow tests are skipped by default. Set the RUN_SLOW environment variable to a
+    truthy value to run them.
+    """
+    return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
+def require_cpu(test_case):
+    """
+    Decorator marking a test that must be only ran on the CPU. These tests are skipped when a GPU is available.
+    """
+    return unittest.skipUnless(torch_device == "cpu", "test requires only a CPU")(test_case)
+def require_non_cpu(test_case):
+    """
+    Decorator marking a test that requires a hardware accelerator backend. These tests are skipped when there are no
+    hardware accelerator available.
+    """
+    return unittest.skipUnless(torch_device != "cpu", "test requires a GPU")(test_case)
+def require_cuda(test_case):
+    """
+    Decorator marking a test that requires CUDA. These tests are skipped when there are no GPU available or when
+    TorchXLA is available.
+    """
+    return unittest.skipUnless(is_cuda_available() and not is_torch_xla_available(), "test requires a GPU")(test_case)
+def require_cuda_or_hpu(test_case):
+    """
+    Decorator marking a test that requires CUDA or HPU. These tests are skipped when there are no GPU available or when
+    TorchXLA is available.
+    """
+    return unittest.skipUnless(
+        (is_cuda_available() and not is_torch_xla_available()) or is_hpu_available(), "test requires a GPU or HPU"
+    )(test_case)
+def require_xpu(test_case):
+    """
+    Decorator marking a test that requires XPU. These tests are skipped when there are no XPU available.
+    """
+    return unittest.skipUnless(is_xpu_available(), "test requires a XPU")(test_case)
+def require_cuda_or_xpu(test_case):
+    """
+    Decorator marking a test that requires CUDA or XPU. These tests are skipped when there are no GPU available or when
+    TorchXLA is available.
+    """
+    cuda_condition = is_cuda_available() and not is_torch_xla_available()
+    xpu_condition = is_xpu_available()
+    return unittest.skipUnless(cuda_condition or xpu_condition, "test requires a CUDA GPU or XPU")(test_case)
+def require_non_xpu(test_case):
+    """
+    Decorator marking a test that should be skipped for XPU.
+    """
+    return unittest.skipUnless(torch_device != "xpu", "test requires a non-XPU")(test_case)
+def require_non_hpu(test_case):
+    """
+    Decorator marking a test that should be skipped for HPU.
+    """
+    return unittest.skipUnless(torch_device != "hpu", "test requires a non-HPU")(test_case)
+def require_fp16(test_case):
+    """
+    Decorator marking a test that requires FP16. These tests are skipped when FP16 is not supported.
+    """
+    return unittest.skipUnless(is_fp16_available(), "test requires FP16 support")(test_case)
+def require_fp8(test_case):
+    """
+    Decorator marking a test that requires FP8. These tests are skipped when FP8 is not supported.
+    """
+    # is_fp8_available only checks for libraries
+    # ideally it should check for device capability as well
+    fp8_is_available = is_fp8_available()
+    if torch.cuda.is_available() and not check_cuda_fp8_capability():
+        fp8_is_available = False
+    if is_hpu_available() and is_habana_gaudi1():
+        fp8_is_available = False
+    return unittest.skipUnless(fp8_is_available, "test requires FP8 support")(test_case)
+def require_fsdp2(test_case):
+    return unittest.skipUnless(is_torch_version(">=", "2.5.0"), "test requires FSDP2 (torch >= 2.5.0)")(test_case)
+def require_mlu(test_case):
+    """
+    Decorator marking a test that requires MLU. These tests are skipped when there are no MLU available.
+    """
+    return unittest.skipUnless(is_mlu_available(), "test require a MLU")(test_case)
+def require_sdaa(test_case):
+    """
+    Decorator marking a test that requires SDAA. These tests are skipped when there are no SDAA available.
+    """
+    return unittest.skipUnless(is_sdaa_available(), "test require a SDAA")(test_case)
+def require_musa(test_case):
+    """
+    Decorator marking a test that requires MUSA. These tests are skipped when there are no MUSA available.
+    """
+    return unittest.skipUnless(is_musa_available(), "test require a MUSA")(test_case)
+def require_npu(test_case):
+    """
+    Decorator marking a test that requires NPU. These tests are skipped when there are no NPU available.
+    """
+    return unittest.skipUnless(is_npu_available(), "test require a NPU")(test_case)
+def require_neuron(test_case):
+    """
+    Decorator marking a test that requires Neuron. These tests are skipped when there are no Neuron Cores available.
+    """
+    return unittest.skipUnless(is_neuron_available(), "test require Neuron Cores")(test_case)
+def require_mps(test_case):
+    """
+    Decorator marking a test that requires MPS backend. These tests are skipped when torch doesn't support `mps`
+    backend.
+    """
+    return unittest.skipUnless(is_mps_available(), "test requires a `mps` backend support in `torch`")(test_case)
+def require_huggingface_suite(test_case):
+    """
+    Decorator marking a test that requires transformers and datasets. These tests are skipped when they are not.
+    """
+    return unittest.skipUnless(
+        is_transformers_available() and is_datasets_available(),
+        "test requires the Hugging Face suite",
+    )(test_case)
+def require_transformers(test_case):
+    """
+    Decorator marking a test that requires transformers. These tests are skipped when they are not.
+    """
+    return unittest.skipUnless(is_transformers_available(), "test requires the transformers library")(test_case)
+def require_timm(test_case):
+    """
+    Decorator marking a test that requires timm. These tests are skipped when they are not.
+    """
+    return unittest.skipUnless(is_timm_available(), "test requires the timm library")(test_case)
+def require_torchvision(test_case):
+    """
+    Decorator marking a test that requires torchvision. These tests are skipped when they are not.
+    """
+    return unittest.skipUnless(is_torchvision_available(), "test requires the torchvision library")(test_case)
+def require_triton(test_case):
+    """
+    Decorator marking a test that requires triton. These tests are skipped when they are not.
+    """
+    return unittest.skipUnless(is_triton_available(), "test requires the triton library")(test_case)
+def require_schedulefree(test_case):
+    """
+    Decorator marking a test that requires schedulefree. These tests are skipped when they are not.
+    """
+    return unittest.skipUnless(is_schedulefree_available(), "test requires the schedulefree library")(test_case)
+def require_bnb(test_case):
+    """
+    Decorator marking a test that requires bitsandbytes. These tests are skipped when they are not.
+    """
+    return unittest.skipUnless(is_bnb_available(), "test requires the bitsandbytes library")(test_case)
+def require_tpu(test_case):
+    """
+    Decorator marking a test that requires TPUs. These tests are skipped when there are no TPUs available.
+    """
+    return unittest.skipUnless(is_torch_xla_available(check_is_tpu=True), "test requires TPU")(test_case)
+def require_non_torch_xla(test_case):
+    """
+    Decorator marking a test as requiring an environment without TorchXLA. These tests are skipped when TorchXLA is
+    available.
+    """
+    return unittest.skipUnless(not is_torch_xla_available(), "test requires an env without TorchXLA")(test_case)
+def require_single_device(test_case):
+    """
+    Decorator marking a test that requires a single device. These tests are skipped when there is no hardware
+    accelerator available or number of devices is more than one.
+    """
+    return unittest.skipUnless(
+        torch_device != "cpu" and device_count == 1, "test requires a single device accelerator"
+    )(test_case)
+def require_single_gpu(test_case):
+    """
+    Decorator marking a test that requires CUDA on a single GPU. These tests are skipped when there are no GPU
+    available or number of GPUs is more than one.
+    """
+    return unittest.skipUnless(torch.cuda.device_count() == 1, "test requires a GPU")(test_case)
+def require_single_xpu(test_case):
+    """
+    Decorator marking a test that requires CUDA on a single XPU. These tests are skipped when there are no XPU
+    available or number of xPUs is more than one.
+    """
+    return unittest.skipUnless(torch.xpu.device_count() == 1, "test requires a XPU")(test_case)
+def require_multi_device(test_case):
+    """
+    Decorator marking a test that requires a multi-device setup. These tests are skipped on a machine without multiple
+    devices.
+    """
+    return unittest.skipUnless(device_count > 1, "test requires multiple hardware accelerators")(test_case)
+def require_multi_gpu(test_case):
+    """
+    Decorator marking a test that requires a multi-GPU setup. These tests are skipped on a machine without multiple
+    GPUs.
+    """
+    return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)
+def require_multi_xpu(test_case):
+    """
+    Decorator marking a test that requires a multi-XPU setup. These tests are skipped on a machine without multiple
+    XPUs.
+    """
+    return unittest.skipUnless(torch.xpu.device_count() > 1, "test requires multiple XPUs")(test_case)
+def require_multi_gpu_or_xpu(test_case):
+    """
+    Decorator marking a test that requires a multi-GPU setup. These tests are skipped on a machine without multiple
+    GPUs or XPUs.
+    """
+    return unittest.skipUnless(
+        (is_cuda_available() or is_xpu_available()) and device_count > 1, "test requires multiple GPUs or XPUs"
+    )(test_case)
+def require_deepspeed(test_case):
+    """
+    Decorator marking a test that requires DeepSpeed installed. These tests are skipped when DeepSpeed isn't installed
+    """
+    return unittest.skipUnless(is_deepspeed_available(), "test requires DeepSpeed")(test_case)
+def require_tp(test_case):
+    """
+    Decorator marking a test that requires TP installed. These tests are skipped when TP isn't installed
+    """
+    return unittest.skipUnless(
+        is_torch_version(">=", "2.3.0") and compare_versions("transformers", ">=", "4.52.0"),
+        "test requires torch version >= 2.3.0 and transformers version >= 4.52.0",
+    )(test_case)
+def require_torch_min_version(test_case=None, version=None):
+    """
+    Decorator marking that a test requires a particular torch version to be tested. These tests are skipped when an
+    installed torch version is less than the required one.
+    """
+    if test_case is None:
+        return partial(require_torch_min_version, version=version)
+    return unittest.skipUnless(is_torch_version(">=", version), f"test requires torch version >= {version}")(test_case)
+def require_tensorboard(test_case):
+    """
+    Decorator marking a test that requires tensorboard installed. These tests are skipped when tensorboard isn't
+    installed
+    """
+    return unittest.skipUnless(is_tensorboard_available(), "test requires Tensorboard")(test_case)
+def require_wandb(test_case):
+    """
+    Decorator marking a test that requires wandb installed. These tests are skipped when wandb isn't installed
+    """
+    return unittest.skipUnless(is_wandb_available(), "test requires wandb")(test_case)
+def require_trackio(test_case):
+    """
+    Decorator marking a test that requires trackio installed. These tests are skipped when trackio isn't installed
+    """
+    return unittest.skipUnless(is_trackio_available(), "test requires trackio")(test_case)
+def require_comet_ml(test_case):
+    """
+    Decorator marking a test that requires comet_ml installed. These tests are skipped when comet_ml isn't installed
+    """
+    return unittest.skipUnless(is_comet_ml_available(), "test requires comet_ml")(test_case)
+def require_aim(test_case):
+    """
+    Decorator marking a test that requires aim installed. These tests are skipped when aim isn't installed
+    """
+    return unittest.skipUnless(is_aim_available(), "test requires aim")(test_case)
+def require_clearml(test_case):
+    """
+    Decorator marking a test that requires clearml installed. These tests are skipped when clearml isn't installed
+    """
+    return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case)
+def require_dvclive(test_case):
+    """
+    Decorator marking a test that requires dvclive installed. These tests are skipped when dvclive isn't installed
+    """
+    return unittest.skipUnless(is_dvclive_available(), "test requires dvclive")(test_case)
+def require_swanlab(test_case):
+    """
+    Decorator marking a test that requires swanlab installed. These tests are skipped when swanlab isn't installed
+    """
+    return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)
+def require_pandas(test_case):
+    """
+    Decorator marking a test that requires pandas installed. These tests are skipped when pandas isn't installed
+    """
+    return unittest.skipUnless(is_pandas_available(), "test requires pandas")(test_case)
+def require_mlflow(test_case):
+    """
+    Decorator marking a test that requires mlflow installed. These tests are skipped when mlflow isn't installed
+    """
+    return unittest.skipUnless(is_mlflow_available(), "test requires mlflow")(test_case)
+def require_pippy(test_case):
+    """
+    Decorator marking a test that requires pippy installed. These tests are skipped when pippy isn't installed It is
+    also checked if the test is running on a Gaudi1 device which doesn't support pippy.
+    """
+    return unittest.skipUnless(is_pippy_available() and not is_habana_gaudi1(), "test requires pippy")(test_case)
+def require_import_timer(test_case):
+    """
+    Decorator marking a test that requires tuna interpreter installed. These tests are skipped when tuna isn't
+    installed
+    """
+    return unittest.skipUnless(is_import_timer_available(), "test requires tuna interpreter")(test_case)
+def require_transformer_engine(test_case):
+    """
+    Decorator marking a test that requires transformers engine installed. These tests are skipped when transformers
+    engine isn't installed
+    """
+    return unittest.skipUnless(is_transformer_engine_available(), "test requires transformers engine")(test_case)
+def require_transformer_engine_mxfp8(test_case):
+    """
+    Decorator marking a test that requires transformers engine MXFP8 block scaling available. These tests are skipped
+    when transformers engine MXFP8 block scaling isn't available
+    """
+    return unittest.skipUnless(
+        is_transformer_engine_mxfp8_available(), "test requires transformers engine MXFP8 block scaling"
+    )(test_case)
+def require_torchao(test_case):
+    """
+    Decorator marking a test that requires torchao installed. These tests are skipped when torchao isn't installed
+    """
+    return unittest.skipUnless(is_torchao_available(), "test requires torchao")(test_case)
+def require_matplotlib(test_case):
+    """
+    Decorator marking a test that requires matplotlib installed. These tests are skipped when matplotlib isn't
+    installed
+    """
+    return unittest.skipUnless(is_matplotlib_available(), "test requires matplotlib")(test_case)
+_atleast_one_tracker_available = (
+    any([is_wandb_available(), is_tensorboard_available(), is_trackio_available(), is_swanlab_available()])
+    and not is_comet_ml_available()
+)
+def require_trackers(test_case):
+    """
+    Decorator marking that a test requires at least one tracking library installed. These tests are skipped when none
+    are installed
+    """
+    return unittest.skipUnless(
+        _atleast_one_tracker_available,
+        "test requires at least one tracker to be available and for `comet_ml` to not be installed",
+    )(test_case)
+def require_torchdata_stateful_dataloader(test_case):
+    """
+    Decorator marking a test that requires torchdata.stateful_dataloader.
+    These tests are skipped when torchdata with stateful_dataloader module isn't installed.
+    """
+    return unittest.skipUnless(
+        is_torchdata_stateful_dataloader_available(), "test requires torchdata.stateful_dataloader"
+    )(test_case)
+def run_first(test_case):
+    """
+    Decorator marking a test with order(1). When pytest-order plugin is installed, tests marked with this decorator are
+    guaranteed to run first.
+    This is especially useful in some test settings like on a Gaudi instance where a Gaudi device can only be used by a
+    single process at a time. So we make sure all tests that run in a subprocess are launched first, to avoid device
+    allocation conflicts.
+    If pytest is not installed, test will be returned as is.
+    """
+    if is_pytest_available():
+        import pytest
+        return pytest.mark.order(1)(test_case)
+    return test_case
+class TempDirTestCase(unittest.TestCase):
+    """
+    A TestCase class that keeps a single `tempfile.TemporaryDirectory` open for the duration of the class, wipes its
+    data at the start of a test, and then destroys it at the end of the TestCase.
+    Useful for when a class or API requires a single constant folder throughout it's use, such as Weights and Biases
+    The temporary directory location will be stored in `self.tmpdir`
+    """
+    clear_on_setup = True
+    @classmethod
+    def setUpClass(cls):
+        "Creates a `tempfile.TemporaryDirectory` and stores it in `cls.tmpdir`"
+        cls.tmpdir = Path(tempfile.mkdtemp())
+    @classmethod
+    def tearDownClass(cls):
+        "Remove `cls.tmpdir` after test suite has finished"
+        if os.path.exists(cls.tmpdir):
+            shutil.rmtree(cls.tmpdir)
+    def setUp(self):
+        "Destroy all contents in `self.tmpdir`, but not `self.tmpdir`"
+        if self.clear_on_setup:
+            for path in self.tmpdir.glob("**/*"):
+                if path.is_file():
+                    path.unlink()
+                elif path.is_dir():
+                    shutil.rmtree(path)
+class AccelerateTestCase(unittest.TestCase):
+    """
+    A TestCase class that will reset the accelerator state at the end of every test. Every test that checks or utilizes
+    the `AcceleratorState` class should inherit from this to avoid silent failures due to state being shared between
+    tests.
+    """
+    def tearDown(self):
+        super().tearDown()
+        # Reset the state of the AcceleratorState singleton.
+        AcceleratorState._reset_state(True)
+class MockingTestCase(unittest.TestCase):
+    """
+    A TestCase class designed to dynamically add various mockers that should be used in every test, mimicking the
+    behavior of a class-wide mock when defining one normally will not do.
+    Useful when a mock requires specific information available only initialized after `TestCase.setUpClass`, such as
+    setting an environment variable with that information.
+    The `add_mocks` function should be ran at the end of a `TestCase`'s `setUp` function, after a call to
+    `super().setUp()` such as:
+    ```python
+    def setUp(self):
+        super().setUp()
+        mocks = mock.patch.dict(os.environ, {"SOME_ENV_VAR", "SOME_VALUE"})
+        self.add_mocks(mocks)
+    ```
+    """
+    def add_mocks(self, mocks: Union[mock.Mock, list[mock.Mock]]):
+        """
+        Add custom mocks for tests that should be repeated on each test. Should be called during
+        `MockingTestCase.setUp`, after `super().setUp()`.
+        Args:
+            mocks (`mock.Mock` or list of `mock.Mock`):
+                Mocks that should be added to the `TestCase` after `TestCase.setUpClass` has been run
+        """
+        self.mocks = mocks if isinstance(mocks, (tuple, list)) else [mocks]
+        for m in self.mocks:
+            m.start()
+            self.addCleanup(m.stop)
+def are_the_same_tensors(tensor):
+    state = AcceleratorState()
+    tensor = tensor[None].clone().to(state.device)
+    tensors = gather(tensor).cpu()
+    tensor = tensor[0].cpu()
+    for i in range(tensors.shape[0]):
+        if not torch.equal(tensors[i], tensor):
+            return False
+    return True
+class _RunOutput:
+    def __init__(self, returncode, stdout, stderr):
+        self.returncode = returncode
+        self.stdout = stdout
+        self.stderr = stderr
+async def _read_stream(stream, callback):
+    while True:
+        line = await stream.readline()
+        if line:
+            callback(line)
+        else:
+            break
+async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput:
+    if echo:
+        print("\nRunning: ", " ".join(cmd))
+    p = await asyncio.create_subprocess_exec(
+        cmd[0],
+        *cmd[1:],
+        stdin=stdin,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        env=env,
+    )
+    # note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe
+    # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait
+    #
+    # If it starts hanging, will need to switch to the following code. The problem is that no data
+    # will be seen until it's done and if it hangs for example there will be no debug info.
+    # out, err = await p.communicate()
+    # return _RunOutput(p.returncode, out, err)
+    out = []
+    err = []
+    def tee(line, sink, pipe, label=""):
+        line = line.decode("utf-8").rstrip()
+        sink.append(line)
+        if not quiet:
+            print(label, line, file=pipe)
+    # XXX: the timeout doesn't seem to make any difference here
+    await asyncio.wait(
+        [
+            asyncio.create_task(_read_stream(p.stdout, lambda l: tee(l, out, sys.stdout, label="stdout:"))),
+            asyncio.create_task(_read_stream(p.stderr, lambda l: tee(l, err, sys.stderr, label="stderr:"))),
+        ],
+        timeout=timeout,
+    )
+    return _RunOutput(await p.wait(), out, err)
+def execute_subprocess_async(cmd: list, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:
+    # Cast every path in `cmd` to a string
+    for i, c in enumerate(cmd):
+        if isinstance(c, Path):
+            cmd[i] = str(c)
+    result = asyncio.run(_stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo))
+    cmd_str = " ".join(cmd)
+    if result.returncode > 0:
+        stderr = "\n".join(result.stderr)
+        raise RuntimeError(
+            f"'{cmd_str}' failed with returncode {result.returncode}\n\n"
+            f"The combined stderr from workers follows:\n{stderr}"
+        )
+    return result
+def pytest_xdist_worker_id():
+    """
+    Returns an int value of worker's numerical id under `pytest-xdist`'s concurrent workers `pytest -n N` regime, or 0
+    if `-n 1` or `pytest-xdist` isn't being used.
+    """
+    worker = os.environ.get("PYTEST_XDIST_WORKER", "gw0")
+    worker = re.sub(r"^gw", "", worker, 0, re.M)
+    return int(worker)
+def get_torch_dist_unique_port():
+    """
+    Returns a port number that can be fed to `torch.distributed.launch`'s `--master_port` argument.
+    Under `pytest-xdist` it adds a delta number based on a worker id so that concurrent tests don't try to use the same
+    port at once.
+    """
+    port = 29500
+    uniq_delta = pytest_xdist_worker_id()
+    return port + uniq_delta
+class SubprocessCallException(Exception):
+    pass
+def run_command(command: list[str], return_stdout=False, env=None):
+    """
+    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
+    if an error occurred while running `command`
+    """
+    # Cast every path in `command` to a string
+    for i, c in enumerate(command):
+        if isinstance(c, Path):
+            command[i] = str(c)
+    if env is None:
+        env = os.environ.copy()
+    try:
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env)
+        if return_stdout:
+            if hasattr(output, "decode"):
+                output = output.decode("utf-8")
+            return output
+    except subprocess.CalledProcessError as e:
+        raise SubprocessCallException(
+            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
+        ) from e
+def path_in_accelerate_package(*components: str) -> Path:
+    """
+    Get a path within the `accelerate` package's directory.
+    Args:
+        *components: Components of the path to join after the package directory.
+    Returns:
+        `Path`: The path to the requested file or directory.
+    """
+    accelerate_package_dir = Path(inspect.getfile(accelerate)).parent
+    return accelerate_package_dir.joinpath(*components)
+@contextmanager
+def assert_exception(exception_class: Exception, msg: Optional[str] = None) -> bool:
+    """
+    Context manager to assert that the right `Exception` class was raised.
+    If `msg` is provided, will check that the message is contained in the raised exception.
+    """
+    was_ran = False
+    try:
+        yield
+        was_ran = True
+    except Exception as e:
+        assert isinstance(e, exception_class), f"Expected exception of type {exception_class} but got {type(e)}"
+        if msg is not None:
+            assert msg in str(e), f"Expected message '{msg}' to be in exception but got '{str(e)}'"
+    if was_ran:
+        raise AssertionError(f"Expected exception of type {exception_class} but ran without issue.")
+def capture_call_output(func, *args, **kwargs):
+    """
+    Takes in a `func` with `args` and `kwargs` and returns the captured stdout as a string
+    """
+    captured_output = io.StringIO()
+    original_stdout = sys.stdout
+    try:
+        sys.stdout = captured_output
+        func(*args, **kwargs)
+    except Exception as e:
+        raise e
+    finally:
+        sys.stdout = original_stdout
+    return captured_output.getvalue()

accelerate/test_utils/training.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from accelerate.utils.dataclasses import DistributedType
+class RegressionDataset:
+    def __init__(self, a=2, b=3, length=64, seed=None):
+        rng = np.random.default_rng(seed)
+        self.length = length
+        self.x = rng.normal(size=(length,)).astype(np.float32)
+        self.y = a * self.x + b + rng.normal(scale=0.1, size=(length,)).astype(np.float32)
+    def __len__(self):
+        return self.length
+    def __getitem__(self, i):
+        return {"x": self.x[i], "y": self.y[i]}
+class RegressionModel(torch.nn.Module):
+    def __init__(self, a=0, b=0, double_output=False):
+        super().__init__()
+        self.a = torch.nn.Parameter(torch.tensor(a).float())
+        self.b = torch.nn.Parameter(torch.tensor(b).float())
+        self.first_batch = True
+    def forward(self, x=None):
+        if self.first_batch:
+            print(f"Model dtype: {self.a.dtype}, {self.b.dtype}. Input dtype: {x.dtype}")
+            self.first_batch = False
+        return x * self.a + self.b
+def mocked_dataloaders(accelerator, batch_size: int = 16):
+    from datasets import load_dataset
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"}
+    datasets = load_dataset("csv", data_files=data_files)
+    label_list = datasets["train"].unique("label")
+    label_to_id = {v: i for i, v in enumerate(label_list)}
+    def tokenize_function(examples):
+        # max_length=None => use the model max length (it's actually the default)
+        outputs = tokenizer(
+            examples["sentence1"], examples["sentence2"], truncation=True, max_length=None, padding="max_length"
+        )
+        if "label" in examples:
+            outputs["labels"] = [label_to_id[l] for l in examples["label"]]
+        return outputs
+    # Apply the method we just defined to all the examples in all the splits of the dataset
+    tokenized_datasets = datasets.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=["sentence1", "sentence2", "label"],
+    )
+    def collate_fn(examples):
+        # On TPU it's best to pad everything to the same length or training will be very slow.
+        if accelerator.distributed_type == DistributedType.XLA:
+            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
+        return tokenizer.pad(examples, padding="longest", return_tensors="pt")
+    # Instantiate dataloaders.
+    train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=2)
+    eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1)
+    return train_dataloader, eval_dataloader
+def mocked_dataloaders_for_autoregressive_models(accelerator, batch_size: int = 16):
+    from datasets import load_dataset
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M")
+    tokenizer.pad_token = tokenizer.eos_token
+    data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"}
+    datasets = load_dataset("csv", data_files=data_files)
+    def tokenize_function(examples):
+        # max_length=None => use the model max length (it's actually the default)
+        outputs = tokenizer(examples["sentence1"], truncation=True, max_length=None, return_attention_mask=False)
+        return outputs
+    # Apply the method we just defined to all the examples in all the splits of the dataset
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["sentence1", "sentence2", "label"],
+        )
+    def collate_fn(examples):
+        # On TPU it's best to pad everything to the same length or training will be very slow.
+        max_length = (
+            128
+            if accelerator.distributed_type == DistributedType.XLA
+            else max([len(e["input_ids"]) for e in examples])
+        )
+        # When using mixed precision we want round multiples of 8/16
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+        batch = tokenizer.pad(
+            examples,
+            padding="max_length",
+            max_length=max_length + 1,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        batch["labels"] = batch["input_ids"][:, 1:]
+        batch["input_ids"] = batch["input_ids"][:, :-1]
+        if "attention_mask" in batch:
+            batch["attention_mask"] = batch["attention_mask"][:, :-1]
+        batch["labels"] = torch.where(batch["labels"] == tokenizer.pad_token_id, -100, batch["labels"])
+        return batch
+    # Instantiate dataloaders.
+    train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=False, collate_fn=collate_fn, batch_size=2)
+    eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1)
+    return train_dataloader, eval_dataloader

accelerate/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..parallelism_config import ParallelismConfig
+from .ao import convert_model_to_fp8_ao, filter_first_and_last_linear_layers, has_ao_layers
+from .constants import (
+    MITA_PROFILING_AVAILABLE_PYTORCH_VERSION,
+    MODEL_NAME,
+    OPTIMIZER_NAME,
+    PROFILE_PATTERN_NAME,
+    RNG_STATE_NAME,
+    SAFE_MODEL_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    SAFE_WEIGHTS_PATTERN_NAME,
+    SAMPLER_NAME,
+    SCALER_NAME,
+    SCHEDULER_NAME,
+    TORCH_DISTRIBUTED_OPERATION_TYPES,
+    TORCH_LAUNCH_PARAMS,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+    WEIGHTS_PATTERN_NAME,
+    XPU_PROFILING_AVAILABLE_PYTORCH_VERSION,
+)
+from .dataclasses import (
+    AORecipeKwargs,
+    AutocastKwargs,
+    BnbQuantizationConfig,
+    ComputeEnvironment,
+    CustomDtype,
+    DataLoaderConfiguration,
+    DDPCommunicationHookType,
+    DeepSpeedPlugin,
+    DeepSpeedSequenceParallelConfig,
+    DistributedDataParallelKwargs,
+    DistributedType,
+    DynamoBackend,
+    FP8RecipeKwargs,
+    FullyShardedDataParallelPlugin,
+    GradientAccumulationPlugin,
+    GradScalerKwargs,
+    InitProcessGroupKwargs,
+    KwargsHandler,
+    LoggerType,
+    MegatronLMPlugin,
+    MSAMPRecipeKwargs,
+    PrecisionType,
+    ProfileKwargs,
+    ProjectConfiguration,
+    RNGType,
+    SageMakerDistributedType,
+    TensorInformation,
+    TERecipeKwargs,
+    TorchContextParallelConfig,
+    TorchDynamoPlugin,
+    TorchTensorParallelConfig,
+    TorchTensorParallelPlugin,
+    add_model_config_to_megatron_parser,
+)
+from .environment import (
+    are_libraries_initialized,
+    check_cuda_fp8_capability,
+    check_cuda_p2p_ib_support,
+    clear_environment,
+    convert_dict_to_env_variables,
+    get_cpu_distributed_information,
+    get_current_device_type,
+    get_gpu_info,
+    get_int_from_env,
+    parse_choice_from_env,
+    parse_flag_from_env,
+    patch_environment,
+    purge_accelerate_environment,
+    set_numa_affinity,
+    str_to_bool,
+)
+from .imports import (
+    deepspeed_required,
+    is_4bit_bnb_available,
+    is_8bit_bnb_available,
+    is_aim_available,
+    is_bf16_available,
+    is_bitsandbytes_multi_backend_available,
+    is_bnb_available,
+    is_boto3_available,
+    is_clearml_available,
+    is_comet_ml_available,
+    is_cuda_available,
+    is_datasets_available,
+    is_deepspeed_available,
+    is_dvclive_available,
+    is_fp8_available,
+    is_fp16_available,
+    is_habana_gaudi1,
+    is_hpu_available,
+    is_import_timer_available,
+    is_lomo_available,
+    is_matplotlib_available,
+    is_megatron_lm_available,
+    is_mlflow_available,
+    is_mlu_available,
+    is_mps_available,
+    is_msamp_available,
+    is_musa_available,
+    is_neuron_available,
+    is_npu_available,
+    is_pandas_available,
+    is_peft_available,
+    is_pippy_available,
+    is_pynvml_available,
+    is_pytest_available,
+    is_rich_available,
+    is_sagemaker_available,
+    is_schedulefree_available,
+    is_sdaa_available,
+    is_swanlab_available,
+    is_tensorboard_available,
+    is_timm_available,
+    is_torch_xla_available,
+    is_torchao_available,
+    is_torchdata_available,
+    is_torchdata_stateful_dataloader_available,
+    is_torchvision_available,
+    is_trackio_available,
+    is_transformer_engine_available,
+    is_transformer_engine_mxfp8_available,
+    is_transformers_available,
+    is_triton_available,
+    is_wandb_available,
+    is_weights_only_available,
+    is_xccl_available,
+    is_xpu_available,
+    torchao_required,
+)
+from .modeling import (
+    align_module_device,
+    calculate_maximum_sizes,
+    check_device_map,
+    check_tied_parameters_in_config,
+    check_tied_parameters_on_same_device,
+    compute_module_sizes,
+    convert_file_size_to_int,
+    dtype_byte_size,
+    find_tied_parameters,
+    get_balanced_memory,
+    get_grad_scaler,
+    get_max_layer_size,
+    get_max_memory,
+    get_mixed_precision_context_manager,
+    has_offloaded_params,
+    id_tensor_storage,
+    infer_auto_device_map,
+    is_peft_model,
+    load_checkpoint_in_model,
+    load_offloaded_weights,
+    load_state_dict,
+    named_module_tensors,
+    retie_parameters,
+    set_module_tensor_to_device,
+)
+from .offload import (
+    OffloadedWeightsLoader,
+    PrefixedDataset,
+    extract_submodules_state_dict,
+    load_offloaded_weight,
+    offload_state_dict,
+    offload_weight,
+    save_offload_index,
+)
+from .operations import (
+    CannotPadNestedTensorWarning,
+    GatheredParameters,
+    broadcast,
+    broadcast_object_list,
+    concatenate,
+    convert_outputs_to_fp32,
+    convert_to_fp32,
+    copy_tensor_to_devices,
+    find_batch_size,
+    find_device,
+    gather,
+    gather_object,
+    get_data_structure,
+    honor_type,
+    ignorant_find_batch_size,
+    initialize_tensors,
+    is_namedtuple,
+    is_tensor_information,
+    is_torch_tensor,
+    listify,
+    pad_across_processes,
+    pad_input_tensors,
+    recursively_apply,
+    reduce,
+    send_to_device,
+    slice_tensors,
+)
+from .versions import compare_versions, is_torch_version
+if is_deepspeed_available():
+    from .deepspeed import (
+        DeepSpeedEngineWrapper,
+        DeepSpeedOptimizerWrapper,
+        DeepSpeedSchedulerWrapper,
+        DummyOptim,
+        DummyScheduler,
+        HfDeepSpeedConfig,
+        get_active_deepspeed_plugin,
+        map_pytorch_optim_to_deepspeed,
+    )
+from .bnb import has_4bit_bnb_layers, load_and_quantize_model
+from .fsdp_utils import (
+    disable_fsdp_ram_efficient_loading,
+    enable_fsdp_ram_efficient_loading,
+    ensure_weights_retied,
+    fsdp2_apply_ac,
+    fsdp2_canonicalize_names,
+    fsdp2_load_full_state_dict,
+    fsdp2_prepare_model,
+    fsdp2_switch_optimizer_parameters,
+    get_fsdp2_grad_scaler,
+    load_fsdp_model,
+    load_fsdp_optimizer,
+    merge_fsdp_weights,
+    save_fsdp_model,
+    save_fsdp_optimizer,
+)
+from .launch import (
+    PrepareForLaunch,
+    _filter_args,
+    prepare_deepspeed_cmd_env,
+    prepare_multi_gpu_env,
+    prepare_sagemager_args_inputs,
+    prepare_simple_launcher_cmd_env,
+    prepare_tpu,
+)
+# For docs
+from .megatron_lm import (
+    AbstractTrainStep,
+    BertTrainStep,
+    GPTTrainStep,
+    MegatronLMDummyDataLoader,
+    MegatronLMDummyScheduler,
+    T5TrainStep,
+    avg_losses_across_data_parallel_group,
+)
+if is_megatron_lm_available():
+    from .megatron_lm import (
+        MegatronEngine,
+        MegatronLMOptimizerWrapper,
+        MegatronLMSchedulerWrapper,
+        gather_across_data_parallel_groups,
+    )
+    from .megatron_lm import initialize as megatron_lm_initialize
+    from .megatron_lm import prepare_data_loader as megatron_lm_prepare_data_loader
+    from .megatron_lm import prepare_model_optimizer_scheduler as megatron_lm_prepare_model_optimizer_scheduler
+    from .megatron_lm import prepare_optimizer as megatron_lm_prepare_optimizer
+    from .megatron_lm import prepare_scheduler as megatron_lm_prepare_scheduler
+from .memory import find_executable_batch_size, release_memory
+from .other import (
+    check_os_kernel,
+    clean_state_dict_for_safetensors,
+    compile_regions,
+    compile_regions_deepspeed,
+    convert_bytes,
+    extract_model_from_parallel,
+    get_module_children_bottom_up,
+    get_pretty_name,
+    has_compiled_regions,
+    is_compiled_module,
+    is_port_in_use,
+    load,
+    merge_dicts,
+    model_has_dtensor,
+    recursive_getattr,
+    save,
+    wait_for_everyone,
+    write_basic_config,
+)
+from .random import set_seed, synchronize_rng_state, synchronize_rng_states
+from .torch_xla import install_xla
+from .tqdm import tqdm
+from .transformer_engine import (
+    apply_fp8_autowrap,
+    contextual_fp8_autocast,
+    convert_model,
+    has_transformer_engine_layers,
+)

accelerate/utils/ao.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Needed utilities for torchao FP8 training.
+"""
+from functools import partial
+from typing import TYPE_CHECKING, Callable, Optional
+import torch
+from .imports import is_torchao_available, torchao_required
+if TYPE_CHECKING:
+    if is_torchao_available():
+        from torchao.float8.float8_linear import Float8LinearConfig
+def find_first_last_linear_layers(model: torch.nn.Module):
+    """
+    Finds the first and last linear layer names in a model.
+    This is needed during FP8 to avoid issues with instability by keeping the first and last layers unquantized.
+    Ref: https://x.com/xariusrke/status/1826669142604141052
+    """
+    first_linear, last_linear = None, None
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            if first_linear is None:
+                first_linear = name
+            last_linear = name
+    return first_linear, last_linear
+def filter_linear_layers(module, fqn: str, layers_to_filter: list[str]) -> bool:
+    """
+    A function which will check if `module` is:
+    - a `torch.nn.Linear` layer
+    - has in_features and out_features divisible by 16
+    - is not part of `layers_to_filter`
+    Args:
+        module (`torch.nn.Module`):
+            The module to check.
+        fqn (`str`):
+            The fully qualified name of the layer.
+        layers_to_filter (`List[str]`):
+            The list of layers to filter.
+    """
+    if isinstance(module, torch.nn.Linear):
+        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
+            return False
+    if fqn in layers_to_filter:
+        return False
+    return True
+def filter_first_and_last_linear_layers(module, fqn: str) -> bool:
+    """
+    A filter function which will filter out all linear layers except the first and last.
+    <Tip>
+        For stability reasons, we skip the first and last linear layers Otherwise can lead to the model not training or
+        converging properly
+    </Tip>
+    Args:
+        module (`torch.nn.Module`):
+            The module to check.
+        fqn (`str`):
+            The fully qualified name of the layer.
+    """
+    first_linear, last_linear = find_first_last_linear_layers(module)
+    return filter_linear_layers(module, fqn, layers_to_filter=[first_linear, last_linear])
+@torchao_required
+def has_ao_layers(model: torch.nn.Module):
+    from torchao.float8.float8_linear import Float8Linear
+    for name, module in model.named_modules():
+        if isinstance(module, Float8Linear):
+            return True
+    return False
+@torchao_required
+def convert_model_to_fp8_ao(
+    model: torch.nn.Module,
+    config: Optional["Float8LinearConfig"] = None,
+    module_filter_func: Optional[Callable] = filter_first_and_last_linear_layers,
+):
+    """
+    Converts all `nn.Linear` layers in the model (except the first and last) to torchao's `Float8Linear` layer inplace.
+    Args:
+        model (`torch.nn.Module`):
+            The model to convert.
+        config (`torchao.float8.Float8LinearConfig`, *optional*):
+            The configuration for the FP8 training. Recommended to utilize
+            `torchao.float8.recipe_name_to_linear_config` to generate this. In general, the default config should be
+            sufficient (what is passed when set to `None`).
+        module_filter_func (`Callable`, *optional*, defaults to `filter_linear_layers`):
+            Optional function that must take in a module and layer name, and returns a boolean indicating whether the
+            module should be converted to FP8. Defaults to `filter_linear_layers`. See it for an example.
+    Example:
+    ```python
+    from accelerate.utils.ao import convert_model_to_fp8_ao
+    from accelerate import Accelerator
+    accelerator = Accelerator(
+    model = MyModel()
+    model.to(accelerator.device)
+    convert_to_float8_training(model)
+    model.train()
+    ```
+    """
+    from torchao.float8 import convert_to_float8_training
+    first_linear, last_linear = find_first_last_linear_layers(model)
+    if module_filter_func is None:
+        module_filter_func = partial(filter_linear_layers, layers_to_filter=[first_linear, last_linear])
+    convert_to_float8_training(model, module_filter_fn=module_filter_func, config=config)

accelerate/utils/bnb.py ADDED Viewed

	@@ -0,0 +1,464 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from copy import deepcopy
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from accelerate.utils.imports import (
+    is_4bit_bnb_available,
+    is_8bit_bnb_available,
+)
+from ..big_modeling import dispatch_model, init_empty_weights
+from .dataclasses import BnbQuantizationConfig
+from .modeling import (
+    find_tied_parameters,
+    get_balanced_memory,
+    infer_auto_device_map,
+    load_checkpoint_in_model,
+    offload_weight,
+    set_module_tensor_to_device,
+)
+logger = logging.getLogger(__name__)
+def load_and_quantize_model(
+    model: torch.nn.Module,
+    bnb_quantization_config: BnbQuantizationConfig,
+    weights_location: Optional[Union[str, os.PathLike]] = None,
+    device_map: Optional[dict[str, Union[int, str, torch.device]]] = None,
+    no_split_module_classes: Optional[list[str]] = None,
+    max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
+    offload_folder: Optional[Union[str, os.PathLike]] = None,
+    offload_state_dict: bool = False,
+):
+    """
+    This function will quantize the input model with the associated config passed in `bnb_quantization_config`. If the
+    model is in the meta device, we will load and dispatch the weights according to the `device_map` passed. If the
+    model is already loaded, we will quantize the model and put the model on the GPU,
+    Args:
+        model (`torch.nn.Module`):
+            Input model. The model can be already loaded or on the meta device
+        bnb_quantization_config (`BnbQuantizationConfig`):
+            The bitsandbytes quantization parameters
+        weights_location (`str` or `os.PathLike`):
+            The folder weights_location to load. It can be:
+            - a path to a file containing a whole model state dict
+            - a path to a `.json` file containing the index to a sharded checkpoint
+            - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
+            - a path to a folder containing a unique pytorch_model.bin file.
+        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
+            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
+            name, once a given module name is inside, every submodule of it will be sent to the same device.
+        no_split_module_classes (`List[str]`, *optional*):
+            A list of layer class names that should never be split across device (for instance any layer that has a
+            residual connection).
+        max_memory (`Dict`, *optional*):
+            A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
+        offload_folder (`str` or `os.PathLike`, *optional*):
+            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+        offload_state_dict (`bool`, *optional*, defaults to `False`):
+            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
+            the weight of the CPU state dict + the biggest shard does not fit.
+    Returns:
+        `torch.nn.Module`: The quantized model
+    """
+    load_in_4bit = bnb_quantization_config.load_in_4bit
+    load_in_8bit = bnb_quantization_config.load_in_8bit
+    if load_in_8bit and not is_8bit_bnb_available():
+        raise ImportError(
+            "You have a version of `bitsandbytes` that is not compatible with 8bit quantization,"
+            " make sure you have the latest version of `bitsandbytes` installed."
+        )
+    if load_in_4bit and not is_4bit_bnb_available():
+        raise ValueError(
+            "You have a version of `bitsandbytes` that is not compatible with 4bit quantization,"
+            "make sure you have the latest version of `bitsandbytes` installed."
+        )
+    modules_on_cpu = []
+    # custom device map
+    if isinstance(device_map, dict) and len(device_map.keys()) > 1:
+        modules_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
+    # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
+    if bnb_quantization_config.skip_modules is None:
+        bnb_quantization_config.skip_modules = get_keys_to_not_convert(model)
+    # add cpu modules to skip modules only for 4-bit modules
+    if load_in_4bit:
+        bnb_quantization_config.skip_modules.extend(modules_on_cpu)
+    modules_to_not_convert = bnb_quantization_config.skip_modules
+    # We add the modules we want to keep in full precision
+    if bnb_quantization_config.keep_in_fp32_modules is None:
+        bnb_quantization_config.keep_in_fp32_modules = []
+    keep_in_fp32_modules = bnb_quantization_config.keep_in_fp32_modules
+    modules_to_not_convert.extend(keep_in_fp32_modules)
+    # compatibility with peft
+    model.is_loaded_in_4bit = load_in_4bit
+    model.is_loaded_in_8bit = load_in_8bit
+    model_device = get_parameter_device(model)
+    if model_device.type != "meta":
+        # quantization of an already loaded model
+        logger.warning(
+            "It is not recommended to quantize a loaded model. "
+            "The model should be instantiated under the `init_empty_weights` context manager."
+        )
+        model = replace_with_bnb_layers(model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert)
+        # convert param to the right dtype
+        dtype = bnb_quantization_config.torch_dtype
+        for name, param in model.named_parameters():
+            if any(module_to_keep_in_fp32 in name for module_to_keep_in_fp32 in keep_in_fp32_modules):
+                param.data = param.data.to(torch.float32)
+            elif torch.is_floating_point(param):
+                param.data = param.data.to(dtype)
+        if model_device.type == "cuda":
+            model.cuda(torch.cuda.current_device())
+            torch.cuda.empty_cache()
+        elif torch.cuda.is_available():
+            model.to(torch.cuda.current_device())
+        elif torch.xpu.is_available():
+            model.to(torch.xpu.current_device())
+        else:
+            raise RuntimeError("No GPU or Intel XPU found. A GPU or Intel XPU is needed for quantization.")
+        logger.info(
+            f"The model device type is {model_device.type}. However, gpu or intel xpu is needed for quantization."
+            "We move the model to it."
+        )
+        return model
+    elif weights_location is None:
+        raise RuntimeError(
+            f"`weights_location` needs to be the folder path containing the weights of the model, but we found {weights_location} "
+        )
+    else:
+        with init_empty_weights():
+            model = replace_with_bnb_layers(
+                model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert
+            )
+        device_map = get_quantized_model_device_map(
+            model,
+            bnb_quantization_config,
+            device_map,
+            max_memory=max_memory,
+            no_split_module_classes=no_split_module_classes,
+        )
+        if offload_state_dict is None and device_map is not None and "disk" in device_map.values():
+            offload_state_dict = True
+        offload = any(x in list(device_map.values()) for x in ["cpu", "disk"])
+        load_checkpoint_in_model(
+            model,
+            weights_location,
+            device_map,
+            dtype=bnb_quantization_config.torch_dtype,
+            offload_folder=offload_folder,
+            offload_state_dict=offload_state_dict,
+            keep_in_fp32_modules=bnb_quantization_config.keep_in_fp32_modules,
+            offload_8bit_bnb=load_in_8bit and offload,
+        )
+        return dispatch_model(model, device_map=device_map, offload_dir=offload_folder)
+def get_quantized_model_device_map(
+    model, bnb_quantization_config, device_map=None, max_memory=None, no_split_module_classes=None
+):
+    if device_map is None:
+        if torch.cuda.is_available():
+            device_map = {"": torch.cuda.current_device()}
+        elif torch.xpu.is_available():
+            device_map = {"": torch.xpu.current_device()}
+        else:
+            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+        logger.info("The device_map was not initialized.Setting device_map to `{'':torch.cuda.current_device()}`.")
+    if isinstance(device_map, str):
+        if device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+            raise ValueError(
+                "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
+                "'sequential'."
+            )
+        special_dtypes = {}
+        special_dtypes.update(
+            {
+                name: bnb_quantization_config.torch_dtype
+                for name, _ in model.named_parameters()
+                if any(m in name for m in bnb_quantization_config.skip_modules)
+            }
+        )
+        special_dtypes.update(
+            {
+                name: torch.float32
+                for name, _ in model.named_parameters()
+                if any(m in name for m in bnb_quantization_config.keep_in_fp32_modules)
+            }
+        )
+        kwargs = {}
+        kwargs["special_dtypes"] = special_dtypes
+        kwargs["no_split_module_classes"] = no_split_module_classes
+        kwargs["dtype"] = bnb_quantization_config.target_dtype
+        # get max_memory for each device.
+        if device_map != "sequential":
+            max_memory = get_balanced_memory(
+                model,
+                low_zero=(device_map == "balanced_low_0"),
+                max_memory=max_memory,
+                **kwargs,
+            )
+        kwargs["max_memory"] = max_memory
+        device_map = infer_auto_device_map(model, **kwargs)
+    if isinstance(device_map, dict):
+        # check if don't have any quantized module on the cpu
+        modules_not_to_convert = bnb_quantization_config.skip_modules + bnb_quantization_config.keep_in_fp32_modules
+        device_map_without_some_modules = {
+            key: device_map[key] for key in device_map.keys() if key not in modules_not_to_convert
+        }
+        for device in ["cpu", "disk"]:
+            if device in device_map_without_some_modules.values():
+                if bnb_quantization_config.load_in_4bit:
+                    raise ValueError(
+                        """
+                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
+                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
+                        these modules in `torch_dtype`, you need to pass a custom `device_map` to
+                        `load_and_quantize_model`. Check
+                        https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization#offload-modules-to-cpu-and-disk
+                        for more details.
+                        """
+                    )
+                else:
+                    logger.info(
+                        "Some modules are are offloaded to the CPU or the disk. Note that these modules will be converted to 8-bit"
+                    )
+        del device_map_without_some_modules
+    return device_map
+def replace_with_bnb_layers(model, bnb_quantization_config, modules_to_not_convert=None, current_key_name=None):
+    """
+    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules or by `bnb.nn.Linear4bit`
+    modules from the `bitsandbytes`library. The function will be run recursively and replace `torch.nn.Linear` modules.
+    Parameters:
+        model (`torch.nn.Module`):
+            Input model or `torch.nn.Module` as the function is run recursively.
+        modules_to_not_convert (`List[str]`):
+            Names of the modules to not quantize convert. In practice we keep the `lm_head` in full precision for
+            numerical stability reasons.
+        current_key_name (`List[str]`, *optional*):
+            An array to track the current key of the recursion. This is used to check whether the current key (part of
+            it) is not in the list of modules to not convert.
+    """
+    if modules_to_not_convert is None:
+        modules_to_not_convert = []
+    model, has_been_replaced = _replace_with_bnb_layers(
+        model, bnb_quantization_config, modules_to_not_convert, current_key_name
+    )
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model in 8bit or 4bit but no linear modules were found in your model."
+            " this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers."
+            " Please double check your model architecture, or submit an issue on github if you think this is"
+            " a bug."
+        )
+    return model
+def _replace_with_bnb_layers(
+    model,
+    bnb_quantization_config,
+    modules_to_not_convert=None,
+    current_key_name=None,
+):
+    """
+    Private method that wraps the recursion for module replacement.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
+    """
+    # bitsandbytes will initialize device(e.g. CUDA, XPU) on import, so it needs to be imported lazily
+    import bitsandbytes as bnb
+    has_been_replaced = False
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+        if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
+            # Check if the current key is not in the `modules_to_not_convert`
+            current_key_name_str = ".".join(current_key_name)
+            proceed = True
+            for key in modules_to_not_convert:
+                if (
+                    (key in current_key_name_str) and (key + "." in current_key_name_str)
+                ) or key == current_key_name_str:
+                    proceed = False
+                    break
+            if proceed:
+                # Load bnb module with empty weight and replace ``nn.Linear` module
+                if bnb_quantization_config.load_in_8bit:
+                    bnb_module = bnb.nn.Linear8bitLt(
+                        module.in_features,
+                        module.out_features,
+                        module.bias is not None,
+                        has_fp16_weights=False,
+                        threshold=bnb_quantization_config.llm_int8_threshold,
+                    )
+                elif bnb_quantization_config.load_in_4bit:
+                    bnb_module = bnb.nn.Linear4bit(
+                        module.in_features,
+                        module.out_features,
+                        module.bias is not None,
+                        bnb_quantization_config.bnb_4bit_compute_dtype,
+                        compress_statistics=bnb_quantization_config.bnb_4bit_use_double_quant,
+                        quant_type=bnb_quantization_config.bnb_4bit_quant_type,
+                    )
+                else:
+                    raise ValueError("load_in_8bit and load_in_4bit can't be both False")
+                bnb_module.weight.data = module.weight.data
+                if module.bias is not None:
+                    bnb_module.bias.data = module.bias.data
+                bnb_module.requires_grad_(False)
+                setattr(model, name, bnb_module)
+                has_been_replaced = True
+        if len(list(module.children())) > 0:
+            _, _has_been_replaced = _replace_with_bnb_layers(
+                module, bnb_quantization_config, modules_to_not_convert, current_key_name
+            )
+            has_been_replaced = has_been_replaced | _has_been_replaced
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model, has_been_replaced
+def get_keys_to_not_convert(model):
+    r"""
+    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
+    we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
+    to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
+    int8.
+    Parameters:
+    model (`torch.nn.Module`):
+        Input model
+    """
+    # Create a copy of the model
+    with init_empty_weights():
+        tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager`
+    tied_params = find_tied_parameters(tied_model)
+    # For compatibility with Accelerate < 0.18
+    if isinstance(tied_params, dict):
+        tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
+    else:
+        tied_keys = sum(tied_params, [])
+    has_tied_params = len(tied_keys) > 0
+    # Check if it is a base model
+    is_base_model = False
+    if hasattr(model, "base_model_prefix"):
+        is_base_model = not hasattr(model, model.base_model_prefix)
+    # Ignore this for base models (BertModel, GPT2Model, etc.)
+    if (not has_tied_params) and is_base_model:
+        return []
+    # otherwise they have an attached head
+    list_modules = list(model.named_children())
+    list_last_module = [list_modules[-1][0]]
+    # add last module together with tied weights
+    intersection = set(list_last_module) - set(tied_keys)
+    list_untouched = list(set(tied_keys)) + list(intersection)
+    # remove ".weight" from the keys
+    names_to_remove = [".weight", ".bias"]
+    filtered_module_names = []
+    for name in list_untouched:
+        for name_to_remove in names_to_remove:
+            if name_to_remove in name:
+                name = name.replace(name_to_remove, "")
+        filtered_module_names.append(name)
+    return filtered_module_names
+def has_4bit_bnb_layers(model):
+    """Check if we have `bnb.nn.Linear4bit` or `bnb.nn.Linear8bitLt` layers inside our model"""
+    # bitsandbytes will initialize device(e.g. CUDA, XPU) on import, so it needs to be imported lazily
+    import bitsandbytes as bnb
+    for m in model.modules():
+        if isinstance(m, bnb.nn.Linear4bit):
+            return True
+    return False
+def get_parameter_device(parameter: nn.Module):
+    return next(parameter.parameters()).device
+def quantize_and_offload_8bit(model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics):
+    # if it is not quantized, we quantize and offload the quantized weights and the SCB stats
+    if fp16_statistics is None:
+        set_module_tensor_to_device(model, param_name, 0, dtype=new_dtype, value=param)
+        tensor_name = param_name
+        module = model
+        if "." in tensor_name:
+            splits = tensor_name.split(".")
+            for split in splits[:-1]:
+                new_module = getattr(module, split)
+                if new_module is None:
+                    raise ValueError(f"{module} has no attribute {split}.")
+                module = new_module
+            tensor_name = splits[-1]
+        # offload weights
+        module._parameters[tensor_name].requires_grad = False
+        offload_weight(module._parameters[tensor_name], param_name, offload_folder, index=offload_index)
+        if hasattr(module._parameters[tensor_name], "SCB"):
+            offload_weight(
+                module._parameters[tensor_name].SCB,
+                param_name.replace("weight", "SCB"),
+                offload_folder,
+                index=offload_index,
+            )
+    else:
+        offload_weight(param, param_name, offload_folder, index=offload_index)
+        offload_weight(fp16_statistics, param_name.replace("weight", "SCB"), offload_folder, index=offload_index)
+    set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype, value=torch.empty(*param.size()))

accelerate/utils/constants.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import operator as op
+import torch
+SCALER_NAME = "scaler.pt"
+MODEL_NAME = "pytorch_model"
+SAFE_MODEL_NAME = "model"
+RNG_STATE_NAME = "random_states"
+OPTIMIZER_NAME = "optimizer"
+SCHEDULER_NAME = "scheduler"
+SAMPLER_NAME = "sampler"
+PROFILE_PATTERN_NAME = "profile_{suffix}.json"
+WEIGHTS_NAME = f"{MODEL_NAME}.bin"
+WEIGHTS_PATTERN_NAME = "pytorch_model{suffix}.bin"
+WEIGHTS_INDEX_NAME = f"{WEIGHTS_NAME}.index.json"
+SAFE_WEIGHTS_NAME = f"{SAFE_MODEL_NAME}.safetensors"
+SAFE_WEIGHTS_PATTERN_NAME = "model{suffix}.safetensors"
+SAFE_WEIGHTS_INDEX_NAME = f"{SAFE_WEIGHTS_NAME}.index.json"
+SAGEMAKER_PYTORCH_VERSION = "1.10.2"
+SAGEMAKER_PYTHON_VERSION = "py38"
+SAGEMAKER_TRANSFORMERS_VERSION = "4.17.0"
+SAGEMAKER_PARALLEL_EC2_INSTANCES = ["ml.p3.16xlarge", "ml.p3dn.24xlarge", "ml.p4dn.24xlarge"]
+FSDP_SHARDING_STRATEGY = ["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD", "HYBRID_SHARD_ZERO2"]
+FSDP_AUTO_WRAP_POLICY = ["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP", "NO_WRAP"]
+FSDP_BACKWARD_PREFETCH = ["BACKWARD_PRE", "BACKWARD_POST", "NO_PREFETCH"]
+FSDP_STATE_DICT_TYPE = ["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"]
+FSDP2_STATE_DICT_TYPE = ["SHARDED_STATE_DICT", "FULL_STATE_DICT"]
+FSDP_PYTORCH_VERSION = (
+    "2.1.0.a0+32f93b1"  # Technically should be 2.1.0, but MS-AMP uses this specific prerelease in their Docker image.
+)
+FSDP2_PYTORCH_VERSION = "2.6.0"
+DTENSOR_PYTORCH_VERSION = "2.5.0"
+FSDP_MODEL_NAME = "pytorch_model_fsdp"
+DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich", "nossh", "slurm"]
+TORCH_DYNAMO_MODES = ["default", "reduce-overhead", "max-autotune"]
+ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION = "2.2.0"
+XPU_PROFILING_AVAILABLE_PYTORCH_VERSION = "2.4.0"
+MITA_PROFILING_AVAILABLE_PYTORCH_VERSION = "2.1.0"
+BETA_TP_AVAILABLE_PYTORCH_VERSION = "2.3.0"
+BETA_TP_AVAILABLE_TRANSFORMERS_VERSION = "4.52.0"
+BETA_CP_AVAILABLE_PYTORCH_VERSION = "2.6.0"
+BETA_SP_AVAILABLE_DEEPSPEED_VERSION = "0.18.2"
+STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
+# These are the args for `torch.distributed.launch` for pytorch < 1.9
+TORCH_LAUNCH_PARAMS = [
+    "nnodes",
+    "nproc_per_node",
+    "rdzv_backend",
+    "rdzv_endpoint",
+    "rdzv_id",
+    "rdzv_conf",
+    "standalone",
+    "max_restarts",
+    "monitor_interval",
+    "start_method",
+    "role",
+    "module",
+    "m",
+    "no_python",
+    "run_path",
+    "log_dir",
+    "r",
+    "redirects",
+    "t",
+    "tee",
+    "node_rank",
+    "master_addr",
+    "master_port",
+]
+CUDA_DISTRIBUTED_TYPES = ["DEEPSPEED", "MULTI_GPU", "FSDP", "MEGATRON_LM", "TP"]
+TORCH_DISTRIBUTED_OPERATION_TYPES = CUDA_DISTRIBUTED_TYPES + [
+    "MULTI_NPU",
+    "MULTI_MLU",
+    "MULTI_SDAA",
+    "MULTI_MUSA",
+    "MULTI_XPU",
+    "MULTI_CPU",
+    "MULTI_HPU",
+    "MULTI_NEURON",
+]
+SUPPORTED_PYTORCH_LAYERS_FOR_UPCASTING = (
+    torch.nn.Conv1d,
+    torch.nn.Conv2d,
+    torch.nn.Conv3d,
+    torch.nn.ConvTranspose1d,
+    torch.nn.ConvTranspose2d,
+    torch.nn.ConvTranspose3d,
+    torch.nn.Linear,
+)