xiaoanyu123 commited on Sep 10, 2025

Commit

cf47cca

verified ·

1 Parent(s): 8e2e3c8

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

pythonProject/.idea/inspectionProfiles/profiles_settings.xml +6 -0
pythonProject/.venv/Lib/site-packages/accelerate/accelerator.py +0 -0
pythonProject/.venv/Lib/site-packages/accelerate/big_modeling.py +789 -0
pythonProject/.venv/Lib/site-packages/accelerate/checkpointing.py +330 -0
pythonProject/.venv/Lib/site-packages/accelerate/commands/merge.py +69 -0
pythonProject/.venv/Lib/site-packages/accelerate/commands/test.py +65 -0
pythonProject/.venv/Lib/site-packages/accelerate/data_loader.py +1451 -0
pythonProject/.venv/Lib/site-packages/accelerate/hooks.py +776 -0
pythonProject/.venv/Lib/site-packages/accelerate/inference.py +184 -0
pythonProject/.venv/Lib/site-packages/accelerate/launchers.py +306 -0
pythonProject/.venv/Lib/site-packages/accelerate/local_sgd.py +106 -0
pythonProject/.venv/Lib/site-packages/accelerate/logging.py +125 -0
pythonProject/.venv/Lib/site-packages/accelerate/memory_utils.py +22 -0
pythonProject/.venv/Lib/site-packages/accelerate/optimizer.py +213 -0
pythonProject/.venv/Lib/site-packages/accelerate/parallelism_config.py +322 -0
pythonProject/.venv/Lib/site-packages/accelerate/scheduler.py +98 -0
pythonProject/.venv/Lib/site-packages/isympy.py +342 -0
pythonProject/.venv/Lib/site-packages/numpy-2.2.6-cp310-cp310-win_amd64.whl +0 -0
pythonProject/.venv/Lib/site-packages/typing_extensions.py +0 -0
pythonProject/.venv/pyvenv.cfg +3 -0

pythonProject/.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

pythonProject/.venv/Lib/site-packages/accelerate/accelerator.py ADDED Viewed

The diff for this file is too large to render. See raw diff

pythonProject/.venv/Lib/site-packages/accelerate/big_modeling.py ADDED Viewed

	@@ -0,0 +1,789 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import re
+from contextlib import contextmanager
+from functools import wraps
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from .hooks import (
+    AlignDevicesHook,
+    CpuOffload,
+    LayerwiseCastingHook,
+    UserCpuOffloadHook,
+    add_hook_to_module,
+    attach_align_device_hook,
+    attach_align_device_hook_on_blocks,
+)
+from .utils import (
+    OffloadedWeightsLoader,
+    check_cuda_p2p_ib_support,
+    check_device_map,
+    extract_submodules_state_dict,
+    find_tied_parameters,
+    get_balanced_memory,
+    infer_auto_device_map,
+    is_bnb_available,
+    is_mlu_available,
+    is_musa_available,
+    is_npu_available,
+    is_sdaa_available,
+    is_xpu_available,
+    load_checkpoint_in_model,
+    offload_state_dict,
+    parse_flag_from_env,
+    retie_parameters,
+)
+from .utils.constants import SUPPORTED_PYTORCH_LAYERS_FOR_UPCASTING
+from .utils.other import recursive_getattr
+logger = logging.getLogger(__name__)
+@contextmanager
+def init_empty_weights(include_buffers: bool = None):
+    """
+    A context manager under which models are initialized with all parameters on the meta device, therefore creating an
+    empty model. Useful when just initializing the model would blow the available RAM.
+    Args:
+        include_buffers (`bool`, *optional*):
+            Whether or not to also put all buffers on the meta device while initializing.
+    Example:
+    ```python
+    import torch.nn as nn
+    from accelerate import init_empty_weights
+    # Initialize a model with 100 billions parameters in no time and without using any RAM.
+    with init_empty_weights():
+        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
+    ```
+    <Tip warning={true}>
+    Any model created under this context manager has no weights. As such you can't do something like
+    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
+    Make sure to overwrite the default device_map param for [`load_checkpoint_and_dispatch`], otherwise dispatch is not
+    called.
+    </Tip>
+    """
+    if include_buffers is None:
+        include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False)
+    with init_on_device(torch.device("meta"), include_buffers=include_buffers) as f:
+        yield f
+@contextmanager
+def init_on_device(device: torch.device, include_buffers: bool = None):
+    """
+    A context manager under which models are initialized with all parameters on the specified device.
+    Args:
+        device (`torch.device`):
+            Device to initialize all parameters on.
+        include_buffers (`bool`, *optional*):
+            Whether or not to also put all buffers on the meta device while initializing.
+    Example:
+    ```python
+    import torch.nn as nn
+    from accelerate import init_on_device
+    with init_on_device(device=torch.device("cuda")):
+        tst = nn.Linear(100, 100)  # on `cuda` device
+    ```
+    """
+    if include_buffers is None:
+        include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False)
+    if include_buffers:
+        with device:
+            yield
+        return
+    old_register_parameter = nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = nn.Module.register_buffer
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+    # Patch tensor creation
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+        return wrapper
+    try:
+        nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
+        yield
+    finally:
+        nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            nn.Module.register_buffer = old_register_buffer
+        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)
+def cpu_offload(
+    model: nn.Module,
+    execution_device: Optional[torch.device] = None,
+    offload_buffers: bool = False,
+    state_dict: Optional[dict[str, torch.Tensor]] = None,
+    preload_module_classes: Optional[list[str]] = None,
+):
+    """
+    Activates full CPU offload for a model. As a result, all parameters of the model will be offloaded and only one
+    copy of the state dict of the model will be kept. During the forward pass, parameters will be extracted from that
+    state dict and put on the execution device passed as they are needed, then offloaded again.
+    Args:
+        model (`torch.nn.Module`):
+            The model to offload.
+        execution_device (`torch.device`, *optional*):
+            The device on which the forward pass of the model will be executed (should be a GPU). Will default to the
+            model first parameter device.
+        offload_buffers (`bool`, *optional*, defaults to `False`):
+            Whether or not to offload the buffers with the model parameters.
+        state_dict (`Dict[str, torch.Tensor]`, *optional*):
+            The state dict of the model that will be kept on CPU.
+        preload_module_classes (`List[str]`, *optional*):
+            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
+            of the forward. This should only be used for classes that have submodules which are registered but not
+            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
+            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
+    """
+    if execution_device is None:
+        execution_device = next(iter(model.parameters())).device
+    if state_dict is None:
+        state_dict = {n: p.to("cpu") for n, p in model.state_dict().items()}
+    add_hook_to_module(model, AlignDevicesHook(io_same_device=True), append=True)
+    attach_align_device_hook(
+        model,
+        execution_device=execution_device,
+        offload=True,
+        offload_buffers=offload_buffers,
+        weights_map=state_dict,
+        preload_module_classes=preload_module_classes,
+    )
+    return model
+def cpu_offload_with_hook(
+    model: torch.nn.Module,
+    execution_device: Optional[Union[int, str, torch.device]] = None,
+    prev_module_hook: Optional[UserCpuOffloadHook] = None,
+):
+    """
+    Offloads a model on the CPU and puts it back to an execution device when executed. The difference with
+    [`cpu_offload`] is that the model stays on the execution device after the forward and is only offloaded again when
+    the `offload` method of the returned `hook` is called. Useful for pipelines running a model in a loop.
+    Args:
+        model (`torch.nn.Module`):
+            The model to offload.
+        execution_device(`str`, `int` or `torch.device`, *optional*):
+            The device on which the model should be executed. Will default to the MPS device if it's available, then
+            GPU 0 if there is a GPU, and finally to the CPU.
+        prev_module_hook (`UserCpuOffloadHook`, *optional*):
+            The hook sent back by this function for a previous model in the pipeline you are running. If passed, its
+            offload method will be called just before the forward of the model to which this hook is attached.
+    Example:
+    ```py
+    model_1, hook_1 = cpu_offload_with_hook(model_1, cuda_device)
+    model_2, hook_2 = cpu_offload_with_hook(model_2, cuda_device, prev_module_hook=hook_1)
+    model_3, hook_3 = cpu_offload_with_hook(model_3, cuda_device, prev_module_hook=hook_2)
+    hid_1 = model_1(input)
+    for i in range(50):
+        # model1 is offloaded on the CPU at the first iteration, model 2 stays on the GPU for this whole loop.
+        hid_2 = model_2(hid_1)
+    # model2 is offloaded to the CPU just before this forward.
+    hid_3 = model_3(hid_3)
+    # For model3, you need to manually call the hook offload method.
+    hook_3.offload()
+    ```
+    """
+    hook = CpuOffload(execution_device=execution_device, prev_module_hook=prev_module_hook)
+    add_hook_to_module(model, hook, append=True)
+    user_hook = UserCpuOffloadHook(model, hook)
+    return model, user_hook
+def disk_offload(
+    model: nn.Module,
+    offload_dir: Union[str, os.PathLike],
+    execution_device: Optional[torch.device] = None,
+    offload_buffers: bool = False,
+    preload_module_classes: Optional[list[str]] = None,
+):
+    """
+    Activates full disk offload for a model. As a result, all parameters of the model will be offloaded as
+    memory-mapped array in a given folder. During the forward pass, parameters will be accessed from that folder and
+    put on the execution device passed as they are needed, then offloaded again.
+    Args:
+        model (`torch.nn.Module`): The model to offload.
+        offload_dir (`str` or `os.PathLike`):
+            The folder in which to offload the model weights (or where the model weights are already offloaded).
+        execution_device (`torch.device`, *optional*):
+            The device on which the forward pass of the model will be executed (should be a GPU). Will default to the
+            model's first parameter device.
+        offload_buffers (`bool`, *optional*, defaults to `False`):
+            Whether or not to offload the buffers with the model parameters.
+        preload_module_classes (`List[str]`, *optional*):
+            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
+            of the forward. This should only be used for classes that have submodules which are registered but not
+            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
+            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
+    """
+    if not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")):
+        offload_state_dict(offload_dir, model.state_dict())
+    if execution_device is None:
+        execution_device = next(iter(model.parameters())).device
+    weights_map = OffloadedWeightsLoader(save_folder=offload_dir)
+    add_hook_to_module(model, AlignDevicesHook(io_same_device=True), append=True)
+    attach_align_device_hook(
+        model,
+        execution_device=execution_device,
+        offload=True,
+        offload_buffers=offload_buffers,
+        weights_map=weights_map,
+        preload_module_classes=preload_module_classes,
+    )
+    return model
+def dispatch_model(
+    model: nn.Module,
+    device_map: dict[str, Union[str, int, torch.device]],
+    main_device: Optional[torch.device] = None,
+    state_dict: Optional[dict[str, torch.Tensor]] = None,
+    offload_dir: Optional[Union[str, os.PathLike]] = None,
+    offload_index: Optional[dict[str, str]] = None,
+    offload_buffers: bool = False,
+    skip_keys: Optional[Union[str, list[str]]] = None,
+    preload_module_classes: Optional[list[str]] = None,
+    force_hooks: bool = False,
+):
+    """
+    Dispatches a model according to a given device map. Layers of the model might be spread across GPUs, offloaded on
+    the CPU or even the disk.
+    Args:
+        model (`torch.nn.Module`):
+            The model to dispatch.
+        device_map (`Dict[str, Union[str, int, torch.device]]`):
+            A dictionary mapping module names in the models `state_dict` to the device they should go to. Note that
+            `"disk"` is accepted even if it's not a proper value for `torch.device`.
+        main_device (`str`, `int` or `torch.device`, *optional*):
+            The main execution device. Will default to the first device in the `device_map` different from `"cpu"` or
+            `"disk"`.
+        state_dict (`Dict[str, torch.Tensor]`, *optional*):
+            The state dict of the part of the model that will be kept on CPU.
+        offload_dir (`str` or `os.PathLike`):
+            The folder in which to offload the model weights (or where the model weights are already offloaded).
+        offload_index (`Dict`, *optional*):
+            A dictionary from weight name to their information (`dtype`/ `shape` or safetensors filename). Will default
+            to the index saved in `save_folder`.
+        offload_buffers (`bool`, *optional*, defaults to `False`):
+            Whether or not to offload the buffers with the model parameters.
+        skip_keys (`str` or `List[str]`, *optional*):
+            A list of keys to ignore when moving inputs or outputs between devices.
+        preload_module_classes (`List[str]`, *optional*):
+            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
+            of the forward. This should only be used for classes that have submodules which are registered but not
+            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
+            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
+        force_hooks (`bool`, *optional*, defaults to `False`):
+            Whether or not to force device hooks to be attached to the model even if all layers are dispatched to a
+            single device.
+    """
+    # Error early if the device map is incomplete.
+    check_device_map(model, device_map)
+    # We need to force hook for quantized model that can't be moved with to()
+    if getattr(model, "quantization_method", "bitsandbytes") == "bitsandbytes":
+        # since bnb 0.43.2, we can move 4-bit model
+        if getattr(model, "is_loaded_in_8bit", False) or (
+            getattr(model, "is_loaded_in_4bit", False) and not is_bnb_available(min_version="0.43.2")
+        ):
+            force_hooks = True
+    # We attach hooks if the device_map has at least 2 different devices or if
+    # force_hooks is set to `True`. Otherwise, the model in already loaded
+    # in the unique device and the user can decide where to dispatch the model.
+    # If the model is quantized, we always force-dispatch the model
+    if (len(set(device_map.values())) > 1) or force_hooks:
+        if main_device is None:
+            if set(device_map.values()) == {"cpu"} or set(device_map.values()) == {"cpu", "disk"}:
+                main_device = "cpu"
+            else:
+                main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0]
+        if main_device != "cpu":
+            cpu_modules = [name for name, device in device_map.items() if device == "cpu"]
+            if state_dict is None and len(cpu_modules) > 0:
+                state_dict = extract_submodules_state_dict(model.state_dict(), cpu_modules)
+        disk_modules = [name for name, device in device_map.items() if device == "disk"]
+        if offload_dir is None and offload_index is None and len(disk_modules) > 0:
+            raise ValueError(
+                "We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules "
+                f"need to be offloaded: {', '.join(disk_modules)}."
+            )
+        if (
+            len(disk_modules) > 0
+            and offload_index is None
+            and (not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")))
+        ):
+            disk_state_dict = extract_submodules_state_dict(model.state_dict(), disk_modules)
+            offload_state_dict(offload_dir, disk_state_dict)
+        execution_device = {
+            name: main_device if device in ["cpu", "disk"] else device for name, device in device_map.items()
+        }
+        execution_device[""] = main_device
+        offloaded_devices = ["disk"] if main_device == "cpu" or main_device == "mps" else ["cpu", "disk"]
+        offload = {name: device in offloaded_devices for name, device in device_map.items()}
+        save_folder = offload_dir if len(disk_modules) > 0 else None
+        if state_dict is not None or save_folder is not None or offload_index is not None:
+            device = main_device if offload_index is not None else None
+            weights_map = OffloadedWeightsLoader(
+                state_dict=state_dict, save_folder=save_folder, index=offload_index, device=device
+            )
+        else:
+            weights_map = None
+        # When dispatching the model's parameters to the devices specified in device_map, we want to avoid allocating memory several times for the
+        # tied parameters. The dictionary tied_params_map keeps track of the already allocated data for a given tied parameter (represented by its
+        # original pointer) on each devices.
+        tied_params = find_tied_parameters(model)
+        tied_params_map = {}
+        for group in tied_params:
+            for param_name in group:
+                # data_ptr() is enough here, as `find_tied_parameters` finds tied params simply by comparing `param1 is param2`, so we don't need
+                # to care about views of tensors through storage_offset.
+                data_ptr = recursive_getattr(model, param_name).data_ptr()
+                tied_params_map[data_ptr] = {}
+                # Note: To handle the disk offloading case, we can not simply use weights_map[param_name].data_ptr() as the reference pointer,
+                # as we have no guarantee that safetensors' `file.get_tensor()` will always give the same pointer.
+        attach_align_device_hook_on_blocks(
+            model,
+            execution_device=execution_device,
+            offload=offload,
+            offload_buffers=offload_buffers,
+            weights_map=weights_map,
+            skip_keys=skip_keys,
+            preload_module_classes=preload_module_classes,
+            tied_params_map=tied_params_map,
+        )
+        # warn if there is any params on the meta device
+        offloaded_devices_str = " and ".join(
+            [device for device in set(device_map.values()) if device in ("cpu", "disk")]
+        )
+        if len(offloaded_devices_str) > 0:
+            logger.warning(
+                f"Some parameters are on the meta device because they were offloaded to the {offloaded_devices_str}."
+            )
+        # Attaching the hook may break tied weights, so we retie them
+        retie_parameters(model, tied_params)
+        # add warning to cuda and to method
+        def add_warning(fn, model):
+            @wraps(fn)
+            def wrapper(*args, **kwargs):
+                warning_msg = "You shouldn't move a model that is dispatched using accelerate hooks."
+                if str(fn.__name__) == "to":
+                    to_device = torch._C._nn._parse_to(*args, **kwargs)[0]
+                    if to_device is not None:
+                        logger.warning(warning_msg)
+                else:
+                    logger.warning(warning_msg)
+                for param in model.parameters():
+                    if param.device == torch.device("meta"):
+                        raise RuntimeError("You can't move a model that has some modules offloaded to cpu or disk.")
+                return fn(*args, **kwargs)
+            return wrapper
+        # Make sure to update _accelerate_added_attributes in hooks.py if you add any hook
+        model.to = add_warning(model.to, model)
+        if is_npu_available():
+            model.npu = add_warning(model.npu, model)
+        elif is_mlu_available():
+            model.mlu = add_warning(model.mlu, model)
+        elif is_sdaa_available():
+            model.sdaa = add_warning(model.sdaa, model)
+        elif is_musa_available():
+            model.musa = add_warning(model.musa, model)
+        elif is_xpu_available():
+            model.xpu = add_warning(model.xpu, model)
+        else:
+            model.cuda = add_warning(model.cuda, model)
+        # Check if we are using multi-gpus with RTX 4000 series
+        use_multi_gpu = len([device for device in set(device_map.values()) if device not in ("cpu", "disk")]) > 1
+        if use_multi_gpu and not check_cuda_p2p_ib_support():
+            logger.warning(
+                "We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. "
+                "This can affect the multi-gpu inference when using accelerate device_map."
+                "Please make sure to update your driver to the latest version which resolves this."
+            )
+    else:
+        device = list(device_map.values())[0]
+        # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
+        if is_npu_available() and isinstance(device, int):
+            device = f"npu:{device}"
+        elif is_mlu_available() and isinstance(device, int):
+            device = f"mlu:{device}"
+        elif is_sdaa_available() and isinstance(device, int):
+            device = f"sdaa:{device}"
+        elif is_musa_available() and isinstance(device, int):
+            device = f"musa:{device}"
+        if device != "disk":
+            model.to(device)
+        else:
+            raise ValueError(
+                "You are trying to offload the whole model to the disk. Please use the `disk_offload` function instead."
+            )
+    # Convert OrderedDict back to dict for easier usage
+    model.hf_device_map = dict(device_map)
+    return model
+def load_checkpoint_and_dispatch(
+    model: nn.Module,
+    checkpoint: Union[str, os.PathLike],
+    device_map: Optional[Union[str, dict[str, Union[int, str, torch.device]]]] = None,
+    max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
+    no_split_module_classes: Optional[list[str]] = None,
+    offload_folder: Optional[Union[str, os.PathLike]] = None,
+    offload_buffers: bool = False,
+    dtype: Optional[Union[str, torch.dtype]] = None,
+    offload_state_dict: Optional[bool] = None,
+    skip_keys: Optional[Union[str, list[str]]] = None,
+    preload_module_classes: Optional[list[str]] = None,
+    force_hooks: bool = False,
+    strict: bool = False,
+    full_state_dict: bool = True,
+    broadcast_from_rank0: bool = False,
+):
+    """
+    Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
+    loaded and adds the various hooks that will make this model run properly (even if split across devices).
+    Args:
+        model (`torch.nn.Module`): The model in which we want to load a checkpoint.
+        checkpoint (`str` or `os.PathLike`):
+            The folder checkpoint to load. It can be:
+            - a path to a file containing a whole model state dict
+            - a path to a `.json` file containing the index to a sharded checkpoint
+            - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
+        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
+            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
+            name, once a given module name is inside, every submodule of it will be sent to the same device.
+            To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For more
+            information about each option see [here](../concept_guides/big_model_inference#designing-a-device-map).
+            Defaults to None, which means [`dispatch_model`] will not be called.
+        max_memory (`Dict`, *optional*):
+            A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
+            and the available CPU RAM if unset.
+        no_split_module_classes (`List[str]`, *optional*):
+            A list of layer class names that should never be split across device (for instance any layer that has a
+            residual connection).
+        offload_folder (`str` or `os.PathLike`, *optional*):
+            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+        offload_buffers (`bool`, *optional*, defaults to `False`):
+            In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
+            well as the parameters.
+        dtype (`str` or `torch.dtype`, *optional*):
+            If provided, the weights will be converted to that type when loaded.
+        offload_state_dict (`bool`, *optional*):
+            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
+            the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
+            picked contains `"disk"` values.
+        skip_keys (`str` or `List[str]`, *optional*):
+            A list of keys to ignore when moving inputs or outputs between devices.
+        preload_module_classes (`List[str]`, *optional*):
+            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
+            of the forward. This should only be used for classes that have submodules which are registered but not
+            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
+            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
+        force_hooks (`bool`, *optional*, defaults to `False`):
+            Whether or not to force device hooks to be attached to the model even if all layers are dispatched to a
+            single device.
+        strict (`bool`, *optional*, defaults to `False`):
+            Whether to strictly enforce that the keys in the checkpoint state_dict match the keys of the model's
+            state_dict.
+        full_state_dict (`bool`, *optional*, defaults to `True`): if this is set to `True`, all the tensors in the
+            loaded state_dict will be gathered. No ShardedTensor and DTensor will be in the loaded state_dict.
+        broadcast_from_rank0 (`False`, *optional*, defaults to `False`): when the option is `True`, a distributed
+            `ProcessGroup` must be initialized. rank0 should receive a full state_dict and will broadcast the tensors
+            in the state_dict one by one to other ranks. Other ranks will receive the tensors and shard (if applicable)
+            according to the local shards in the model.
+    Example:
+    ```python
+    >>> from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+    >>> from huggingface_hub import hf_hub_download
+    >>> from transformers import AutoConfig, AutoModelForCausalLM
+    >>> # Download the Weights
+    >>> checkpoint = "EleutherAI/gpt-j-6B"
+    >>> weights_location = hf_hub_download(checkpoint, "pytorch_model.bin")
+    >>> # Create a model and initialize it with empty weights
+    >>> config = AutoConfig.from_pretrained(checkpoint)
+    >>> with init_empty_weights():
+    ...     model = AutoModelForCausalLM.from_config(config)
+    >>> # Load the checkpoint and dispatch it to the right devices
+    >>> model = load_checkpoint_and_dispatch(
+    ...     model, weights_location, device_map="auto", no_split_module_classes=["GPTJBlock"]
+    ... )
+    ```
+    """
+    if isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+        raise ValueError(
+            "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or 'sequential'."
+        )
+    if isinstance(device_map, str):
+        if device_map != "sequential":
+            max_memory = get_balanced_memory(
+                model,
+                max_memory=max_memory,
+                no_split_module_classes=no_split_module_classes,
+                dtype=dtype,
+                low_zero=(device_map == "balanced_low_0"),
+            )
+        device_map = infer_auto_device_map(
+            model,
+            max_memory=max_memory,
+            no_split_module_classes=no_split_module_classes,
+            dtype=dtype,
+            offload_buffers=offload_buffers,
+        )
+    if offload_state_dict is None and device_map is not None and "disk" in device_map.values():
+        offload_state_dict = True
+    load_checkpoint_in_model(
+        model,
+        checkpoint,
+        device_map=device_map,
+        offload_folder=offload_folder,
+        dtype=dtype,
+        offload_state_dict=offload_state_dict,
+        offload_buffers=offload_buffers,
+        strict=strict,
+        full_state_dict=full_state_dict,
+        broadcast_from_rank0=broadcast_from_rank0,
+    )
+    if device_map is None:
+        return model
+    return dispatch_model(
+        model,
+        device_map=device_map,
+        offload_dir=offload_folder,
+        offload_buffers=offload_buffers,
+        skip_keys=skip_keys,
+        preload_module_classes=preload_module_classes,
+        force_hooks=force_hooks,
+    )
+def attach_layerwise_casting_hooks(
+    module: torch.nn.Module,
+    storage_dtype: torch.dtype,
+    compute_dtype: torch.dtype,
+    skip_modules_pattern: Union[str, tuple[str, ...]] = None,
+    skip_modules_classes: Optional[tuple[type[torch.nn.Module], ...]] = None,
+    non_blocking: bool = False,
+) -> None:
+    r"""
+    Applies layerwise casting to a given module. The module expected here is a PyTorch `nn.Module`. This is helpful for
+    reducing memory requirements when one doesn't want to fully quantize a model. Model params can be kept in say,
+    `torch.float8_e4m3fn` and upcasted to a higher precision like `torch.bfloat16` during forward pass and downcasted
+    back to `torch.float8_e4m3fn` to realize memory savings.
+    Args:
+        module (`torch.nn.Module`):
+            The module whose leaf modules will be cast to a high precision dtype for computation, and to a low
+            precision dtype for storage.
+        storage_dtype (`torch.dtype`):
+            The dtype to cast the module to before/after the forward pass for storage.
+        compute_dtype (`torch.dtype`):
+            The dtype to cast the module to during the forward pass for computation.
+        skip_modules_pattern (`tuple[str, ...]`, defaults to `None`):
+            A list of patterns to match the names of the modules to skip during the layerwise casting process. If set
+            to `None` alongside `skip_modules_classes` being `None`, the layerwise casting is applied directly to the
+            module instead of its internal submodules.
+        skip_modules_classes (`tuple[type[torch.nn.Module], ...]`, defaults to `None`):
+            A list of module classes to skip during the layerwise casting process.
+        non_blocking (`bool`, defaults to `False`):
+            If `True`, the weight casting operations are non-blocking.
+    Example:
+    ```python
+    >>> from accelerate.hooks import attach_layerwise_casting_hooks
+    >>> from transformers import AutoModelForCausalLM
+    >>> import torch
+    >>> # Model
+    >>> checkpoint = "EleutherAI/gpt-j-6B"
+    >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+    >>> # Attach hooks and perform inference
+    >>> attach_layerwise_casting_hooks(model, storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)
+    >>> with torch.no_grad():
+    ...     model(...)
+    ```
+    Users can also pass modules they want to avoid from getting downcasted.
+    ```py
+    >>> attach_layerwise_casting_hooks(
+    ...     model, storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16, skip_modules_pattern=["norm"]
+    ... )
+    ```
+    """
+    _attach_layerwise_casting_hooks(
+        module, storage_dtype, compute_dtype, skip_modules_pattern, skip_modules_classes, non_blocking
+    )
+def _attach_layerwise_casting_hooks(
+    module: torch.nn.Module,
+    storage_dtype: torch.dtype,
+    compute_dtype: torch.dtype,
+    skip_modules_pattern: Union[str, tuple[str, ...]] = None,
+    skip_modules_classes: Optional[tuple[type[torch.nn.Module], ...]] = None,
+    non_blocking: bool = False,
+    _prefix: str = "",
+):
+    should_skip = (skip_modules_classes is not None and isinstance(module, skip_modules_classes)) or (
+        skip_modules_pattern is not None and any(re.search(pattern, _prefix) for pattern in skip_modules_pattern)
+    )
+    if should_skip:
+        logger.debug(f'Skipping layerwise casting for layer "{_prefix}"')
+        return
+    if isinstance(module, SUPPORTED_PYTORCH_LAYERS_FOR_UPCASTING):
+        logger.debug(f'Applying layerwise casting to layer "{_prefix}"')
+        add_hook_to_module(
+            module,
+            LayerwiseCastingHook(storage_dtype=storage_dtype, compute_dtype=compute_dtype, non_blocking=non_blocking),
+            append=True,
+        )
+        return
+    for name, submodule in module.named_children():
+        layer_name = f"{_prefix}.{name}" if _prefix else name
+        _attach_layerwise_casting_hooks(
+            submodule,
+            storage_dtype,
+            compute_dtype,
+            skip_modules_pattern,
+            skip_modules_classes,
+            non_blocking,
+            _prefix=layer_name,
+        )
+def _attach_context_parallel_hooks(
+    model: nn.Module,
+):
+    """
+    Monkeypatch huggingface's `transformers` model to fix attention mask issues when using context parallelism.
+    This function attaches forward_pre_hooks to each self_attn module of the model, where each hook checks the
+    args/kwargs, if they contain an attention mask, if it does, it will remove this mask, check if it is a causal mask,
+    if yes, will add a kwarg `is_causal=True`, otherwise will raise an error. This is because context parallelism does
+    not support attention masks. This function modifies the model in place.
+    Args:
+        model (`nn.Module`):
+            The model to attach the hooks to.
+    """
+    def _self_attn_pre_forward_hook(_module, module_args, module_kwargs):
+        if "attention_mask" in module_kwargs:
+            module_kwargs["attention_mask"] = None
+            module_kwargs["is_causal"] = True
+        return module_args, module_kwargs
+    for name, module in model.named_modules():
+        # We hope (assume) that if user uses their own model (without this structure which transformers uses), they read the docs saying they can't pass in attention masks
+        # Then these cases can happen:
+        # 1) some modules end with a `self-attn` module, in which case we attach the hook, but the
+        #    there's no attention mask kwarg -> hook is a no-op
+        # 2) some modules end with a `self-attn` module, in which case we attach the hook, and the
+        #    attention mask kwarg is passed -> hook will remove the attention mask and add
+        #    `is_causal=True` kwarg, which either crashes the training or fixes it
+        #    (training would crash anyway as attention mask isn't supported)
+        # 3) no modules end with a `self-attn` module, in which case we don't attach the hook, this is
+        #    a no-op as well
+        if name.endswith("self_attn"):
+            # we want the hook to be executed first, to avoid any other hooks doing work on the attention mask
+            module.register_forward_pre_hook(_self_attn_pre_forward_hook, with_kwargs=True, prepend=True)

pythonProject/.venv/Lib/site-packages/accelerate/checkpointing.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from pathlib import Path
+import numpy as np
+import torch
+from safetensors.torch import load_model
+from .utils import (
+    MODEL_NAME,
+    OPTIMIZER_NAME,
+    RNG_STATE_NAME,
+    SAFE_MODEL_NAME,
+    SAFE_WEIGHTS_NAME,
+    SAMPLER_NAME,
+    SCALER_NAME,
+    SCHEDULER_NAME,
+    WEIGHTS_NAME,
+    get_pretty_name,
+    is_cuda_available,
+    is_hpu_available,
+    is_mlu_available,
+    is_musa_available,
+    is_sdaa_available,
+    is_torch_version,
+    is_torch_xla_available,
+    is_xpu_available,
+    load,
+    save,
+)
+if is_torch_version(">=", "2.4.0"):
+    from torch.amp import GradScaler
+else:
+    from torch.cuda.amp import GradScaler
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+from .logging import get_logger
+from .state import PartialState
+logger = get_logger(__name__)
+def save_accelerator_state(
+    output_dir: str,
+    model_states: list[dict],
+    optimizers: list,
+    schedulers: list,
+    dataloaders: list,
+    process_index: int,
+    step: int,
+    scaler: GradScaler = None,
+    save_on_each_node: bool = False,
+    safe_serialization: bool = True,
+):
+    """
+    Saves the current states of the models, optimizers, scaler, and RNG generators to a given directory.
+    <Tip>
+    If `safe_serialization` is `True`, models will be saved with `safetensors` while the rest are saved using native
+    `pickle`.
+    </Tip>
+    Args:
+        output_dir (`str` or `os.PathLike`):
+            The name of the folder to save all relevant weights and states.
+        model_states (`List[torch.nn.Module]`):
+            A list of model states
+        optimizers (`List[torch.optim.Optimizer]`):
+            A list of optimizer instances
+        schedulers (`List[torch.optim.lr_scheduler._LRScheduler]`):
+            A list of learning rate schedulers
+        dataloaders (`List[torch.utils.data.DataLoader]`):
+            A list of dataloader instances to save their sampler states
+        process_index (`int`):
+            The current process index in the Accelerator state
+        step (`int`):
+            The current step in the internal step tracker
+        scaler (`torch.amp.GradScaler`, *optional*):
+            An optional gradient scaler instance to save;
+        save_on_each_node (`bool`, *optional*):
+            Whether to save on every node, or only the main node.
+        safe_serialization (`bool`, *optional*, defaults to `True`):
+            Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+    """
+    output_dir = Path(output_dir)
+    # Model states
+    for i, state in enumerate(model_states):
+        weights_name = WEIGHTS_NAME if not safe_serialization else SAFE_WEIGHTS_NAME
+        if i > 0:
+            weights_name = weights_name.replace(".", f"_{i}.")
+        output_model_file = output_dir.joinpath(weights_name)
+        save(state, output_model_file, save_on_each_node=save_on_each_node, safe_serialization=safe_serialization)
+        logger.info(f"Model weights saved in {output_model_file}")
+    # Optimizer states
+    for i, opt in enumerate(optimizers):
+        state = opt.state_dict()
+        optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
+        output_optimizer_file = output_dir.joinpath(optimizer_name)
+        save(state, output_optimizer_file, save_on_each_node=save_on_each_node, safe_serialization=False)
+        logger.info(f"Optimizer state saved in {output_optimizer_file}")
+    # Scheduler states
+    for i, scheduler in enumerate(schedulers):
+        state = scheduler.state_dict()
+        scheduler_name = f"{SCHEDULER_NAME}.bin" if i == 0 else f"{SCHEDULER_NAME}_{i}.bin"
+        output_scheduler_file = output_dir.joinpath(scheduler_name)
+        save(state, output_scheduler_file, save_on_each_node=save_on_each_node, safe_serialization=False)
+        logger.info(f"Scheduler state saved in {output_scheduler_file}")
+    # DataLoader states
+    for i, dataloader in enumerate(dataloaders):
+        sampler_name = f"{SAMPLER_NAME}.bin" if i == 0 else f"{SAMPLER_NAME}_{i}.bin"
+        output_sampler_file = output_dir.joinpath(sampler_name)
+        # Only save if we have our custom sampler
+        from .data_loader import IterableDatasetShard, SeedableRandomSampler
+        if isinstance(dataloader.dataset, IterableDatasetShard):
+            sampler = dataloader.get_sampler()
+            if isinstance(sampler, SeedableRandomSampler):
+                save(sampler, output_sampler_file, save_on_each_node=save_on_each_node, safe_serialization=False)
+        if getattr(dataloader, "use_stateful_dataloader", False):
+            dataloader_state_dict_name = "dl_state_dict.bin" if i == 0 else f"dl_state_dict_{i}.bin"
+            output_dataloader_state_dict_file = output_dir.joinpath(dataloader_state_dict_name)
+            state_dict = dataloader.state_dict()
+            torch.save(state_dict, output_dataloader_state_dict_file)
+        logger.info(f"Sampler state for dataloader {i} saved in {output_sampler_file}")
+    # GradScaler state
+    if scaler is not None:
+        state = scaler.state_dict()
+        output_scaler_file = output_dir.joinpath(SCALER_NAME)
+        torch.save(state, output_scaler_file)
+        logger.info(f"Gradient scaler state saved in {output_scaler_file}")
+    # Random number generator states
+    states = {}
+    states_name = f"{RNG_STATE_NAME}_{process_index}.pkl"
+    states["step"] = step
+    states["random_state"] = random.getstate()
+    states["numpy_random_seed"] = np.random.get_state()
+    states["torch_manual_seed"] = torch.get_rng_state()
+    if is_xpu_available():
+        states["torch_xpu_manual_seed"] = torch.xpu.get_rng_state_all()
+    if is_mlu_available():
+        states["torch_mlu_manual_seed"] = torch.mlu.get_rng_state_all()
+    elif is_sdaa_available():
+        states["torch_sdaa_manual_seed"] = torch.sdaa.get_rng_state_all()
+    elif is_musa_available():
+        states["torch_musa_manual_seed"] = torch.musa.get_rng_state_all()
+    if is_hpu_available():
+        states["torch_hpu_manual_seed"] = torch.hpu.get_rng_state_all()
+    if is_cuda_available():
+        states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
+    if is_torch_xla_available():
+        states["xm_seed"] = xm.get_rng_state()
+    output_states_file = output_dir.joinpath(states_name)
+    torch.save(states, output_states_file)
+    logger.info(f"Random states saved in {output_states_file}")
+    return output_dir
+def load_accelerator_state(
+    input_dir,
+    models,
+    optimizers,
+    schedulers,
+    dataloaders,
+    process_index,
+    scaler=None,
+    map_location=None,
+    load_kwargs=None,
+    **load_model_func_kwargs,
+):
+    """
+    Loads states of the models, optimizers, scaler, and RNG generators from a given directory.
+    Args:
+        input_dir (`str` or `os.PathLike`):
+            The name of the folder to load all relevant weights and states.
+        models (`List[torch.nn.Module]`):
+            A list of model instances
+        optimizers (`List[torch.optim.Optimizer]`):
+            A list of optimizer instances
+        schedulers (`List[torch.optim.lr_scheduler._LRScheduler]`):
+            A list of learning rate schedulers
+        process_index (`int`):
+            The current process index in the Accelerator state
+        scaler (`torch.amp.GradScaler`, *optional*):
+            An optional *GradScaler* instance to load
+        map_location (`str`, *optional*):
+            What device to load the optimizer state onto. Should be one of either "cpu" or "on_device".
+        load_kwargs (`dict`, *optional*):
+            Additional arguments that can be passed to the `load` function.
+        load_model_func_kwargs (`dict`, *optional*):
+            Additional arguments that can be passed to the model's `load_state_dict` method.
+    Returns:
+        `dict`: Contains the `Accelerator` attributes to override while loading the state.
+    """
+    # stores the `Accelerator` attributes to override
+    override_attributes = dict()
+    if map_location not in [None, "cpu", "on_device"]:
+        raise TypeError(
+            "Unsupported optimizer map location passed, please choose one of `None`, `'cpu'`, or `'on_device'`"
+        )
+    if map_location is None:
+        map_location = "cpu"
+    elif map_location == "on_device":
+        map_location = PartialState().device
+    if load_kwargs is None:
+        load_kwargs = {}
+    input_dir = Path(input_dir)
+    # Model states
+    for i, model in enumerate(models):
+        ending = f"_{i}" if i > 0 else ""
+        input_model_file = input_dir.joinpath(f"{SAFE_MODEL_NAME}{ending}.safetensors")
+        if input_model_file.exists():
+            load_model(model, input_model_file, device=str(map_location), **load_model_func_kwargs)
+        else:
+            # Load with torch
+            input_model_file = input_dir.joinpath(f"{MODEL_NAME}{ending}.bin")
+            state_dict = load(input_model_file, map_location=map_location)
+            model.load_state_dict(state_dict, **load_model_func_kwargs)
+    logger.info("All model weights loaded successfully")
+    # Optimizer states
+    for i, opt in enumerate(optimizers):
+        optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
+        input_optimizer_file = input_dir.joinpath(optimizer_name)
+        optimizer_state = load(input_optimizer_file, map_location=map_location, **load_kwargs)
+        optimizers[i].load_state_dict(optimizer_state)
+    logger.info("All optimizer states loaded successfully")
+    # Scheduler states
+    for i, scheduler in enumerate(schedulers):
+        scheduler_name = f"{SCHEDULER_NAME}.bin" if i == 0 else f"{SCHEDULER_NAME}_{i}.bin"
+        input_scheduler_file = input_dir.joinpath(scheduler_name)
+        scheduler_state = load(input_scheduler_file, **load_kwargs)
+        scheduler.load_state_dict(scheduler_state)
+    logger.info("All scheduler states loaded successfully")
+    for i, dataloader in enumerate(dataloaders):
+        sampler_name = f"{SAMPLER_NAME}.bin" if i == 0 else f"{SAMPLER_NAME}_{i}.bin"
+        input_sampler_file = input_dir.joinpath(sampler_name)
+        # Only load if we have our custom sampler
+        from .data_loader import IterableDatasetShard, SeedableRandomSampler
+        if isinstance(dataloader.dataset, IterableDatasetShard):
+            sampler = dataloader.get_sampler()
+            if isinstance(sampler, SeedableRandomSampler):
+                sampler = dataloader.set_sampler(load(input_sampler_file))
+        if getattr(dataloader, "use_stateful_dataloader", False):
+            dataloader_state_dict_name = "dl_state_dict.bin" if i == 0 else f"dl_state_dict_{i}.bin"
+            input_dataloader_state_dict_file = input_dir.joinpath(dataloader_state_dict_name)
+            if input_dataloader_state_dict_file.exists():
+                state_dict = load(input_dataloader_state_dict_file, **load_kwargs)
+                dataloader.load_state_dict(state_dict)
+    logger.info("All dataloader sampler states loaded successfully")
+    # GradScaler state
+    if scaler is not None:
+        input_scaler_file = input_dir.joinpath(SCALER_NAME)
+        scaler_state = load(input_scaler_file)
+        scaler.load_state_dict(scaler_state)
+        logger.info("GradScaler state loaded successfully")
+    # Random states
+    try:
+        states = load(input_dir.joinpath(f"{RNG_STATE_NAME}_{process_index}.pkl"))
+        if "step" in states:
+            override_attributes["step"] = states["step"]
+        random.setstate(states["random_state"])
+        np.random.set_state(states["numpy_random_seed"])
+        torch.set_rng_state(states["torch_manual_seed"])
+        if is_xpu_available():
+            torch.xpu.set_rng_state_all(states["torch_xpu_manual_seed"])
+        if is_mlu_available():
+            torch.mlu.set_rng_state_all(states["torch_mlu_manual_seed"])
+        elif is_sdaa_available():
+            torch.sdaa.set_rng_state_all(states["torch_sdaa_manual_seed"])
+        elif is_musa_available():
+            torch.musa.set_rng_state_all(states["torch_musa_manual_seed"])
+        else:
+            torch.cuda.set_rng_state_all(states["torch_cuda_manual_seed"])
+        if is_torch_xla_available():
+            xm.set_rng_state(states["xm_seed"])
+        logger.info("All random states loaded successfully")
+    except Exception:
+        logger.info("Could not load random states")
+    return override_attributes
+def save_custom_state(obj, path, index: int = 0, save_on_each_node: bool = False):
+    """
+    Saves the state of `obj` to `{path}/custom_checkpoint_{index}.pkl`
+    """
+    # Should this be the right way to get a qual_name type value from `obj`?
+    save_location = Path(path) / f"custom_checkpoint_{index}.pkl"
+    logger.info(f"Saving the state of {get_pretty_name(obj)} to {save_location}")
+    save(obj.state_dict(), save_location, save_on_each_node=save_on_each_node)
+def load_custom_state(obj, path, index: int = 0):
+    """
+    Loads the state of `obj` at `{path}/custom_checkpoint_{index}.pkl`. Will always set `weights_only=False` when
+    loading the state.
+    """
+    load_location = f"{path}/custom_checkpoint_{index}.pkl"
+    logger.info(f"Loading the state of {get_pretty_name(obj)} from {load_location}")
+    obj.load_state_dict(load(load_location, map_location="cpu", weights_only=False))

pythonProject/.venv/Lib/site-packages/accelerate/commands/merge.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from accelerate.commands.utils import CustomArgumentParser
+from accelerate.utils import merge_fsdp_weights
+description = """Utility to merge the weights from multiple FSDP checkpoints into a single combined checkpoint. Should be used if
+`SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}`.
+This is a CPU-bound process and requires enough RAM to load the entire model state dict."""
+def merge_command(args):
+    merge_fsdp_weights(
+        args.checkpoint_directory, args.output_path, not args.unsafe_serialization, args.remove_checkpoint_dir
+    )
+def merge_command_parser(subparsers=None):
+    if subparsers is not None:
+        parser = subparsers.add_parser("merge-weights", description=description)
+    else:
+        parser = CustomArgumentParser(description=description)
+    parser.add_argument("checkpoint_directory", type=str, help="A directory containing sharded weights saved by FSDP.")
+    parser.add_argument(
+        "output_path",
+        type=str,
+        help="The path to save the merged weights. Defaults to the current directory. ",
+    )
+    parser.add_argument(
+        "--unsafe_serialization",
+        action="store_true",
+        default=False,
+        help="Whether to save the merged weights as `.bin` rather than `.safetensors` (not recommended).",
+    )
+    parser.add_argument(
+        "--remove_checkpoint_dir",
+        action="store_true",
+        help="Whether to remove the checkpoint directory after merging.",
+        default=False,
+    )
+    if subparsers is not None:
+        parser.set_defaults(func=merge_command)
+    return parser
+def main():
+    parser = merge_command_parser()
+    args = parser.parse_args()
+    merge_command(args)
+if __name__ == "__main__":
+    main()

pythonProject/.venv/Lib/site-packages/accelerate/commands/test.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#!/usr/bin/env python
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from accelerate.test_utils import execute_subprocess_async, path_in_accelerate_package
+def test_command_parser(subparsers=None):
+    if subparsers is not None:
+        parser = subparsers.add_parser("test")
+    else:
+        parser = argparse.ArgumentParser("Accelerate test command")
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        help=(
+            "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
+            "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
+            "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
+            "with 'huggingface'."
+        ),
+    )
+    if subparsers is not None:
+        parser.set_defaults(func=test_command)
+    return parser
+def test_command(args):
+    script_name = path_in_accelerate_package("test_utils", "scripts", "test_script.py")
+    if args.config_file is None:
+        test_args = [script_name]
+    else:
+        test_args = f"--config_file={args.config_file} {script_name}".split()
+    cmd = ["accelerate-launch"] + test_args
+    result = execute_subprocess_async(cmd)
+    if result.returncode == 0:
+        print("Test is a success! You are ready for your distributed training!")
+def main():
+    parser = test_command_parser()
+    args = parser.parse_args()
+    test_command(args)
+if __name__ == "__main__":
+    main()

pythonProject/.venv/Lib/site-packages/accelerate/data_loader.py ADDED Viewed

	@@ -0,0 +1,1451 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import math
+from contextlib import suppress
+from typing import Callable, Optional, Union
+import torch
+from packaging import version
+from torch.utils.data import BatchSampler, DataLoader, IterableDataset, RandomSampler
+from .logging import get_logger
+from .state import DistributedType, GradientState, PartialState, is_torch_xla_available
+from .utils import (
+    RNGType,
+    broadcast,
+    broadcast_object_list,
+    compare_versions,
+    concatenate,
+    find_batch_size,
+    get_data_structure,
+    initialize_tensors,
+    is_datasets_available,
+    is_torch_version,
+    is_torchdata_stateful_dataloader_available,
+    send_to_device,
+    slice_tensors,
+    synchronize_rng_states,
+)
+logger = get_logger(__name__)
+# kwargs of the DataLoader in min version 2.0
+_PYTORCH_DATALOADER_KWARGS = {
+    "batch_size": 1,
+    "shuffle": False,
+    "sampler": None,
+    "batch_sampler": None,
+    "num_workers": 0,
+    "collate_fn": None,
+    "pin_memory": False,
+    "drop_last": False,
+    "timeout": 0,
+    "worker_init_fn": None,
+    "multiprocessing_context": None,
+    "generator": None,
+    "prefetch_factor": 2,
+    "persistent_workers": False,
+    "pin_memory_device": "",
+}
+# kwargs added after by version
+_PYTORCH_DATALOADER_ADDITIONAL_KWARGS = {"2.6.0": {"in_order": True}}
+for v, additional_kwargs in _PYTORCH_DATALOADER_ADDITIONAL_KWARGS.items():
+    if is_torch_version(">=", v):
+        _PYTORCH_DATALOADER_KWARGS.update(additional_kwargs)
+class SeedableRandomSampler(RandomSampler):
+    """
+    Same as a random sampler, except that in `__iter__` a seed can be used.
+    Needed specifically in distributed cases, when the random generator for each GPU needs to start from the same seed
+    and be fully reproducable on multiple iterations.
+    If a custom `generator` is passed, it will rely on its initial seed as well as the current iteration it is on
+    (stored in `self.epoch`).
+    """
+    def __init__(self, *args, **kwargs):
+        data_seed = kwargs.pop("data_seed", None)
+        super().__init__(*args, **kwargs)
+        self.initial_seed = data_seed if data_seed is not None else torch.random.initial_seed()
+        self.epoch = 0
+    def __iter__(self):
+        if self.generator is None:
+            self.generator = torch.Generator(
+                device=torch.get_default_device() if hasattr(torch, "get_default_device") else "cpu"
+            )
+            self.generator.manual_seed(self.initial_seed)
+        # Allow `self.epoch` to modify the seed of the generator
+        seed = self.epoch + self.initial_seed
+        # print("Setting seed at epoch", self.epoch, seed)
+        self.generator.manual_seed(seed)
+        yield from super().__iter__()
+        self.set_epoch(self.epoch + 1)
+    def set_epoch(self, epoch: int):
+        "Sets the current iteration of the sampler."
+        self.epoch = epoch
+class BatchSamplerShard(BatchSampler):
+    """
+    Wraps a PyTorch `BatchSampler` to generate batches for one of the processes only. Instances of this class will
+    always yield a number of batches that is a round multiple of `num_processes` and that all have the same size.
+    Depending on the value of the `drop_last` attribute of the batch sampler passed, it will either stop the iteration
+    at the first batch that would be too small / not present on all processes or loop with indices from the beginning.
+    Args:
+        batch_sampler (`torch.utils.data.sampler.BatchSampler`):
+            The batch sampler to split in several shards.
+        num_processes (`int`, *optional*, defaults to 1):
+            The number of processes running concurrently.
+        process_index (`int`, *optional*, defaults to 0):
+            The index of the current process.
+        split_batches (`bool`, *optional*, defaults to `False`):
+            Whether the shards should be created by splitting a batch to give a piece of it on each process, or by
+            yielding different full batches on each process.
+            On two processes with a sampler of `[[0, 1, 2, 3], [4, 5, 6, 7]]`, this will result in:
+            - the sampler on process 0 to yield `[0, 1, 2, 3]` and the sampler on process 1 to yield `[4, 5, 6, 7]` if
+              this argument is set to `False`.
+            - the sampler on process 0 to yield `[0, 1]` then `[4, 5]` and the sampler on process 1 to yield `[2, 3]`
+              then `[6, 7]` if this argument is set to `True`.
+        even_batches (`bool`, *optional*, defaults to `True`):
+            Whether or not to loop back at the beginning of the sampler when the number of samples is not a round
+            multiple of (original batch size / number of processes).
+    <Tip warning={true}>
+    `BatchSampler`s with varying batch sizes are not enabled by default. To enable this behaviour, set `even_batches`
+    equal to `False`
+    </Tip>"""
+    def __init__(
+        self,
+        batch_sampler: BatchSampler,
+        num_processes: int = 1,
+        process_index: int = 0,
+        split_batches: bool = False,
+        even_batches: bool = True,
+    ):
+        if split_batches and batch_sampler.batch_size % num_processes != 0:
+            raise ValueError(
+                f"To use `BatchSamplerShard` in `split_batches` mode, the batch size ({batch_sampler.batch_size}) "
+                f"needs to be a round multiple of the number of processes ({num_processes})."
+            )
+        self.batch_sampler = batch_sampler
+        self.num_processes = num_processes
+        self.process_index = process_index
+        self.split_batches = split_batches
+        self.even_batches = even_batches
+        self.batch_size = getattr(batch_sampler, "batch_size", None)
+        self.drop_last = getattr(batch_sampler, "drop_last", False)
+        if self.batch_size is None and self.even_batches:
+            raise ValueError(
+                "You need to use `even_batches=False` when the batch sampler has no batch size. If you "
+                "are not calling this method directly, set `accelerator.even_batches=False` instead."
+            )
+    @property
+    def total_length(self):
+        return len(self.batch_sampler)
+    def __len__(self):
+        if self.split_batches:
+            # Split batches does not change the length of the batch sampler
+            return len(self.batch_sampler)
+        if len(self.batch_sampler) % self.num_processes == 0:
+            # If the length is a round multiple of the number of processes, it's easy.
+            return len(self.batch_sampler) // self.num_processes
+        length = len(self.batch_sampler) // self.num_processes
+        if self.drop_last:
+            # Same if we drop the remainder.
+            return length
+        elif self.even_batches:
+            # When we even batches we always get +1
+            return length + 1
+        else:
+            # Otherwise it depends on the process index.
+            return length + 1 if self.process_index < len(self.batch_sampler) % self.num_processes else length
+    def __iter__(self):
+        return self._iter_with_split() if self.split_batches else self._iter_with_no_split()
+    def _iter_with_split(self):
+        initial_data = []
+        batch_length = self.batch_sampler.batch_size // self.num_processes
+        for idx, batch in enumerate(self.batch_sampler):
+            if idx == 0:
+                initial_data = batch
+            if len(batch) == self.batch_size:
+                # If the batch is full, we yield the part of it this process is responsible of.
+                yield batch[batch_length * self.process_index : batch_length * (self.process_index + 1)]
+        # If drop_last is True of the last batch was full, iteration is over, otherwise...
+        if not self.drop_last and len(initial_data) > 0 and len(batch) < self.batch_size:
+            if not self.even_batches:
+                if len(batch) > batch_length * self.process_index:
+                    yield batch[batch_length * self.process_index : batch_length * (self.process_index + 1)]
+            else:
+                # For degenerate cases where the dataset has less than num_process * batch_size samples
+                while len(initial_data) < self.batch_size:
+                    initial_data += initial_data
+                batch = batch + initial_data
+                yield batch[batch_length * self.process_index : batch_length * (self.process_index + 1)]
+    def _iter_with_no_split(self):
+        initial_data = []
+        batch_to_yield = []
+        for idx, batch in enumerate(self.batch_sampler):
+            # We gather the initial indices in case we need to circle back at the end.
+            if not self.drop_last and idx < self.num_processes:
+                initial_data += batch
+            # We identify the batch to yield but wait until we ar sure every process gets a full batch before actually
+            # yielding it.
+            if idx % self.num_processes == self.process_index:
+                batch_to_yield = batch
+            if idx % self.num_processes == self.num_processes - 1 and (
+                self.batch_size is None or len(batch) == self.batch_size
+            ):
+                yield batch_to_yield
+                batch_to_yield = []
+        # If drop_last is True, iteration is over, otherwise...
+        if not self.drop_last and len(initial_data) > 0:
+            if not self.even_batches:
+                if len(batch_to_yield) > 0:
+                    yield batch_to_yield
+            else:
+                # ... we yield the complete batch we had saved before if it has the proper length
+                if len(batch_to_yield) == self.batch_size:
+                    yield batch_to_yield
+                # For degenerate cases where the dataset has less than num_process * batch_size samples
+                while len(initial_data) < self.num_processes * self.batch_size:
+                    initial_data += initial_data
+                # If the last batch seen was of the proper size, it has been yielded by its process so we move to the next
+                if len(batch) == self.batch_size:
+                    batch = []
+                    idx += 1
+                # Make sure we yield a multiple of self.num_processes batches
+                cycle_index = 0
+                while idx % self.num_processes != 0 or len(batch) > 0:
+                    end_index = cycle_index + self.batch_size - len(batch)
+                    batch += initial_data[cycle_index:end_index]
+                    if idx % self.num_processes == self.process_index:
+                        yield batch
+                    cycle_index = end_index
+                    batch = []
+                    idx += 1
+class IterableDatasetShard(IterableDataset):
+    """
+    Wraps a PyTorch `IterableDataset` to generate samples for one of the processes only. Instances of this class will
+    always yield a number of samples that is a round multiple of the actual batch size (depending of the value of
+    `split_batches`, this is either `batch_size` or `batch_size x num_processes`). Depending on the value of the
+    `drop_last` attribute of the batch sampler passed, it will either stop the iteration at the first batch that would
+    be too small or loop with indices from the beginning.
+    Args:
+        dataset (`torch.utils.data.dataset.IterableDataset`):
+            The batch sampler to split in several shards.
+        batch_size (`int`, *optional*, defaults to 1):
+            The size of the batches per shard (if `split_batches=False`) or the size of the batches (if
+            `split_batches=True`).
+        drop_last (`bool`, *optional*, defaults to `False`):
+            Whether or not to drop the last incomplete batch or complete the last batches by using the samples from the
+            beginning.
+        num_processes (`int`, *optional*, defaults to 1):
+            The number of processes running concurrently.
+        process_index (`int`, *optional*, defaults to 0):
+            The index of the current process.
+        split_batches (`bool`, *optional*, defaults to `False`):
+            Whether the shards should be created by splitting a batch to give a piece of it on each process, or by
+            yielding different full batches on each process.
+            On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7]`, this will result in:
+            - the shard on process 0 to yield `[0, 1, 2, 3]` and the shard on process 1 to yield `[4, 5, 6, 7]` if this
+              argument is set to `False`.
+            - the shard on process 0 to yield `[0, 1, 4, 5]` and the sampler on process 1 to yield `[2, 3, 6, 7]` if
+              this argument is set to `True`.
+    """
+    def __init__(
+        self,
+        dataset: IterableDataset,
+        batch_size: int = 1,
+        drop_last: bool = False,
+        num_processes: int = 1,
+        process_index: int = 0,
+        split_batches: bool = False,
+    ):
+        if split_batches and batch_size > 1 and batch_size % num_processes != 0:
+            raise ValueError(
+                f"To use `IterableDatasetShard` in `split_batches` mode, the batch size ({batch_size}) "
+                f"needs to be a round multiple of the number of processes ({num_processes})."
+            )
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.num_processes = num_processes
+        self.process_index = process_index
+        self.split_batches = split_batches
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+        if hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(epoch)
+    def __len__(self):
+        # We will just raise the downstream error if the underlying dataset is not sized
+        if self.drop_last:
+            return (len(self.dataset) // (self.batch_size * self.num_processes)) * self.batch_size
+        else:
+            return math.ceil(len(self.dataset) / (self.batch_size * self.num_processes)) * self.batch_size
+    def __iter__(self):
+        if (
+            not hasattr(self.dataset, "set_epoch")
+            and hasattr(self.dataset, "generator")
+            and isinstance(self.dataset.generator, torch.Generator)
+        ):
+            self.dataset.generator.manual_seed(self.epoch)
+        real_batch_size = self.batch_size if self.split_batches else (self.batch_size * self.num_processes)
+        process_batch_size = (self.batch_size // self.num_processes) if self.split_batches else self.batch_size
+        process_slice = range(self.process_index * process_batch_size, (self.process_index + 1) * process_batch_size)
+        first_batch = None
+        current_batch = []
+        for element in self.dataset:
+            current_batch.append(element)
+            # Wait to have a full batch before yielding elements.
+            if len(current_batch) == real_batch_size:
+                for i in process_slice:
+                    yield current_batch[i]
+                if first_batch is None:
+                    first_batch = current_batch.copy()
+                current_batch = []
+        # Finished if drop_last is True, otherwise complete the last batch with elements from the beginning.
+        if not self.drop_last and len(current_batch) > 0:
+            if first_batch is None:
+                first_batch = current_batch.copy()
+            while len(current_batch) < real_batch_size:
+                current_batch += first_batch
+            for i in process_slice:
+                yield current_batch[i]
+class DataLoaderStateMixin:
+    """
+    Mixin class that adds a state to a `DataLoader` to keep track of the status inside the dataloader such as at the
+    end of the iteration, the number of items in the dataset in the last batch relative to the batch size, and other
+    useful information that might be needed.
+    **Available attributes:**
+        - **end_of_dataloader** (`bool`) -- Whether at the last iteration or batch
+        - **remainder** (`int`) -- The number of items that are remaining in the last batch, relative to the total
+          batch size
+    <Tip warning={true}>
+        Inheriters of this class should ensure that the class creates a `GradientState()` instance, stored in
+        `self.gradient_state`.
+    </Tip>
+    """
+    def __init_subclass__(cls, **kwargs):
+        cls.end_of_dataloader = False
+        cls.remainder = -1
+    def reset(self):
+        self.end_of_dataloader = False
+        self.remainder = -1
+    def begin(self):
+        "Prepares the gradient state for the current dataloader"
+        self.reset()
+        with suppress(Exception):
+            if not self._drop_last:
+                length = getattr(self.dataset, "total_dataset_length", len(self.dataset))
+                self.remainder = length % self.total_batch_size
+        self.gradient_state._add_dataloader(self)
+    def end(self):
+        "Cleans up the gradient state after exiting the dataloader"
+        self.gradient_state._remove_dataloader(self)
+class DataLoaderAdapter:
+    """
+    A class which wraps around a PyTorch `DataLoader` (or variants of it) to be used with the `Accelerator`. For
+    compatability reasons, this class inherits from the class it wraps around, so it can be used as a drop-in.
+    """
+    def __init__(self, dataset, use_stateful_dataloader=False, batch_sampler=None, **kwargs):
+        self.use_stateful_dataloader = use_stateful_dataloader
+        if is_torchdata_stateful_dataloader_available():
+            from torchdata.stateful_dataloader import StatefulDataLoader
+        if use_stateful_dataloader and not is_torchdata_stateful_dataloader_available():
+            raise ImportError(
+                "StatefulDataLoader is not available. Please install torchdata version 0.8.0 or higher to use it."
+            )
+        if use_stateful_dataloader:
+            torchdata_version = version.parse(importlib.metadata.version("torchdata"))
+            if (
+                "in_order" in kwargs
+                and compare_versions(torchdata_version, "<", "0.11")
+                and is_torch_version(">=", "2.6.0")
+            ):
+                kwargs.pop("in_order")
+            self.base_dataloader = StatefulDataLoader(dataset, batch_sampler=batch_sampler, **kwargs)
+        else:
+            self.base_dataloader = DataLoader(dataset, batch_sampler=batch_sampler, **kwargs)
+        if hasattr(self.base_dataloader, "state_dict"):
+            self.dl_state_dict = self.base_dataloader.state_dict()
+    def __getattr__(self, name):
+        # Avoid infinite recursion if we try to access a nonexistent base_dataloader attribute.
+        if name == "base_dataloader":
+            raise AttributeError()
+        # Delegate attribute access to the internal dataloader
+        return getattr(self.base_dataloader, name)
+    def state_dict(self):
+        return self.dl_state_dict
+    def load_state_dict(self, state_dict):
+        self.base_dataloader.load_state_dict(state_dict)
+    @property
+    def __class__(self):
+        """
+        In order to maintain backwards compatability with other code, we need to ensure `isinstance(obj, DataLoader)`
+        returs true. This is because some downstream code assumes that the `DataLoader` is the base class of the
+        object.
+        """
+        return self.base_dataloader.__class__
+    def __len__(self):
+        return len(self.base_dataloader)
+    def adjust_state_dict_for_prefetch(self):
+        """
+        Adjusts the state dict for prefetching. Natively, this will adjust all of the iters yielded keys in
+        `self.dl_state_dict` by a factor of `num_processes - 1`, however if a custom correction is needed, this can be
+        overridden.
+        This should modify `self.dl_state_dict` directly
+        """
+        # The state dict will be off by a factor of `n-1` batch too many during DDP,
+        # so we need to adjust it here
+        if PartialState().distributed_type != DistributedType.NO:
+            factor = PartialState().num_processes - 1
+            if self.dl_state_dict["_sampler_iter_yielded"] > 0:
+                self.dl_state_dict["_sampler_iter_yielded"] -= factor
+            if self.dl_state_dict["_num_yielded"] > 0:
+                self.dl_state_dict["_num_yielded"] -= factor
+            if self.dl_state_dict["_index_sampler_state"] is not None:
+                if (
+                    "samples_yielded" in self.dl_state_dict["_index_sampler_state"]
+                    and self.dl_state_dict["_index_sampler_state"]["samples_yielded"] > 0
+                ):
+                    self.dl_state_dict["_index_sampler_state"]["samples_yielded"] -= self.batch_size * factor
+    def _update_state_dict(self):
+        # The state_dict of the underlying base_dataloader may be ahead of what is currently being yielded.
+        # E.g. the implementation of DataLoaderShard involves having an underlying iterator 1 element ahead of
+        # what it wants to yield.
+        #
+        # _update_state_dict is called to snapshot the state_dict that would properly recover the DataLoaderAdapter.
+        if hasattr(self.base_dataloader, "state_dict"):
+            self.dl_state_dict = self.base_dataloader.state_dict()
+            # Potentially modify the state_dict to adjust for prefetching
+            self.adjust_state_dict_for_prefetch()
+            # Then tag if we are at the end of the dataloader
+            self.dl_state_dict["_iterator_finished"] = self.end_of_dataloader
+class DataLoaderShard(DataLoaderAdapter, DataLoaderStateMixin):
+    """
+    Subclass of `DataLoaderAdapter` that will deal with device placement and current distributed setup.
+    Args:
+        dataset (`torch.utils.data.dataset.Dataset`):
+            The dataset to use to build this dataloader.
+        device (`torch.device`, *optional*):
+            If passed, the device to put all batches on.
+        rng_types (list of `str` or [`~utils.RNGType`]):
+            The list of random number generators to synchronize at the beginning of each iteration. Should be one or
+            several of:
+            - `"torch"`: the base torch random number generator
+            - `"cuda"`: the CUDA random number generator (GPU only)
+            - `"xla"`: the XLA random number generator (TPU only)
+            - `"generator"`: an optional `torch.Generator`
+        synchronized_generator (`torch.Generator`, *optional*):
+            A random number generator to keep synchronized across processes.
+        skip_batches (`int`, *optional*, defaults to 0):
+            The number of batches to skip at the beginning.
+        use_stateful_dataloader (`bool`, *optional*, defaults to `False`):
+            Whether to have this class adapt `StatefulDataLoader` from `torchdata` instead of the regular `DataLoader`.
+        **kwargs (additional keyword arguments, *optional*):
+            All other keyword arguments to pass to the regular `DataLoader` initialization.
+    **Available attributes:**
+        - **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
+            Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
+            number of processes
+        - **total_dataset_length** (`int`) -- Total length of the inner dataset across all processes.
+    """
+    def __init__(
+        self,
+        dataset,
+        device=None,
+        rng_types=None,
+        synchronized_generator=None,
+        skip_batches=0,
+        use_stateful_dataloader=False,
+        _drop_last: bool = False,
+        _non_blocking: bool = False,
+        torch_device_mesh=None,
+        **kwargs,
+    ):
+        super().__init__(dataset, use_stateful_dataloader=use_stateful_dataloader, **kwargs)
+        self.device = device
+        self.rng_types = rng_types
+        self.synchronized_generator = synchronized_generator
+        self.skip_batches = skip_batches
+        self.gradient_state = GradientState()
+        self._drop_last = _drop_last
+        self._non_blocking = _non_blocking
+        self.iteration = 0
+    def __iter__(self):
+        if self.rng_types is not None:
+            synchronize_rng_states(self.rng_types, self.synchronized_generator)
+        self.begin()
+        self.set_epoch(self.iteration)
+        dataloader_iter = self.base_dataloader.__iter__()
+        # We iterate one batch ahead to check when we are at the end
+        try:
+            current_batch = next(dataloader_iter)
+        except StopIteration:
+            self.end()
+            return
+        batch_index = 0
+        while True:
+            try:
+                # But we still move it to the device so it is done before `StopIteration` is reached
+                if self.device is not None:
+                    current_batch = send_to_device(current_batch, self.device, non_blocking=self._non_blocking)
+                self._update_state_dict()
+                next_batch = next(dataloader_iter)
+                if batch_index >= self.skip_batches:
+                    yield current_batch
+                batch_index += 1
+                current_batch = next_batch
+            except StopIteration:
+                self.end_of_dataloader = True
+                self._update_state_dict()
+                if batch_index >= self.skip_batches:
+                    yield current_batch
+                break
+        self.iteration += 1
+        self.end()
+    def __reduce__(self):
+        """
+        Define the `__reduce__` method to ensure a `DataLoaderShard` can be pickled and unpickled. This needs to be
+        explicitly defined since default pickling behavior is broken by `DataLoaderAdapter` messing with its
+        `__class__` member.
+        """
+        args = super().__reduce__()
+        return (DataLoaderShard, *args[1:])
+    def set_epoch(self, epoch: int):
+        # In case it is manually passed in, the user can set it to what they like
+        if self.iteration != epoch:
+            self.iteration = epoch
+        if hasattr(self.batch_sampler, "set_epoch"):
+            self.batch_sampler.set_epoch(epoch)
+        if hasattr(self.batch_sampler, "sampler") and hasattr(self.batch_sampler.sampler, "set_epoch"):
+            self.batch_sampler.sampler.set_epoch(epoch)
+        if (
+            hasattr(self.batch_sampler, "batch_sampler")
+            and hasattr(self.batch_sampler.batch_sampler, "sampler")
+            and hasattr(self.batch_sampler.batch_sampler.sampler, "set_epoch")
+        ):
+            self.batch_sampler.batch_sampler.sampler.set_epoch(epoch)
+        # We support if a custom `Dataset` implementation has `set_epoch`
+        # or in general HF datasets `Datasets`
+        elif hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(epoch)
+    @property
+    def total_batch_size(self):
+        batch_sampler = self.sampler if isinstance(self.sampler, BatchSampler) else self.batch_sampler
+        return (
+            batch_sampler.batch_size
+            if getattr(batch_sampler, "split_batches", False)
+            else (batch_sampler.batch_size * getattr(batch_sampler, "num_processes", 1))
+        )
+    @property
+    def total_dataset_length(self):
+        if hasattr(self.dataset, "total_length"):
+            return self.dataset.total_length
+        else:
+            return len(self.dataset)
+    def get_sampler(self):
+        return get_sampler(self)
+    def set_sampler(self, sampler):
+        sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
+        if sampler_is_batch_sampler:
+            self.sampler.sampler = sampler
+        else:
+            self.batch_sampler.sampler = sampler
+            if hasattr(self.batch_sampler, "batch_sampler"):
+                self.batch_sampler.batch_sampler.sampler = sampler
+if is_torch_xla_available():
+    import torch_xla.distributed.parallel_loader as xpl
+    class MpDeviceLoaderWrapper(xpl.MpDeviceLoader):
+        """
+        Wrapper for the xpl.MpDeviceLoader class that knows the total batch size.
+        XLA preloading threads will all call DataLoaderShard's __iter__(). Remove rng_types from DataLoaderShard to
+        prevent it from using the XLA device in the preloading threads, and synchronize the RNG once from the main
+        thread only.
+        **Available attributes:**
+        - **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
+            Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
+            number of processes
+        - **total_dataset_length** (`int`) -- Total length of the inner dataset across all processes.
+        """
+        def __init__(self, dataloader: DataLoaderShard, device: torch.device):
+            super().__init__(dataloader, device)
+            self._rng_types = self._loader.rng_types
+            self._loader.rng_types = None
+            self.device = device
+        def __iter__(self):
+            if self._rng_types is not None:
+                synchronize_rng_states(self._rng_types, self._loader.synchronized_generator)
+            return super().__iter__()
+        def set_epoch(self, epoch: int):
+            if hasattr(self.dataloader, "set_epoch"):
+                self.dataloader.set_epoch(epoch)
+        @property
+        def total_batch_size(self):
+            return self._loader.total_batch_size
+        @property
+        def total_dataset_length(self):
+            return self._loader.total_dataset_length
+        @property
+        def batch_sampler(self):
+            return self._loader.batch_sampler
+        @property
+        def dataloader(self):
+            return self._loader
+class DataLoaderDispatcher(DataLoaderAdapter, DataLoaderStateMixin):
+    """
+    Subclass of `DataLoaderAdapter` that will iterate and preprocess on process 0 only, then dispatch on each process
+    their part of the batch.
+    Args:
+        split_batches (`bool`, *optional*, defaults to `False`):
+            Whether the resulting `DataLoader` should split the batches of the original data loader across devices or
+            yield full batches (in which case it will yield batches starting at the `process_index`-th and advancing of
+            `num_processes` batches at each iteration). Another way to see this is that the observed batch size will be
+            the same as the initial `dataloader` if this option is set to `True`, the batch size of the initial
+            `dataloader` multiplied by `num_processes` otherwise. Setting this option to `True` requires that the batch
+            size of the `dataloader` is a round multiple of `batch_size`.
+        skip_batches (`int`, *optional*, defaults to 0):
+            The number of batches to skip at the beginning of an iteration.
+        use_stateful_dataloader (`bool`, *optional*, defaults to `False`):
+            Whether to have this class adapt `StatefulDataLoader` from `torchdata` instead of the regular `DataLoader`.
+    **Available attributes:**
+        - **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
+            Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
+            number of processes
+        - **total_dataset_length** (`int`) -- Total length of the inner dataset across all processes.
+    """
+    def __init__(
+        self,
+        dataset,
+        split_batches: bool = False,
+        skip_batches=0,
+        use_stateful_dataloader=False,
+        _drop_last: bool = False,
+        _non_blocking: bool = False,
+        slice_fn=None,
+        torch_device_mesh=None,
+        **kwargs,
+    ):
+        shuffle = False
+        from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe
+        # We need to save the shuffling state of the DataPipe
+        if isinstance(dataset, ShufflerIterDataPipe):
+            shuffle = dataset._shuffle_enabled
+        super().__init__(dataset, use_stateful_dataloader=use_stateful_dataloader, **kwargs)
+        self.split_batches = split_batches
+        if shuffle:
+            torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)
+        self.gradient_state = GradientState()
+        self.state = PartialState()
+        self._drop_last = _drop_last
+        self._non_blocking = _non_blocking
+        self.skip_batches = skip_batches
+        self.torch_device_mesh = torch_device_mesh
+        self.slice_fn = slice_tensors if slice_fn is None else slice_fn
+        self.iteration = 0
+        # if a device mesh is provided extract each dimension (dp, fsdp, tp)
+        # device mesh may hold any number of dimensions, however,
+        # below code is for targetted support for dp, fsdp and tp
+        # device mesh will be used only if there is tp involved
+        # or any multi-dimensional parallelism involving tp
+        # (dp, tp) (fsdp, tp) (dp, fsdp, tp)
+        # otherwise the default behavour not using device mesh should be sufficient
+        # since multi dimensional parallelism devoid of tp would anyway need
+        # different batches for each process irrespective of dp or fsdp
+        self.submesh_tp = None
+        self.submesh_dp = None
+        self.submesh_fsdp = None
+        if self.torch_device_mesh and "tp" in self.torch_device_mesh.mesh_dim_names:
+            self.submesh_tp = self.torch_device_mesh["tp"]
+            if "dp" in self.torch_device_mesh.mesh_dim_names:
+                self.submesh_dp = self.torch_device_mesh["dp"]
+            if "fsdp" in self.torch_device_mesh.mesh_dim_names:
+                self.submesh_fsdp = self.torch_device_mesh["fsdp"]
+        if self.submesh_tp and (self.submesh_dp or self.submesh_fsdp):
+            raise ValueError("TP + (DP/FSDP) is not yet supported in dispatch mode")
+    def _fetch_batches(self, iterator):
+        batches, batch = None, None
+        # On process 0, we gather the batch to dispatch.
+        if self.state.process_index == 0:
+            # Procedure to support TP only is simpler
+            # since we want to dispatch the same batch of samples across all ranks
+            # this removes complexity of handling multiple tp rank groups when TP + DP
+            # combination is involved.
+            try:
+                # for TP case avoid using split_batches
+                # since it would mean that the dataloader should be spilling out
+                # duplicates of batches.
+                if self.split_batches:
+                    # One batch of the main iterator is dispatched and split.
+                    if self.submesh_tp:
+                        logger.warning(
+                            "Use of split_batches for TP would need the dataloader to produce duplicate batches,"
+                            "otherwise, use dispatch_batches=True instead."
+                        )
+                    self._update_state_dict()
+                    batch = next(iterator)
+                else:
+                    # num_processes batches of the main iterator are concatenated then dispatched and split.
+                    # We add the batches one by one so we have the remainder available when drop_last=False.
+                    batches = []
+                    if self.submesh_tp:
+                        # when tp, extract single batch and then replicate
+                        self._update_state_dict()
+                        batch = next(iterator)
+                        batches = [batch] * self.state.num_processes
+                    else:
+                        for _ in range(self.state.num_processes):
+                            self._update_state_dict()
+                            batches.append(next(iterator))
+                    try:
+                        batch = concatenate(batches, dim=0)
+                    except RuntimeError as e:
+                        raise RuntimeError(
+                            "You can't use batches of different size with `dispatch_batches=True` or when using an `IterableDataset`."
+                            "either pass `dispatch_batches=False` and have each process fetch its own batch "
+                            " or pass `split_batches=True`. By doing so, the main process will fetch a full batch and "
+                            "slice it into `num_processes` batches for each process."
+                        ) from e
+                # In both cases, we need to get the structure of the batch that we will broadcast on other
+                # processes to initialize the tensors with the right shape.
+                # data_structure, stop_iteration
+                batch_info = [get_data_structure(batch), False]
+            except StopIteration:
+                batch_info = [None, True]
+        else:
+            batch_info = [None, self._stop_iteration]
+        # This is inplace, so after this instruction, every process has the same `batch_info` as process 0.
+        broadcast_object_list(batch_info)
+        self._stop_iteration = batch_info[1]
+        if self._stop_iteration:
+            # If drop_last is False and split_batches is False, we may have a remainder to take care of.
+            if not self.split_batches and not self._drop_last:
+                if self.state.process_index == 0 and len(batches) > 0:
+                    batch = concatenate(batches, dim=0)
+                    batch_info = [get_data_structure(batch), False]
+                else:
+                    batch_info = [None, True]
+                broadcast_object_list(batch_info)
+        return batch, batch_info
+    def __iter__(self):
+        self.begin()
+        self.set_epoch(self.iteration)
+        main_iterator = None
+        if is_torch_version(">=", "2.0.1"):
+            # NOTE PyTorch DataLoader adds forward compatibilities for DataPipes, which broadcasts
+            # shared seed to all dist processes. Thus, we need to create iterator for all dist processes.
+            # But, we only iterate through the DataLoader on process 0.
+            main_iterator = self.base_dataloader.__iter__()
+        elif self.state.process_index == 0:
+            main_iterator = self.base_dataloader.__iter__()
+        stop_iteration = False
+        self._stop_iteration = False
+        first_batch = None
+        next_batch, next_batch_info = self._fetch_batches(main_iterator)
+        batch_index = 0
+        while not stop_iteration:
+            batch, batch_info = next_batch, next_batch_info
+            if self.state.process_index != 0:
+                # Initialize tensors on other processes than process 0.
+                batch = initialize_tensors(batch_info[0])
+            batch = send_to_device(batch, self.state.device, non_blocking=self._non_blocking)
+            # Broadcast the batch before splitting it.
+            batch = broadcast(batch, from_process=0)
+            if not self._drop_last and first_batch is None:
+                # We keep at least num processes elements of the first batch to be able to complete the last batch
+                first_batch = self.slice_fn(
+                    batch,
+                    slice(0, self.state.num_processes),
+                    process_index=self.state.process_index,
+                    num_processes=self.state.num_processes,
+                )
+            if batch is None:
+                raise ValueError(
+                    f"Batch does not contain any data (`{batch}`). At the end of all iterable data available before expected stop iteration."
+                )
+            observed_batch_size = find_batch_size(batch)
+            batch_size = observed_batch_size // self.state.num_processes
+            stop_iteration = self._stop_iteration
+            if not stop_iteration:
+                # We may still be at the end of the dataloader without knowing it yet: if there is nothing left in
+                # the dataloader since the number of batches is a round multiple of the number of processes.
+                next_batch, next_batch_info = self._fetch_batches(main_iterator)
+                # next_batch_info[0] is None when there are no more batches, otherwise we still need to process them.
+                if self._stop_iteration and next_batch_info[0] is None:
+                    stop_iteration = True
+            if not self._drop_last and stop_iteration and observed_batch_size % self.state.num_processes != 0:
+                # If the last batch is not complete, let's add the first batch to it.
+                batch = concatenate([batch, first_batch], dim=0)
+                # Batch size computation above is wrong, it's off by 1 so we fix it.
+                batch_size += 1
+            data_slice = slice(self.state.process_index * batch_size, (self.state.process_index + 1) * batch_size)
+            batch = self.slice_fn(
+                batch,
+                data_slice,
+                process_index=self.state.process_index,
+                num_processes=self.state.num_processes,
+            )
+            if stop_iteration:
+                self.end_of_dataloader = True
+                self._update_state_dict()
+                self.remainder = observed_batch_size
+            if batch_index >= self.skip_batches:
+                yield batch
+            batch_index += 1
+        self.iteration += 1
+        self.end()
+    def set_epoch(self, epoch: int):
+        # In case it is manually passed in, the user can set it to what they like
+        if self.iteration != epoch:
+            self.iteration = epoch
+        if hasattr(self.batch_sampler, "sampler") and hasattr(self.batch_sampler.sampler, "set_epoch"):
+            self.batch_sampler.sampler.set_epoch(epoch)
+        elif hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(epoch)
+    def __len__(self):
+        whole_length = len(self.base_dataloader)
+        if self.split_batches:
+            return whole_length
+        elif self._drop_last:
+            return whole_length // self.state.num_processes
+        else:
+            return math.ceil(whole_length / self.state.num_processes)
+    def __reduce__(self):
+        """
+        Define the `__reduce__` method to ensure a `DataLoaderDispatcher` can be pickled and unpickled. This needs to
+        be explicitly defined since default pickling behavior is broken by `DataLoaderAdapter` messing with its
+        `__class__` member.
+        """
+        args = super().__reduce__()
+        return (DataLoaderDispatcher, *args[1:])
+    @property
+    def total_batch_size(self):
+        return (
+            self.dataset.batch_size if self.split_batches else (self.dataset.batch_size * self.dataset.num_processes)
+        )
+    @property
+    def total_dataset_length(self):
+        return len(self.dataset)
+    def get_sampler(self):
+        return get_sampler(self)
+    def set_sampler(self, sampler):
+        sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
+        if sampler_is_batch_sampler:
+            self.sampler.sampler = sampler
+        else:
+            self.batch_sampler.sampler = sampler
+            if hasattr(self.batch_sampler, "batch_sampler"):
+                self.batch_sampler.batch_sampler.sampler = sampler
+def get_sampler(dataloader):
+    """
+    Get the sampler associated to the dataloader
+    Args:
+        dataloader (`torch.utils.data.dataloader.DataLoader`):
+            The data loader to split across several devices.
+    Returns:
+        `torch.utils.data.Sampler`: The sampler associated to the dataloader
+    """
+    sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
+    if sampler_is_batch_sampler:
+        sampler = getattr(dataloader.sampler, "sampler", None)
+    else:
+        sampler = getattr(dataloader.batch_sampler, "sampler", None)
+    return sampler
+def prepare_data_loader(
+    dataloader: DataLoader,
+    device: Optional[torch.device] = None,
+    num_processes: Optional[int] = None,
+    process_index: Optional[int] = None,
+    split_batches: bool = False,
+    put_on_device: bool = False,
+    rng_types: Optional[list[Union[str, RNGType]]] = None,
+    dispatch_batches: Optional[bool] = None,
+    even_batches: bool = True,
+    slice_fn_for_dispatch: Optional[Callable] = None,
+    use_seedable_sampler: bool = False,
+    data_seed: Optional[int] = None,
+    non_blocking: bool = False,
+    use_stateful_dataloader: bool = False,
+    torch_device_mesh=None,
+) -> DataLoader:
+    """
+    Wraps a PyTorch `DataLoader` to generate batches for one of the processes only.
+    Depending on the value of the `drop_last` attribute of the `dataloader` passed, it will either stop the iteration
+    at the first batch that would be too small / not present on all processes or loop with indices from the beginning.
+    Args:
+        dataloader (`torch.utils.data.dataloader.DataLoader`):
+            The data loader to split across several devices.
+        device (`torch.device`):
+            The target device for the returned `DataLoader`.
+        num_processes (`int`, *optional*):
+            The number of processes running concurrently. Will default to the value given by [`~state.PartialState`].
+        process_index (`int`, *optional*):
+            The index of the current process. Will default to the value given by [`~state.PartialState`].
+        split_batches (`bool`, *optional*, defaults to `False`):
+            Whether the resulting `DataLoader` should split the batches of the original data loader across devices or
+            yield full batches (in which case it will yield batches starting at the `process_index`-th and advancing of
+            `num_processes` batches at each iteration).
+            Another way to see this is that the observed batch size will be the same as the initial `dataloader` if
+            this option is set to `True`, the batch size of the initial `dataloader` multiplied by `num_processes`
+            otherwise.
+            Setting this option to `True` requires that the batch size of the `dataloader` is a round multiple of
+            `batch_size`.
+        put_on_device (`bool`, *optional*, defaults to `False`):
+            Whether or not to put the batches on `device` (only works if the batches are nested list, tuples or
+            dictionaries of tensors).
+        rng_types (list of `str` or [`~utils.RNGType`]):
+            The list of random number generators to synchronize at the beginning of each iteration. Should be one or
+            several of:
+            - `"torch"`: the base torch random number generator
+            - `"cuda"`: the CUDA random number generator (GPU only)
+            - `"xla"`: the XLA random number generator (TPU only)
+            - `"generator"`: the `torch.Generator` of the sampler (or batch sampler if there is no sampler in your
+              dataloader) or of the iterable dataset (if it exists) if the underlying dataset is of that type.
+        dispatch_batches (`bool`, *optional*):
+            If set to `True`, the dataloader prepared is only iterated through on the main process and then the batches
+            are split and broadcast to each process. Will default to `True` when the underlying dataset is an
+            `IterableDataset`, `False` otherwise.
+        even_batches (`bool`, *optional*, defaults to `True`):
+            If set to `True`, in cases where the total batch size across all processes does not exactly divide the
+            dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among
+            all workers.
+        slice_fn_for_dispatch (`Callable`, *optional*`):
+            If passed, this function will be used to slice tensors across `num_processes`. Will default to
+            [`~utils.slice_tensors`]. This argument is used only when `dispatch_batches` is set to `True` and will be
+            ignored otherwise.
+        use_seedable_sampler (`bool`, *optional*, defaults to `False`):
+            Whether to use the [`~data_loader.SeedableRandomSampler`] instead of a `RandomSampler` for better
+            reproducability. Comes at a cost of potentially different performances due to different shuffling
+            algorithms but ensures results will be the *exact* same. Should be paired with `set_seed()` at every
+            `self.set_epoch`
+        data_seed (`int`, *optional*, defaults to `None`):
+            The seed to use for the underlying generator when using `use_seedable_sampler`. If `None`, the generator
+            will use the current default seed from torch.
+        non_blocking (`bool`, *optional*, defaults to `False`):
+            If set to `True`, dataloader will utilize non-blocking host-to-device transfers. If the dataloader has
+            `pin_memory` set to `True`, this will help to increase overlap between data transfer and computations.
+        use_stateful_dataloader (`bool`, *optional*, defaults to `False`):
+            "If set to true, the dataloader prepared by the Accelerator will be backed by "
+            "[torchdata.StatefulDataLoader](https://github.com/pytorch/data/tree/main/torchdata/stateful_dataloader).
+            This requires `torchdata` version 0.8.0 or higher that supports StatefulDataLoader to be installed."
+        torch_device_mesh (`torch.distributed.DeviceMesh`, *optional*, defaults to `None`):
+            PyTorch device mesh.
+    Returns:
+        `torch.utils.data.dataloader.DataLoader`: A new data loader that will yield the portion of the batches
+    <Tip warning={true}>
+    `BatchSampler`s with varying batch sizes are not enabled by default. To enable this behaviour, set `even_batches`
+    equal to `False`
+    </Tip>
+    """
+    if dispatch_batches is None:
+        if not put_on_device:
+            dispatch_batches = False
+        else:
+            dispatch_batches = isinstance(dataloader.dataset, IterableDataset)
+    if dispatch_batches and not put_on_device:
+        raise ValueError("Using `dispatch_batches=True` requires `put_on_device=True`.")
+    # Grab defaults from PartialState
+    state = PartialState()
+    if num_processes is None:
+        num_processes = state.num_processes
+    if process_index is None:
+        process_index = state.process_index
+    if torch_device_mesh:
+        if state.distributed_type == DistributedType.DEEPSPEED:
+            # In DeepSpeed, the optimizer sharing level in DP is determined by the config file.
+            # Only considers "dp" and "tp".
+            # Given a device mesh (dp, tp) = (2, 3):
+            # - From the data parallel perspective, ranks should be structured as: 0 0 0 1 1 1
+            # - Processes with the same DP rank will receive the same batch.
+            submesh_tp_size = 1
+            if "tp" in torch_device_mesh.mesh_dim_names:
+                submesh_tp_size = torch_device_mesh["tp"].size()
+            process_index = process_index // submesh_tp_size
+            num_processes = num_processes // submesh_tp_size
+        else:
+            # when device mesh is used, specifically with TP
+            # then there is need to update process_index and num_processes
+            # to bring in the effect of generating same batch across TP ranks
+            # and different batch across FSDP and DP ranks.
+            # Example:
+            # if device mesh is (dp,fsdp,tp) = (2, 2, 3)
+            # ranks would range from 0...11
+            # from data angle ranks should look like 0 0 0 1 1 1 2 2 2 3 3 3
+            # processes with same ranks/ids would receive the same batch
+            # for CP the same as TP applies
+            submesh_fsdp_size = 1
+            submesh_dp_size = 1
+            submesh_tp_size = 1
+            submesh_cp_size = 1
+            if "tp" in torch_device_mesh.mesh_dim_names:
+                submesh_tp_size = torch_device_mesh["tp"].size()
+            if "cp" in torch_device_mesh.mesh_dim_names:
+                submesh_cp_size = torch_device_mesh["cp"].size()
+            if "dp_replicate" in torch_device_mesh.mesh_dim_names:
+                submesh_dp_size = torch_device_mesh["dp_replicate"].size()
+            if "dp_shard" in torch_device_mesh.mesh_dim_names:
+                submesh_fsdp_size = torch_device_mesh["dp_shard"].size()
+            process_index = process_index // (submesh_tp_size * submesh_cp_size)
+            num_processes = submesh_fsdp_size * submesh_dp_size
+    # Sanity check
+    if split_batches:
+        if dataloader.batch_size is not None:
+            batch_size_for_check = dataloader.batch_size
+        else:
+            # For custom batch_sampler
+            if hasattr(dataloader.batch_sampler, "batch_size"):
+                batch_size_for_check = dataloader.batch_sampler.batch_size
+            else:
+                raise ValueError(
+                    "In order to use `split_batches==True` you must have a `batch_size` attribute either in the passed "
+                    "`dataloader` or `dataloader.batch_sampler` objects, and it has to return a natural number. "
+                    "Your `dataloader.batch_size` is None and `dataloader.batch_sampler` "
+                    f"(`{type(dataloader.batch_sampler)}`) does not have the `batch_size` attribute set."
+                )
+        if batch_size_for_check > 1 and batch_size_for_check % num_processes != 0:
+            raise ValueError(
+                f"To use a `DataLoader` in `split_batches` mode, the batch size ({dataloader.batch_size}) "
+                f"needs to be a round multiple of the number of processes ({num_processes})."
+            )
+    new_dataset = dataloader.dataset
+    # Iterable dataset doesn't like batch_sampler, but data_loader creates a default one for it
+    new_batch_sampler = dataloader.batch_sampler if not isinstance(new_dataset, IterableDataset) else None
+    sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
+    synchronized_generator = None
+    sampler = get_sampler(dataloader)
+    if isinstance(sampler, RandomSampler) and use_seedable_sampler:
+        # When iterating through the dataloader during distributed processes
+        # we want to ensure that on each process we are iterating through the same
+        # samples in the same order if a seed is set. This requires a tweak
+        # to the `torch.utils.data.RandomSampler` class (if used).
+        sampler = SeedableRandomSampler(
+            data_source=sampler.data_source,
+            replacement=sampler.replacement,
+            num_samples=sampler._num_samples,
+            generator=getattr(
+                sampler,
+                "generator",
+                torch.Generator(device=torch.get_default_device() if hasattr(torch, "get_default_device") else "cpu"),
+            ),
+            data_seed=data_seed,
+        )
+    if isinstance(dataloader.sampler, RandomSampler) and state.distributed_type == DistributedType.XLA:
+        # isinstance(dataloader.sampler, RandomSampler) indicates the original dataloader has `shuffle` enabled.
+        generator = torch.Generator(
+            device=torch.get_default_device() if hasattr(torch, "get_default_device") else "cpu"
+        )
+        seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        generator.manual_seed(seed)
+        dataloader.generator = generator
+        dataloader.sampler.generator = generator
+    # No change if no multiprocess
+    if (num_processes != 1 or state.distributed_type == DistributedType.MEGATRON_LM) and not dispatch_batches:
+        if is_datasets_available():
+            from datasets import IterableDataset as DatasetsIterableDataset
+        if (
+            is_datasets_available()
+            and isinstance(new_dataset, DatasetsIterableDataset)
+            and not split_batches
+            and new_dataset.n_shards > num_processes
+        ):
+            new_dataset = new_dataset.shard(num_shards=num_processes, index=process_index)
+        elif isinstance(new_dataset, IterableDataset):
+            if getattr(dataloader.dataset, "generator", None) is not None:
+                synchronized_generator = dataloader.dataset.generator
+            new_dataset = IterableDatasetShard(
+                new_dataset,
+                batch_size=dataloader.batch_size,
+                drop_last=dataloader.drop_last,
+                num_processes=num_processes,
+                process_index=process_index,
+                split_batches=split_batches,
+            )
+        else:
+            if not use_seedable_sampler and hasattr(sampler, "generator"):
+                if sampler.generator is None:
+                    sampler.generator = torch.Generator(
+                        device=torch.get_default_device() if hasattr(torch, "get_default_device") else "cpu"
+                    )
+                    seed = int(torch.empty((), dtype=torch.int64).random_().item())
+                    sampler.generator.manual_seed(seed)
+                synchronized_generator = sampler.generator
+            batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
+            new_batch_sampler = BatchSamplerShard(
+                batch_sampler,
+                num_processes=num_processes,
+                process_index=process_index,
+                split_batches=split_batches,
+                even_batches=even_batches,
+            )
+    # We ignore all of those since they are all dealt with by our new_batch_sampler
+    ignore_kwargs = [
+        "batch_size",
+        "shuffle",
+        "sampler",
+        "batch_sampler",
+        "drop_last",
+    ]
+    if rng_types is not None and synchronized_generator is None and "generator" in rng_types:
+        rng_types.remove("generator")
+    kwargs = {
+        k: getattr(dataloader, k, _PYTORCH_DATALOADER_KWARGS[k])
+        for k in _PYTORCH_DATALOADER_KWARGS
+        if k not in ignore_kwargs
+    }
+    # Need to provide batch_size as batch_sampler is None for Iterable dataset
+    if new_batch_sampler is None:
+        kwargs["drop_last"] = dataloader.drop_last
+        kwargs["batch_size"] = (
+            dataloader.batch_size // num_processes if split_batches and not dispatch_batches else dataloader.batch_size
+        )
+    if dispatch_batches:
+        kwargs.pop("generator")
+        dataloader = DataLoaderDispatcher(
+            new_dataset,
+            split_batches=split_batches,
+            batch_sampler=new_batch_sampler,
+            _drop_last=dataloader.drop_last,
+            _non_blocking=non_blocking,
+            slice_fn=slice_fn_for_dispatch,
+            use_stateful_dataloader=use_stateful_dataloader,
+            torch_device_mesh=torch_device_mesh,
+            **kwargs,
+        )
+    elif sampler_is_batch_sampler:
+        dataloader = DataLoaderShard(
+            new_dataset,
+            device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
+            sampler=new_batch_sampler,
+            batch_size=dataloader.batch_size,
+            rng_types=rng_types,
+            _drop_last=dataloader.drop_last,
+            _non_blocking=non_blocking,
+            synchronized_generator=synchronized_generator,
+            use_stateful_dataloader=use_stateful_dataloader,
+            **kwargs,
+        )
+    else:
+        dataloader = DataLoaderShard(
+            new_dataset,
+            device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
+            batch_sampler=new_batch_sampler,
+            rng_types=rng_types,
+            synchronized_generator=synchronized_generator,
+            _drop_last=dataloader.drop_last,
+            _non_blocking=non_blocking,
+            use_stateful_dataloader=use_stateful_dataloader,
+            **kwargs,
+        )
+    if isinstance(sampler, SeedableRandomSampler) and use_seedable_sampler:
+        dataloader.set_sampler(sampler)
+    if state.distributed_type == DistributedType.XLA:
+        return MpDeviceLoaderWrapper(dataloader, device)
+    return dataloader
+class SkipBatchSampler(BatchSampler):
+    """
+    A `torch.utils.data.BatchSampler` that skips the first `n` batches of another `torch.utils.data.BatchSampler`.
+    Should not be used if the original dataloader is a `StatefulDataLoader`.
+    """
+    def __init__(self, batch_sampler, skip_batches=0):
+        self.batch_sampler = batch_sampler
+        self.skip_batches = skip_batches
+    def __iter__(self):
+        for index, samples in enumerate(self.batch_sampler):
+            if index >= self.skip_batches:
+                yield samples
+    @property
+    def total_length(self):
+        return len(self.batch_sampler)
+    def __len__(self):
+        return len(self.batch_sampler) - self.skip_batches
+class SkipDataLoader(DataLoaderAdapter, DataLoaderStateMixin):
+    """
+    Subclass of a PyTorch `DataLoader` that will skip the first batches. Generally it's preferable to use
+    `skip_first_batches`/`torchdata.StatefulDataLoader` instead of this class.
+    Args:
+        dataset (`torch.utils.data.dataset.Dataset`):
+            The dataset to use to build this dataloader.
+        skip_batches (`int`, *optional*, defaults to 0):
+            The number of batches to skip at the beginning.
+        kwargs:
+            All other keyword arguments to pass to the regular `DataLoader` initialization.
+    """
+    def __init__(self, dataset, skip_batches=0, use_stateful_dataloader=False, **kwargs):
+        super().__init__(dataset, use_stateful_dataloader=use_stateful_dataloader, **kwargs)
+        self.skip_batches = skip_batches
+        self.gradient_state = GradientState()
+    def __iter__(self):
+        self.begin()
+        for index, batch in enumerate(self.base_dataloader.__iter__()):
+            if index >= self.skip_batches:
+                self._update_state_dict()
+                yield batch
+        self.end()
+    def __len__(self):
+        return len(self.base_dataloader) - self.skip_batches
+    def __reduce__(self):
+        """
+        Define the `__reduce__` method to ensure a `SkipDataLoader` can be pickled and unpickled. This needs to be
+        explicitly defined since default pickling behavior is broken by `DataLoaderAdapter` messing with its
+        `__class__` member.
+        """
+        args = super().__reduce__()
+        return (SkipDataLoader, *args[1:])
+def skip_first_batches(dataloader, num_batches=0):
+    """
+    Creates a `torch.utils.data.DataLoader` that will efficiently skip the first `num_batches`. Should not be used if
+    the original dataloader is a `StatefulDataLoader`.
+    """
+    state = PartialState()
+    if state.distributed_type == DistributedType.XLA:
+        device = dataloader.device
+        dataloader = dataloader.dataloader
+    dataset = dataloader.dataset
+    sampler_is_batch_sampler = False
+    if isinstance(dataset, IterableDataset):
+        new_batch_sampler = None
+    else:
+        sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
+        batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
+        new_batch_sampler = SkipBatchSampler(batch_sampler, skip_batches=num_batches)
+    # We ignore all of those since they are all dealt with by our new_batch_sampler
+    ignore_kwargs = [
+        "batch_size",
+        "shuffle",
+        "sampler",
+        "batch_sampler",
+        "drop_last",
+    ]
+    kwargs = {
+        k: getattr(dataloader, k, _PYTORCH_DATALOADER_KWARGS[k])
+        for k in _PYTORCH_DATALOADER_KWARGS
+        if k not in ignore_kwargs
+    }
+    # Need to provide batch_size as batch_sampler is None for Iterable dataset
+    if new_batch_sampler is None:
+        kwargs["drop_last"] = dataloader.drop_last
+        kwargs["batch_size"] = dataloader.batch_size
+    if isinstance(dataloader, DataLoaderDispatcher):
+        if new_batch_sampler is None:
+            # Need to manually skip batches in the dataloader
+            kwargs["skip_batches"] = num_batches
+        dataloader = DataLoaderDispatcher(
+            dataset,
+            split_batches=dataloader.split_batches,
+            batch_sampler=new_batch_sampler,
+            _drop_last=dataloader._drop_last,
+            **kwargs,
+        )
+    elif isinstance(dataloader, DataLoaderShard):
+        if new_batch_sampler is None:
+            # Need to manually skip batches in the dataloader
+            kwargs["skip_batches"] = num_batches
+        elif sampler_is_batch_sampler:
+            kwargs["sampler"] = new_batch_sampler
+            kwargs["batch_size"] = dataloader.batch_size
+        else:
+            kwargs["batch_sampler"] = new_batch_sampler
+        dataloader = DataLoaderShard(
+            dataset,
+            device=dataloader.device,
+            rng_types=dataloader.rng_types,
+            synchronized_generator=dataloader.synchronized_generator,
+            **kwargs,
+        )
+    else:
+        if new_batch_sampler is None:
+            # Need to manually skip batches in the dataloader
+            dataloader = SkipDataLoader(dataset, skip_batches=num_batches, **kwargs)
+        else:
+            dataloader = DataLoader(dataset, batch_sampler=new_batch_sampler, **kwargs)
+    if state.distributed_type == DistributedType.XLA:
+        dataloader = MpDeviceLoaderWrapper(dataloader, device)
+    return dataloader

pythonProject/.venv/Lib/site-packages/accelerate/hooks.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+from collections.abc import Mapping
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from .state import PartialState
+from .utils import (
+    PrefixedDataset,
+    find_device,
+    named_module_tensors,
+    send_to_device,
+    set_module_tensor_to_device,
+)
+from .utils.imports import (
+    is_mlu_available,
+    is_musa_available,
+    is_npu_available,
+)
+from .utils.memory import clear_device_cache
+from .utils.modeling import get_non_persistent_buffers
+from .utils.other import recursive_getattr
+_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "sdaa", "musa"]
+class ModelHook:
+    """
+    A hook that contains callbacks to be executed just before and after the forward method of a model. The difference
+    with PyTorch existing hooks is that they get passed along the kwargs.
+    Class attribute:
+    - **no_grad** (`bool`, *optional*, defaults to `False`) -- Whether or not to execute the actual forward pass under
+      the `torch.no_grad()` context manager.
+    """
+    no_grad = False
+    def init_hook(self, module):
+        """
+        To be executed when the hook is attached to the module.
+        Args:
+            module (`torch.nn.Module`): The module attached to this hook.
+        """
+        return module
+    def pre_forward(self, module, *args, **kwargs):
+        """
+        To be executed just before the forward method of the model.
+        Args:
+            module (`torch.nn.Module`): The module whose forward pass will be executed just after this event.
+            args (`Tuple[Any]`): The positional arguments passed to the module.
+            kwargs (`Dict[Str, Any]`): The keyword arguments passed to the module.
+        Returns:
+            `Tuple[Tuple[Any], Dict[Str, Any]]`: A tuple with the treated `args` and `kwargs`.
+        """
+        return args, kwargs
+    def post_forward(self, module, output):
+        """
+        To be executed just after the forward method of the model.
+        Args:
+            module (`torch.nn.Module`): The module whose forward pass been executed just before this event.
+            output (`Any`): The output of the module.
+        Returns:
+            `Any`: The processed `output`.
+        """
+        return output
+    def detach_hook(self, module):
+        """
+        To be executed when the hook is detached from a module.
+        Args:
+            module (`torch.nn.Module`): The module detached from this hook.
+        """
+        return module
+class SequentialHook(ModelHook):
+    """
+    A hook that can contain several hooks and iterates through them at each event.
+    """
+    def __init__(self, *hooks):
+        self.hooks = hooks
+    def init_hook(self, module):
+        for hook in self.hooks:
+            module = hook.init_hook(module)
+        return module
+    def pre_forward(self, module, *args, **kwargs):
+        for hook in self.hooks:
+            args, kwargs = hook.pre_forward(module, *args, **kwargs)
+        return args, kwargs
+    def post_forward(self, module, output):
+        for hook in self.hooks:
+            output = hook.post_forward(module, output)
+        return output
+    def detach_hook(self, module):
+        for hook in self.hooks:
+            module = hook.detach_hook(module)
+        return module
+def add_hook_to_module(module: nn.Module, hook: ModelHook, append: bool = False):
+    """
+    Adds a hook to a given module. This will rewrite the `forward` method of the module to include the hook, to remove
+    this behavior and restore the original `forward` method, use `remove_hook_from_module`.
+    <Tip warning={true}>
+    If the module already contains a hook, this will replace it with the new hook passed by default. To chain two hooks
+    together, pass `append=True`, so it chains the current and new hook into an instance of the `SequentialHook` class.
+    </Tip>
+    Args:
+        module (`torch.nn.Module`):
+            The module to attach a hook to.
+        hook (`ModelHook`):
+            The hook to attach.
+        append (`bool`, *optional*, defaults to `False`):
+            Whether the hook should be chained with an existing one (if module already contains a hook) or not.
+    Returns:
+        `torch.nn.Module`: The same module, with the hook attached (the module is modified in place, so the result can
+        be discarded).
+    """
+    if append and (getattr(module, "_hf_hook", None) is not None):
+        old_hook = module._hf_hook
+        remove_hook_from_module(module)
+        hook = SequentialHook(old_hook, hook)
+    if hasattr(module, "_hf_hook") and hasattr(module, "_old_forward"):
+        # If we already put some hook on this module, we replace it with the new one.
+        old_forward = module._old_forward
+    else:
+        old_forward = module.forward
+        module._old_forward = old_forward
+    module = hook.init_hook(module)
+    module._hf_hook = hook
+    def new_forward(module, *args, **kwargs):
+        args, kwargs = module._hf_hook.pre_forward(module, *args, **kwargs)
+        if module._hf_hook.no_grad:
+            with torch.no_grad():
+                output = module._old_forward(*args, **kwargs)
+        else:
+            output = module._old_forward(*args, **kwargs)
+        return module._hf_hook.post_forward(module, output)
+    # Overriding a GraphModuleImpl forward freezes the forward call and later modifications on the graph will fail.
+    # Reference: https://pytorch.slack.com/archives/C3PDTEV8E/p1705929610405409
+    if "GraphModuleImpl" in str(type(module)):
+        module.__class__.forward = functools.update_wrapper(functools.partial(new_forward, module), old_forward)
+    else:
+        module.forward = functools.update_wrapper(functools.partial(new_forward, module), old_forward)
+    return module
+def remove_hook_from_module(module: nn.Module, recurse=False):
+    """
+    Removes any hook attached to a module via `add_hook_to_module`.
+    Args:
+        module (`torch.nn.Module`): The module to attach a hook to.
+        recurse (`bool`, **optional**): Whether to remove the hooks recursively
+    Returns:
+        `torch.nn.Module`: The same module, with the hook detached (the module is modified in place, so the result can
+        be discarded).
+    """
+    if hasattr(module, "_hf_hook"):
+        module._hf_hook.detach_hook(module)
+        delattr(module, "_hf_hook")
+    if hasattr(module, "_old_forward"):
+        # Overriding a GraphModuleImpl forward freezes the forward call and later modifications on the graph will fail.
+        # Reference: https://pytorch.slack.com/archives/C3PDTEV8E/p1705929610405409
+        if "GraphModuleImpl" in str(type(module)):
+            module.__class__.forward = module._old_forward
+        else:
+            module.forward = module._old_forward
+        delattr(module, "_old_forward")
+    # Remove accelerate added warning hooks from dispatch_model
+    for attr in _accelerate_added_attributes:
+        module.__dict__.pop(attr, None)
+    if recurse:
+        for child in module.children():
+            remove_hook_from_module(child, recurse)
+    return module
+class AlignDevicesHook(ModelHook):
+    """
+    A generic `ModelHook` that ensures inputs and model weights are on the same device for the forward pass of the
+    associated module, potentially offloading the weights after the forward pass.
+    Args:
+        execution_device (`torch.device`, *optional*):
+            The device on which inputs and model weights should be placed before the forward pass.
+        offload (`bool`, *optional*, defaults to `False`):
+            Whether or not the weights should be offloaded after the forward pass.
+        io_same_device (`bool`, *optional*, defaults to `False`):
+            Whether or not the output should be placed on the same device as the input was.
+        weights_map (`Mapping[str, torch.Tensor]`, *optional*):
+            When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
+        offload_buffers (`bool`, *optional*, defaults to `False`):
+            Whether or not to include the associated module's buffers when offloading.
+        place_submodules (`bool`, *optional*, defaults to `False`):
+            Whether to place the submodules on `execution_device` during the `init_hook` event.
+    """
+    def __init__(
+        self,
+        execution_device: Optional[Union[int, str, torch.device]] = None,
+        offload: bool = False,
+        io_same_device: bool = False,
+        weights_map: Optional[Mapping] = None,
+        offload_buffers: bool = False,
+        place_submodules: bool = False,
+        skip_keys: Optional[Union[str, list[str]]] = None,
+        tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
+    ):
+        self.execution_device = execution_device
+        self.offload = offload
+        self.io_same_device = io_same_device
+        self.weights_map = weights_map
+        self.offload_buffers = offload_buffers
+        self.place_submodules = place_submodules
+        self.skip_keys = skip_keys
+        # Will contain the input device when `io_same_device=True`.
+        self.input_device = None
+        self.param_original_devices = {}
+        self.buffer_original_devices = {}
+        self.tied_params_names = set()
+        # The hook pre_forward/post_forward need to have knowledge of this dictionary, as with offloading we want to avoid duplicating memory
+        # for tied weights already loaded on the target execution device.
+        self.tied_params_map = tied_params_map
+    def __repr__(self):
+        return (
+            f"AlignDevicesHook(execution_device={self.execution_device}, offload={self.offload}, "
+            f"io_same_device={self.io_same_device}, offload_buffers={self.offload_buffers}, "
+            f"place_submodules={self.place_submodules}, skip_keys={repr(self.skip_keys)})"
+        )
+    def init_hook(self, module):
+        # In case the AlignDevicesHook is on meta device, ignore tied weights as data_ptr() is then always zero.
+        if self.execution_device == "meta" or self.execution_device == torch.device("meta"):
+            self.tied_params_map = None
+        if not self.offload and self.execution_device is not None:
+            for name, _ in named_module_tensors(module, recurse=self.place_submodules):
+                set_module_tensor_to_device(module, name, self.execution_device, tied_params_map=self.tied_params_map)
+        elif self.offload:
+            self.original_devices = {
+                name: param.device for name, param in named_module_tensors(module, recurse=self.place_submodules)
+            }
+            if self.weights_map is None:
+                self.weights_map = {
+                    name: param.to("cpu")
+                    for name, param in named_module_tensors(
+                        module, include_buffers=self.offload_buffers, recurse=self.place_submodules
+                    )
+                }
+            for name, _ in named_module_tensors(
+                module, include_buffers=self.offload_buffers, recurse=self.place_submodules, remove_non_persistent=True
+            ):
+                # When using disk offloading, we can not rely on `weights_map[name].data_ptr()` as the reference pointer,
+                # as we have no guarantee that safetensors' `file.get_tensor()` will always give the same pointer.
+                # As we have no reliable way to track the shared data pointer of tied weights in this case, we use tied_params_names: List[str]
+                # to add on the fly pointers to `tied_params_map` in the pre_forward call.
+                if (
+                    self.tied_params_map is not None
+                    and recursive_getattr(module, name).data_ptr() in self.tied_params_map
+                ):
+                    self.tied_params_names.add(name)
+                set_module_tensor_to_device(module, name, "meta")
+            if not self.offload_buffers and self.execution_device is not None:
+                for name, _ in module.named_buffers(recurse=self.place_submodules):
+                    set_module_tensor_to_device(
+                        module, name, self.execution_device, tied_params_map=self.tied_params_map
+                    )
+            elif self.offload_buffers and self.execution_device is not None:
+                for name in get_non_persistent_buffers(module, recurse=self.place_submodules):
+                    set_module_tensor_to_device(
+                        module, name, self.execution_device, tied_params_map=self.tied_params_map
+                    )
+        return module
+    def pre_forward(self, module, *args, **kwargs):
+        if self.io_same_device:
+            self.input_device = find_device([args, kwargs])
+        if self.offload:
+            self.tied_pointers_to_remove = set()
+            for name, _ in named_module_tensors(
+                module,
+                include_buffers=self.offload_buffers,
+                recurse=self.place_submodules,
+                remove_non_persistent=True,
+            ):
+                fp16_statistics = None
+                value = self.weights_map[name]
+                if "weight" in name and name.replace("weight", "SCB") in self.weights_map.keys():
+                    if value.dtype == torch.int8:
+                        fp16_statistics = self.weights_map[name.replace("weight", "SCB")]
+                # In case we are using offloading with tied weights, we need to keep track of the offloaded weights
+                # that are loaded on device at this point, as we will need to remove them as well from the dictionary
+                # self.tied_params_map in order to allow to free memory.
+                if name in self.tied_params_names and value.data_ptr() not in self.tied_params_map:
+                    self.tied_params_map[value.data_ptr()] = {}
+                if (
+                    value is not None
+                    and self.tied_params_map is not None
+                    and value.data_ptr() in self.tied_params_map
+                    and self.execution_device not in self.tied_params_map[value.data_ptr()]
+                ):
+                    self.tied_pointers_to_remove.add((value.data_ptr(), self.execution_device))
+                set_module_tensor_to_device(
+                    module,
+                    name,
+                    self.execution_device,
+                    value=value,
+                    fp16_statistics=fp16_statistics,
+                    tied_params_map=self.tied_params_map,
+                )
+        return send_to_device(args, self.execution_device), send_to_device(
+            kwargs, self.execution_device, skip_keys=self.skip_keys
+        )
+    def post_forward(self, module, output):
+        if self.offload:
+            for name, _ in named_module_tensors(
+                module,
+                include_buffers=self.offload_buffers,
+                recurse=self.place_submodules,
+                remove_non_persistent=True,
+            ):
+                set_module_tensor_to_device(module, name, "meta")
+                if type(module).__name__ == "Linear8bitLt":
+                    module.state.SCB = None
+                    module.state.CxB = None
+            # We may have loaded tied weights into self.tied_params_map (avoiding to load them several times in e.g. submodules): remove them from
+            # this dictionary to allow the garbage collector to do its job.
+            for value_pointer, device in self.tied_pointers_to_remove:
+                if isinstance(device, int):
+                    if is_npu_available():
+                        device = f"npu:{device}"
+                    elif is_mlu_available():
+                        device = f"mlu:{device}"
+                    elif is_musa_available():
+                        device = f"musa:{device}"
+                if device in self.tied_params_map[value_pointer]:
+                    del self.tied_params_map[value_pointer][device]
+            self.tied_pointers_to_remove = set()
+        if self.io_same_device and self.input_device is not None:
+            output = send_to_device(output, self.input_device, skip_keys=self.skip_keys)
+        return output
+    def detach_hook(self, module):
+        if self.offload:
+            for name, device in self.original_devices.items():
+                if device != torch.device("meta"):
+                    set_module_tensor_to_device(module, name, device, value=self.weights_map.get(name, None))
+        return module
+def attach_execution_device_hook(
+    module: torch.nn.Module,
+    execution_device: Union[int, str, torch.device],
+    skip_keys: Optional[Union[str, list[str]]] = None,
+    preload_module_classes: Optional[list[str]] = None,
+    tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
+):
+    """
+    Recursively attaches `AlignDevicesHook` to all submodules of a given model to make sure they have the right
+    execution device
+    Args:
+        module (`torch.nn.Module`):
+            The module where we want to attach the hooks.
+        execution_device (`int`, `str` or `torch.device`):
+            The device on which inputs and model weights should be placed before the forward pass.
+        skip_keys (`str` or `List[str]`, *optional*):
+            A list of keys to ignore when moving inputs or outputs between devices.
+        preload_module_classes (`List[str]`, *optional*):
+            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
+            of the forward. This should only be used for classes that have submodules which are registered but not
+            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
+            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
+        tied_params_map (Optional[Dict[int, Dict[torch.device, torch.Tensor]]], *optional*, defaults to `None`):
+            A map of data pointers to dictionaries of devices to already dispatched tied weights. For a given execution
+            device, this parameter is useful to reuse the first available pointer of a shared weight for all others,
+            instead of duplicating memory.
+    """
+    if not hasattr(module, "_hf_hook") and len(module.state_dict()) > 0:
+        add_hook_to_module(
+            module,
+            AlignDevicesHook(execution_device, skip_keys=skip_keys, tied_params_map=tied_params_map),
+        )
+    # Break the recursion if we get to a preload module.
+    if preload_module_classes is not None and module.__class__.__name__ in preload_module_classes:
+        return
+    for child in module.children():
+        attach_execution_device_hook(
+            child,
+            execution_device,
+            skip_keys=skip_keys,
+            preload_module_classes=preload_module_classes,
+            tied_params_map=tied_params_map,
+        )
+def attach_align_device_hook(
+    module: torch.nn.Module,
+    execution_device: Optional[torch.device] = None,
+    offload: bool = False,
+    weights_map: Optional[Mapping] = None,
+    offload_buffers: bool = False,
+    module_name: str = "",
+    skip_keys: Optional[Union[str, list[str]]] = None,
+    preload_module_classes: Optional[list[str]] = None,
+    tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
+):
+    """
+    Recursively attaches `AlignDevicesHook` to all submodules of a given model that have direct parameters and/or
+    buffers.
+    Args:
+        module (`torch.nn.Module`):
+            The module where we want to attach the hooks.
+        execution_device (`torch.device`, *optional*):
+            The device on which inputs and model weights should be placed before the forward pass.
+        offload (`bool`, *optional*, defaults to `False`):
+            Whether or not the weights should be offloaded after the forward pass.
+        weights_map (`Mapping[str, torch.Tensor]`, *optional*):
+            When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
+        offload_buffers (`bool`, *optional*, defaults to `False`):
+            Whether or not to include the associated module's buffers when offloading.
+        module_name (`str`, *optional*, defaults to `""`):
+            The name of the module.
+        skip_keys (`str` or `List[str]`, *optional*):
+            A list of keys to ignore when moving inputs or outputs between devices.
+        preload_module_classes (`List[str]`, *optional*):
+            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
+            of the forward. This should only be used for classes that have submodules which are registered but not
+            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
+            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
+        tied_params_map (Optional[Dict[int, Dict[torch.device, torch.Tensor]]], *optional*, defaults to `None`):
+            A map of data pointers to dictionaries of devices to already dispatched tied weights. For a given execution
+            device, this parameter is useful to reuse the first available pointer of a shared weight for all others,
+            instead of duplicating memory.
+    """
+    # Attach the hook on this module if it has any direct tensor.
+    directs = named_module_tensors(module)
+    full_offload = (
+        offload and preload_module_classes is not None and module.__class__.__name__ in preload_module_classes
+    )
+    if len(list(directs)) > 0 or full_offload:
+        if weights_map is not None:
+            prefix = f"{module_name}." if len(module_name) > 0 else ""
+            prefixed_weights_map = PrefixedDataset(weights_map, prefix)
+        else:
+            prefixed_weights_map = None
+        hook = AlignDevicesHook(
+            execution_device=execution_device,
+            offload=offload,
+            weights_map=prefixed_weights_map,
+            offload_buffers=offload_buffers,
+            place_submodules=full_offload,
+            skip_keys=skip_keys,
+            tied_params_map=tied_params_map,
+        )
+        add_hook_to_module(module, hook, append=True)
+    # We stop the recursion in case we hit the full offload.
+    if full_offload:
+        return
+    # Recurse on all children of the module.
+    for child_name, child in module.named_children():
+        child_name = f"{module_name}.{child_name}" if len(module_name) > 0 else child_name
+        attach_align_device_hook(
+            child,
+            execution_device=execution_device,
+            offload=offload,
+            weights_map=weights_map,
+            offload_buffers=offload_buffers,
+            module_name=child_name,
+            preload_module_classes=preload_module_classes,
+            skip_keys=skip_keys,
+            tied_params_map=tied_params_map,
+        )
+def remove_hook_from_submodules(module: nn.Module):
+    """
+    Recursively removes all hooks attached on the submodules of a given model.
+    Args:
+        module (`torch.nn.Module`): The module on which to remove all hooks.
+    """
+    remove_hook_from_module(module)
+    for child in module.children():
+        remove_hook_from_submodules(child)
+def attach_align_device_hook_on_blocks(
+    module: nn.Module,
+    execution_device: Optional[Union[torch.device, dict[str, torch.device]]] = None,
+    offload: Union[bool, dict[str, bool]] = False,
+    weights_map: Mapping = None,
+    offload_buffers: bool = False,
+    module_name: str = "",
+    skip_keys: Optional[Union[str, list[str]]] = None,
+    preload_module_classes: Optional[list[str]] = None,
+    tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
+):
+    """
+    Attaches `AlignDevicesHook` to all blocks of a given model as needed.
+    Args:
+        module (`torch.nn.Module`):
+            The module where we want to attach the hooks.
+        execution_device (`torch.device` or `Dict[str, torch.device]`, *optional*):
+            The device on which inputs and model weights should be placed before the forward pass. It can be one device
+            for the whole module, or a dictionary mapping module name to device.
+        offload (`bool`, *optional*, defaults to `False`):
+            Whether or not the weights should be offloaded after the forward pass. It can be one boolean for the whole
+            module, or a dictionary mapping module name to boolean.
+        weights_map (`Mapping[str, torch.Tensor]`, *optional*):
+            When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
+        offload_buffers (`bool`, *optional*, defaults to `False`):
+            Whether or not to include the associated module's buffers when offloading.
+        module_name (`str`, *optional*, defaults to `""`):
+            The name of the module.
+        skip_keys (`str` or `List[str]`, *optional*):
+            A list of keys to ignore when moving inputs or outputs between devices.
+        preload_module_classes (`List[str]`, *optional*):
+            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
+            of the forward. This should only be used for classes that have submodules which are registered but not
+            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
+            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
+        tied_params_map (Optional[Dict[int, Dict[torch.device, torch.Tensor]]], *optional*, defaults to `None`):
+            A map of data pointers to dictionaries of devices to already dispatched tied weights. For a given execution
+            device, this parameter is useful to reuse the first available pointer of a shared weight for all others,
+            instead of duplicating memory.
+    """
+    # If one device and one offload, we've got one hook.
+    if not isinstance(execution_device, Mapping) and not isinstance(offload, dict):
+        if not offload:
+            hook = AlignDevicesHook(
+                execution_device=execution_device,
+                io_same_device=True,
+                skip_keys=skip_keys,
+                place_submodules=True,
+                tied_params_map=tied_params_map,
+            )
+            add_hook_to_module(module, hook)
+        else:
+            attach_align_device_hook(
+                module,
+                execution_device=execution_device,
+                offload=True,
+                weights_map=weights_map,
+                offload_buffers=offload_buffers,
+                module_name=module_name,
+                skip_keys=skip_keys,
+                tied_params_map=tied_params_map,
+            )
+        return
+    if not isinstance(execution_device, Mapping):
+        execution_device = {key: execution_device for key in offload.keys()}
+    if not isinstance(offload, Mapping):
+        offload = {key: offload for key in execution_device.keys()}
+    if module_name in execution_device and module_name in offload and not offload[module_name]:
+        hook = AlignDevicesHook(
+            execution_device=execution_device[module_name],
+            offload_buffers=offload_buffers,
+            io_same_device=(module_name == ""),
+            place_submodules=True,
+            skip_keys=skip_keys,
+            tied_params_map=tied_params_map,
+        )
+        add_hook_to_module(module, hook)
+        attach_execution_device_hook(
+            module, execution_device[module_name], skip_keys=skip_keys, tied_params_map=tied_params_map
+        )
+    elif module_name in execution_device and module_name in offload:
+        attach_align_device_hook(
+            module,
+            execution_device=execution_device[module_name],
+            offload=True,
+            weights_map=weights_map,
+            offload_buffers=offload_buffers,
+            module_name=module_name,
+            skip_keys=skip_keys,
+            preload_module_classes=preload_module_classes,
+            tied_params_map=tied_params_map,
+        )
+        if not hasattr(module, "_hf_hook"):
+            hook = AlignDevicesHook(
+                execution_device=execution_device[module_name],
+                io_same_device=(module_name == ""),
+                skip_keys=skip_keys,
+                tied_params_map=tied_params_map,
+            )
+            add_hook_to_module(module, hook)
+        attach_execution_device_hook(
+            module,
+            execution_device[module_name],
+            preload_module_classes=preload_module_classes,
+            skip_keys=skip_keys,
+            tied_params_map=tied_params_map,
+        )
+    elif module_name == "":
+        hook = AlignDevicesHook(
+            execution_device=execution_device.get(""),
+            io_same_device=True,
+            skip_keys=skip_keys,
+            tied_params_map=tied_params_map,
+        )
+        add_hook_to_module(module, hook)
+    for child_name, child in module.named_children():
+        child_name = f"{module_name}.{child_name}" if len(module_name) > 0 else child_name
+        attach_align_device_hook_on_blocks(
+            child,
+            execution_device=execution_device,
+            offload=offload,
+            weights_map=weights_map,
+            offload_buffers=offload_buffers,
+            module_name=child_name,
+            preload_module_classes=preload_module_classes,
+            skip_keys=skip_keys,
+            tied_params_map=tied_params_map,
+        )
+class CpuOffload(ModelHook):
+    """
+    Offloads a model on the CPU until its forward pass is called. The model will not be offloaded back to the CPU after
+    the forward, the user needs to call the `init_hook` method again for this.
+    Args:
+        execution_device(`str`, `int` or `torch.device`, *optional*):
+            The device on which the model should be executed. Will default to the MPS device if it's available, then
+            GPU 0 if there is a GPU, and finally to the CPU.
+        prev_module_hook (`UserCpuOffloadHook`, *optional*):
+            The hook sent back by [`cpu_offload_with_hook`] for a previous model in the pipeline you are running. If
+            passed, its offload method will be called just before the forward of the model to which this hook is
+            attached.
+    """
+    def __init__(
+        self,
+        execution_device: Optional[Union[str, int, torch.device]] = None,
+        prev_module_hook: Optional["UserCpuOffloadHook"] = None,
+    ):
+        self.prev_module_hook = prev_module_hook
+        self.execution_device = execution_device if execution_device is not None else PartialState().default_device
+    def init_hook(self, module):
+        return module.to("cpu")
+    def pre_forward(self, module, *args, **kwargs):
+        if self.prev_module_hook is not None and isinstance(self.prev_module_hook, UserCpuOffloadHook):
+            prev_module = self.prev_module_hook.model
+            prev_device = next(prev_module.parameters()).device
+            # Only offload the previous module if it is not already on CPU.
+            if prev_device != torch.device("cpu"):
+                self.prev_module_hook.offload()
+                clear_device_cache()
+        # If the current device is already the self.execution_device, we can skip the transfer.
+        current_device = next(module.parameters()).device
+        if current_device == self.execution_device:
+            return args, kwargs
+        module.to(self.execution_device)
+        return send_to_device(args, self.execution_device), send_to_device(kwargs, self.execution_device)
+class UserCpuOffloadHook:
+    """
+    A simple hook grouping a model and a `ModelHook`, which provides easy APIs for to call the init method of the hook
+    or remove it entirely.
+    """
+    def __init__(self, model, hook):
+        self.model = model
+        self.hook = hook
+    def offload(self):
+        self.hook.init_hook(self.model)
+    def remove(self):
+        remove_hook_from_module(self.model)
+class LayerwiseCastingHook(ModelHook):
+    r"""
+    A hook that casts the weights of a module to a high precision dtype for computation, and to a low precision dtype
+    for storage. This process may lead to quality loss in the output, but can significantly reduce the memory
+    footprint.
+    """
+    _is_stateful = False
+    def __init__(self, storage_dtype: torch.dtype, compute_dtype: torch.dtype, non_blocking: bool) -> None:
+        self.storage_dtype = storage_dtype
+        self.compute_dtype = compute_dtype
+        self.non_blocking = non_blocking
+    def init_hook(self, module: torch.nn.Module):
+        module.to(dtype=self.storage_dtype, non_blocking=self.non_blocking)
+        return module
+    def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
+        module.to(dtype=self.compute_dtype, non_blocking=self.non_blocking)
+        return args, kwargs
+    def post_forward(self, module: torch.nn.Module, output):
+        module.to(dtype=self.storage_dtype, non_blocking=self.non_blocking)
+        return output

pythonProject/.venv/Lib/site-packages/accelerate/inference.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from types import MethodType
+from typing import Any, Optional, Union
+from .state import PartialState
+from .utils import (
+    calculate_maximum_sizes,
+    convert_bytes,
+    copy_tensor_to_devices,
+    ignorant_find_batch_size,
+    infer_auto_device_map,
+    is_pippy_available,
+    pad_input_tensors,
+    send_to_device,
+)
+def generate_device_map(model, num_processes: int = 1, no_split_module_classes=None, max_memory: dict = None):
+    """
+    Calculates the device map for `model` with an offset for PiPPy
+    """
+    if num_processes == 1:
+        return infer_auto_device_map(model, no_split_module_classes=no_split_module_classes, clean_result=False)
+    if max_memory is None:
+        model_size, shared = calculate_maximum_sizes(model)
+        # Split into `n` chunks for each GPU
+        memory = (model_size + shared[0]) / num_processes
+        memory = convert_bytes(memory)
+        value, ending = memory.split(" ")
+        # Add a chunk to deal with potential extra shared memory instances
+        memory = math.ceil(float(value)) * 1.1
+        memory = f"{memory} {ending}"
+        max_memory = {i: memory for i in range(num_processes)}
+    device_map = infer_auto_device_map(
+        model,
+        max_memory=max_memory,
+        no_split_module_classes=no_split_module_classes,
+        clean_result=False,
+    )
+    return device_map
+def find_pippy_batch_size(args, kwargs):
+    found_batch_size = None
+    if args is not None:
+        for arg in args:
+            found_batch_size = ignorant_find_batch_size(arg)
+            if found_batch_size is not None:
+                break
+    if kwargs is not None and found_batch_size is None:
+        for kwarg in kwargs.values():
+            found_batch_size = ignorant_find_batch_size(kwarg)
+            if found_batch_size is not None:
+                break
+    return found_batch_size
+def build_pipeline(model, split_points, args, kwargs, num_chunks):
+    """
+    Attaches the split points to the model based on `self.device_map` and generates a `PipelineStage`. Requires passing
+    in needed `args` and `kwargs` as the model needs on the CPU.
+    Users can pass in custom `num_chunks` as an optional hyper-parameter. By default will use
+    `AcceleratorState.num_processes`
+    """
+    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
+    from torch.distributed.pipelining import ScheduleGPipe, SplitPoint, pipeline
+    # We need to annotate the split points in the model for PiPPy
+    state = PartialState()
+    split_spec = {split_point: SplitPoint.BEGINNING for split_point in split_points}
+    pipe = pipeline(
+        model,
+        mb_args=args,
+        mb_kwargs=kwargs,
+        split_spec=split_spec,
+    )
+    stage = pipe.build_stage(state.local_process_index, device=state.device)
+    schedule = ScheduleGPipe(stage, num_chunks)
+    return schedule
+def pippy_forward(forward, num_chunks, gather_output, *args, **kwargs):
+    state = PartialState()
+    output = None
+    if state.num_processes == 1:
+        output = forward(*args, **kwargs)
+    elif state.is_local_main_process:
+        found_batch_size = find_pippy_batch_size(args, kwargs)
+        if found_batch_size is None:
+            raise ValueError("Could not find batch size from args or kwargs")
+        else:
+            if found_batch_size != num_chunks:
+                args = pad_input_tensors(args, found_batch_size, num_chunks)
+                kwargs = pad_input_tensors(kwargs, found_batch_size, num_chunks)
+        forward(*args, **kwargs)
+    elif state.is_last_process:
+        output = forward()
+    else:
+        forward()
+    if gather_output:
+        # Each node will get a copy of the full output which is only on the last GPU
+        output = copy_tensor_to_devices(output)
+    return output
+def prepare_pippy(
+    model,
+    split_points: Optional[Union[str, list[str]]] = "auto",
+    no_split_module_classes: Optional[list[str]] = None,
+    example_args: Optional[tuple[Any]] = (),
+    example_kwargs: Optional[dict[str, Any]] = None,
+    num_chunks: Optional[int] = None,
+    gather_output: Optional[bool] = False,
+):
+    """
+    Wraps `model` for pipeline parallel inference.
+    Args:
+        model (`torch.nn.Module`):
+            A model we want to split for pipeline-parallel inference
+        split_points (`str` or `List[str]`, defaults to 'auto'):
+            How to generate the split points and chunk the model across each GPU. 'auto' will find the best balanced
+            split given any model. Should be a list of layer names in the model to split by otherwise.
+        no_split_module_classes (`List[str]`):
+            A list of class names for layers we don't want to be split.
+        example_args (tuple of model inputs):
+            The expected inputs for the model that uses order-based inputs for a *single process*. Recommended to use
+            this method if possible.
+        example_kwargs (dict of model inputs)
+            The expected inputs for the model that uses dictionary-based inputs for a *single process*. This is a
+            *highly* limiting structure that requires the same keys be present at *all* inference calls. Not
+            recommended unless the prior condition is true for all cases.
+        num_chunks (`int`, defaults to the number of available GPUs):
+            The number of different stages the Pipeline will have. By default it will assign one chunk per GPU, but
+            this can be tuned and played with. In general one should have num_chunks >= num_gpus.
+        gather_output (`bool`, defaults to `False`):
+            If `True`, the output from the last GPU (which holds the true outputs) is sent across to all GPUs.
+    """
+    if not is_pippy_available():
+        raise ImportError("Using `torch.distributed.pipelining` requires PyTorch 2.4.0 or later.")
+    state = PartialState()
+    example_args = send_to_device(example_args, "cpu")
+    example_kwargs = send_to_device(example_kwargs, "cpu")
+    if num_chunks is None:
+        num_chunks = state.num_processes
+    if split_points == "auto":
+        device_map = generate_device_map(model, num_chunks, no_split_module_classes=no_split_module_classes)
+        split_points = []
+        for i in range(1, num_chunks):
+            split_points.append(next(k for k, v in device_map.items() if v == i))
+    model.hf_split_points = split_points
+    stage = build_pipeline(model, split_points, example_args, example_kwargs, num_chunks)
+    model._original_forward = model.forward
+    model._original_call = model.__call__
+    model.pippy_stage = stage
+    model.hf_split_points = split_points
+    def forward(*args, **kwargs):
+        return pippy_forward(stage.step, num_chunks, gather_output, *args, **kwargs)
+    # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
+    # Note: creates an infinite recursion loop with `generate`
+    model_forward = MethodType(forward, model)
+    forward.__wrapped__ = model_forward
+    model.forward = forward
+    return model

pythonProject/.venv/Lib/site-packages/accelerate/launchers.py ADDED Viewed

	@@ -0,0 +1,306 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import tempfile
+import torch
+from .state import AcceleratorState, PartialState
+from .utils import (
+    PrecisionType,
+    PrepareForLaunch,
+    are_libraries_initialized,
+    check_cuda_p2p_ib_support,
+    get_gpu_info,
+    is_mps_available,
+    is_torch_version,
+    patch_environment,
+)
+from .utils.constants import ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION
+def test_launch():
+    "Verify a `PartialState` can be initialized."
+    _ = PartialState()
+def notebook_launcher(
+    function,
+    args=(),
+    num_processes=None,
+    mixed_precision="no",
+    use_port="29500",
+    master_addr="127.0.0.1",
+    node_rank=0,
+    num_nodes=1,
+    rdzv_backend="static",
+    rdzv_endpoint="",
+    rdzv_conf=None,
+    rdzv_id="none",
+    max_restarts=0,
+    monitor_interval=0.1,
+    log_line_prefix_template=None,
+):
+    """
+    Launches a training function, using several processes or multiple nodes if it's possible in the current environment
+    (TPU with multiple cores for instance).
+    <Tip warning={true}>
+    To use this function absolutely zero calls to a device must be made in the notebook session before calling. If any
+    have been made, you will need to restart the notebook and make sure no cells use any device capability.
+    Setting `ACCELERATE_DEBUG_MODE="1"` in your environment will run a test before truly launching to ensure that none
+    of those calls have been made.
+    </Tip>
+    Args:
+        function (`Callable`):
+            The training function to execute. If it accepts arguments, the first argument should be the index of the
+            process run.
+        args (`Tuple`):
+            Tuple of arguments to pass to the function (it will receive `*args`).
+        num_processes (`int`, *optional*):
+            The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to
+            the number of devices available otherwise.
+        mixed_precision (`str`, *optional*, defaults to `"no"`):
+            If `fp16` or `bf16`, will use mixed precision training on multi-device.
+        use_port (`str`, *optional*, defaults to `"29500"`):
+            The port to use to communicate between processes when launching a multi-device training.
+        master_addr (`str`, *optional*, defaults to `"127.0.0.1"`):
+            The address to use for communication between processes.
+        node_rank (`int`, *optional*, defaults to 0):
+            The rank of the current node.
+        num_nodes (`int`, *optional*, defaults to 1):
+            The number of nodes to use for training.
+        rdzv_backend (`str`, *optional*, defaults to `"static"`):
+            The rendezvous method to use, such as 'static' (the default) or 'c10d'
+        rdzv_endpoint (`str`, *optional*, defaults to `""`):
+            The endpoint of the rdzv sync. storage.
+        rdzv_conf (`Dict`, *optional*, defaults to `None`):
+            Additional rendezvous configuration.
+        rdzv_id (`str`, *optional*, defaults to `"none"`):
+            The unique run id of the job.
+        max_restarts (`int`, *optional*, defaults to 0):
+            The maximum amount of restarts that elastic agent will conduct on workers before failure.
+        monitor_interval (`float`, *optional*, defaults to 0.1):
+            The interval in seconds that is used by the elastic_agent as a period of monitoring workers.
+        log_line_prefix_template (`str`, *optional*, defaults to `None`):
+            The prefix template for elastic launch logging. Available from PyTorch 2.2.0.
+    Example:
+    ```python
+    # Assume this is defined in a Jupyter Notebook on an instance with two devices
+    from accelerate import notebook_launcher
+    def train(*args):
+        # Your training function here
+        ...
+    notebook_launcher(train, args=(arg1, arg2), num_processes=2, mixed_precision="fp16")
+    ```
+    """
+    # Are we in a google colab or a Kaggle Kernel?
+    in_colab = False
+    in_kaggle = False
+    if any(key.startswith("KAGGLE") for key in os.environ.keys()):
+        in_kaggle = True
+    elif "IPython" in sys.modules:
+        in_colab = "google.colab" in str(sys.modules["IPython"].get_ipython())
+    try:
+        mixed_precision = PrecisionType(mixed_precision.lower())
+    except ValueError:
+        raise ValueError(
+            f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
+        )
+    if (in_colab or in_kaggle) and (
+        (os.environ.get("TPU_NAME", None) is not None) or (os.environ.get("PJRT_DEVICE", "") == "TPU")
+    ):
+        # TPU launch
+        import torch_xla.distributed.xla_multiprocessing as xmp
+        if len(AcceleratorState._shared_state) > 0:
+            raise ValueError(
+                "To train on TPU in Colab or Kaggle Kernel, the `Accelerator` should only be initialized inside "
+                "your training function. Restart your notebook and make sure no cells initializes an "
+                "`Accelerator`."
+            )
+        launcher = PrepareForLaunch(function, distributed_type="XLA")
+        print("Launching a training on TPU cores.")
+        xmp.spawn(launcher, args=args, start_method="fork")
+    elif in_colab and get_gpu_info()[1] < 2:
+        # No need for a distributed launch otherwise as it's either CPU or one GPU.
+        if torch.cuda.is_available():
+            print("Launching training on one GPU.")
+        else:
+            print("Launching training on one CPU.")
+        function(*args)
+    else:
+        if num_processes is None:
+            raise ValueError(
+                "You have to specify the number of devices you would like to use, add `num_processes=...` to your call."
+            )
+        if node_rank >= num_nodes:
+            raise ValueError("The node_rank must be less than the number of nodes.")
+        if num_processes > 1:
+            # Multi-device launch
+            from torch.distributed.launcher.api import LaunchConfig, elastic_launch
+            from torch.multiprocessing import start_processes
+            from torch.multiprocessing.spawn import ProcessRaisedException
+            if len(AcceleratorState._shared_state) > 0:
+                raise ValueError(
+                    "To launch a multi-device training from your notebook, the `Accelerator` should only be initialized "
+                    "inside your training function. Restart your notebook and make sure no cells initializes an "
+                    "`Accelerator`."
+                )
+            # Check for specific libraries known to initialize device that users constantly use
+            problematic_imports = are_libraries_initialized("bitsandbytes")
+            if len(problematic_imports) > 0:
+                err = (
+                    "Could not start distributed process. Libraries known to initialize device upon import have been "
+                    "imported already. Please keep these imports inside your training function to try and help with this:"
+                )
+                for lib_name in problematic_imports:
+                    err += f"\n\t* `{lib_name}`"
+                raise RuntimeError(err)
+            patched_env = dict(
+                nproc=num_processes,
+                node_rank=node_rank,
+                world_size=num_nodes * num_processes,
+                master_addr=master_addr,
+                master_port=use_port,
+                mixed_precision=mixed_precision,
+            )
+            # Check for CUDA P2P and IB issues
+            if not check_cuda_p2p_ib_support():
+                patched_env["nccl_p2p_disable"] = "1"
+                patched_env["nccl_ib_disable"] = "1"
+            # torch.distributed will expect a few environment variable to be here. We set the ones common to each
+            # process here (the other ones will be set be the launcher).
+            with patch_environment(**patched_env):
+                # First dummy launch
+                device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+                distributed_type = "MULTI_XPU" if device_type == "xpu" else "MULTI_GPU"
+                if os.environ.get("ACCELERATE_DEBUG_MODE", "false").lower() == "true":
+                    launcher = PrepareForLaunch(test_launch, distributed_type=distributed_type)
+                    try:
+                        start_processes(launcher, args=(), nprocs=num_processes, start_method="fork")
+                    except ProcessRaisedException as e:
+                        err = "An issue was found when verifying a stable environment for the notebook launcher."
+                        if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
+                            raise RuntimeError(
+                                f"{err}"
+                                "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
+                                "Please review your imports and test them when running the `notebook_launcher()` to identify "
+                                f"which one is problematic and causing {device_type.upper()} to be initialized."
+                            ) from e
+                        else:
+                            raise RuntimeError(f"{err} The following error was raised: {e}") from e
+                # Now the actual launch
+                launcher = PrepareForLaunch(function, distributed_type=distributed_type)
+                print(f"Launching training on {num_processes} {device_type.upper()}s.")
+                try:
+                    if rdzv_conf is None:
+                        rdzv_conf = {}
+                    if rdzv_backend == "static":
+                        rdzv_conf["rank"] = node_rank
+                        if not rdzv_endpoint:
+                            rdzv_endpoint = f"{master_addr}:{use_port}"
+                    launch_config_kwargs = dict(
+                        min_nodes=num_nodes,
+                        max_nodes=num_nodes,
+                        nproc_per_node=num_processes,
+                        run_id=rdzv_id,
+                        rdzv_endpoint=rdzv_endpoint,
+                        rdzv_backend=rdzv_backend,
+                        rdzv_configs=rdzv_conf,
+                        max_restarts=max_restarts,
+                        monitor_interval=monitor_interval,
+                        start_method="fork",
+                    )
+                    if is_torch_version(">=", ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION):
+                        launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
+                    elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
+                except ProcessRaisedException as e:
+                    if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
+                        raise RuntimeError(
+                            f"{device_type.upper()} has been initialized before the `notebook_launcher` could create a forked subprocess. "
+                            "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
+                            "Please review your imports and test them when running the `notebook_launcher()` to identify "
+                            f"which one is problematic and causing {device_type.upper()} to be initialized."
+                        ) from e
+                    else:
+                        raise RuntimeError(f"An issue was found when launching the training: {e}") from e
+        else:
+            # No need for a distributed launch otherwise as it's either CPU, GPU, XPU or MPS.
+            if is_mps_available():
+                os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+                print("Launching training on MPS.")
+            elif torch.cuda.is_available():
+                print("Launching training on one GPU.")
+            elif torch.xpu.is_available():
+                print("Launching training on one XPU.")
+            else:
+                print("Launching training on CPU.")
+            function(*args)
+def debug_launcher(function, args=(), num_processes=2):
+    """
+    Launches a training function using several processes on CPU for debugging purposes.
+    <Tip warning={true}>
+    This function is provided for internal testing and debugging, but it's not intended for real trainings. It will
+    only use the CPU.
+    </Tip>
+    Args:
+        function (`Callable`):
+            The training function to execute.
+        args (`Tuple`):
+            Tuple of arguments to pass to the function (it will receive `*args`).
+        num_processes (`int`, *optional*, defaults to 2):
+            The number of processes to use for training.
+    """
+    from torch.multiprocessing import start_processes
+    with tempfile.NamedTemporaryFile() as tmp_file:
+        # torch.distributed will expect a few environment variable to be here. We set the ones common to each
+        # process here (the other ones will be set be the launcher).
+        with patch_environment(
+            world_size=num_processes,
+            master_addr="127.0.0.1",
+            master_port="29500",
+            accelerate_mixed_precision="no",
+            accelerate_debug_rdv_file=tmp_file.name,
+            accelerate_use_cpu="yes",
+        ):
+            launcher = PrepareForLaunch(function, debug=True)
+            start_processes(launcher, args=args, nprocs=num_processes, start_method="fork")

pythonProject/.venv/Lib/site-packages/accelerate/local_sgd.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from accelerate import Accelerator, DistributedType
+class LocalSGD:
+    """
+    A helper class to support local SGD on top of Accelerator. It simply runs a given number of updates independently
+    on each device, and averages model weights every K synchronization step.
+    It should be used only in the multi-GPU (or multi-CPU) setup without extensions such as DeepSpeed. In particular,
+    this is a simple implementation that cannot support scenarios such as model parallelism.
+    Although we are not aware of the true origins of this simple approach, the idea of local SGD is quite old and goes
+    back to at least:
+    Zhang, J., De Sa, C., Mitliagkas, I., & Ré, C. (2016). [Parallel SGD: When does averaging help?. arXiv preprint
+    arXiv:1606.07365.](https://arxiv.org/abs/1606.07365)
+    We credit the term Local SGD to the following paper (but there might be earlier references we are not aware of).
+    Stich, Sebastian Urban. ["Local SGD Converges Fast and Communicates Little." ICLR 2019-International Conference on
+    Learning Representations. No. CONF. 2019.](https://arxiv.org/abs/1805.09767)
+    """
+    def __enter__(self):
+        if self.enabled:
+            self.model_sync_obj = self.model.no_sync()
+            self.model_sync_obj.__enter__()
+        return self
+    def __exit__(self, type, value, tb):
+        if self.enabled:
+            # Average all models on exit
+            self._sync_and_avg_model_params()
+            self.model_sync_obj.__exit__(type, value, tb)
+    def __init__(self, accelerator: Accelerator, model: torch.nn.Module, local_sgd_steps: int, enabled: bool = True):
+        """
+        Constructor.
+        Args:
+            model (`torch.nn.Module):
+                The model whose parameters we need to average.
+            accelerator (`Accelerator`):
+                Accelerator object.
+            local_sgd_steps (`int`):
+                A number of local SGD steps (before model parameters are synchronized).
+            enabled (`bool):
+                Local SGD is disabled if this parameter set to `False`.
+        """
+        if accelerator.distributed_type not in [
+            DistributedType.NO,
+            DistributedType.MULTI_CPU,
+            DistributedType.MULTI_GPU,
+            DistributedType.MULTI_XPU,
+            DistributedType.MULTI_MLU,
+            DistributedType.MULTI_HPU,
+            DistributedType.MULTI_SDAA,
+            DistributedType.MULTI_MUSA,
+            DistributedType.MULTI_NPU,
+        ]:
+            raise NotImplementedError("LocalSGD is supported only for CPUs and GPUs (no DeepSpeed or MegatronLM)")
+        self.enabled = enabled and accelerator.distributed_type != DistributedType.NO
+        self.num_steps = 0
+        if self.enabled:
+            self.accelerator = accelerator
+            self.model = model
+            self.local_sgd_steps = local_sgd_steps
+    def step(self):
+        """
+        This function makes a "step" and synchronizes model parameters if necessary.
+        """
+        self.num_steps += 1
+        if not self.enabled:
+            return
+        if self.num_steps % self.local_sgd_steps == 0:
+            self._sync_and_avg_model_params()
+    def _sync_and_avg_model_params(self):
+        """
+        Synchronize + Average model parameters across all GPUs
+        """
+        self.accelerator.wait_for_everyone()
+        with self.accelerator.autocast():
+            for param in self.model.parameters():
+                param.data = self.accelerator.reduce(param.data, reduction="mean")

pythonProject/.venv/Lib/site-packages/accelerate/logging.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import logging
+import os
+from .state import PartialState
+class MultiProcessAdapter(logging.LoggerAdapter):
+    """
+    An adapter to assist with logging in multiprocess.
+    `log` takes in an additional `main_process_only` kwarg, which dictates whether it should be called on all processes
+    or only the main executed one. Default is `main_process_only=True`.
+    Does not require an `Accelerator` object to be created first.
+    """
+    @staticmethod
+    def _should_log(main_process_only):
+        "Check if log should be performed"
+        state = PartialState()
+        return not main_process_only or (main_process_only and state.is_main_process)
+    def log(self, level, msg, *args, **kwargs):
+        """
+        Delegates logger call after checking if we should log.
+        Accepts a new kwarg of `main_process_only`, which will dictate whether it will be logged across all processes
+        or only the main executed one. Default is `True` if not passed
+        Also accepts "in_order", which if `True` makes the processes log one by one, in order. This is much easier to
+        read, but comes at the cost of sometimes needing to wait for the other processes. Default is `False` to not
+        break with the previous behavior.
+        `in_order` is ignored if `main_process_only` is passed.
+        """
+        if PartialState._shared_state == {}:
+            raise RuntimeError(
+                "You must initialize the accelerate state by calling either `PartialState()` or `Accelerator()` before using the logging utility."
+            )
+        main_process_only = kwargs.pop("main_process_only", True)
+        in_order = kwargs.pop("in_order", False)
+        # set `stacklevel` to exclude ourself in `Logger.findCaller()` while respecting user's choice
+        kwargs.setdefault("stacklevel", 2)
+        if self.isEnabledFor(level):
+            if self._should_log(main_process_only):
+                msg, kwargs = self.process(msg, kwargs)
+                self.logger.log(level, msg, *args, **kwargs)
+            elif in_order:
+                state = PartialState()
+                for i in range(state.num_processes):
+                    if i == state.process_index:
+                        msg, kwargs = self.process(msg, kwargs)
+                        self.logger.log(level, msg, *args, **kwargs)
+                    state.wait_for_everyone()
+    @functools.lru_cache(None)
+    def warning_once(self, *args, **kwargs):
+        """
+        This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+        Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the
+        cache. The assumption here is that all warning messages are unique across the code. If they aren't then need to
+        switch to another type of cache that includes the caller frame information in the hashing function.
+        """
+        self.warning(*args, **kwargs)
+def get_logger(name: str, log_level: str = None):
+    """
+    Returns a `logging.Logger` for `name` that can handle multiprocessing.
+    If a log should be called on all processes, pass `main_process_only=False` If a log should be called on all
+    processes and in order, also pass `in_order=True`
+    Args:
+        name (`str`):
+            The name for the logger, such as `__file__`
+        log_level (`str`, *optional*):
+            The log level to use. If not passed, will default to the `LOG_LEVEL` environment variable, or `INFO` if not
+    Example:
+    ```python
+    >>> from accelerate.logging import get_logger
+    >>> from accelerate import Accelerator
+    >>> logger = get_logger(__name__)
+    >>> accelerator = Accelerator()
+    >>> logger.info("My log", main_process_only=False)
+    >>> logger.debug("My log", main_process_only=True)
+    >>> logger = get_logger(__name__, log_level="DEBUG")
+    >>> logger.info("My log")
+    >>> logger.debug("My second log")
+    >>> array = ["a", "b", "c", "d"]
+    >>> letter_at_rank = array[accelerator.process_index]
+    >>> logger.info(letter_at_rank, in_order=True)
+    ```
+    """
+    if log_level is None:
+        log_level = os.environ.get("ACCELERATE_LOG_LEVEL", None)
+    logger = logging.getLogger(name)
+    if log_level is not None:
+        logger.setLevel(log_level.upper())
+        logger.root.setLevel(log_level.upper())
+    return MultiProcessAdapter(logger, {})

pythonProject/.venv/Lib/site-packages/accelerate/memory_utils.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+warnings.warn(
+    "memory_utils has been reorganized to utils.memory. Import `find_executable_batchsize` from the main `__init__`: "
+    "`from accelerate import find_executable_batch_size` to avoid this warning.",
+    FutureWarning,
+)

pythonProject/.venv/Lib/site-packages/accelerate/optimizer.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import torch
+from .state import AcceleratorState, GradientState
+from .utils import DistributedType, honor_type, is_lomo_available, is_torch_xla_available
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    import torch_xla.runtime as xr
+def move_to_device(state, device):
+    if isinstance(state, (list, tuple)):
+        return honor_type(state, (move_to_device(t, device) for t in state))
+    elif isinstance(state, dict):
+        return type(state)({k: move_to_device(v, device) for k, v in state.items()})
+    elif isinstance(state, torch.Tensor):
+        return state.to(device)
+    return state
+class AcceleratedOptimizer(torch.optim.Optimizer):
+    """
+    Internal wrapper around a torch optimizer.
+    Conditionally will perform `step` and `zero_grad` if gradients should be synchronized when performing gradient
+    accumulation.
+    Args:
+        optimizer (`torch.optim.optimizer.Optimizer`):
+            The optimizer to wrap.
+        device_placement (`bool`, *optional*, defaults to `True`):
+            Whether or not the optimizer should handle device placement. If so, it will place the state dictionary of
+            `optimizer` on the right device.
+        scaler (`torch.amp.GradScaler` or `torch.cuda.amp.GradScaler`, *optional*):
+            The scaler to use in the step function if training with mixed precision.
+    """
+    def __init__(self, optimizer, device_placement=True, scaler=None):
+        self.optimizer = optimizer
+        self.scaler = scaler
+        self.accelerator_state = AcceleratorState()
+        self.gradient_state = GradientState()
+        self.device_placement = device_placement
+        self._is_overflow = False
+        if self.scaler is not None:
+            self._accelerate_step_called = False
+            self._optimizer_original_step_method = self.optimizer.step
+            self._optimizer_patched_step_method = patch_optimizer_step(self, self.optimizer.step)
+        # Handle device placement
+        if device_placement:
+            state_dict = self.optimizer.state_dict()
+            if self.accelerator_state.distributed_type == DistributedType.XLA:
+                xm.send_cpu_data_to_device(state_dict, self.accelerator_state.device)
+            else:
+                state_dict = move_to_device(state_dict, self.accelerator_state.device)
+            self.optimizer.load_state_dict(state_dict)
+    @property
+    def state(self):
+        return self.optimizer.state
+    @state.setter
+    def state(self, state):
+        self.optimizer.state = state
+    @property
+    def param_groups(self):
+        return self.optimizer.param_groups
+    @param_groups.setter
+    def param_groups(self, param_groups):
+        self.optimizer.param_groups = param_groups
+    @property
+    def defaults(self):
+        return self.optimizer.defaults
+    @defaults.setter
+    def defaults(self, defaults):
+        self.optimizer.defaults = defaults
+    def add_param_group(self, param_group):
+        self.optimizer.add_param_group(param_group)
+    def load_state_dict(self, state_dict):
+        if self.accelerator_state.distributed_type == DistributedType.XLA and self.device_placement:
+            xm.send_cpu_data_to_device(state_dict, self.accelerator_state.device)
+        self.optimizer.load_state_dict(state_dict)
+    def state_dict(self):
+        return self.optimizer.state_dict()
+    def zero_grad(self, set_to_none=None):
+        if self.gradient_state.sync_gradients:
+            accept_arg = "set_to_none" in inspect.signature(self.optimizer.zero_grad).parameters
+            if accept_arg:
+                if set_to_none is None:
+                    set_to_none = True
+                self.optimizer.zero_grad(set_to_none=set_to_none)
+            else:
+                if set_to_none is not None:
+                    raise ValueError("`set_to_none` for Optimizer.zero_grad` is not supported by this optimizer.")
+                self.optimizer.zero_grad()
+    def train(self):
+        """
+        Sets the optimizer to "train" mode. Useful for optimizers like `schedule_free`
+        """
+        if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
+            self.optimizer.train()
+        elif (
+            hasattr(self.optimizer, "optimizer")
+            and hasattr(self.optimizer.optimizer, "train")
+            and callable(self.optimizer.optimizer.train)
+        ):
+            # the deepspeed optimizer further wraps the optimizer
+            self.optimizer.optimizer.train()
+    def eval(self):
+        """
+        Sets the optimizer to "eval" mode. Useful for optimizers like `schedule_free`
+        """
+        if hasattr(self.optimizer, "eval") and callable(self.optimizer.eval):
+            self.optimizer.eval()
+    def step(self, closure=None):
+        if is_lomo_available():
+            from lomo_optim import AdaLomo, Lomo
+        if (
+            not self.gradient_state.is_xla_gradients_synced
+            and self.accelerator_state.distributed_type == DistributedType.XLA
+        ):
+            gradients = xm._fetch_gradients(self.optimizer)
+            xm.all_reduce("sum", gradients, scale=1.0 / xr.world_size())
+            self.gradient_state.is_xla_gradients_synced = True
+        if is_lomo_available():
+            #  `step` should be a no-op for LOMO optimizers.
+            if isinstance(self.optimizer, (Lomo, AdaLomo)):
+                return
+        if self.gradient_state.sync_gradients:
+            if self.scaler is not None:
+                self.optimizer.step = self._optimizer_patched_step_method
+                self.scaler.step(self.optimizer, closure)
+                self.scaler.update()
+                if not self._accelerate_step_called:
+                    # If the optimizer step was skipped, gradient overflow was detected.
+                    self._is_overflow = True
+                else:
+                    self._is_overflow = False
+                # Reset the step method to the original one
+                self.optimizer.step = self._optimizer_original_step_method
+                # Reset the indicator
+                self._accelerate_step_called = False
+            else:
+                self.optimizer.step(closure)
+        if self.accelerator_state.distributed_type == DistributedType.XLA:
+            self.gradient_state.is_xla_gradients_synced = False
+    def _switch_parameters(self, parameters_map):
+        for param_group in self.optimizer.param_groups:
+            param_group["params"] = [parameters_map.get(p, p) for p in param_group["params"]]
+    @property
+    def step_was_skipped(self):
+        """Whether or not the optimizer step was skipped."""
+        return self._is_overflow
+    def __getstate__(self):
+        _ignored_keys = [
+            "_accelerate_step_called",
+            "_optimizer_original_step_method",
+            "_optimizer_patched_step_method",
+        ]
+        return {k: v for k, v in self.__dict__.items() if k not in _ignored_keys}
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if self.scaler is not None:
+            self._accelerate_step_called = False
+            self._optimizer_original_step_method = self.optimizer.step
+            self._optimizer_patched_step_method = patch_optimizer_step(self, self.optimizer.step)
+def patch_optimizer_step(accelerated_optimizer: AcceleratedOptimizer, method):
+    def patched_step(*args, **kwargs):
+        accelerated_optimizer._accelerate_step_called = True
+        return method(*args, **kwargs)
+    return patched_step

pythonProject/.venv/Lib/site-packages/accelerate/parallelism_config.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import warnings
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+from accelerate.utils.dataclasses import TorchContextParallelConfig, TorchTensorParallelConfig
+from accelerate.utils.versions import is_torch_version
+if TYPE_CHECKING:
+    from accelerate import Accelerator
+@dataclass
+class ParallelismConfig:
+    """
+    A dataclass to configure parallelisms applied to the model. Inspired by torchtitan's `ParallelDims`
+    https://github.com/pytorch/torchtitan/blob/main/torchtitan/distributed/parallel_dims.py
+    Args:
+        dp_replicate_size (`int`, defaults to `1`):
+            The size of the data parallel group. If `dp_replicate_size` is set to 1, the data parallel replication
+            group will not be used.
+        dp_shard_size (`int`, defaults to `1`):
+            The size of the model shard group. If `dp_replicate_size > 1` and `tp_size > 1`, `dp_shard_size` must also
+            be greater than 1, as composing DDP + TP is currently not supported.
+        tp_size (`int`, defaults to `1`):
+            The size of the tensor parallel group. If `tp_size` is set to `1`, the tensor parallel group will not be
+            used.
+        cp_size (`int`, defaults to `1`):
+            The size of the context parallel group. Currently not supported, but reserved for future use and enabled
+            for downstream libraries.
+        tp_handler (`~utils.TorchTensorParallelConfig`, defaults to `None`):
+            The handler for the tensor parallel group.
+    You may obtain different distributed data parallel paradigms by configuring `dp_replicate_size` and `dp_shard_size`
+    together:
+        - `dp_replicate_size == 1` and `dp_shard_size > 1`, we obtain Fully Sharded Data Parallel (FSDP).
+        - `dp_replicate_size > 1` and `dp_shard_size > 1`, we obtain Hybrid Sharded Data Parallel (HSDP).
+        - `dp_replicate_size > 1` and `dp_shard_size == 1` is an invalid configuration, to use pure DP, use
+          `DistributedDataParallelKwargs` instead.
+    """
+    dp_replicate_size: int = None
+    dp_shard_size: int = None
+    tp_size: int = None
+    cp_size: int = None
+    # we use Union because we might support other x parallel plugins (i.e. deepspeed, etc)
+    tp_handler: Union[None, TorchTensorParallelConfig] = None
+    cp_handler: Union[None, TorchContextParallelConfig] = None
+    device_mesh = None
+    def __repr__(self):
+        return (
+            "ParallelismConfig(\n "
+            f"\tdp_replicate_size={self.dp_replicate_size},\n"
+            f"\tdp_shard_size={self.dp_shard_size},\n"
+            f"\ttp_size={self.tp_size},\n"
+            f"\tcp_size={self.cp_size},\n"
+            f"\ttotal_size={self.total_size}\n"
+            f"\ttp_handler={self.tp_handler},\n"
+            f"\tcp_handler={self.cp_handler})\n"
+        )
+    def to_json(self):
+        import copy
+        _non_serializable_fields = ["device_mesh"]
+        copy.deepcopy(
+            {
+                k: copy.deepcopy(v.__dict__) if hasattr(v, "__dict__") else v
+                for k, v in self.__dict__.items()
+                if k not in _non_serializable_fields
+            }
+        )
+    @property
+    def dp_dim_names(self):
+        """Names of enabled dimensions across which data parallelism is applied."""
+        dims = []
+        if self.dp_replicate_enabled:
+            dims += ["dp_replicate"]
+        if self.dp_shard_enabled:
+            dims += ["dp_shard"]
+        return dims
+    @property
+    def non_dp_dim_names(self):
+        """Names of enabled dimensions which will receive the same batch (non-data parallel dimensions)."""
+        dims = []
+        if self.tp_enabled:
+            dims += ["tp"]
+        if self.cp_enabled:
+            dims += ["cp"]
+        return dims
+    @property
+    def dp_shard_cp_dim_names(self):
+        """Names of enabled dimensions which will be flattened into a joint mesh across which is model sharded in FSDP."""
+        dims = []
+        if self.dp_shard_enabled:
+            dims += ["dp_shard"]
+        if self.cp_enabled:
+            dims += ["cp"]
+        return dims
+    @property
+    def dp_cp_dim_names(self):
+        """Names of enabled dimensions across which loss should be averaged"""
+        dims = []
+        if self.dp_replicate_enabled:
+            dims += ["dp_replicate"]
+        if self.dp_shard_enabled:
+            dims += ["dp_shard"]
+        if self.cp_enabled:
+            dims += ["cp"]
+        return dims
+    @property
+    def fsdp_dim_names(self):
+        """Names of enabled dimensions across which FSDP is applied, including data parallel replication."""
+        dims = []
+        if self.dp_replicate_enabled:
+            dims += ["dp_replicate"]
+        dims += ["dp_shard_cp"]
+        return dims
+    @property
+    def total_size(self):
+        """The total size of the parallelism configuration, which is the product of all sizes."""
+        return self.dp_replicate_size * self.dp_shard_size * self.tp_size * self.cp_size
+    @property
+    def non_data_parallel_size(self):
+        """The size of the non-data parallel dimensions, which is the product of tensor and context parallel sizes."""
+        return self.tp_size * self.cp_size
+    @property
+    def data_parallel_size(self):
+        """The size of the data parallel dimensions, which is the product of data parallel replication and"""
+        return self.dp_replicate_size * self.dp_shard_size
+    @property
+    def dp_replicate_enabled(self):
+        """True if data parallel replication is enabled, i.e. `dp_replicate_size > 1`."""
+        return self.dp_replicate_size > 1
+    @property
+    def dp_shard_enabled(self):
+        """True if data parallel sharding is enabled, i.e. `dp_shard_size > 1`."""
+        return self.dp_shard_size > 1
+    @property
+    def tp_enabled(self):
+        """True if tensor parallelism is enabled, i.e. `tp_size > 1`."""
+        return self.tp_size > 1
+    @property
+    def cp_enabled(self):
+        """True if context parallelism is enabled, i.e. `cp_size > 1`."""
+        return self.cp_size > 1
+    @property
+    def active_mesh_dims(self):
+        """Names of all active mesh dimensions."""
+        return self.dp_dim_names + self.non_dp_dim_names
+    def build_device_mesh(self, device_type: str):
+        """Builds a device mesh for the given device type based on the parallelism configuration.
+        This method will also create required joint meshes (e.g. `dp_shard_cp`, `dp_cp`, `dp`).
+        Args:
+            device_type (`str`): The type of device for which to build the mesh, e
+        """
+        if is_torch_version(">=", "2.2.0"):
+            from torch.distributed.device_mesh import init_device_mesh
+        else:
+            raise RuntimeError("Building a device_mesh requires to have torch>=2.2.0")
+        mesh = self._get_mesh()
+        if len(mesh) == 0:
+            return None
+        mesh_dim_names, mesh_shape = mesh
+        device_mesh = init_device_mesh(
+            device_type,
+            mesh_shape,
+            mesh_dim_names=mesh_dim_names,
+        )
+        if self.dp_dim_names:
+            device_mesh[self.dp_dim_names]._flatten("dp")
+        if self.dp_shard_cp_dim_names:
+            device_mesh[self.dp_shard_cp_dim_names]._flatten("dp_shard_cp")
+        if self.dp_cp_dim_names:
+            device_mesh[self.dp_cp_dim_names]._flatten("dp_cp")
+        return device_mesh
+    def get_device_mesh(self, device_type: Optional[str] = None):
+        if self.device_mesh is None:
+            if device_type is not None:
+                self.device_mesh = self.build_device_mesh(device_type)
+            else:
+                raise ("You need to pass a device_type e.g cuda to build the device mesh")
+        else:
+            if device_type is not None:
+                if self.device_mesh.device_type != device_type:
+                    raise ValueError(
+                        f"The device_mesh is already created with device type {self.device_mesh.device_type}. However, you are trying to get a device mesh with device_type {device_type}. Please check if you correctly initialized your device_mesh"
+                    )
+        return self.device_mesh
+    def _get_mesh(self) -> tuple[tuple[int, ...], tuple[str, ...]]:
+        """Generate mesh shape and dimension names for torch.distributed.init_device_mesh()."""
+        # Build mesh dimensions dictionary
+        mesh_dims = {parallelism: self._sizes[parallelism] for parallelism in self.active_mesh_dims}
+        # Apply canonical ordering
+        mesh_order = ["dp_replicate", "dp_shard", "cp", "tp"]
+        sorted_items = sorted(
+            mesh_dims.items(),
+            key=lambda x: (mesh_order.index(x[0])),
+        )
+        return tuple(zip(*sorted_items))
+    def __post_init__(self):
+        # Basic size validation
+        if self.dp_replicate_size is None:
+            self.dp_replicate_size = int(os.environ.get("PARALLELISM_CONFIG_DP_REPLICATE_SIZE", "1"))
+        if self.dp_shard_size is None:
+            self.dp_shard_size = int(os.environ.get("PARALLELISM_CONFIG_DP_SHARD_SIZE", "1"))
+        if self.tp_size is None:
+            self.tp_size = int(os.environ.get("PARALLELISM_CONFIG_TP_SIZE", "1"))
+        if self.cp_size is None:
+            self.cp_size = int(os.environ.get("PARALLELISM_CONFIG_CP_SIZE", "1"))
+        if self.tp_size > 1:
+            if self.tp_handler is None:
+                self.tp_handler = TorchTensorParallelConfig()
+        if self.cp_size > 1:
+            if self.cp_handler is None:
+                self.cp_handler = TorchContextParallelConfig()
+        if self.dp_replicate_size < 1:
+            raise ValueError(f"dp_replicate_size must be at least 1, but got {self.dp_replicate_size}")
+        if self.dp_shard_size < 1:
+            raise ValueError(f"dp_shard_size must be at least 1, but got {self.dp_shard_size}")
+        if self.tp_size < 1:
+            raise ValueError(f"tp_size must be at least 1, but got {self.tp_size}")
+        if self.cp_size < 1:
+            raise ValueError(f"cp_size must be at least 1, but got {self.cp_size}")
+        if (self.tp_size > 1 or self.cp_size > 1) and self.dp_replicate_size > 1 and self.dp_shard_size == 1:
+            raise ValueError(
+                "Tensor/Context parallelism (tp/cp_size > 1) cannot be used with pure data parallelism (dp_replicate_size > 1 and dp_shard_size == 1). "
+                "Please set dp_shard_size > 1 and dp_replicate_size == 1 to compose FSDP + TP/CP for 2D parallel, "
+                "or set dp_replicate_size == 1 and dp_shard_size > 1 to compose HSDP + TP/CP for 3D parallel."
+            )
+        self._sizes = {
+            "dp_replicate": self.dp_replicate_size,
+            "dp_shard": self.dp_shard_size,
+            "tp": self.tp_size,
+            "cp": self.cp_size,
+        }
+    def _set_size(self, parallelism: str, size: int):
+        assert parallelism in self._sizes.keys(), f"Parallelism must be one of {self._sizes.keys()}"
+        self._sizes[parallelism] = size
+        setattr(self, f"{parallelism}_size", size)
+    def _validate_accelerator(self, accelerator: "Accelerator"):
+        _warnings = set()
+        if not accelerator.multi_device and self.total_size == 1:
+            # No distributed setup, valid parallelism config
+            return
+        # We need this to ensure DDP works
+        if self.total_size == 1:
+            self._set_size("dp_replicate", accelerator.num_processes)
+        if self.total_size != accelerator.num_processes:
+            raise ValueError(
+                f"ParallelismConfig total_size ({self.total_size}) does not match "
+                f"num_processes ({accelerator.num_processes}). Please adjust dp_replicate_size/ "
+                f"dp_shard_size/tp_size/cp_size."
+            )
+        if self.total_size > 1 and not (accelerator.is_fsdp2 or accelerator.multi_device):
+            raise ValueError(
+                f"ParallelismConfig is only compatible DistributedType.FSDP (version 2) or DistributedType.Multi{{Device}}, but got {accelerator.distributed_type}."
+            )
+        for parallelism, size in self._sizes.items():
+            if size == 1 and getattr(self, f"{parallelism}_handler", None) is not None:
+                _warnings.add(
+                    f"ParallelismConfig.{parallelism}_handler is set, but {parallelism}_size is set to 1. This handler will be ignored."
+                )
+        if _warnings and accelerator.is_main_process:
+            warnings.warn(
+                "ParallelismConfig has the following warnings:\n" + "\n".join(_warnings),
+                UserWarning,
+            )

pythonProject/.venv/Lib/site-packages/accelerate/scheduler.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# We ignore warnings about stepping the scheduler since we step it ourselves during gradient accumulation
+import warnings
+from .state import AcceleratorState, GradientState
+warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")
+class AcceleratedScheduler:
+    """
+    A wrapper around a learning rate scheduler that will only step when the optimizer(s) have a training step. Useful
+    to avoid making a scheduler step too fast when gradients went overflow and there was no training step (in mixed
+    precision training)
+    When performing gradient accumulation scheduler lengths should not be changed accordingly, Accelerate will always
+    step the scheduler to account for it.
+    Args:
+        scheduler (`torch.optim.lr_scheduler._LRScheduler`):
+            The scheduler to wrap.
+        optimizers (one or a list of `torch.optim.Optimizer`):
+            The optimizers used.
+        step_with_optimizer (`bool`, *optional*, defaults to `True`):
+            Whether or not the scheduler should be stepped at each optimizer step.
+        split_batches (`bool`, *optional*, defaults to `False`):
+            Whether or not the dataloaders split one batch across the different processes (so batch size is the same
+            regardless of the number of processes) or create batches on each process (so batch size is the original
+            batch size multiplied by the number of processes).
+    """
+    def __init__(self, scheduler, optimizers, step_with_optimizer: bool = True, split_batches: bool = False):
+        self.scheduler = scheduler
+        self.optimizers = optimizers if isinstance(optimizers, (list, tuple)) else [optimizers]
+        self.split_batches = split_batches
+        self.step_with_optimizer = step_with_optimizer
+        self.gradient_state = GradientState()
+    def step(self, *args, **kwargs):
+        if not self.step_with_optimizer:
+            # No link between scheduler and optimizer -> just step
+            self.scheduler.step(*args, **kwargs)
+            return
+        # Otherwise, first make sure the optimizer was stepped.
+        if not self.gradient_state.sync_gradients:
+            if self.gradient_state.adjust_scheduler:
+                self.scheduler._step_count += 1
+            return
+        for opt in self.optimizers:
+            if opt.step_was_skipped:
+                return
+        if self.split_batches:
+            # Split batches -> the training dataloader batch size is not changed so one step per training step
+            self.scheduler.step(*args, **kwargs)
+        else:
+            # Otherwise the training dataloader batch size was multiplied by `num_processes`, so we need to do
+            # num_processes steps per training step
+            num_processes = AcceleratorState().num_processes
+            for _ in range(num_processes):
+                # Special case when using OneCycle and `drop_last` was not used
+                if hasattr(self.scheduler, "total_steps"):
+                    if self.scheduler._step_count <= self.scheduler.total_steps:
+                        self.scheduler.step(*args, **kwargs)
+                else:
+                    self.scheduler.step(*args, **kwargs)
+    # Passthroughs
+    def get_last_lr(self):
+        return self.scheduler.get_last_lr()
+    def state_dict(self):
+        return self.scheduler.state_dict()
+    def load_state_dict(self, state_dict):
+        self.scheduler.load_state_dict(state_dict)
+    def get_lr(self):
+        return self.scheduler.get_lr()
+    def print_lr(self, *args, **kwargs):
+        return self.scheduler.print_lr(*args, **kwargs)

pythonProject/.venv/Lib/site-packages/isympy.py ADDED Viewed

	@@ -0,0 +1,342 @@

+"""
+Python shell for SymPy.
+This is just a normal Python shell (IPython shell if you have the
+IPython package installed), that executes the following commands for
+the user:
+    >>> from __future__ import division
+    >>> from sympy import *
+    >>> x, y, z, t = symbols('x y z t')
+    >>> k, m, n = symbols('k m n', integer=True)
+    >>> f, g, h = symbols('f g h', cls=Function)
+    >>> init_printing()
+So starting 'isympy' is equivalent to starting Python (or IPython) and
+executing the above commands by hand.  It is intended for easy and quick
+experimentation with SymPy.  isympy is a good way to use SymPy as an
+interactive calculator. If you have IPython and Matplotlib installed, then
+interactive plotting is enabled by default.
+COMMAND LINE OPTIONS
+--------------------
+-c CONSOLE, --console=CONSOLE
+     Use the specified shell (Python or IPython) shell as the console
+     backend instead of the default one (IPython if present, Python
+     otherwise), e.g.:
+        $isympy -c python
+    CONSOLE must be one of 'ipython' or 'python'
+-p PRETTY, --pretty PRETTY
+    Setup pretty-printing in SymPy. When pretty-printing is enabled,
+    expressions can be printed with Unicode or ASCII. The default is
+    to use pretty-printing (with Unicode if the terminal supports it).
+    When this option is 'no', expressions will not be pretty-printed
+    and ASCII will be used:
+        $isympy -p no
+    PRETTY must be one of 'unicode', 'ascii', or 'no'
+-t TYPES, --types=TYPES
+    Setup the ground types for the polys.  By default, gmpy ground types
+    are used if gmpy2 or gmpy is installed, otherwise it falls back to python
+    ground types, which are a little bit slower.  You can manually
+    choose python ground types even if gmpy is installed (e.g., for
+    testing purposes):
+        $isympy -t python
+    TYPES must be one of 'gmpy', 'gmpy1' or 'python'
+    Note that the ground type gmpy1 is primarily intended for testing; it
+    forces the use of gmpy version 1 even if gmpy2 is available.
+    This is the same as setting the environment variable
+    SYMPY_GROUND_TYPES to the given ground type (e.g.,
+    SYMPY_GROUND_TYPES='gmpy')
+    The ground types can be determined interactively from the variable
+    sympy.polys.domains.GROUND_TYPES.
+-o ORDER, --order ORDER
+    Setup the ordering of terms for printing.  The default is lex, which
+    orders terms lexicographically (e.g., x**2 + x + 1). You can choose
+    other orderings, such as rev-lex, which will use reverse
+    lexicographic ordering (e.g., 1 + x + x**2):
+        $isympy -o rev-lex
+    ORDER must be one of 'lex', 'rev-lex', 'grlex', 'rev-grlex',
+    'grevlex', 'rev-grevlex', 'old', or 'none'.
+    Note that for very large expressions, ORDER='none' may speed up
+    printing considerably but the terms will have no canonical order.
+-q, --quiet
+    Print only Python's and SymPy's versions to stdout at startup.
+-d, --doctest
+    Use the same format that should be used for doctests.  This is
+    equivalent to -c python -p no.
+-C, --no-cache
+    Disable the caching mechanism.  Disabling the cache may slow certain
+    operations down considerably.  This is useful for testing the cache,
+    or for benchmarking, as the cache can result in deceptive timings.
+    This is equivalent to setting the environment variable
+    SYMPY_USE_CACHE to 'no'.
+-a, --auto-symbols (requires at least IPython 0.11)
+    Automatically create missing symbols.  Normally, typing a name of a
+    Symbol that has not been instantiated first would raise NameError,
+    but with this option enabled, any undefined name will be
+    automatically created as a Symbol.
+    Note that this is intended only for interactive, calculator style
+    usage. In a script that uses SymPy, Symbols should be instantiated
+    at the top, so that it's clear what they are.
+    This will not override any names that are already defined, which
+    includes the single character letters represented by the mnemonic
+    QCOSINE (see the "Gotchas and Pitfalls" document in the
+    documentation). You can delete existing names by executing "del
+    name".  If a name is defined, typing "'name' in dir()" will return True.
+    The Symbols that are created using this have default assumptions.
+    If you want to place assumptions on symbols, you should create them
+    using symbols() or var().
+    Finally, this only works in the top level namespace. So, for
+    example, if you define a function in isympy with an undefined
+    Symbol, it will not work.
+    See also the -i and -I options.
+-i, --int-to-Integer (requires at least IPython 0.11)
+    Automatically wrap int literals with Integer.  This makes it so that
+    things like 1/2 will come out as Rational(1, 2), rather than 0.5.  This
+    works by preprocessing the source and wrapping all int literals with
+    Integer.  Note that this will not change the behavior of int literals
+    assigned to variables, and it also won't change the behavior of functions
+    that return int literals.
+    If you want an int, you can wrap the literal in int(), e.g. int(3)/int(2)
+    gives 1.5 (with division imported from __future__).
+-I, --interactive (requires at least IPython 0.11)
+    This is equivalent to --auto-symbols --int-to-Integer.  Future options
+    designed for ease of interactive use may be added to this.
+-D, --debug
+    Enable debugging output.  This is the same as setting the
+    environment variable SYMPY_DEBUG to 'True'.  The debug status is set
+    in the variable SYMPY_DEBUG within isympy.
+-- IPython options
+    Additionally you can pass command line options directly to the IPython
+    interpreter (the standard Python shell is not supported).  However you
+    need to add the '--' separator between two types of options, e.g the
+    startup banner option and the colors option. You need to enter the
+    options as required by the version of IPython that you are using, too:
+    in IPython 0.11,
+        $isympy -q -- --colors=NoColor
+    or older versions of IPython,
+        $isympy -q -- -colors NoColor
+See also isympy --help.
+"""
+import os
+import sys
+# DO NOT IMPORT SYMPY HERE! Or the setting of the sympy environment variables
+# by the command line will break.
+def main() -> None:
+    from argparse import ArgumentParser, RawDescriptionHelpFormatter
+    VERSION = None
+    if '--version' in sys.argv:
+        # We cannot import sympy before this is run, because flags like -C and
+        # -t set environment variables that must be set before SymPy is
+        # imported. The only thing we need to import it for is to get the
+        # version, which only matters with the --version flag.
+        import sympy
+        VERSION = sympy.__version__
+    usage = 'isympy [options] -- [ipython options]'
+    parser = ArgumentParser(
+        usage=usage,
+        description=__doc__,
+        formatter_class=RawDescriptionHelpFormatter,
+    )
+    parser.add_argument('--version', action='version', version=VERSION)
+    parser.add_argument(
+        '-c', '--console',
+        dest='console',
+        action='store',
+        default=None,
+        choices=['ipython', 'python'],
+        metavar='CONSOLE',
+        help='select type of interactive session: ipython | python; defaults '
+        'to ipython if IPython is installed, otherwise python')
+    parser.add_argument(
+        '-p', '--pretty',
+        dest='pretty',
+        action='store',
+        default=None,
+        metavar='PRETTY',
+        choices=['unicode', 'ascii', 'no'],
+        help='setup pretty printing: unicode | ascii | no; defaults to '
+        'unicode printing if the terminal supports it, otherwise ascii')
+    parser.add_argument(
+        '-t', '--types',
+        dest='types',
+        action='store',
+        default=None,
+        metavar='TYPES',
+        choices=['gmpy', 'gmpy1', 'python'],
+        help='setup ground types: gmpy | gmpy1 | python; defaults to gmpy if gmpy2 '
+        'or gmpy is installed, otherwise python')
+    parser.add_argument(
+        '-o', '--order',
+        dest='order',
+        action='store',
+        default=None,
+        metavar='ORDER',
+        choices=['lex', 'grlex', 'grevlex', 'rev-lex', 'rev-grlex', 'rev-grevlex', 'old', 'none'],
+        help='setup ordering of terms: [rev-]lex | [rev-]grlex | [rev-]grevlex | old | none; defaults to lex')
+    parser.add_argument(
+        '-q', '--quiet',
+        dest='quiet',
+        action='store_true',
+        default=False,
+        help='print only version information at startup')
+    parser.add_argument(
+        '-d', '--doctest',
+        dest='doctest',
+        action='store_true',
+        default=False,
+        help='use the doctest format for output (you can just copy and paste it)')
+    parser.add_argument(
+        '-C', '--no-cache',
+        dest='cache',
+        action='store_false',
+        default=True,
+        help='disable caching mechanism')
+    parser.add_argument(
+        '-a', '--auto-symbols',
+        dest='auto_symbols',
+        action='store_true',
+        default=False,
+        help='automatically construct missing symbols')
+    parser.add_argument(
+        '-i', '--int-to-Integer',
+        dest='auto_int_to_Integer',
+        action='store_true',
+        default=False,
+        help="automatically wrap int literals with Integer")
+    parser.add_argument(
+        '-I', '--interactive',
+        dest='interactive',
+        action='store_true',
+        default=False,
+        help="equivalent to -a -i")
+    parser.add_argument(
+        '-D', '--debug',
+        dest='debug',
+        action='store_true',
+        default=False,
+        help='enable debugging output')
+    (options, ipy_args) = parser.parse_known_args()
+    if '--' in ipy_args:
+        ipy_args.remove('--')
+    if not options.cache:
+        os.environ['SYMPY_USE_CACHE'] = 'no'
+    if options.types:
+        os.environ['SYMPY_GROUND_TYPES'] = options.types
+    if options.debug:
+        os.environ['SYMPY_DEBUG'] = str(options.debug)
+    if options.doctest:
+        options.pretty = 'no'
+        options.console = 'python'
+    session = options.console
+    if session is not None:
+        ipython = session == 'ipython'
+    else:
+        try:
+            import IPython
+            ipython = True
+        except ImportError:
+            if not options.quiet:
+                from sympy.interactive.session import no_ipython
+                print(no_ipython)
+            ipython = False
+    args = {
+        'pretty_print': True,
+        'use_unicode':  None,
+        'use_latex':    None,
+        'order':        None,
+        'argv':         ipy_args,
+    }
+    if options.pretty == 'unicode':
+        args['use_unicode'] = True
+    elif options.pretty == 'ascii':
+        args['use_unicode'] = False
+    elif options.pretty == 'no':
+        args['pretty_print'] = False
+    if options.order is not None:
+        args['order'] = options.order
+    args['quiet'] = options.quiet
+    args['auto_symbols'] = options.auto_symbols or options.interactive
+    args['auto_int_to_Integer'] = options.auto_int_to_Integer or options.interactive
+    from sympy.interactive import init_session
+    init_session(ipython, **args)
+if __name__ == "__main__":
+    main()

pythonProject/.venv/Lib/site-packages/numpy-2.2.6-cp310-cp310-win_amd64.whl ADDED Viewed

File without changes

pythonProject/.venv/Lib/site-packages/typing_extensions.py ADDED Viewed

The diff for this file is too large to render. See raw diff

pythonProject/.venv/pyvenv.cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+home = C:\Users\ADMIN\AppData\Local\Programs\Python\Python310
+include-system-site-packages = false
+version = 3.10.11