xiaoanyu123 commited on
Commit
29a7f0c
·
verified ·
1 Parent(s): fe17766

Add files using upload-large-folder tool

Browse files
Files changed (21) hide show
  1. .gitattributes +2 -0
  2. cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer.lib +3 -0
  3. cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer64_8.lib +3 -0
  4. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/__init__.cpython-310.pyc +0 -0
  5. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/examples.cpython-310.pyc +0 -0
  6. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/testing.cpython-310.pyc +0 -0
  7. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/training.cpython-310.pyc +0 -0
  8. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/__init__.cpython-310.pyc +0 -0
  9. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_cli.cpython-310.pyc +0 -0
  10. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_merge_weights.cpython-310.pyc +0 -0
  11. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_notebook.cpython-310.pyc +0 -0
  12. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_ops.cpython-310.pyc +0 -0
  13. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_script.cpython-310.pyc +0 -0
  14. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_sync.cpython-310.pyc +0 -0
  15. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/__init__.cpython-310.pyc +0 -0
  16. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_peak_memory_usage.cpython-310.pyc +0 -0
  17. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_pippy.cpython-310.pyc +0 -0
  18. pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_zero3_integration.cpython-310.pyc +0 -0
  19. pythonProject/.venv/Lib/site-packages/accelerate/utils/ao.py +140 -0
  20. pythonProject/.venv/Lib/site-packages/accelerate/utils/bnb.py +469 -0
  21. pythonProject/.venv/Lib/site-packages/distutils-precedence.pth +3 -0
.gitattributes CHANGED
@@ -38,3 +38,5 @@ VC_redist.x64.exe filter=lfs diff=lfs merge=lfs -text
38
  VC_redist.x86.exe filter=lfs diff=lfs merge=lfs -text
39
  python-3.10.11-amd64.exe filter=lfs diff=lfs merge=lfs -text
40
  cuda_11.8.0_522.06_windows.exe filter=lfs diff=lfs merge=lfs -text
 
 
 
38
  VC_redist.x86.exe filter=lfs diff=lfs merge=lfs -text
39
  python-3.10.11-amd64.exe filter=lfs diff=lfs merge=lfs -text
40
  cuda_11.8.0_522.06_windows.exe filter=lfs diff=lfs merge=lfs -text
41
+ cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer.lib filter=lfs diff=lfs merge=lfs -text
42
+ cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer64_8.lib filter=lfs diff=lfs merge=lfs -text
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer.lib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10b2c9aac7483dc9d3650f99cc7b2297c66b1c2eb4ec1963bdde2a2e4363ea20
3
+ size 153564
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer64_8.lib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10b2c9aac7483dc9d3650f99cc7b2297c66b1c2eb4ec1963bdde2a2e4363ea20
3
+ size 153564
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.65 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/examples.cpython-310.pyc ADDED
Binary file (5.21 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/testing.cpython-310.pyc ADDED
Binary file (29.4 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/training.cpython-310.pyc ADDED
Binary file (5.78 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (186 Bytes). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_cli.cpython-310.pyc ADDED
Binary file (624 Bytes). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_merge_weights.cpython-310.pyc ADDED
Binary file (4.96 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_notebook.cpython-310.pyc ADDED
Binary file (3.73 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_ops.cpython-310.pyc ADDED
Binary file (4.63 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_script.cpython-310.pyc ADDED
Binary file (23.5 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_sync.cpython-310.pyc ADDED
Binary file (9.23 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (200 Bytes). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_peak_memory_usage.cpython-310.pyc ADDED
Binary file (7.63 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_pippy.cpython-310.pyc ADDED
Binary file (2.25 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_zero3_integration.cpython-310.pyc ADDED
Binary file (1.23 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/utils/ao.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Needed utilities for torchao FP8 training.
17
+ """
18
+
19
+ from functools import partial
20
+ from typing import TYPE_CHECKING, Callable, Optional
21
+
22
+ import torch
23
+
24
+ from .imports import is_torchao_available, torchao_required
25
+
26
+
27
+ if TYPE_CHECKING:
28
+ if is_torchao_available():
29
+ from torchao.float8.float8_linear import Float8LinearConfig
30
+
31
+
32
+ def find_first_last_linear_layers(model: torch.nn.Module):
33
+ """
34
+ Finds the first and last linear layer names in a model.
35
+
36
+ This is needed during FP8 to avoid issues with instability by keeping the first and last layers unquantized.
37
+
38
+ Ref: https://x.com/xariusrke/status/1826669142604141052
39
+ """
40
+ first_linear, last_linear = None, None
41
+ for name, module in model.named_modules():
42
+ if isinstance(module, torch.nn.Linear):
43
+ if first_linear is None:
44
+ first_linear = name
45
+ last_linear = name
46
+ return first_linear, last_linear
47
+
48
+
49
+ def filter_linear_layers(module, fqn: str, layers_to_filter: list[str]) -> bool:
50
+ """
51
+ A function which will check if `module` is:
52
+ - a `torch.nn.Linear` layer
53
+ - has in_features and out_features divisible by 16
54
+ - is not part of `layers_to_filter`
55
+
56
+ Args:
57
+ module (`torch.nn.Module`):
58
+ The module to check.
59
+ fqn (`str`):
60
+ The fully qualified name of the layer.
61
+ layers_to_filter (`List[str]`):
62
+ The list of layers to filter.
63
+ """
64
+ if isinstance(module, torch.nn.Linear):
65
+ if module.in_features % 16 != 0 or module.out_features % 16 != 0:
66
+ return False
67
+ if fqn in layers_to_filter:
68
+ return False
69
+ return True
70
+
71
+
72
+ def filter_first_and_last_linear_layers(module, fqn: str) -> bool:
73
+ """
74
+ A filter function which will filter out all linear layers except the first and last.
75
+
76
+ <Tip>
77
+
78
+ For stability reasons, we skip the first and last linear layers Otherwise can lead to the model not training or
79
+ converging properly
80
+
81
+ </Tip>
82
+
83
+ Args:
84
+ module (`torch.nn.Module`):
85
+ The module to check.
86
+ fqn (`str`):
87
+ The fully qualified name of the layer.
88
+ """
89
+ first_linear, last_linear = find_first_last_linear_layers(module)
90
+ return filter_linear_layers(module, fqn, layers_to_filter=[first_linear, last_linear])
91
+
92
+
93
+ @torchao_required
94
+ def has_ao_layers(model: torch.nn.Module):
95
+ from torchao.float8.float8_linear import Float8Linear
96
+
97
+ for name, module in model.named_modules():
98
+ if isinstance(module, Float8Linear):
99
+ return True
100
+ return False
101
+
102
+
103
+ @torchao_required
104
+ def convert_model_to_fp8_ao(
105
+ model: torch.nn.Module,
106
+ config: Optional["Float8LinearConfig"] = None,
107
+ module_filter_func: Optional[Callable] = filter_first_and_last_linear_layers,
108
+ ):
109
+ """
110
+ Converts all `nn.Linear` layers in the model (except the first and last) to torchao's `Float8Linear` layer inplace.
111
+
112
+ Args:
113
+ model (`torch.nn.Module`):
114
+ The model to convert.
115
+ config (`torchao.float8.Float8LinearConfig`, *optional*):
116
+ The configuration for the FP8 training. Recommended to utilize
117
+ `torchao.float8.recipe_name_to_linear_config` to generate this. In general, the default config should be
118
+ sufficient (what is passed when set to `None`).
119
+ module_filter_func (`Callable`, *optional*, defaults to `filter_linear_layers`):
120
+ Optional function that must take in a module and layer name, and returns a boolean indicating whether the
121
+ module should be converted to FP8. Defaults to `filter_linear_layers`. See it for an example.
122
+
123
+ Example:
124
+
125
+ ```python
126
+ from accelerate.utils.ao import convert_model_to_fp8_ao
127
+
128
+ model = MyModel()
129
+ model.to("cuda")
130
+ convert_to_float8_training(model)
131
+
132
+ model.train()
133
+ ```
134
+ """
135
+ from torchao.float8 import convert_to_float8_training
136
+
137
+ first_linear, last_linear = find_first_last_linear_layers(model)
138
+ if module_filter_func is None:
139
+ module_filter_func = partial(filter_linear_layers, layers_to_filter=[first_linear, last_linear])
140
+ convert_to_float8_training(model, module_filter_fn=module_filter_func, config=config)
pythonProject/.venv/Lib/site-packages/accelerate/utils/bnb.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import logging
17
+ import os
18
+ from copy import deepcopy
19
+ from typing import Optional, Union
20
+
21
+ import torch
22
+ import torch.nn as nn
23
+
24
+ from accelerate.utils.imports import (
25
+ is_4bit_bnb_available,
26
+ is_8bit_bnb_available,
27
+ )
28
+
29
+ from ..big_modeling import dispatch_model, init_empty_weights
30
+ from .dataclasses import BnbQuantizationConfig
31
+ from .modeling import (
32
+ find_tied_parameters,
33
+ get_balanced_memory,
34
+ infer_auto_device_map,
35
+ load_checkpoint_in_model,
36
+ offload_weight,
37
+ set_module_tensor_to_device,
38
+ )
39
+
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ def load_and_quantize_model(
45
+ model: torch.nn.Module,
46
+ bnb_quantization_config: BnbQuantizationConfig,
47
+ weights_location: Union[str, os.PathLike] = None,
48
+ device_map: Optional[dict[str, Union[int, str, torch.device]]] = None,
49
+ no_split_module_classes: Optional[list[str]] = None,
50
+ max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
51
+ offload_folder: Optional[Union[str, os.PathLike]] = None,
52
+ offload_state_dict: bool = False,
53
+ ):
54
+ """
55
+ This function will quantize the input model with the associated config passed in `bnb_quantization_config`. If the
56
+ model is in the meta device, we will load and dispatch the weights according to the `device_map` passed. If the
57
+ model is already loaded, we will quantize the model and put the model on the GPU,
58
+
59
+ Args:
60
+ model (`torch.nn.Module`):
61
+ Input model. The model can be already loaded or on the meta device
62
+ bnb_quantization_config (`BnbQuantizationConfig`):
63
+ The bitsandbytes quantization parameters
64
+ weights_location (`str` or `os.PathLike`):
65
+ The folder weights_location to load. It can be:
66
+ - a path to a file containing a whole model state dict
67
+ - a path to a `.json` file containing the index to a sharded checkpoint
68
+ - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
69
+ - a path to a folder containing a unique pytorch_model.bin file.
70
+ device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
71
+ A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
72
+ name, once a given module name is inside, every submodule of it will be sent to the same device.
73
+ no_split_module_classes (`List[str]`, *optional*):
74
+ A list of layer class names that should never be split across device (for instance any layer that has a
75
+ residual connection).
76
+ max_memory (`Dict`, *optional*):
77
+ A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
78
+ offload_folder (`str` or `os.PathLike`, *optional*):
79
+ If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
80
+ offload_state_dict (`bool`, *optional*, defaults to `False`):
81
+ If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
82
+ the weight of the CPU state dict + the biggest shard does not fit.
83
+
84
+ Returns:
85
+ `torch.nn.Module`: The quantized model
86
+ """
87
+
88
+ load_in_4bit = bnb_quantization_config.load_in_4bit
89
+ load_in_8bit = bnb_quantization_config.load_in_8bit
90
+
91
+ if load_in_8bit and not is_8bit_bnb_available():
92
+ raise ImportError(
93
+ "You have a version of `bitsandbytes` that is not compatible with 8bit quantization,"
94
+ " make sure you have the latest version of `bitsandbytes` installed."
95
+ )
96
+ if load_in_4bit and not is_4bit_bnb_available():
97
+ raise ValueError(
98
+ "You have a version of `bitsandbytes` that is not compatible with 4bit quantization,"
99
+ "make sure you have the latest version of `bitsandbytes` installed."
100
+ )
101
+
102
+ modules_on_cpu = []
103
+ # custom device map
104
+ if isinstance(device_map, dict) and len(device_map.keys()) > 1:
105
+ modules_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
106
+
107
+ # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
108
+ if bnb_quantization_config.skip_modules is None:
109
+ bnb_quantization_config.skip_modules = get_keys_to_not_convert(model)
110
+
111
+ # add cpu modules to skip modules only for 4-bit modules
112
+ if load_in_4bit:
113
+ bnb_quantization_config.skip_modules.extend(modules_on_cpu)
114
+ modules_to_not_convert = bnb_quantization_config.skip_modules
115
+
116
+ # We add the modules we want to keep in full precision
117
+ if bnb_quantization_config.keep_in_fp32_modules is None:
118
+ bnb_quantization_config.keep_in_fp32_modules = []
119
+ keep_in_fp32_modules = bnb_quantization_config.keep_in_fp32_modules
120
+ modules_to_not_convert.extend(keep_in_fp32_modules)
121
+
122
+ # compatibility with peft
123
+ model.is_loaded_in_4bit = load_in_4bit
124
+ model.is_loaded_in_8bit = load_in_8bit
125
+
126
+ model_device = get_parameter_device(model)
127
+ if model_device.type != "meta":
128
+ # quantization of an already loaded model
129
+ logger.warning(
130
+ "It is not recommended to quantize a loaded model. "
131
+ "The model should be instantiated under the `init_empty_weights` context manager."
132
+ )
133
+ model = replace_with_bnb_layers(model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert)
134
+ # convert param to the right dtype
135
+ dtype = bnb_quantization_config.torch_dtype
136
+ for name, param in model.state_dict().items():
137
+ if any(module_to_keep_in_fp32 in name for module_to_keep_in_fp32 in keep_in_fp32_modules):
138
+ param.to(torch.float32)
139
+ if param.dtype != torch.float32:
140
+ name = name.replace(".weight", "").replace(".bias", "")
141
+ param = getattr(model, name, None)
142
+ if param is not None:
143
+ param.to(torch.float32)
144
+ elif torch.is_floating_point(param):
145
+ param.to(dtype)
146
+ if model_device.type == "cuda":
147
+ model.cuda(torch.cuda.current_device())
148
+ torch.cuda.empty_cache()
149
+ elif torch.cuda.is_available():
150
+ model.to(torch.cuda.current_device())
151
+ elif torch.xpu.is_available():
152
+ model.to(torch.xpu.current_device())
153
+ else:
154
+ raise RuntimeError("No GPU or Intel XPU found. A GPU or Intel XPU is needed for quantization.")
155
+ logger.info(
156
+ f"The model device type is {model_device.type}. However, gpu or intel xpu is needed for quantization."
157
+ "We move the model to it."
158
+ )
159
+ return model
160
+
161
+ elif weights_location is None:
162
+ raise RuntimeError(
163
+ f"`weights_location` needs to be the folder path containing the weights of the model, but we found {weights_location} "
164
+ )
165
+
166
+ else:
167
+ with init_empty_weights():
168
+ model = replace_with_bnb_layers(
169
+ model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert
170
+ )
171
+ device_map = get_quantized_model_device_map(
172
+ model,
173
+ bnb_quantization_config,
174
+ device_map,
175
+ max_memory=max_memory,
176
+ no_split_module_classes=no_split_module_classes,
177
+ )
178
+ if offload_state_dict is None and device_map is not None and "disk" in device_map.values():
179
+ offload_state_dict = True
180
+
181
+ offload = any(x in list(device_map.values()) for x in ["cpu", "disk"])
182
+
183
+ load_checkpoint_in_model(
184
+ model,
185
+ weights_location,
186
+ device_map,
187
+ dtype=bnb_quantization_config.torch_dtype,
188
+ offload_folder=offload_folder,
189
+ offload_state_dict=offload_state_dict,
190
+ keep_in_fp32_modules=bnb_quantization_config.keep_in_fp32_modules,
191
+ offload_8bit_bnb=load_in_8bit and offload,
192
+ )
193
+ return dispatch_model(model, device_map=device_map, offload_dir=offload_folder)
194
+
195
+
196
+ def get_quantized_model_device_map(
197
+ model, bnb_quantization_config, device_map=None, max_memory=None, no_split_module_classes=None
198
+ ):
199
+ if device_map is None:
200
+ if torch.cuda.is_available():
201
+ device_map = {"": torch.cuda.current_device()}
202
+ elif torch.xpu.is_available():
203
+ device_map = {"": torch.xpu.current_device()}
204
+ else:
205
+ raise RuntimeError("No GPU found. A GPU is needed for quantization.")
206
+ logger.info("The device_map was not initialized.Setting device_map to `{'':torch.cuda.current_device()}`.")
207
+
208
+ if isinstance(device_map, str):
209
+ if device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
210
+ raise ValueError(
211
+ "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
212
+ "'sequential'."
213
+ )
214
+
215
+ special_dtypes = {}
216
+ special_dtypes.update(
217
+ {
218
+ name: bnb_quantization_config.torch_dtype
219
+ for name, _ in model.named_parameters()
220
+ if any(m in name for m in bnb_quantization_config.skip_modules)
221
+ }
222
+ )
223
+ special_dtypes.update(
224
+ {
225
+ name: torch.float32
226
+ for name, _ in model.named_parameters()
227
+ if any(m in name for m in bnb_quantization_config.keep_in_fp32_modules)
228
+ }
229
+ )
230
+
231
+ kwargs = {}
232
+ kwargs["special_dtypes"] = special_dtypes
233
+ kwargs["no_split_module_classes"] = no_split_module_classes
234
+ kwargs["dtype"] = bnb_quantization_config.target_dtype
235
+
236
+ # get max_memory for each device.
237
+ if device_map != "sequential":
238
+ max_memory = get_balanced_memory(
239
+ model,
240
+ low_zero=(device_map == "balanced_low_0"),
241
+ max_memory=max_memory,
242
+ **kwargs,
243
+ )
244
+
245
+ kwargs["max_memory"] = max_memory
246
+ device_map = infer_auto_device_map(model, **kwargs)
247
+
248
+ if isinstance(device_map, dict):
249
+ # check if don't have any quantized module on the cpu
250
+ modules_not_to_convert = bnb_quantization_config.skip_modules + bnb_quantization_config.keep_in_fp32_modules
251
+
252
+ device_map_without_some_modules = {
253
+ key: device_map[key] for key in device_map.keys() if key not in modules_not_to_convert
254
+ }
255
+ for device in ["cpu", "disk"]:
256
+ if device in device_map_without_some_modules.values():
257
+ if bnb_quantization_config.load_in_4bit:
258
+ raise ValueError(
259
+ """
260
+ Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
261
+ the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
262
+ these modules in `torch_dtype`, you need to pass a custom `device_map` to
263
+ `load_and_quantize_model`. Check
264
+ https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization#offload-modules-to-cpu-and-disk
265
+ for more details.
266
+ """
267
+ )
268
+ else:
269
+ logger.info(
270
+ "Some modules are are offloaded to the CPU or the disk. Note that these modules will be converted to 8-bit"
271
+ )
272
+ del device_map_without_some_modules
273
+ return device_map
274
+
275
+
276
+ def replace_with_bnb_layers(model, bnb_quantization_config, modules_to_not_convert=None, current_key_name=None):
277
+ """
278
+ A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules or by `bnb.nn.Linear4bit`
279
+ modules from the `bitsandbytes`library. The function will be run recursively and replace `torch.nn.Linear` modules.
280
+
281
+ Parameters:
282
+ model (`torch.nn.Module`):
283
+ Input model or `torch.nn.Module` as the function is run recursively.
284
+ modules_to_not_convert (`List[str]`):
285
+ Names of the modules to not quantize convert. In practice we keep the `lm_head` in full precision for
286
+ numerical stability reasons.
287
+ current_key_name (`List[str]`, *optional*):
288
+ An array to track the current key of the recursion. This is used to check whether the current key (part of
289
+ it) is not in the list of modules to not convert.
290
+ """
291
+
292
+ if modules_to_not_convert is None:
293
+ modules_to_not_convert = []
294
+
295
+ model, has_been_replaced = _replace_with_bnb_layers(
296
+ model, bnb_quantization_config, modules_to_not_convert, current_key_name
297
+ )
298
+ if not has_been_replaced:
299
+ logger.warning(
300
+ "You are loading your model in 8bit or 4bit but no linear modules were found in your model."
301
+ " this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers."
302
+ " Please double check your model architecture, or submit an issue on github if you think this is"
303
+ " a bug."
304
+ )
305
+ return model
306
+
307
+
308
+ def _replace_with_bnb_layers(
309
+ model,
310
+ bnb_quantization_config,
311
+ modules_to_not_convert=None,
312
+ current_key_name=None,
313
+ ):
314
+ """
315
+ Private method that wraps the recursion for module replacement.
316
+
317
+ Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
318
+ """
319
+ # bitsandbytes will initialize CUDA on import, so it needs to be imported lazily
320
+ import bitsandbytes as bnb
321
+
322
+ has_been_replaced = False
323
+ for name, module in model.named_children():
324
+ if current_key_name is None:
325
+ current_key_name = []
326
+ current_key_name.append(name)
327
+ if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
328
+ # Check if the current key is not in the `modules_to_not_convert`
329
+ current_key_name_str = ".".join(current_key_name)
330
+ proceed = True
331
+ for key in modules_to_not_convert:
332
+ if (
333
+ (key in current_key_name_str) and (key + "." in current_key_name_str)
334
+ ) or key == current_key_name_str:
335
+ proceed = False
336
+ break
337
+ if proceed:
338
+ # Load bnb module with empty weight and replace ``nn.Linear` module
339
+ if bnb_quantization_config.load_in_8bit:
340
+ bnb_module = bnb.nn.Linear8bitLt(
341
+ module.in_features,
342
+ module.out_features,
343
+ module.bias is not None,
344
+ has_fp16_weights=False,
345
+ threshold=bnb_quantization_config.llm_int8_threshold,
346
+ )
347
+ elif bnb_quantization_config.load_in_4bit:
348
+ bnb_module = bnb.nn.Linear4bit(
349
+ module.in_features,
350
+ module.out_features,
351
+ module.bias is not None,
352
+ bnb_quantization_config.bnb_4bit_compute_dtype,
353
+ compress_statistics=bnb_quantization_config.bnb_4bit_use_double_quant,
354
+ quant_type=bnb_quantization_config.bnb_4bit_quant_type,
355
+ )
356
+ else:
357
+ raise ValueError("load_in_8bit and load_in_4bit can't be both False")
358
+ bnb_module.weight.data = module.weight.data
359
+ if module.bias is not None:
360
+ bnb_module.bias.data = module.bias.data
361
+ bnb_module.requires_grad_(False)
362
+ setattr(model, name, bnb_module)
363
+ has_been_replaced = True
364
+ if len(list(module.children())) > 0:
365
+ _, _has_been_replaced = _replace_with_bnb_layers(
366
+ module, bnb_quantization_config, modules_to_not_convert, current_key_name
367
+ )
368
+ has_been_replaced = has_been_replaced | _has_been_replaced
369
+ # Remove the last key for recursion
370
+ current_key_name.pop(-1)
371
+ return model, has_been_replaced
372
+
373
+
374
+ def get_keys_to_not_convert(model):
375
+ r"""
376
+ An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
377
+ we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
378
+ to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
379
+ int8.
380
+
381
+ Parameters:
382
+ model (`torch.nn.Module`):
383
+ Input model
384
+ """
385
+ # Create a copy of the model
386
+ with init_empty_weights():
387
+ tied_model = deepcopy(model) # this has 0 cost since it is done inside `init_empty_weights` context manager`
388
+
389
+ tied_params = find_tied_parameters(tied_model)
390
+ # For compatibility with Accelerate < 0.18
391
+ if isinstance(tied_params, dict):
392
+ tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
393
+ else:
394
+ tied_keys = sum(tied_params, [])
395
+ has_tied_params = len(tied_keys) > 0
396
+
397
+ # Check if it is a base model
398
+ is_base_model = False
399
+ if hasattr(model, "base_model_prefix"):
400
+ is_base_model = not hasattr(model, model.base_model_prefix)
401
+
402
+ # Ignore this for base models (BertModel, GPT2Model, etc.)
403
+ if (not has_tied_params) and is_base_model:
404
+ return []
405
+
406
+ # otherwise they have an attached head
407
+ list_modules = list(model.named_children())
408
+ list_last_module = [list_modules[-1][0]]
409
+
410
+ # add last module together with tied weights
411
+ intersection = set(list_last_module) - set(tied_keys)
412
+ list_untouched = list(set(tied_keys)) + list(intersection)
413
+
414
+ # remove ".weight" from the keys
415
+ names_to_remove = [".weight", ".bias"]
416
+ filtered_module_names = []
417
+ for name in list_untouched:
418
+ for name_to_remove in names_to_remove:
419
+ if name_to_remove in name:
420
+ name = name.replace(name_to_remove, "")
421
+ filtered_module_names.append(name)
422
+
423
+ return filtered_module_names
424
+
425
+
426
+ def has_4bit_bnb_layers(model):
427
+ """Check if we have `bnb.nn.Linear4bit` or `bnb.nn.Linear8bitLt` layers inside our model"""
428
+ # bitsandbytes will initialize CUDA on import, so it needs to be imported lazily
429
+ import bitsandbytes as bnb
430
+
431
+ for m in model.modules():
432
+ if isinstance(m, bnb.nn.Linear4bit):
433
+ return True
434
+ return False
435
+
436
+
437
+ def get_parameter_device(parameter: nn.Module):
438
+ return next(parameter.parameters()).device
439
+
440
+
441
+ def quantize_and_offload_8bit(model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics):
442
+ # if it is not quantized, we quantize and offload the quantized weights and the SCB stats
443
+ if fp16_statistics is None:
444
+ set_module_tensor_to_device(model, param_name, 0, dtype=new_dtype, value=param)
445
+ tensor_name = param_name
446
+ module = model
447
+ if "." in tensor_name:
448
+ splits = tensor_name.split(".")
449
+ for split in splits[:-1]:
450
+ new_module = getattr(module, split)
451
+ if new_module is None:
452
+ raise ValueError(f"{module} has no attribute {split}.")
453
+ module = new_module
454
+ tensor_name = splits[-1]
455
+ # offload weights
456
+ module._parameters[tensor_name].requires_grad = False
457
+ offload_weight(module._parameters[tensor_name], param_name, offload_folder, index=offload_index)
458
+ if hasattr(module._parameters[tensor_name], "SCB"):
459
+ offload_weight(
460
+ module._parameters[tensor_name].SCB,
461
+ param_name.replace("weight", "SCB"),
462
+ offload_folder,
463
+ index=offload_index,
464
+ )
465
+ else:
466
+ offload_weight(param, param_name, offload_folder, index=offload_index)
467
+ offload_weight(fp16_statistics, param_name.replace("weight", "SCB"), offload_folder, index=offload_index)
468
+
469
+ set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype, value=torch.empty(*param.size()))
pythonProject/.venv/Lib/site-packages/distutils-precedence.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2638ce9e2500e572a5e0de7faed6661eb569d1b696fcba07b0dd223da5f5d224
3
+ size 151