Add files using upload-large-folder tool
Browse files- .gitattributes +2 -0
- cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer.lib +3 -0
- cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer64_8.lib +3 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/__init__.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/examples.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/testing.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/training.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/__init__.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_cli.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_merge_weights.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_notebook.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_ops.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_script.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_sync.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/__init__.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_peak_memory_usage.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_pippy.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_zero3_integration.cpython-310.pyc +0 -0
- pythonProject/.venv/Lib/site-packages/accelerate/utils/ao.py +140 -0
- pythonProject/.venv/Lib/site-packages/accelerate/utils/bnb.py +469 -0
- pythonProject/.venv/Lib/site-packages/distutils-precedence.pth +3 -0
.gitattributes
CHANGED
|
@@ -38,3 +38,5 @@ VC_redist.x64.exe filter=lfs diff=lfs merge=lfs -text
|
|
| 38 |
VC_redist.x86.exe filter=lfs diff=lfs merge=lfs -text
|
| 39 |
python-3.10.11-amd64.exe filter=lfs diff=lfs merge=lfs -text
|
| 40 |
cuda_11.8.0_522.06_windows.exe filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 38 |
VC_redist.x86.exe filter=lfs diff=lfs merge=lfs -text
|
| 39 |
python-3.10.11-amd64.exe filter=lfs diff=lfs merge=lfs -text
|
| 40 |
cuda_11.8.0_522.06_windows.exe filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer.lib filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer64_8.lib filter=lfs diff=lfs merge=lfs -text
|
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer.lib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10b2c9aac7483dc9d3650f99cc7b2297c66b1c2eb4ec1963bdde2a2e4363ea20
|
| 3 |
+
size 153564
|
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer64_8.lib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10b2c9aac7483dc9d3650f99cc7b2297c66b1c2eb4ec1963bdde2a2e4363ea20
|
| 3 |
+
size 153564
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (1.65 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/examples.cpython-310.pyc
ADDED
|
Binary file (5.21 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/testing.cpython-310.pyc
ADDED
|
Binary file (29.4 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/__pycache__/training.cpython-310.pyc
ADDED
|
Binary file (5.78 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (186 Bytes). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_cli.cpython-310.pyc
ADDED
|
Binary file (624 Bytes). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_merge_weights.cpython-310.pyc
ADDED
|
Binary file (4.96 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_notebook.cpython-310.pyc
ADDED
|
Binary file (3.73 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_ops.cpython-310.pyc
ADDED
|
Binary file (4.63 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_script.cpython-310.pyc
ADDED
|
Binary file (23.5 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/__pycache__/test_sync.cpython-310.pyc
ADDED
|
Binary file (9.23 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (200 Bytes). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_peak_memory_usage.cpython-310.pyc
ADDED
|
Binary file (7.63 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_pippy.cpython-310.pyc
ADDED
|
Binary file (2.25 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_zero3_integration.cpython-310.pyc
ADDED
|
Binary file (1.23 kB). View file
|
|
|
pythonProject/.venv/Lib/site-packages/accelerate/utils/ao.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
Needed utilities for torchao FP8 training.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from functools import partial
|
| 20 |
+
from typing import TYPE_CHECKING, Callable, Optional
|
| 21 |
+
|
| 22 |
+
import torch
|
| 23 |
+
|
| 24 |
+
from .imports import is_torchao_available, torchao_required
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
if TYPE_CHECKING:
|
| 28 |
+
if is_torchao_available():
|
| 29 |
+
from torchao.float8.float8_linear import Float8LinearConfig
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def find_first_last_linear_layers(model: torch.nn.Module):
|
| 33 |
+
"""
|
| 34 |
+
Finds the first and last linear layer names in a model.
|
| 35 |
+
|
| 36 |
+
This is needed during FP8 to avoid issues with instability by keeping the first and last layers unquantized.
|
| 37 |
+
|
| 38 |
+
Ref: https://x.com/xariusrke/status/1826669142604141052
|
| 39 |
+
"""
|
| 40 |
+
first_linear, last_linear = None, None
|
| 41 |
+
for name, module in model.named_modules():
|
| 42 |
+
if isinstance(module, torch.nn.Linear):
|
| 43 |
+
if first_linear is None:
|
| 44 |
+
first_linear = name
|
| 45 |
+
last_linear = name
|
| 46 |
+
return first_linear, last_linear
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def filter_linear_layers(module, fqn: str, layers_to_filter: list[str]) -> bool:
|
| 50 |
+
"""
|
| 51 |
+
A function which will check if `module` is:
|
| 52 |
+
- a `torch.nn.Linear` layer
|
| 53 |
+
- has in_features and out_features divisible by 16
|
| 54 |
+
- is not part of `layers_to_filter`
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
module (`torch.nn.Module`):
|
| 58 |
+
The module to check.
|
| 59 |
+
fqn (`str`):
|
| 60 |
+
The fully qualified name of the layer.
|
| 61 |
+
layers_to_filter (`List[str]`):
|
| 62 |
+
The list of layers to filter.
|
| 63 |
+
"""
|
| 64 |
+
if isinstance(module, torch.nn.Linear):
|
| 65 |
+
if module.in_features % 16 != 0 or module.out_features % 16 != 0:
|
| 66 |
+
return False
|
| 67 |
+
if fqn in layers_to_filter:
|
| 68 |
+
return False
|
| 69 |
+
return True
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def filter_first_and_last_linear_layers(module, fqn: str) -> bool:
|
| 73 |
+
"""
|
| 74 |
+
A filter function which will filter out all linear layers except the first and last.
|
| 75 |
+
|
| 76 |
+
<Tip>
|
| 77 |
+
|
| 78 |
+
For stability reasons, we skip the first and last linear layers Otherwise can lead to the model not training or
|
| 79 |
+
converging properly
|
| 80 |
+
|
| 81 |
+
</Tip>
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
module (`torch.nn.Module`):
|
| 85 |
+
The module to check.
|
| 86 |
+
fqn (`str`):
|
| 87 |
+
The fully qualified name of the layer.
|
| 88 |
+
"""
|
| 89 |
+
first_linear, last_linear = find_first_last_linear_layers(module)
|
| 90 |
+
return filter_linear_layers(module, fqn, layers_to_filter=[first_linear, last_linear])
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@torchao_required
|
| 94 |
+
def has_ao_layers(model: torch.nn.Module):
|
| 95 |
+
from torchao.float8.float8_linear import Float8Linear
|
| 96 |
+
|
| 97 |
+
for name, module in model.named_modules():
|
| 98 |
+
if isinstance(module, Float8Linear):
|
| 99 |
+
return True
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@torchao_required
|
| 104 |
+
def convert_model_to_fp8_ao(
|
| 105 |
+
model: torch.nn.Module,
|
| 106 |
+
config: Optional["Float8LinearConfig"] = None,
|
| 107 |
+
module_filter_func: Optional[Callable] = filter_first_and_last_linear_layers,
|
| 108 |
+
):
|
| 109 |
+
"""
|
| 110 |
+
Converts all `nn.Linear` layers in the model (except the first and last) to torchao's `Float8Linear` layer inplace.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
model (`torch.nn.Module`):
|
| 114 |
+
The model to convert.
|
| 115 |
+
config (`torchao.float8.Float8LinearConfig`, *optional*):
|
| 116 |
+
The configuration for the FP8 training. Recommended to utilize
|
| 117 |
+
`torchao.float8.recipe_name_to_linear_config` to generate this. In general, the default config should be
|
| 118 |
+
sufficient (what is passed when set to `None`).
|
| 119 |
+
module_filter_func (`Callable`, *optional*, defaults to `filter_linear_layers`):
|
| 120 |
+
Optional function that must take in a module and layer name, and returns a boolean indicating whether the
|
| 121 |
+
module should be converted to FP8. Defaults to `filter_linear_layers`. See it for an example.
|
| 122 |
+
|
| 123 |
+
Example:
|
| 124 |
+
|
| 125 |
+
```python
|
| 126 |
+
from accelerate.utils.ao import convert_model_to_fp8_ao
|
| 127 |
+
|
| 128 |
+
model = MyModel()
|
| 129 |
+
model.to("cuda")
|
| 130 |
+
convert_to_float8_training(model)
|
| 131 |
+
|
| 132 |
+
model.train()
|
| 133 |
+
```
|
| 134 |
+
"""
|
| 135 |
+
from torchao.float8 import convert_to_float8_training
|
| 136 |
+
|
| 137 |
+
first_linear, last_linear = find_first_last_linear_layers(model)
|
| 138 |
+
if module_filter_func is None:
|
| 139 |
+
module_filter_func = partial(filter_linear_layers, layers_to_filter=[first_linear, last_linear])
|
| 140 |
+
convert_to_float8_training(model, module_filter_fn=module_filter_func, config=config)
|
pythonProject/.venv/Lib/site-packages/accelerate/utils/bnb.py
ADDED
|
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
import logging
|
| 17 |
+
import os
|
| 18 |
+
from copy import deepcopy
|
| 19 |
+
from typing import Optional, Union
|
| 20 |
+
|
| 21 |
+
import torch
|
| 22 |
+
import torch.nn as nn
|
| 23 |
+
|
| 24 |
+
from accelerate.utils.imports import (
|
| 25 |
+
is_4bit_bnb_available,
|
| 26 |
+
is_8bit_bnb_available,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
from ..big_modeling import dispatch_model, init_empty_weights
|
| 30 |
+
from .dataclasses import BnbQuantizationConfig
|
| 31 |
+
from .modeling import (
|
| 32 |
+
find_tied_parameters,
|
| 33 |
+
get_balanced_memory,
|
| 34 |
+
infer_auto_device_map,
|
| 35 |
+
load_checkpoint_in_model,
|
| 36 |
+
offload_weight,
|
| 37 |
+
set_module_tensor_to_device,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
logger = logging.getLogger(__name__)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def load_and_quantize_model(
|
| 45 |
+
model: torch.nn.Module,
|
| 46 |
+
bnb_quantization_config: BnbQuantizationConfig,
|
| 47 |
+
weights_location: Union[str, os.PathLike] = None,
|
| 48 |
+
device_map: Optional[dict[str, Union[int, str, torch.device]]] = None,
|
| 49 |
+
no_split_module_classes: Optional[list[str]] = None,
|
| 50 |
+
max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
|
| 51 |
+
offload_folder: Optional[Union[str, os.PathLike]] = None,
|
| 52 |
+
offload_state_dict: bool = False,
|
| 53 |
+
):
|
| 54 |
+
"""
|
| 55 |
+
This function will quantize the input model with the associated config passed in `bnb_quantization_config`. If the
|
| 56 |
+
model is in the meta device, we will load and dispatch the weights according to the `device_map` passed. If the
|
| 57 |
+
model is already loaded, we will quantize the model and put the model on the GPU,
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
model (`torch.nn.Module`):
|
| 61 |
+
Input model. The model can be already loaded or on the meta device
|
| 62 |
+
bnb_quantization_config (`BnbQuantizationConfig`):
|
| 63 |
+
The bitsandbytes quantization parameters
|
| 64 |
+
weights_location (`str` or `os.PathLike`):
|
| 65 |
+
The folder weights_location to load. It can be:
|
| 66 |
+
- a path to a file containing a whole model state dict
|
| 67 |
+
- a path to a `.json` file containing the index to a sharded checkpoint
|
| 68 |
+
- a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
|
| 69 |
+
- a path to a folder containing a unique pytorch_model.bin file.
|
| 70 |
+
device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
|
| 71 |
+
A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
|
| 72 |
+
name, once a given module name is inside, every submodule of it will be sent to the same device.
|
| 73 |
+
no_split_module_classes (`List[str]`, *optional*):
|
| 74 |
+
A list of layer class names that should never be split across device (for instance any layer that has a
|
| 75 |
+
residual connection).
|
| 76 |
+
max_memory (`Dict`, *optional*):
|
| 77 |
+
A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
|
| 78 |
+
offload_folder (`str` or `os.PathLike`, *optional*):
|
| 79 |
+
If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
|
| 80 |
+
offload_state_dict (`bool`, *optional*, defaults to `False`):
|
| 81 |
+
If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
|
| 82 |
+
the weight of the CPU state dict + the biggest shard does not fit.
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
`torch.nn.Module`: The quantized model
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
load_in_4bit = bnb_quantization_config.load_in_4bit
|
| 89 |
+
load_in_8bit = bnb_quantization_config.load_in_8bit
|
| 90 |
+
|
| 91 |
+
if load_in_8bit and not is_8bit_bnb_available():
|
| 92 |
+
raise ImportError(
|
| 93 |
+
"You have a version of `bitsandbytes` that is not compatible with 8bit quantization,"
|
| 94 |
+
" make sure you have the latest version of `bitsandbytes` installed."
|
| 95 |
+
)
|
| 96 |
+
if load_in_4bit and not is_4bit_bnb_available():
|
| 97 |
+
raise ValueError(
|
| 98 |
+
"You have a version of `bitsandbytes` that is not compatible with 4bit quantization,"
|
| 99 |
+
"make sure you have the latest version of `bitsandbytes` installed."
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
modules_on_cpu = []
|
| 103 |
+
# custom device map
|
| 104 |
+
if isinstance(device_map, dict) and len(device_map.keys()) > 1:
|
| 105 |
+
modules_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
|
| 106 |
+
|
| 107 |
+
# We keep some modules such as the lm_head in their original dtype for numerical stability reasons
|
| 108 |
+
if bnb_quantization_config.skip_modules is None:
|
| 109 |
+
bnb_quantization_config.skip_modules = get_keys_to_not_convert(model)
|
| 110 |
+
|
| 111 |
+
# add cpu modules to skip modules only for 4-bit modules
|
| 112 |
+
if load_in_4bit:
|
| 113 |
+
bnb_quantization_config.skip_modules.extend(modules_on_cpu)
|
| 114 |
+
modules_to_not_convert = bnb_quantization_config.skip_modules
|
| 115 |
+
|
| 116 |
+
# We add the modules we want to keep in full precision
|
| 117 |
+
if bnb_quantization_config.keep_in_fp32_modules is None:
|
| 118 |
+
bnb_quantization_config.keep_in_fp32_modules = []
|
| 119 |
+
keep_in_fp32_modules = bnb_quantization_config.keep_in_fp32_modules
|
| 120 |
+
modules_to_not_convert.extend(keep_in_fp32_modules)
|
| 121 |
+
|
| 122 |
+
# compatibility with peft
|
| 123 |
+
model.is_loaded_in_4bit = load_in_4bit
|
| 124 |
+
model.is_loaded_in_8bit = load_in_8bit
|
| 125 |
+
|
| 126 |
+
model_device = get_parameter_device(model)
|
| 127 |
+
if model_device.type != "meta":
|
| 128 |
+
# quantization of an already loaded model
|
| 129 |
+
logger.warning(
|
| 130 |
+
"It is not recommended to quantize a loaded model. "
|
| 131 |
+
"The model should be instantiated under the `init_empty_weights` context manager."
|
| 132 |
+
)
|
| 133 |
+
model = replace_with_bnb_layers(model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert)
|
| 134 |
+
# convert param to the right dtype
|
| 135 |
+
dtype = bnb_quantization_config.torch_dtype
|
| 136 |
+
for name, param in model.state_dict().items():
|
| 137 |
+
if any(module_to_keep_in_fp32 in name for module_to_keep_in_fp32 in keep_in_fp32_modules):
|
| 138 |
+
param.to(torch.float32)
|
| 139 |
+
if param.dtype != torch.float32:
|
| 140 |
+
name = name.replace(".weight", "").replace(".bias", "")
|
| 141 |
+
param = getattr(model, name, None)
|
| 142 |
+
if param is not None:
|
| 143 |
+
param.to(torch.float32)
|
| 144 |
+
elif torch.is_floating_point(param):
|
| 145 |
+
param.to(dtype)
|
| 146 |
+
if model_device.type == "cuda":
|
| 147 |
+
model.cuda(torch.cuda.current_device())
|
| 148 |
+
torch.cuda.empty_cache()
|
| 149 |
+
elif torch.cuda.is_available():
|
| 150 |
+
model.to(torch.cuda.current_device())
|
| 151 |
+
elif torch.xpu.is_available():
|
| 152 |
+
model.to(torch.xpu.current_device())
|
| 153 |
+
else:
|
| 154 |
+
raise RuntimeError("No GPU or Intel XPU found. A GPU or Intel XPU is needed for quantization.")
|
| 155 |
+
logger.info(
|
| 156 |
+
f"The model device type is {model_device.type}. However, gpu or intel xpu is needed for quantization."
|
| 157 |
+
"We move the model to it."
|
| 158 |
+
)
|
| 159 |
+
return model
|
| 160 |
+
|
| 161 |
+
elif weights_location is None:
|
| 162 |
+
raise RuntimeError(
|
| 163 |
+
f"`weights_location` needs to be the folder path containing the weights of the model, but we found {weights_location} "
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
else:
|
| 167 |
+
with init_empty_weights():
|
| 168 |
+
model = replace_with_bnb_layers(
|
| 169 |
+
model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert
|
| 170 |
+
)
|
| 171 |
+
device_map = get_quantized_model_device_map(
|
| 172 |
+
model,
|
| 173 |
+
bnb_quantization_config,
|
| 174 |
+
device_map,
|
| 175 |
+
max_memory=max_memory,
|
| 176 |
+
no_split_module_classes=no_split_module_classes,
|
| 177 |
+
)
|
| 178 |
+
if offload_state_dict is None and device_map is not None and "disk" in device_map.values():
|
| 179 |
+
offload_state_dict = True
|
| 180 |
+
|
| 181 |
+
offload = any(x in list(device_map.values()) for x in ["cpu", "disk"])
|
| 182 |
+
|
| 183 |
+
load_checkpoint_in_model(
|
| 184 |
+
model,
|
| 185 |
+
weights_location,
|
| 186 |
+
device_map,
|
| 187 |
+
dtype=bnb_quantization_config.torch_dtype,
|
| 188 |
+
offload_folder=offload_folder,
|
| 189 |
+
offload_state_dict=offload_state_dict,
|
| 190 |
+
keep_in_fp32_modules=bnb_quantization_config.keep_in_fp32_modules,
|
| 191 |
+
offload_8bit_bnb=load_in_8bit and offload,
|
| 192 |
+
)
|
| 193 |
+
return dispatch_model(model, device_map=device_map, offload_dir=offload_folder)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def get_quantized_model_device_map(
|
| 197 |
+
model, bnb_quantization_config, device_map=None, max_memory=None, no_split_module_classes=None
|
| 198 |
+
):
|
| 199 |
+
if device_map is None:
|
| 200 |
+
if torch.cuda.is_available():
|
| 201 |
+
device_map = {"": torch.cuda.current_device()}
|
| 202 |
+
elif torch.xpu.is_available():
|
| 203 |
+
device_map = {"": torch.xpu.current_device()}
|
| 204 |
+
else:
|
| 205 |
+
raise RuntimeError("No GPU found. A GPU is needed for quantization.")
|
| 206 |
+
logger.info("The device_map was not initialized.Setting device_map to `{'':torch.cuda.current_device()}`.")
|
| 207 |
+
|
| 208 |
+
if isinstance(device_map, str):
|
| 209 |
+
if device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
|
| 210 |
+
raise ValueError(
|
| 211 |
+
"If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
|
| 212 |
+
"'sequential'."
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
special_dtypes = {}
|
| 216 |
+
special_dtypes.update(
|
| 217 |
+
{
|
| 218 |
+
name: bnb_quantization_config.torch_dtype
|
| 219 |
+
for name, _ in model.named_parameters()
|
| 220 |
+
if any(m in name for m in bnb_quantization_config.skip_modules)
|
| 221 |
+
}
|
| 222 |
+
)
|
| 223 |
+
special_dtypes.update(
|
| 224 |
+
{
|
| 225 |
+
name: torch.float32
|
| 226 |
+
for name, _ in model.named_parameters()
|
| 227 |
+
if any(m in name for m in bnb_quantization_config.keep_in_fp32_modules)
|
| 228 |
+
}
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
kwargs = {}
|
| 232 |
+
kwargs["special_dtypes"] = special_dtypes
|
| 233 |
+
kwargs["no_split_module_classes"] = no_split_module_classes
|
| 234 |
+
kwargs["dtype"] = bnb_quantization_config.target_dtype
|
| 235 |
+
|
| 236 |
+
# get max_memory for each device.
|
| 237 |
+
if device_map != "sequential":
|
| 238 |
+
max_memory = get_balanced_memory(
|
| 239 |
+
model,
|
| 240 |
+
low_zero=(device_map == "balanced_low_0"),
|
| 241 |
+
max_memory=max_memory,
|
| 242 |
+
**kwargs,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
kwargs["max_memory"] = max_memory
|
| 246 |
+
device_map = infer_auto_device_map(model, **kwargs)
|
| 247 |
+
|
| 248 |
+
if isinstance(device_map, dict):
|
| 249 |
+
# check if don't have any quantized module on the cpu
|
| 250 |
+
modules_not_to_convert = bnb_quantization_config.skip_modules + bnb_quantization_config.keep_in_fp32_modules
|
| 251 |
+
|
| 252 |
+
device_map_without_some_modules = {
|
| 253 |
+
key: device_map[key] for key in device_map.keys() if key not in modules_not_to_convert
|
| 254 |
+
}
|
| 255 |
+
for device in ["cpu", "disk"]:
|
| 256 |
+
if device in device_map_without_some_modules.values():
|
| 257 |
+
if bnb_quantization_config.load_in_4bit:
|
| 258 |
+
raise ValueError(
|
| 259 |
+
"""
|
| 260 |
+
Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
|
| 261 |
+
the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
|
| 262 |
+
these modules in `torch_dtype`, you need to pass a custom `device_map` to
|
| 263 |
+
`load_and_quantize_model`. Check
|
| 264 |
+
https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization#offload-modules-to-cpu-and-disk
|
| 265 |
+
for more details.
|
| 266 |
+
"""
|
| 267 |
+
)
|
| 268 |
+
else:
|
| 269 |
+
logger.info(
|
| 270 |
+
"Some modules are are offloaded to the CPU or the disk. Note that these modules will be converted to 8-bit"
|
| 271 |
+
)
|
| 272 |
+
del device_map_without_some_modules
|
| 273 |
+
return device_map
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def replace_with_bnb_layers(model, bnb_quantization_config, modules_to_not_convert=None, current_key_name=None):
|
| 277 |
+
"""
|
| 278 |
+
A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules or by `bnb.nn.Linear4bit`
|
| 279 |
+
modules from the `bitsandbytes`library. The function will be run recursively and replace `torch.nn.Linear` modules.
|
| 280 |
+
|
| 281 |
+
Parameters:
|
| 282 |
+
model (`torch.nn.Module`):
|
| 283 |
+
Input model or `torch.nn.Module` as the function is run recursively.
|
| 284 |
+
modules_to_not_convert (`List[str]`):
|
| 285 |
+
Names of the modules to not quantize convert. In practice we keep the `lm_head` in full precision for
|
| 286 |
+
numerical stability reasons.
|
| 287 |
+
current_key_name (`List[str]`, *optional*):
|
| 288 |
+
An array to track the current key of the recursion. This is used to check whether the current key (part of
|
| 289 |
+
it) is not in the list of modules to not convert.
|
| 290 |
+
"""
|
| 291 |
+
|
| 292 |
+
if modules_to_not_convert is None:
|
| 293 |
+
modules_to_not_convert = []
|
| 294 |
+
|
| 295 |
+
model, has_been_replaced = _replace_with_bnb_layers(
|
| 296 |
+
model, bnb_quantization_config, modules_to_not_convert, current_key_name
|
| 297 |
+
)
|
| 298 |
+
if not has_been_replaced:
|
| 299 |
+
logger.warning(
|
| 300 |
+
"You are loading your model in 8bit or 4bit but no linear modules were found in your model."
|
| 301 |
+
" this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers."
|
| 302 |
+
" Please double check your model architecture, or submit an issue on github if you think this is"
|
| 303 |
+
" a bug."
|
| 304 |
+
)
|
| 305 |
+
return model
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def _replace_with_bnb_layers(
|
| 309 |
+
model,
|
| 310 |
+
bnb_quantization_config,
|
| 311 |
+
modules_to_not_convert=None,
|
| 312 |
+
current_key_name=None,
|
| 313 |
+
):
|
| 314 |
+
"""
|
| 315 |
+
Private method that wraps the recursion for module replacement.
|
| 316 |
+
|
| 317 |
+
Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
|
| 318 |
+
"""
|
| 319 |
+
# bitsandbytes will initialize CUDA on import, so it needs to be imported lazily
|
| 320 |
+
import bitsandbytes as bnb
|
| 321 |
+
|
| 322 |
+
has_been_replaced = False
|
| 323 |
+
for name, module in model.named_children():
|
| 324 |
+
if current_key_name is None:
|
| 325 |
+
current_key_name = []
|
| 326 |
+
current_key_name.append(name)
|
| 327 |
+
if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
|
| 328 |
+
# Check if the current key is not in the `modules_to_not_convert`
|
| 329 |
+
current_key_name_str = ".".join(current_key_name)
|
| 330 |
+
proceed = True
|
| 331 |
+
for key in modules_to_not_convert:
|
| 332 |
+
if (
|
| 333 |
+
(key in current_key_name_str) and (key + "." in current_key_name_str)
|
| 334 |
+
) or key == current_key_name_str:
|
| 335 |
+
proceed = False
|
| 336 |
+
break
|
| 337 |
+
if proceed:
|
| 338 |
+
# Load bnb module with empty weight and replace ``nn.Linear` module
|
| 339 |
+
if bnb_quantization_config.load_in_8bit:
|
| 340 |
+
bnb_module = bnb.nn.Linear8bitLt(
|
| 341 |
+
module.in_features,
|
| 342 |
+
module.out_features,
|
| 343 |
+
module.bias is not None,
|
| 344 |
+
has_fp16_weights=False,
|
| 345 |
+
threshold=bnb_quantization_config.llm_int8_threshold,
|
| 346 |
+
)
|
| 347 |
+
elif bnb_quantization_config.load_in_4bit:
|
| 348 |
+
bnb_module = bnb.nn.Linear4bit(
|
| 349 |
+
module.in_features,
|
| 350 |
+
module.out_features,
|
| 351 |
+
module.bias is not None,
|
| 352 |
+
bnb_quantization_config.bnb_4bit_compute_dtype,
|
| 353 |
+
compress_statistics=bnb_quantization_config.bnb_4bit_use_double_quant,
|
| 354 |
+
quant_type=bnb_quantization_config.bnb_4bit_quant_type,
|
| 355 |
+
)
|
| 356 |
+
else:
|
| 357 |
+
raise ValueError("load_in_8bit and load_in_4bit can't be both False")
|
| 358 |
+
bnb_module.weight.data = module.weight.data
|
| 359 |
+
if module.bias is not None:
|
| 360 |
+
bnb_module.bias.data = module.bias.data
|
| 361 |
+
bnb_module.requires_grad_(False)
|
| 362 |
+
setattr(model, name, bnb_module)
|
| 363 |
+
has_been_replaced = True
|
| 364 |
+
if len(list(module.children())) > 0:
|
| 365 |
+
_, _has_been_replaced = _replace_with_bnb_layers(
|
| 366 |
+
module, bnb_quantization_config, modules_to_not_convert, current_key_name
|
| 367 |
+
)
|
| 368 |
+
has_been_replaced = has_been_replaced | _has_been_replaced
|
| 369 |
+
# Remove the last key for recursion
|
| 370 |
+
current_key_name.pop(-1)
|
| 371 |
+
return model, has_been_replaced
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
def get_keys_to_not_convert(model):
|
| 375 |
+
r"""
|
| 376 |
+
An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
|
| 377 |
+
we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
|
| 378 |
+
to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
|
| 379 |
+
int8.
|
| 380 |
+
|
| 381 |
+
Parameters:
|
| 382 |
+
model (`torch.nn.Module`):
|
| 383 |
+
Input model
|
| 384 |
+
"""
|
| 385 |
+
# Create a copy of the model
|
| 386 |
+
with init_empty_weights():
|
| 387 |
+
tied_model = deepcopy(model) # this has 0 cost since it is done inside `init_empty_weights` context manager`
|
| 388 |
+
|
| 389 |
+
tied_params = find_tied_parameters(tied_model)
|
| 390 |
+
# For compatibility with Accelerate < 0.18
|
| 391 |
+
if isinstance(tied_params, dict):
|
| 392 |
+
tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
|
| 393 |
+
else:
|
| 394 |
+
tied_keys = sum(tied_params, [])
|
| 395 |
+
has_tied_params = len(tied_keys) > 0
|
| 396 |
+
|
| 397 |
+
# Check if it is a base model
|
| 398 |
+
is_base_model = False
|
| 399 |
+
if hasattr(model, "base_model_prefix"):
|
| 400 |
+
is_base_model = not hasattr(model, model.base_model_prefix)
|
| 401 |
+
|
| 402 |
+
# Ignore this for base models (BertModel, GPT2Model, etc.)
|
| 403 |
+
if (not has_tied_params) and is_base_model:
|
| 404 |
+
return []
|
| 405 |
+
|
| 406 |
+
# otherwise they have an attached head
|
| 407 |
+
list_modules = list(model.named_children())
|
| 408 |
+
list_last_module = [list_modules[-1][0]]
|
| 409 |
+
|
| 410 |
+
# add last module together with tied weights
|
| 411 |
+
intersection = set(list_last_module) - set(tied_keys)
|
| 412 |
+
list_untouched = list(set(tied_keys)) + list(intersection)
|
| 413 |
+
|
| 414 |
+
# remove ".weight" from the keys
|
| 415 |
+
names_to_remove = [".weight", ".bias"]
|
| 416 |
+
filtered_module_names = []
|
| 417 |
+
for name in list_untouched:
|
| 418 |
+
for name_to_remove in names_to_remove:
|
| 419 |
+
if name_to_remove in name:
|
| 420 |
+
name = name.replace(name_to_remove, "")
|
| 421 |
+
filtered_module_names.append(name)
|
| 422 |
+
|
| 423 |
+
return filtered_module_names
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def has_4bit_bnb_layers(model):
|
| 427 |
+
"""Check if we have `bnb.nn.Linear4bit` or `bnb.nn.Linear8bitLt` layers inside our model"""
|
| 428 |
+
# bitsandbytes will initialize CUDA on import, so it needs to be imported lazily
|
| 429 |
+
import bitsandbytes as bnb
|
| 430 |
+
|
| 431 |
+
for m in model.modules():
|
| 432 |
+
if isinstance(m, bnb.nn.Linear4bit):
|
| 433 |
+
return True
|
| 434 |
+
return False
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def get_parameter_device(parameter: nn.Module):
|
| 438 |
+
return next(parameter.parameters()).device
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
def quantize_and_offload_8bit(model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics):
|
| 442 |
+
# if it is not quantized, we quantize and offload the quantized weights and the SCB stats
|
| 443 |
+
if fp16_statistics is None:
|
| 444 |
+
set_module_tensor_to_device(model, param_name, 0, dtype=new_dtype, value=param)
|
| 445 |
+
tensor_name = param_name
|
| 446 |
+
module = model
|
| 447 |
+
if "." in tensor_name:
|
| 448 |
+
splits = tensor_name.split(".")
|
| 449 |
+
for split in splits[:-1]:
|
| 450 |
+
new_module = getattr(module, split)
|
| 451 |
+
if new_module is None:
|
| 452 |
+
raise ValueError(f"{module} has no attribute {split}.")
|
| 453 |
+
module = new_module
|
| 454 |
+
tensor_name = splits[-1]
|
| 455 |
+
# offload weights
|
| 456 |
+
module._parameters[tensor_name].requires_grad = False
|
| 457 |
+
offload_weight(module._parameters[tensor_name], param_name, offload_folder, index=offload_index)
|
| 458 |
+
if hasattr(module._parameters[tensor_name], "SCB"):
|
| 459 |
+
offload_weight(
|
| 460 |
+
module._parameters[tensor_name].SCB,
|
| 461 |
+
param_name.replace("weight", "SCB"),
|
| 462 |
+
offload_folder,
|
| 463 |
+
index=offload_index,
|
| 464 |
+
)
|
| 465 |
+
else:
|
| 466 |
+
offload_weight(param, param_name, offload_folder, index=offload_index)
|
| 467 |
+
offload_weight(fp16_statistics, param_name.replace("weight", "SCB"), offload_folder, index=offload_index)
|
| 468 |
+
|
| 469 |
+
set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype, value=torch.empty(*param.size()))
|
pythonProject/.venv/Lib/site-packages/distutils-precedence.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2638ce9e2500e572a5e0de7faed6661eb569d1b696fcba07b0dd223da5f5d224
|
| 3 |
+
size 151
|