diff --git a/.gitattributes b/.gitattributes index 9e3bd09e030501d2c64e03fa740a7bda429ef395..4693aff96530df800098c4e9b1871bc63e6bdb1f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -422,3 +422,8 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/ .venv/lib/python3.11/site-packages/pkg_resources/_vendor/pyparsing/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/pkg_resources/__pycache__/__init__.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/pkg_resources/_vendor/more_itertools/__pycache__/more.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/logits_process.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d9bad05b8e4114608cfe4b7881f53c15950b2f6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations_tf.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79ab2704fa66d3acaa99d707d2c479df3587f95f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations_tf.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/audio_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/audio_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3459b29826c2faa9acf5f652765be0b1e51cce1a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/audio_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/configuration_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/configuration_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c667cc4e9d668e3d6b3141ea9ba14280fe027074 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/configuration_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8fbce2e084be5af0857f70a01cdcd1aab57bdde Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_pytorch_checkpoint_to_tf2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_pytorch_checkpoint_to_tf2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7fb7469ba8189787bcfa1e31fab7be4b7c2241d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_pytorch_checkpoint_to_tf2.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2d2bab39fd3b2b01cdc34903e9d16b286f90d0d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2fed5838e2a4f5ed75fba53981029b869842603 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..141bf0d51b136f5c61694fcdea6154cbe9d4fb69 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/debug_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/debug_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3a5bc6cabdee64d381c9bd0662f57a19e0fa98e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/debug_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_check.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_check.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a009956be49c59013c7d78b250f1860eae88951 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_check.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_table.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_table.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d1e8cb806448a3f23b35ca8a3ac66a7b37d5f78 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_table.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92c57f627ce1497a35fe53be7c181ebf17c73b25 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d296c8ca4285faa3de6c5efad9301f4ab66446b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9af08a16cebab55a3fc2f436f2ff845ce837e214 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/file_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/file_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c64f2e2d4dbe518c6d4dc9224b938e0b3951a77 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/file_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/hf_argparser.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/hf_argparser.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9578adbb990e6e181dc497e6b0de583e9313730c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/hf_argparser.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/hyperparameter_search.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/hyperparameter_search.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ff48710e220416b72e4350d9c21cbfa1ebf8897 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/hyperparameter_search.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e31e613072f74b425e12941123db8a944d648d1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_base.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea1b73999c92b255cda0949585348addcf5e2e7f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7afc1a0f47d94d14a6e94bad2559d01660690cd0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/keras_callbacks.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/keras_callbacks.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7c9ac743fb1a5e30d183e17e729f7d837a30525 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/keras_callbacks.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed2a12331343972b986817d3cc2b7f6be97f1101 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34b3253111e3b7c97567adacb0aec5398ff18d1f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26116465358b7b6e3c37c5b9573153967edae0da Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2f28468c292936578a68b686e1782ee0ce0d3e9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01dd22599555236aec052843fe6f11bd48ee503a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d09469fb6408591ccae72b3fe43445be6a7d4122 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0b2aef97cb2fc5b0bb1e7fc722c2570024f102e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..acc0dc498cc36566b390d70fe8d64666a5568ade Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d9898c807b746620f266162e4236bc3b4bbe25b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81fbb6f9c25895771a98c78061f04cafb56039f2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_utils.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:276838de4c287269109cf384c70ff8d5055e5b91bb854c5126112b913a4d0093 +size 281139 diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0e2954725cc3cdc044268e8195500a9218106e5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization_tf.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25698db85293361c9d9c07399f52f38b0cc51ea1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization_tf.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/processing_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/processing_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd1d6e5f9360e81850206ad451afb34889f6b4a2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/processing_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/pytorch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/pytorch_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74d47bf059c9629624d58aa9ffb714f77f79d2be Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/pytorch_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/safetensors_conversion.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/safetensors_conversion.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ce180b27a598c315e0949f4fb8c70554a793e78 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/safetensors_conversion.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/tf_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tf_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2edeacbdc1a481d219620ad29e8dbbdf9df472ea Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tf_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/time_series_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/time_series_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1aa90264517975c935e33d6edcac666abbb9804f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/time_series_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..944929db9c0ed24cbc39b54c5f79dd5422bcde3b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9324d315f5fba3c9676e0e5612ad7b9b86058e29 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49edaecbc672677e9fdfa7390871d5a377c9645f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa11f4d36296dff15a1f257f9e9bca3d856758140df110c8e3854e18618ab31 +size 267562 diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_callback.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_callback.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8c6cb87839c0e223468eb759ea49d8036f7bef7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_callback.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c693201e6b5e2df4ef1aef9c0ba459b71606cb2f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12c94ae24982c174222b9c848e7dad9f6f53ef2a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..444e5f0331e3c7adfdb9fef2fe38edfcbbaa5699 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10d5b1613a0ed4c0b1e70397f4f168663357bcad Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/logits_process.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/logits_process.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21aa97844bcc4e71c1c94a7cfa57b5994edb1f6f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/logits_process.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22e1b8514136448dfbd2a040a495b78427db9a11049cc608c6f038ddfdaae233 +size 165048 diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28379f98dc1b88091adeea2dc16fe296e7909bcc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_utils.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3812724ea0a461913baef68de3e6f7e9e062a4f753b90a6876cb0fd235602b67 +size 155213 diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27c49bd2c2b57b00cf3315ba63f649f0c77bf001 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/utils.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8ba052ef8f2b2f60ed4f6bbd2d958a022502d424a55bbebd161c81d67b7c091 +size 215014 diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__init__.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3409af4cd78c62daa258303472fa5a6345b090d5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .auto import AutoHfQuantizer, AutoQuantizationConfig +from .base import HfQuantizer diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/auto.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/auto.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a1a8a5371a696ac9ecdb849d21529ae3d6a4f47 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/auto.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d00c3c47d948a981c0d5665526014acac4462ab3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/base.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_aqlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_aqlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9af983f50749b9406771a2323a2b10dfea5c834 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_aqlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_awq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_awq.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5df94fbe4680766ee17ce9e406c5d3c904d29286 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_awq.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_4bit.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_4bit.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49caf1d4460f6c642a2300f2933b43a43d8b8362 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_4bit.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_8bit.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_8bit.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8275421b966d4a7115f150d254f14f7fc7a3f5f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_8bit.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_eetq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_eetq.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d112b23daff9426332eee20431bfabf7f61fc25 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_eetq.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_fbgemm_fp8.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_fbgemm_fp8.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7d7c80480836e71b318e6fedaeaa32936195e7a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_fbgemm_fp8.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_higgs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_higgs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66762bad5fa76ed75f340ca92ac4ec73de0a8340 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_higgs.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_hqq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_hqq.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..048ff232b46b9203a6898f16ff8e46c9ef4beb66 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_hqq.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_quanto.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_quanto.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7265a0f53a156ee4ec767fe3d94e540af4179565 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_quanto.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizers_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizers_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68c9d2d9d8b017c94862601d62f27932e6498231 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizers_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/auto.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/auto.py new file mode 100644 index 0000000000000000000000000000000000000000..d5b51d038ab8bbe3231e05295fe3844dee0b1ca7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/auto.py @@ -0,0 +1,197 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +from typing import Dict, Optional, Union + +from ..models.auto.configuration_auto import AutoConfig +from ..utils.quantization_config import ( + AqlmConfig, + AwqConfig, + BitNetConfig, + BitsAndBytesConfig, + CompressedTensorsConfig, + EetqConfig, + FbgemmFp8Config, + GPTQConfig, + HiggsConfig, + HqqConfig, + QuantizationConfigMixin, + QuantizationMethod, + QuantoConfig, + TorchAoConfig, + VptqConfig, +) +from .quantizer_aqlm import AqlmHfQuantizer +from .quantizer_awq import AwqQuantizer +from .quantizer_bitnet import BitNetHfQuantizer +from .quantizer_bnb_4bit import Bnb4BitHfQuantizer +from .quantizer_bnb_8bit import Bnb8BitHfQuantizer +from .quantizer_compressed_tensors import CompressedTensorsHfQuantizer +from .quantizer_eetq import EetqHfQuantizer +from .quantizer_fbgemm_fp8 import FbgemmFp8HfQuantizer +from .quantizer_gptq import GptqHfQuantizer +from .quantizer_higgs import HiggsHfQuantizer +from .quantizer_hqq import HqqHfQuantizer +from .quantizer_quanto import QuantoHfQuantizer +from .quantizer_torchao import TorchAoHfQuantizer +from .quantizer_vptq import VptqHfQuantizer + + +AUTO_QUANTIZER_MAPPING = { + "awq": AwqQuantizer, + "bitsandbytes_4bit": Bnb4BitHfQuantizer, + "bitsandbytes_8bit": Bnb8BitHfQuantizer, + "gptq": GptqHfQuantizer, + "aqlm": AqlmHfQuantizer, + "quanto": QuantoHfQuantizer, + "eetq": EetqHfQuantizer, + "higgs": HiggsHfQuantizer, + "hqq": HqqHfQuantizer, + "compressed-tensors": CompressedTensorsHfQuantizer, + "fbgemm_fp8": FbgemmFp8HfQuantizer, + "torchao": TorchAoHfQuantizer, + "bitnet": BitNetHfQuantizer, + "vptq": VptqHfQuantizer, +} + +AUTO_QUANTIZATION_CONFIG_MAPPING = { + "awq": AwqConfig, + "bitsandbytes_4bit": BitsAndBytesConfig, + "bitsandbytes_8bit": BitsAndBytesConfig, + "eetq": EetqConfig, + "gptq": GPTQConfig, + "aqlm": AqlmConfig, + "quanto": QuantoConfig, + "hqq": HqqConfig, + "compressed-tensors": CompressedTensorsConfig, + "fbgemm_fp8": FbgemmFp8Config, + "higgs": HiggsConfig, + "torchao": TorchAoConfig, + "bitnet": BitNetConfig, + "vptq": VptqConfig, +} + + +class AutoQuantizationConfig: + """ + The Auto-HF quantization config class that takes care of automatically dispatching to the correct + quantization config given a quantization config stored in a dictionary. + """ + + @classmethod + def from_dict(cls, quantization_config_dict: Dict): + quant_method = quantization_config_dict.get("quant_method", None) + # We need a special care for bnb models to make sure everything is BC .. + if quantization_config_dict.get("load_in_8bit", False) or quantization_config_dict.get("load_in_4bit", False): + suffix = "_4bit" if quantization_config_dict.get("load_in_4bit", False) else "_8bit" + quant_method = QuantizationMethod.BITS_AND_BYTES + suffix + elif quant_method is None: + raise ValueError( + "The model's quantization config from the arguments has no `quant_method` attribute. Make sure that the model has been correctly quantized" + ) + + if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING.keys(): + raise ValueError( + f"Unknown quantization type, got {quant_method} - supported types are:" + f" {list(AUTO_QUANTIZER_MAPPING.keys())}" + ) + + target_cls = AUTO_QUANTIZATION_CONFIG_MAPPING[quant_method] + return target_cls.from_dict(quantization_config_dict) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + if getattr(model_config, "quantization_config", None) is None: + raise ValueError( + f"Did not found a `quantization_config` in {pretrained_model_name_or_path}. Make sure that the model is correctly quantized." + ) + quantization_config_dict = model_config.quantization_config + quantization_config = cls.from_dict(quantization_config_dict) + # Update with potential kwargs that are passed through from_pretrained. + quantization_config.update(**kwargs) + return quantization_config + + +class AutoHfQuantizer: + """ + The Auto-HF quantizer class that takes care of automatically instantiating to the correct + `HfQuantizer` given the `QuantizationConfig`. + """ + + @classmethod + def from_config(cls, quantization_config: Union[QuantizationConfigMixin, Dict], **kwargs): + # Convert it to a QuantizationConfig if the q_config is a dict + if isinstance(quantization_config, dict): + quantization_config = AutoQuantizationConfig.from_dict(quantization_config) + + quant_method = quantization_config.quant_method + + # Again, we need a special care for bnb as we have a single quantization config + # class for both 4-bit and 8-bit quantization + if quant_method == QuantizationMethod.BITS_AND_BYTES: + if quantization_config.load_in_8bit: + quant_method += "_8bit" + else: + quant_method += "_4bit" + + if quant_method not in AUTO_QUANTIZER_MAPPING.keys(): + raise ValueError( + f"Unknown quantization type, got {quant_method} - supported types are:" + f" {list(AUTO_QUANTIZER_MAPPING.keys())}" + ) + + target_cls = AUTO_QUANTIZER_MAPPING[quant_method] + return target_cls(quantization_config, **kwargs) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + quantization_config = AutoQuantizationConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + return cls.from_config(quantization_config) + + @classmethod + def merge_quantization_configs( + cls, + quantization_config: Union[dict, QuantizationConfigMixin], + quantization_config_from_args: Optional[QuantizationConfigMixin], + ): + """ + handles situations where both quantization_config from args and quantization_config from model config are present. + """ + if quantization_config_from_args is not None: + warning_msg = ( + "You passed `quantization_config` or equivalent parameters to `from_pretrained` but the model you're loading" + " already has a `quantization_config` attribute. The `quantization_config` from the model will be used." + ) + else: + warning_msg = "" + + if isinstance(quantization_config, dict): + quantization_config = AutoQuantizationConfig.from_dict(quantization_config) + + if ( + isinstance(quantization_config, (GPTQConfig, AwqConfig, FbgemmFp8Config, CompressedTensorsConfig)) + and quantization_config_from_args is not None + ): + # special case for GPTQ / AWQ / FbgemmFp8 config collision + loading_attr_dict = quantization_config_from_args.get_loading_attributes() + for attr, val in loading_attr_dict.items(): + setattr(quantization_config, attr, val) + + warning_msg += f"However, loading attributes (e.g. {list(loading_attr_dict.keys())}) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored." + + if warning_msg != "": + warnings.warn(warning_msg) + + return quantization_config diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/base.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/base.py new file mode 100644 index 0000000000000000000000000000000000000000..d6303b230204e92859cb116e3d6d17251e6b2ecc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/base.py @@ -0,0 +1,245 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from ..utils import is_torch_available +from ..utils.quantization_config import QuantizationConfigMixin + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +if is_torch_available(): + import torch + + +class HfQuantizer(ABC): + """ + Abstract class of the HuggingFace quantizer. Supports for now quantizing HF transformers models for inference and/or quantization. + This class is used only for transformers.PreTrainedModel.from_pretrained and cannot be easily used outside the scope of that method + yet. + + Attributes + quantization_config (`transformers.utils.quantization_config.QuantizationConfigMixin`): + The quantization config that defines the quantization parameters of your model that you want to quantize. + modules_to_not_convert (`List[str]`, *optional*): + The list of module names to not convert when quantizing the model. + required_packages (`List[str]`, *optional*): + The list of required pip packages to install prior to using the quantizer + requires_calibration (`bool`): + Whether the quantization method requires to calibrate the model before using it. + requires_parameters_quantization (`bool`): + Whether the quantization method requires to create a new Parameter. For example, for bitsandbytes, it is + required to create a new xxxParameter in order to properly quantize the model. + """ + + requires_calibration = False + required_packages = None + requires_parameters_quantization = False + + def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): + self.quantization_config = quantization_config + + # -- Handle extra kwargs below -- + self.modules_to_not_convert = kwargs.pop("modules_to_not_convert", []) + self.pre_quantized = kwargs.pop("pre_quantized", True) + + if not self.pre_quantized and self.requires_calibration: + raise ValueError( + f"The quantization method {quantization_config.quant_method} does require the model to be pre-quantized." + f" You explicitly passed `pre_quantized=False` meaning your model weights are not quantized. Make sure to " + f"pass `pre_quantized=True` while knowing what you are doing." + ) + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + """ + Some quantization methods require to explicitly set the dtype of the model to a + target dtype. You need to override this method in case you want to make sure that behavior is + preserved + + Args: + torch_dtype (`torch.dtype`): + The input dtype that is passed in `from_pretrained` + """ + return torch_dtype + + def update_device_map(self, device_map: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: + """ + Override this method if you want to pass a override the existing device map with a new + one. E.g. for bitsandbytes, since `accelerate` is a hard requirement, if no device_map is + passed, the device_map is set to `"auto"`` + + Args: + device_map (`Union[dict, str]`, *optional*): + The device_map that is passed through the `from_pretrained` method. + """ + return device_map + + def adjust_target_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + """ + Override this method if you want to adjust the `target_dtype` variable used in `from_pretrained` + to compute the device_map in case the device_map is a `str`. E.g. for bitsandbytes we force-set `target_dtype` + to `torch.int8` and for 4-bit we pass a custom enum `accelerate.CustomDtype.int4`. + + Args: + torch_dtype (`torch.dtype`, *optional*): + The torch_dtype that is used to compute the device_map. + """ + return torch_dtype + + def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]: + """ + Override this method if you want to adjust the `missing_keys`. + + Args: + missing_keys (`List[str]`, *optional*): + The list of missing keys in the checkpoint compared to the state dict of the model + """ + return missing_keys + + def update_expected_keys(self, model, expected_keys: List[str], loaded_keys: List[str]) -> List[str]: + """ + Override this method if you want to adjust the `update_expected_keys`. + + Args: + expected_keys (`List[str]`, *optional*): + The list of the expected keys in the initialized model. + loaded_keys (`List[str]`, *optional*): + The list of the loaded keys in the checkpoint. + """ + return expected_keys + + def get_special_dtypes_update(self, model, torch_dtype: "torch.dtype") -> Dict[str, "torch.dtype"]: + """ + returns dtypes for modules that are not quantized - used for the computation of the device_map in case + one passes a str as a device_map. The method will use the `modules_to_not_convert` that is modified + in `_process_model_before_weight_loading`. + + Args: + model (`~transformers.PreTrainedModel`): + The model to quantize + torch_dtype (`torch.dtype`): + The dtype passed in `from_pretrained` method. + """ + + return { + name: torch_dtype + for name, _ in model.named_parameters() + if any(m in name for m in self.modules_to_not_convert) + } + + def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: + """adjust max_memory argument for infer_auto_device_map() if extra memory is needed for quantization""" + return max_memory + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ) -> bool: + """ + checks if a loaded state_dict component is part of quantized param + some validation; only defined if + requires_parameters_quantization == True for quantization methods that require to create a new parameters + for quantization. + """ + return False + + def create_quantized_param(self, *args, **kwargs) -> "torch.nn.Parameter": + """ + takes needed components from state_dict and creates quantized param; only applicable if + requires_parameters_quantization == True + """ + if not self.requires_parameters_quantization: + raise AttributeError( + f"`.create_quantized_param()` method is not supported by quantizer class {self.__class__.__name__}." + ) + + def validate_environment(self, *args, **kwargs): + """ + This method is used to potentially check for potential conflicts with arguments that are + passed in `from_pretrained`. You need to define it for all future quantizers that are integrated with transformers. + If no explicit check are needed, simply return nothing. + """ + return + + def preprocess_model(self, model: "PreTrainedModel", **kwargs): + """ + Setting model attributes and/or converting model before weights loading. At this point + the model should be initialized on the meta device so you can freely manipulate the skeleton + of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`. + + Args: + model (`~transformers.PreTrainedModel`): + The model to quantize + kwargs (`dict`, *optional*): + The keyword arguments that are passed along `_process_model_before_weight_loading`. + """ + model.is_quantized = True + model.quantization_method = self.quantization_config.quant_method + return self._process_model_before_weight_loading(model, **kwargs) + + def postprocess_model(self, model: "PreTrainedModel", **kwargs): + """ + Post-process the model post weights loading. + Make sure to override the abstract method `_process_model_after_weight_loading`. + + Args: + model (`~transformers.PreTrainedModel`): + The model to quantize + kwargs (`dict`, *optional*): + The keyword arguments that are passed along `_process_model_after_weight_loading`. + """ + return self._process_model_after_weight_loading(model, **kwargs) + + def dequantize(self, model): + """ + Potentially dequantize the model to retrive the original model, with some loss in accuracy / performance. + Note not all quantization schemes support this. + """ + model = self._dequantize(model) + + # Delete quantizer and quantization config + del model.hf_quantizer + del model.config.quantization_config + del model.config._pre_quantization_dtype + model.is_quantized = False + + return model + + def _dequantize(self, model): + raise NotImplementedError( + f"{self.quantization_config.quant_method} has no implementation of `dequantize`, please raise an issue on GitHub." + ) + + @property + def is_qat_trainable(self) -> bool: + """Flag indicating whether the quantized model can carry out quantization aware training""" + return False + + @abstractmethod + def _process_model_before_weight_loading(self, model, **kwargs): ... + + @abstractmethod + def _process_model_after_weight_loading(self, model, **kwargs): ... + + @abstractmethod + def is_serializable(self, safe_serialization=None): ... + + @property + @abstractmethod + def is_trainable(self): ... diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_aqlm.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_aqlm.py new file mode 100644 index 0000000000000000000000000000000000000000..9d1d6f7e89f1e9aa2454a1fd56c81524660f9079 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_aqlm.py @@ -0,0 +1,97 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from typing import TYPE_CHECKING, Optional + +from packaging import version + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..integrations import replace_with_aqlm_linear +from ..utils import is_accelerate_available, is_aqlm_available, is_torch_available, logging +from ..utils.quantization_config import QuantizationConfigMixin + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class AqlmHfQuantizer(HfQuantizer): + """ + Quantizer of the AQLM method. Enables the loading of prequantized models. + """ + + requires_calibration = True + required_packages = ["aqlm"] + optimum_quantizer = None + + def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): + super().__init__(quantization_config, **kwargs) + self.quantization_config = quantization_config + + def validate_environment(self, *args, **kwargs): + if not is_accelerate_available(): + raise ImportError("Using `aqlm` quantization requires Accelerate: `pip install accelerate`") + + if not is_aqlm_available(): + raise ImportError("Using `aqlm` quantization requires AQLM: `pip install aqlm[gpu,cpu]`") + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + if torch.cuda.is_available(): + torch_dtype = torch.float16 + logger.info( + "CUDA available. Assuming AQLM inference on GPU and loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually." + ) + else: + torch_dtype = torch.float32 + logger.info( + "CUDA is unavailable. Assuming AQLM inference on CPU and loading the model in `torch.float32`. To overwrite it, set `torch_dtype` manually." + ) + return torch_dtype + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + **kwargs, + ): + replace_with_aqlm_linear( + model, + quantization_config=self.quantization_config, + linear_weights_not_to_quantize=self.quantization_config.linear_weights_not_to_quantize, + ) + model.config.quantization_config = self.quantization_config + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + return model + + @property + def is_trainable(self, model: Optional["PreTrainedModel"] = None): + aqlm_supports_training = version.parse(importlib.metadata.version("aqlm")) >= version.parse("1.0.2") + if aqlm_supports_training: + return True + else: + logger.warning( + f"Currently installed `aqlm` version ({importlib.metadata.version('aqlm')}) doesn't support training. If you wish to train a quantized model, please update `aqlm` with `pip install aqlm>=1.0.2`" + ) + return False + + def is_serializable(self, safe_serialization=None): + return True diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_awq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_awq.py new file mode 100644 index 0000000000000000000000000000000000000000..d7a756b23a07e74253aeff1f22499f777126c2e6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_awq.py @@ -0,0 +1,152 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib.metadata +from typing import TYPE_CHECKING + +from packaging import version + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import is_accelerate_available, is_auto_awq_available, is_torch_available, logging +from ..utils.quantization_config import AWQLinearVersion + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class AwqQuantizer(HfQuantizer): + """ + 4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978) + """ + + # AWQ requires data callibration - we support only inference + requires_calibration = True + + required_packages = ["awq", "accelerate"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + + def validate_environment(self, device_map, **kwargs): + if not is_auto_awq_available(): + raise ImportError("Loading an AWQ quantized model requires auto-awq library (`pip install autoawq`)") + + if not is_accelerate_available(): + raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)") + + if self.quantization_config.version == AWQLinearVersion.GEMM and not torch.cuda.is_available(): + logger.warning_once("No CUDA found, replace GEMM with IPEX version to support non-cuda AWQ model.") + self.quantization_config.version = AWQLinearVersion.IPEX + + if self.quantization_config.version == AWQLinearVersion.IPEX: + if version.parse(importlib.metadata.version("autoawq")) < version.parse("0.2.6"): + raise RuntimeError( + "To use IPEX backend, you need autoawq>0.6.2. Please install the latest version or from source." + ) + if device_map is None: + logger.warning_once( + "You have loaded an AWQ model without setting device_map, please set 'cpu' or 'xpu' or 'auto'" + ) + elif isinstance(device_map, dict) and "disk" in device_map.values(): + raise ValueError( + "You are attempting to load an IPEX version AWQ model with a device_map that contains disk device." + " This is not supported. Please make sure only cpu and xpu in the device_map." + ) + else: + if not torch.cuda.is_available(): + raise RuntimeError( + "GPU is required to run AWQ quantized model. You can use IPEX version AWQ if you have an Intel CPU" + ) + + if device_map is None: + logger.warning_once( + "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set " + "your model on a GPU device in order to run your model." + ) + elif device_map is not None: + if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()): + raise ValueError( + "You are attempting to load an AWQ model with a device_map that contains a CPU or disk device." + " This is not supported. Please remove the CPU or disk device from the device_map." + ) + + def update_torch_dtype(self, torch_dtype): + if torch_dtype is None: + torch_dtype = torch.float16 + logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.") + elif torch_dtype != torch.float16: + logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.") + return torch_dtype + + def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs): + from ..integrations import get_keys_to_not_convert, replace_quantization_scales, replace_with_awq_linear + + self.modules_to_not_convert = get_keys_to_not_convert(model) + + if self.quantization_config.modules_to_not_convert is not None: + self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert) + + model, has_been_replaced = replace_with_awq_linear( + model, quantization_config=self.quantization_config, modules_to_not_convert=self.modules_to_not_convert + ) + + model = replace_quantization_scales(model, model.config.model_type) + + if not has_been_replaced: + logger.warning( + "You are loading an AWQ model but no linear modules were found in your model." + " Please double check your model architecture, or submit an issue on github if you think this is a bug." + ) + + def _process_model_after_weight_loading(self, model, **kwargs): + if self.quantization_config.do_fuse: + from ..integrations import fuse_awq_modules + + model = fuse_awq_modules(model, self.quantization_config) + model._awq_is_fused = True # TODO: consider storing this flag in model.config instead + + if self.quantization_config.version == AWQLinearVersion.EXLLAMA: + from ..integrations import post_init_awq_exllama_modules + + model = post_init_awq_exllama_modules(model, self.quantization_config.exllama_config) + + if self.quantization_config.version == AWQLinearVersion.IPEX: + from ..integrations import post_init_awq_ipex_modules + + model = post_init_awq_ipex_modules(model) + + def is_serializable(self, safe_serialization=None): + # AWQ through auto-awq has been always serializable, except if the model is fused. + if self.quantization_config.do_fuse: + logger.warning("You cannot save an AWQ model that uses fused modules!") + return False + + if self.quantization_config.version == AWQLinearVersion.EXLLAMA: + logger.warning("You cannot save an AWQ model that uses Exllama backend!") + return False + + return True + + @property + def is_trainable(self): + # AWQ supports PEFT fine-tuning from version 0.2.0 + MIN_AWQ_VERSION_FOR_PEFT = "0.2.0" + return version.parse(importlib.metadata.version("autoawq")) >= version.parse(MIN_AWQ_VERSION_FOR_PEFT) diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bitnet.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bitnet.py new file mode 100644 index 0000000000000000000000000000000000000000..3607caa00733ccb6ce67d313bf41f52bcad4657a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bitnet.py @@ -0,0 +1,115 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING, Dict, List, Union + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import is_accelerate_available, is_torch_available, logging + + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +class BitNetHfQuantizer(HfQuantizer): + """ + 1.58-bit quantization from BitNet quantization method: + Before loading: it converts the linear layers into BitLinear layers during loading. + + Checkout the paper introducing this method : https://arxiv.org/pdf/2402.17764 + """ + + requires_parameters_quantization = False + requires_calibration = True + + required_packages = ["accelerate"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + self.quantization_config = quantization_config + + def validate_environment(self, *args, **kwargs): + if not is_accelerate_available(): + raise ImportError("Loading a BitNet quantized model requires accelerate (`pip install accelerate`)") + + if kwargs.get("from_tf", False) or kwargs.get("from_flax", False): + raise ValueError( + "Loading ternary weights from tf/flax is currently not supported, please make" + " sure the weights are in PyTorch format." + ) + + if not torch.cuda.is_available(): + logger.warning_once( + "You don't have a GPU available to load the model, the inference will be slow because of weight unpacking" + ) + return + + device_map = kwargs.get("device_map", None) + if device_map is None: + logger.warning_once( + "You have loaded a BitNet model on CPU and have a CUDA device available, make sure to set " + "your model on a GPU device in order to run your model." + ) + elif device_map is not None: + if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()): + raise ValueError( + "You are attempting to load a BitNet model with a device_map that contains a CPU or disk device." + "This is not supported. Please remove the CPU or disk device from the device_map." + ) + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + return model + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + device_map, + keep_in_fp32_modules: List[str] = [], + **kwargs, + ): + from ..integrations import get_keys_to_not_convert, replace_with_bitnet_linear + + self.modules_to_not_convert = get_keys_to_not_convert(model) + + if self.quantization_config.modules_to_not_convert is not None: + self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert) + + model = replace_with_bitnet_linear( + model, + modules_to_not_convert=self.modules_to_not_convert, + quantization_config=self.quantization_config, + pre_quantized=self.pre_quantized, + ) + + def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: + max_memory = {key: val * 0.90 for key, val in max_memory.items()} + return max_memory + + def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": + target_dtype = torch.int8 + return target_dtype + + def is_serializable(self, safe_serialization=None): + return True + + @property + def is_trainable(self) -> bool: + return False diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_4bit.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_4bit.py new file mode 100644 index 0000000000000000000000000000000000000000..8657bda166254df45217519c5119f7eff3f1566e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_4bit.py @@ -0,0 +1,362 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from functools import cached_property +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from packaging import version + +from .base import HfQuantizer +from .quantizers_utils import get_module_from_name + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import ( + ACCELERATE_MIN_VERSION, + is_accelerate_available, + is_bitsandbytes_available, + is_torch_available, + is_torch_npu_available, + is_torch_xpu_available, + logging, +) + + +if is_torch_available(): + import torch + + from ..pytorch_utils import Conv1D + +logger = logging.get_logger(__name__) + + +class Bnb4BitHfQuantizer(HfQuantizer): + """ + 4-bit quantization from bitsandbytes.py quantization method: + before loading: converts transformer layers into Linear4bit during loading: load 16bit weight and pass to the + layer object after: quantizes individual weights in Linear4bit into 4bit at the first .cuda() call + saving: + from state dict, as usual; saves weights and `quant_state` components + loading: + need to locate `quant_state` components and pass to Param4bit constructor + """ + + use_keep_in_fp32_modules = True + requires_parameters_quantization = True + requires_calibration = False + + required_packages = ["bitsandbytes", "accelerate"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + + if self.quantization_config.llm_int8_skip_modules is not None: + self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules + + def validate_environment(self, *args, **kwargs): + if not is_accelerate_available(): + raise ImportError( + f"Using `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`" + ) + if not is_bitsandbytes_available(): + raise ImportError( + "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`" + ) + + from ..integrations import validate_bnb_backend_availability + from ..utils import is_bitsandbytes_multi_backend_available + + bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available() + validate_bnb_backend_availability(raise_exception=True) + + if kwargs.get("from_tf", False) or kwargs.get("from_flax", False): + raise ValueError( + "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make" + " sure the weights are in PyTorch format." + ) + + device_map = kwargs.get("device_map", None) + if ( + device_map is not None + and isinstance(device_map, dict) + and not self.quantization_config.llm_int8_enable_fp32_cpu_offload + ): + device_map_without_lm_head = { + key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert + } + if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled: + pass + elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values(): + raise ValueError( + "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the " + "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules " + "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to " + "`from_pretrained`. Check " + "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu " + "for more details. " + ) + + if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.39.0"): + raise ValueError( + "You have a version of `bitsandbytes` that is not compatible with 4bit inference and training" + " make sure you have the latest version of `bitsandbytes` installed" + ) + + def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": + if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"): + from accelerate.utils import CustomDtype + + if target_dtype != torch.int8: + logger.info("target_dtype {target_dtype} is replaced by `CustomDtype.INT4` for 4-bit BnB quantization") + return CustomDtype.INT4 + else: + raise ValueError( + "You are using `device_map='auto'` on a 4bit loaded version of the model. To automatically compute" + " the appropriate device map, you should upgrade your `accelerate` library," + "`pip install --upgrade accelerate` or install it from source to support fp4 auto device map" + "calculation. You may encounter unexpected behavior, or pass your own device map" + ) + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ) -> bool: + import bitsandbytes as bnb + + module, tensor_name = get_module_from_name(model, param_name) + if isinstance(module._parameters.get(tensor_name, None), bnb.nn.Params4bit): + # Add here check for loaded components' dtypes once serialization is implemented + return True + elif isinstance(module, bnb.nn.Linear4bit) and tensor_name == "bias": + # bias could be loaded by regular set_module_tensor_to_device() from accelerate, + # but it would wrongly use uninitialized weight there. + return True + else: + return False + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: Optional[List[str]] = None, + ): + """ + combines logic from _load_state_dict_into_meta_model and .integrations.bitsandbytes.py::set_module_quantized_tensor_to_device() + """ + import bitsandbytes as bnb + + module, tensor_name = get_module_from_name(model, param_name) + + if tensor_name not in module._parameters: + raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") + + old_value = getattr(module, tensor_name) + + # `torch.Tensor.to()` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)). + if isinstance(target_device, int) and is_torch_npu_available(): + target_device = f"npu:{target_device}" + if tensor_name == "bias": + if param_value is None: + new_value = old_value.to(target_device) + else: + new_value = param_value.to(target_device) + + new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad) + module._parameters[tensor_name] = new_value + return + + if not isinstance(module._parameters[tensor_name], bnb.nn.Params4bit): + raise ValueError("this function only loads `Linear4bit components`") + if ( + old_value.device == torch.device("meta") + and target_device not in ["meta", torch.device("meta")] + and param_value is None + ): + raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.") + + # construct `new_value` for the module._parameters[tensor_name]: + if self.pre_quantized: + # 4bit loading. Collecting components for restoring quantized weight + # This can be expanded to make a universal call for any quantized weight loading + + if not self.is_serializable: + raise ValueError( + "Detected int4 weights but the version of bitsandbytes is not compatible with int4 serialization. " + "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`." + ) + + if (param_name + ".quant_state.bitsandbytes__fp4" not in state_dict) and ( + param_name + ".quant_state.bitsandbytes__nf4" not in state_dict + ): + raise ValueError( + f"Supplied state dict for {param_name} does not contain `bitsandbytes__*` and possibly other `quantized_stats` components." + ) + + quantized_stats = {} + for k, v in state_dict.items(): + if param_name + "." in k: + quantized_stats[k] = v + if unexpected_keys is not None and k in unexpected_keys: + unexpected_keys.remove(k) + + param_kwargs = {} + if self.is_bnb_supports_quant_storage_module: + param_kwargs["module"] = module + + new_value = bnb.nn.Params4bit.from_prequantized( + data=param_value, + quantized_stats=quantized_stats, + requires_grad=False, + device=target_device, + **param_kwargs, + ) + else: + new_value = param_value.to("cpu") + + # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization. + # Since weights are saved in the correct "orientation", we skip transposing when loading. + if issubclass(module.source_cls, Conv1D): + new_value = new_value.T + + kwargs = old_value.__dict__ + new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device) + + module._parameters[tensor_name] = new_value + + # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.adjust_max_memory + def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: + # need more space for buffers that are created during quantization + max_memory = {key: val * 0.90 for key, val in max_memory.items()} + return max_memory + + # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.update_torch_dtype + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + # We force the `dtype` to be float16, this is a requirement from `bitsandbytes` + logger.info( + "Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to " + "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. " + "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass" + " torch_dtype=torch.float16 to remove this warning.", + torch_dtype, + ) + torch_dtype = torch.float16 + return torch_dtype + + def update_device_map(self, device_map): + if device_map is None: + if torch.cuda.is_available(): + device_map = {"": torch.cuda.current_device()} + elif is_torch_npu_available(): + device_map = {"": f"npu:{torch.npu.current_device()}"} + elif is_torch_xpu_available(): + device_map = {"": f"xpu:{torch.xpu.current_device()}"} + else: + device_map = {"": "cpu"} + logger.info( + "The device_map was not initialized. " + f"Setting device_map to {device_map}. " + "If you want to use the model for inference, please set device_map ='auto' " + ) + return device_map + + # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer._process_model_before_weight_loading + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + device_map, + keep_in_fp32_modules: List[str] = [], + **kwargs, + ): + from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear + + llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload + + # We keep some modules such as the lm_head in their original dtype for numerical stability reasons + if self.quantization_config.llm_int8_skip_modules is None: + self.modules_to_not_convert = get_keys_to_not_convert(model) + else: + self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules + + if not isinstance(self.modules_to_not_convert, list): + self.modules_to_not_convert = [self.modules_to_not_convert] + + self.modules_to_not_convert.extend(keep_in_fp32_modules) + + # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk` + if isinstance(device_map, dict) and len(device_map.keys()) > 1: + keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]] + + if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload: + raise ValueError( + "If you want to offload some keys to `cpu` or `disk`, you need to set " + "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be " + " converted to 8-bit but kept in 32-bit." + ) + self.modules_to_not_convert.extend(keys_on_cpu) + + model = replace_with_bnb_linear( + model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config + ) + # TODO: consider bringing replace_with_bnb_linear() code from ..integrations/bitsandbyter.py to here + + model.config.quantization_config = self.quantization_config + + # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer._process_model_after_weight_loading with 8bit->4bit + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + model.is_loaded_in_4bit = True + model.is_4bit_serializable = self.is_serializable() + return model + + def is_serializable(self, safe_serialization=None): + _is_4bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.41.3") + + if not _is_4bit_serializable: + logger.warning( + "You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. " + "If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed." + ) + return False + + return True + + @cached_property + def is_bnb_supports_quant_storage_module(self) -> bool: + """ + determines if the current version of bitsandbytes supports + the `module` parameter in `Params4bit.from_prequantized` + :return: + """ + return version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.43.3") + + @property + def is_trainable(self) -> bool: + return True + + def _dequantize(self, model): + from ..integrations import dequantize_and_replace + + model = dequantize_and_replace( + model, self.modules_to_not_convert, quantization_config=self.quantization_config + ) + return model diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_8bit.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_8bit.py new file mode 100644 index 0000000000000000000000000000000000000000..093d612b914cefbb9e472bfa4f68f42d914ff480 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_8bit.py @@ -0,0 +1,310 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from packaging import version + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import ( + ACCELERATE_MIN_VERSION, + is_accelerate_available, + is_bitsandbytes_available, + is_torch_available, + is_torch_xpu_available, + logging, +) +from .quantizers_utils import get_module_from_name + + +if is_torch_available(): + import torch + + from ..pytorch_utils import Conv1D + +logger = logging.get_logger(__name__) + + +class Bnb8BitHfQuantizer(HfQuantizer): + """ + 8-bit quantization from bitsandbytes quantization method: + before loading: converts transformer layers into Linear8bitLt during loading: load 16bit weight and pass to the + layer object after: quantizes individual weights in Linear8bitLt into 8bit at fitst .cuda() call + saving: + from state dict, as usual; saves weights and 'SCB' component + loading: + need to locate SCB component and pass to the Linear8bitLt object + """ + + use_keep_in_fp32_modules = True + requires_parameters_quantization = True + requires_calibration = False + + required_packages = ["bitsandbytes", "accelerate"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + + if self.quantization_config.llm_int8_skip_modules is not None: + self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules + + def validate_environment(self, *args, **kwargs): + if not is_accelerate_available(): + raise ImportError( + f"Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`" + ) + if not is_bitsandbytes_available(): + raise ImportError( + "Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`" + ) + + from ..integrations import validate_bnb_backend_availability + from ..utils import is_bitsandbytes_multi_backend_available + + bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available() + validate_bnb_backend_availability(raise_exception=True) + + if kwargs.get("from_tf", False) or kwargs.get("from_flax", False): + raise ValueError( + "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make" + " sure the weights are in PyTorch format." + ) + + device_map = kwargs.get("device_map", None) + if ( + device_map is not None + and isinstance(device_map, dict) + and not self.quantization_config.llm_int8_enable_fp32_cpu_offload + ): + device_map_without_lm_head = { + key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert + } + if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled: + pass + elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values(): + raise ValueError( + "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the " + "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules " + "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to " + "`from_pretrained`. Check " + "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu " + "for more details. " + ) + + if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.2"): + raise ValueError( + "You have a version of `bitsandbytes` that is not compatible with 8bit inference and training" + " make sure you have the latest version of `bitsandbytes` installed" + ) + + def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: + # need more space for buffers that are created during quantization + max_memory = {key: val * 0.90 for key, val in max_memory.items()} + return max_memory + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + # We force the `dtype` to be float16, this is a requirement from `bitsandbytes` + logger.info( + "Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to " + "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. " + "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass" + " torch_dtype=torch.float16 to remove this warning.", + torch_dtype, + ) + torch_dtype = torch.float16 + return torch_dtype + + def update_device_map(self, device_map): + if device_map is None: + if torch.cuda.is_available(): + device_map = {"": torch.cuda.current_device()} + elif is_torch_xpu_available(): + device_map = {"": f"xpu:{torch.xpu.current_device()}"} + else: + device_map = {"": "cpu"} + logger.info( + "The device_map was not initialized. " + f"Setting device_map to {device_map}. " + "If you want to use the model for inference, please set device_map ='auto' " + ) + return device_map + + def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": + if target_dtype != torch.int8: + logger.info("target_dtype {target_dtype} is replaced by `torch.int8` for 8-bit BnB quantization") + return torch.int8 + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ): + import bitsandbytes as bnb + + module, tensor_name = get_module_from_name(model, param_name) + if isinstance(module._parameters.get(tensor_name, None), bnb.nn.Int8Params): + if self.pre_quantized: + if param_name.replace("weight", "SCB") not in state_dict.keys(): + raise ValueError("Missing quantization component `SCB`") + if param_value.dtype != torch.int8: + raise ValueError( + f"Incompatible dtype `{param_value.dtype}` when loading 8-bit prequantized weight. Expected `torch.int8`." + ) + return True + return False + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: Optional[List[str]] = None, + ): + """ + combines logic from _load_state_dict_into_meta_model and .integrations.bitsandbytes.py::set_module_quantized_tensor_to_device() + needs aux items from state dicts, if found - removes them from unexpected_keys + """ + import bitsandbytes as bnb + + fp16_statistics_key = param_name.replace("weight", "SCB") + fp16_weights_format_key = param_name.replace("weight", "weight_format") + + fp16_statistics = state_dict.get(fp16_statistics_key, None) + fp16_weights_format = state_dict.get(fp16_weights_format_key, None) + + module, tensor_name = get_module_from_name(model, param_name) + if tensor_name not in module._parameters: + raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") + + old_value = getattr(module, tensor_name) + + if not isinstance(module._parameters[tensor_name], bnb.nn.Int8Params): + raise ValueError(f"Parameter `{tensor_name}` should only be a `bnb.nn.Int8Params` instance.") + if ( + old_value.device == torch.device("meta") + and target_device not in ["meta", torch.device("meta")] + and param_value is None + ): + raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.") + + new_value = param_value.to("cpu") + if self.pre_quantized and not self.is_serializable(): + raise ValueError( + "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. " + "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`." + ) + + # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization. + # Since weights are saved in the correct "orientation", we skip transposing when loading. + if issubclass(module.source_cls, Conv1D): + if fp16_statistics is None: + new_value = new_value.T + + kwargs = old_value.__dict__ + new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(target_device) + + module._parameters[tensor_name] = new_value + if fp16_statistics is not None: + setattr(module.weight, "SCB", fp16_statistics.to(target_device)) + if unexpected_keys is not None: + unexpected_keys.remove(fp16_statistics_key) + + # We just need to pop the `weight_format` keys from the state dict to remove unneeded + # messages. The correct format is correctly retrieved during the first forward pass. + if fp16_weights_format is not None and unexpected_keys is not None: + unexpected_keys.remove(fp16_weights_format_key) + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + model.is_loaded_in_8bit = True + model.is_8bit_serializable = self.is_serializable() + return model + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + device_map, + keep_in_fp32_modules: List[str] = [], + **kwargs, + ): + from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear + + llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload + + # We keep some modules such as the lm_head in their original dtype for numerical stability reasons + if self.quantization_config.llm_int8_skip_modules is None: + self.modules_to_not_convert = get_keys_to_not_convert(model) + else: + self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules + + if not isinstance(self.modules_to_not_convert, list): + self.modules_to_not_convert = [self.modules_to_not_convert] + + self.modules_to_not_convert.extend(keep_in_fp32_modules) + + # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk` + if isinstance(device_map, dict) and len(device_map.keys()) > 1: + keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]] + + if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload: + raise ValueError( + "If you want to offload some keys to `cpu` or `disk`, you need to set " + "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be " + " converted to 8-bit but kept in 32-bit." + ) + self.modules_to_not_convert.extend(keys_on_cpu) + + model = replace_with_bnb_linear( + model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config + ) + # TODO: consider bringing replace_with_bnb_linear() code from ..integrations/bitsandbyter.py to here + + model.config.quantization_config = self.quantization_config + + def is_serializable(self, safe_serialization=None): + _bnb_supports_8bit_serialization = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse( + "0.37.2" + ) + + if not _bnb_supports_8bit_serialization: + logger.warning( + "You are calling `save_pretrained` to a 8-bit converted model, but your `bitsandbytes` version doesn't support it. " + "If you want to save 8-bit models, make sure to have `bitsandbytes>0.37.2` installed. You will most likely face errors or" + " unexpected behaviours." + ) + return False + + return True + + @property + def is_trainable(self) -> bool: + return version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.37.0") + + def _dequantize(self, model): + from ..integrations import dequantize_and_replace + + model = dequantize_and_replace( + model, self.modules_to_not_convert, quantization_config=self.quantization_config + ) + return model diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_compressed_tensors.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_compressed_tensors.py new file mode 100644 index 0000000000000000000000000000000000000000..7d208087bbbfece0a4bb47238a773b23e3dbcd77 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_compressed_tensors.py @@ -0,0 +1,131 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +from ..utils import is_compressed_tensors_available, is_torch_available, logging +from ..utils.quantization_config import CompressedTensorsConfig +from .base import HfQuantizer + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class CompressedTensorsHfQuantizer(HfQuantizer): + """ + Quantizer for the compressed_tensors package. Loads and restores models to + quantized state with compressed_tensors + """ + + requires_calibration = True + required_packages = ["compressed_tensors"] + + def __init__(self, quantization_config: CompressedTensorsConfig, **kwargs): + super().__init__(quantization_config, **kwargs) + + if not is_compressed_tensors_available(): + raise ImportError( + "Using `compressed_tensors` quantized models requires the compressed-tensors library: " + "`pip install compressed-tensors`" + ) + + from compressed_tensors.compressors import ModelCompressor + + self.compressor = ModelCompressor.from_compression_config(quantization_config) + self.run_compressed = quantization_config.run_compressed + self.quantization_config = quantization_config + + def validate_environment(self, *args, **kwargs): + if not is_compressed_tensors_available(): + raise ImportError( + "Using `compressed_tensors` quantized models requires the compressed-tensors library: " + "`pip install compressed-tensors`" + ) + if not is_torch_available(): + # torch already should be installed as part of compressed tensors + raise ImportError("torch is required for using compressed-tensors quantization") + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + logger.info("Loading model using torch.float16 for compressed-tensors quantization") + torch_dtype = torch.float16 + elif torch_dtype != torch.float16: + logger.info( + "We suggest you to set `torch_dtype=torch.float16` for better efficiency with compressed_tensors." + ) + return torch_dtype + + def _process_model_before_weight_loading(self, model, **kwargs): + from compressed_tensors.quantization import apply_quantization_config + + ct_quantization_config = self.compressor.quantization_config + + if self.run_compressed and self.is_quantization_compressed: + apply_quantization_config(model, ct_quantization_config, run_compressed=True) + elif not self.is_quantization_compressed: + apply_quantization_config(model, ct_quantization_config) + + def _process_model_after_weight_loading(self, model, **kwargs): + """Decompress loaded model if necessary - need for qat""" + + if (self.is_quantization_compressed and not self.run_compressed) or self.is_sparsification_compressed: + config = kwargs.get("config", None) + cache_path = config._name_or_path + + if not os.path.exists(cache_path): + from transformers.utils import cached_file + + config_file_path = cached_file(cache_path, "config.json") + cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1]) + + if self.is_quantization_compressed and not self.run_compressed: + from compressed_tensors.quantization import QuantizationStatus + + self.compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN + self.compressor.decompress(model_path=cache_path, model=model) + + @property + def is_quantization_compressed(self): + from compressed_tensors.quantization import QuantizationStatus + + return ( + self.quantization_config.quantization_config is not None + and self.quantization_config.quantization_config.quantization_status == QuantizationStatus.COMPRESSED + ) + + @property + def is_sparsification_compressed(self): + from compressed_tensors.config.base import CompressionFormat + + return ( + self.quantization_config.sparsity_config is not None + and self.quantization_config.sparsity_config.format != CompressionFormat.dense.value + ) + + @property + def is_trainable(self): + return True + + def is_qat_trainable(self) -> bool: + """Loaded Models can carry out quantization aware training""" + # models need to be decompressed carry out qat + return not self.run_compressed or not self.is_quantization_compressed + + def is_serializable(self, safe_serialization=None) -> bool: + """Models quantized using compressed tensors can be saved to disk""" + return True diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_eetq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_eetq.py new file mode 100644 index 0000000000000000000000000000000000000000..7dfce75c373ad7c1411e99f0d3536bc0c475bfd1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_eetq.py @@ -0,0 +1,183 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import is_accelerate_available, is_eetq_available, is_torch_available, logging +from .quantizers_utils import get_module_from_name + + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +class EetqHfQuantizer(HfQuantizer): + """ + 8-bit quantization from EETQ quantization method: + before loading: converts transformer layers into W8A16Linear during loading: load 16bit weight and pass to the + layer object after: quantizes individual weights in Linear8bitLt into 8bit at first .cuda() call + """ + + requires_parameters_quantization = True + requires_calibration = False + + required_packages = ["eetq", "accelerate"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + self.quantization_config = quantization_config + + def validate_environment(self, *args, **kwargs): + if not is_eetq_available(): + raise ImportError( + "Using `eetq` 8-bit quantization requires eetq." + "Please install the latest version of eetq from : https://github.com/NetEase-FuXi/EETQ" + ) + + try: + import eetq # noqa: F401 + except ImportError as exc: + if "shard_checkpoint" in str(exc): + # EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed + # shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34. + # TODO: Update message once eetq releases a fix + raise ImportError( + "You are using a version of EETQ that is incompatible with the current transformers version. " + "Either downgrade transformers to <= v4.46.3 or, if available, upgrade EETQ to > v1.0.0." + ) from exc + else: + raise + + if not is_accelerate_available(): + raise ImportError("Loading an EETQ quantized model requires accelerate (`pip install accelerate`)") + + if kwargs.get("from_tf", False) or kwargs.get("from_flax", False): + raise ValueError( + "Converting into 8-bit weights from tf/flax weights is currently not supported, please make" + " sure the weights are in PyTorch format." + ) + + if not torch.cuda.is_available(): + raise RuntimeError("No GPU found. A GPU is needed for quantization.") + + device_map = kwargs.get("device_map", None) + if device_map is None: + logger.warning_once( + "You have loaded an EETQ model on CPU and have a CUDA device available, make sure to set " + "your model on a GPU device in order to run your model." + ) + elif device_map is not None: + if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()): + raise ValueError( + "You are attempting to load an EETQ model with a device_map that contains a CPU or disk device." + " This is not supported. Please remove the CPU or disk device from the device_map." + ) + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + torch_dtype = torch.float16 + logger.info( + "Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to " + "requirements of `eetq` to enable model loading in 8-bit. " + "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass" + " torch_dtype=torch.float16 to remove this warning.", + torch_dtype, + ) + elif torch_dtype != torch.float16: + logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with EETQ.") + return torch_dtype + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ): + from eetq import EetqLinear + + module, tensor_name = get_module_from_name(model, param_name) + + if isinstance(module, EetqLinear): + if self.pre_quantized or tensor_name == "bias": + if tensor_name == "weight" and param_value.dtype != torch.int8: + raise ValueError("Expect quantized weights but got an unquantized weight") + return False + else: + if tensor_name == "weight_scale": + raise ValueError("Expect unquantized weights but got a quantized weight_scale") + return True + return False + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: Optional[List[str]] = None, + ): + """ + quantizes weights into qweight and weight_scales + """ + from eetq import quantize_and_preprocess_weights + + module, tensor_name = get_module_from_name(model, param_name) + new_value, weight_scale = quantize_and_preprocess_weights(param_value) + + module._buffers[tensor_name] = new_value.to(target_device) + module.register("weight_scales", weight_scale.to(target_device)) + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + return model + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + device_map, + keep_in_fp32_modules: List[str] = [], + **kwargs, + ): + from ..integrations import get_keys_to_not_convert, replace_with_eetq_linear + + self.modules_to_not_convert = get_keys_to_not_convert(model) + + if self.quantization_config.modules_to_not_convert is not None: + self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert) + + model = replace_with_eetq_linear( + model, + modules_to_not_convert=self.modules_to_not_convert, + quantization_config=self.quantization_config, + pre_quantized=self.pre_quantized, + ) + + model.config.quantization_config = self.quantization_config + + def is_serializable(self, safe_serialization=None): + return True + + @property + def is_trainable(self) -> bool: + return True diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.py new file mode 100644 index 0000000000000000000000000000000000000000..07d5ce87ef6cc16977762cf8671b8fba7fe24f8c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.py @@ -0,0 +1,204 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from packaging import version + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging +from .quantizers_utils import get_module_from_name + + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +class FbgemmFp8HfQuantizer(HfQuantizer): + """ + FP8 quantization using fbgemm kernels + """ + + requires_parameters_quantization = True + requires_calibration = False + + required_packages = ["fbgemm-gpu", "accelerate"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + self.quantization_config = quantization_config + + def validate_environment(self, *args, **kwargs): + if not is_torch_available() or version.parse(importlib.metadata.version("torch")) < version.parse("2.1.0"): + raise ImportError( + "Using fbgemm fp8 quantization requires torch > 2.1.0" + "Please install the latest version of torch ( pip install --upgrade torch )" + ) + if not is_fbgemm_gpu_available(): + raise ImportError( + "Using fbgemm fp8 quantization requires fbgemm-gpu library" + "Please install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries" + ) + + if not is_accelerate_available("0.32.2"): + raise ImportError( + "Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)" + ) + + if not torch.cuda.is_available(): + raise RuntimeError("Using FP8 quantized models with fbgemm kernels requires a GPU") + + compute_capability = torch.cuda.get_device_capability() + major, minor = compute_capability + if major < 9: + raise ValueError( + "FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)" + ) + + device_map = kwargs.get("device_map", None) + if device_map is None: + logger.warning_once( + "You have loaded an FP8 model on CPU and have a CUDA device available, make sure to set " + "your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. " + ) + elif device_map is not None: + if ( + not self.pre_quantized + and isinstance(device_map, dict) + and ("cpu" in device_map.values() or "disk" in device_map.values()) + ): + raise ValueError( + "You are attempting to load an FP8 model with a device_map that contains a CPU or disk device." + "This is not supported when the model is quantized on the fly. " + "Please use a quantized checkpoint or remove the CPU or disk device from the device_map." + ) + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + torch_dtype = torch.bfloat16 + logger.info( + "Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to " + "requirements of `fbgemm-gpu` to enable model loading in fp8. " + "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass" + " torch_dtype=torch.bfloat16 to remove this warning.", + torch_dtype, + ) + elif torch_dtype == torch.float16: + raise ValueError( + "You cannot use FP8 with torch_dtype=torch.float16." + "We recommend you passing torch_dtype=torch.bfloat16" + ) + return torch_dtype + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ): + from ..integrations import FbgemmFp8Linear + + module, tensor_name = get_module_from_name(model, param_name) + + if isinstance(module, FbgemmFp8Linear): + if self.pre_quantized or tensor_name == "bias": + if tensor_name == "weight" and param_value.dtype != torch.float8_e4m3fn: + raise ValueError("Expect quantized weights but got an unquantized weight") + return False + else: + if tensor_name == "weight_scale": + raise ValueError("Expect unquantized weights but got a quantized weight_scale") + return True + return False + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: Optional[List[str]] = None, + ): + """ + Quantizes weights into weight and weight_scale + """ + new_value, weight_scale = torch.ops.fbgemm.quantize_fp8_per_row(param_value) + + module, tensor_name = get_module_from_name(model, param_name) + module._buffers[tensor_name] = new_value.to(target_device) + # to have the right output shape -> (out_features, 1) + module._buffers["weight_scale"] = weight_scale.view(weight_scale.shape[0], 1).to(target_device) + + if unexpected_keys is not None and param_name in unexpected_keys: + unexpected_keys.remove(param_name) + del param_name + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + return model + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + device_map, + keep_in_fp32_modules: List[str] = [], + **kwargs, + ): + from ..integrations import get_keys_to_not_convert, replace_with_fbgemm_fp8_linear + + self.modules_to_not_convert = get_keys_to_not_convert(model) + + if self.quantization_config.modules_to_not_convert is not None: + self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert) + + model = replace_with_fbgemm_fp8_linear( + model, + modules_to_not_convert=self.modules_to_not_convert, + quantization_config=self.quantization_config, + pre_quantized=self.pre_quantized, + ) + + model.config.quantization_config = self.quantization_config + + def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]: + from ..integrations import FbgemmFp8Linear + + not_missing_keys = [] + for name, module in model.named_modules(): + if isinstance(module, FbgemmFp8Linear): + for missing in missing_keys: + if ( + (name in missing or name in f"{prefix}.{missing}") + and not missing.endswith(".weight") + and not missing.endswith(".bias") + ): + not_missing_keys.append(missing) + return [k for k in missing_keys if k not in not_missing_keys] + + def is_serializable(self, safe_serialization=None): + return True + + @property + def is_trainable(self) -> bool: + return False diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_gptq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_gptq.py new file mode 100644 index 0000000000000000000000000000000000000000..d47a2ba79cb60da847000f13a1de0527703ce0e9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_gptq.py @@ -0,0 +1,101 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from typing import TYPE_CHECKING, Optional + +from packaging import version + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import is_auto_gptq_available, is_optimum_available, is_torch_available, logging +from ..utils.quantization_config import GPTQConfig, QuantizationConfigMixin + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class GptqHfQuantizer(HfQuantizer): + """ + Quantizer of the GPTQ method - for GPTQ the quantizer support calibration of the model through + `auto_gptq` package. Quantization is done under the hood for users if they load a non-prequantized model. + """ + + requires_calibration = False + required_packages = ["optimum", "auto_gptq"] + optimum_quantizer = None + + def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): + super().__init__(quantization_config, **kwargs) + + if not is_optimum_available(): + raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)") + from optimum.gptq import GPTQQuantizer + + self.optimum_quantizer = GPTQQuantizer.from_dict(self.quantization_config.to_dict_optimum()) + + def validate_environment(self, *args, **kwargs): + if not is_optimum_available(): + raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)") + + if not is_auto_gptq_available(): + raise ImportError( + "Loading a GPTQ quantized model requires the auto-gptq library (`pip install auto-gptq`)" + ) + + gptq_supports_cpu = version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2") + if not gptq_supports_cpu and not torch.cuda.is_available(): + raise RuntimeError("GPU is required to quantize or run quantize model.") + elif version.parse(importlib.metadata.version("auto_gptq")) < version.parse("0.4.2"): + raise ImportError( + "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq`" + ) + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + torch_dtype = torch.float16 + logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.") + elif torch_dtype != torch.float16: + logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with GPTQ.") + return torch_dtype + + def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs): + if model.__class__.main_input_name != "input_ids": + raise RuntimeError("We can only quantize pure text model.") + + if self.pre_quantized: + model = self.optimum_quantizer.convert_model(model) + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + if self.pre_quantized: + model = self.optimum_quantizer.post_init_model(model) + else: + if self.quantization_config.tokenizer is None: + self.quantization_config.tokenizer = model.name_or_path + + self.optimum_quantizer.quantize_model(model, self.quantization_config.tokenizer) + model.config.quantization_config = GPTQConfig.from_dict(self.optimum_quantizer.to_dict()) + + @property + def is_trainable(self, model: Optional["PreTrainedModel"] = None): + return True + + def is_serializable(self, safe_serialization=None): + return True diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_higgs.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_higgs.py new file mode 100644 index 0000000000000000000000000000000000000000..f33e2f21e98fd8f9636e040a0c4f3f37992b574c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_higgs.py @@ -0,0 +1,232 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from .base import HfQuantizer +from .quantizers_utils import get_module_from_name + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import is_accelerate_available, is_flute_available, is_hadamard_available, is_torch_available, logging +from ..utils.quantization_config import QuantizationConfigMixin + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +def get_num_sms_from_device(device): + target_device_cc = torch.cuda.get_device_capability(device=device) + if target_device_cc == (8, 6): + return 84 + elif target_device_cc == (8, 0): + return 108 + elif target_device_cc == (8, 9): + return 128 + else: + raise NotImplementedError( + f"Device capability {target_device_cc} not supported for FLUTE (yet?) to verify your device capability check out https://developer.nvidia.com/cuda-gpus" + ) + + +class HiggsHfQuantizer(HfQuantizer): + """ + Quantizer of the HIGGS method. Enables the loading of prequantized models and in-flight quantization of full-precision models. + """ + + requires_calibration = False + requires_parameters_quantization = True + required_packages = ["flute-kernel", "fast_hadamard_transform"] + + def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): + super().__init__(quantization_config, **kwargs) + self.quantization_config = quantization_config + + def validate_environment(self, device_map, **kwargs): + if not torch.cuda.is_available(): + raise NotImplementedError("HIGGS quantization is only supported on GPU. Please use a different quantizer.") + + if not is_accelerate_available(): + raise ImportError("Using `higgs` quantization requires Accelerate: `pip install accelerate`") + + if not is_flute_available(): + raise ImportError("Using `higgs` quantization requires FLUTE: `pip install flute-kernel>=0.3.0`") + + if not is_hadamard_available(): + raise ImportError( + "Using `higgs` quantization requires fast_hadamard_transform: `pip install fast_hadamard_transform`" + ) + + if device_map is None: + raise ValueError( + "You are attempting to load a HIGGS model without setting device_map." + " Please set device_map comprised of 'cuda' devices." + ) + elif isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()): + raise ValueError( + "You are attempting to load a HIGGS model with a device_map that contains a CPU or disk device." + " This is not supported. Please remove the CPU or disk device from the device_map." + ) + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + logger.info("`torch_dtype` is None. Setting `torch_dtype=torch.float16` for FLUTE compatibility.") + torch_dtype = torch.float16 + elif torch_dtype != torch.float16 and torch_dtype != torch.bfloat16: + raise ValueError( + f"Invalid `torch_dtype` {torch_dtype}. HIGGS quantization only supports `torch_dtype=torch.float16` or `torch_dtype=torch.bfloat16`." + ) + + return torch_dtype + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: Optional[List[str]] = None, + ): + from ..integrations import quantize_with_higgs + + """ + Quantizes weights into weight and weight_scale + """ + flute_dict = quantize_with_higgs( + param_value.to(target_device), + self.quantization_config.bits, + self.quantization_config.p, + self.quantization_config.group_size, + self.quantization_config.hadamard_size, + ) + + del param_value + + module, tensor_name = get_module_from_name(model, param_name) + for key, value in flute_dict.items(): + if key in module._parameters: + module._parameters[key] = torch.nn.Parameter(value, requires_grad=False) + elif key in module._buffers: + module._buffers[key] = torch.nn.Buffer(value) + else: + raise ValueError(f"Unexpected key {key} in module {module}") + + if unexpected_keys is not None and param_name in unexpected_keys: + unexpected_keys.remove(param_name) + + module.num_sms_packed = torch.nn.Parameter( + torch.tensor(get_num_sms_from_device(target_device), device=target_device, dtype=torch.int32), + requires_grad=False, + ) + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + **kwargs, + ): + from ..integrations import replace_with_higgs_linear + + replace_with_higgs_linear( + model, + quantization_config=self.quantization_config, + ) + model.config.quantization_config = self.quantization_config + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + import flute.utils + + from ..integrations import HiggsLinear + + flute_workspaces = {} + for name, module in model.named_modules(): + if isinstance(module, HiggsLinear): + # Every HiggsLinear needs a "workspace": a buffer for the unpacking operation. + # This buffer needs to be on the same device as the weights, but can be reused across modules otherwise. + if module.weight.device not in flute_workspaces: + flute_workspaces[module.weight.device] = flute.utils.make_workspace_streamk( + device=module.weight.device + ) + module.workspace = flute_workspaces[module.weight.device] + + # FLUTE weights are packed in a way that is optimized for a specific number of SMs (GPU streaming multiprocessors). + # If the model is loaded on a different device than the one it was saved on, we need to repack the weights. + if module.num_sms_packed.item() != get_num_sms_from_device(module.weight.device): + new_device = module.weight.device + new_num_sms = get_num_sms_from_device(new_device) + module.weight.data = flute.utils.pack( + flute.utils.unpack( + weight=module.weight.data, + scales=module.scales.data, + workspace=module.workspace, + num_bits=module.num_bits, + group_size=module.group_size, + num_sms_packed=module.num_sms_packed.item(), + ).T.contiguous(), + module.num_bits, + module.group_size, + ) + module.num_sms_packed = torch.nn.Parameter( + torch.tensor(new_num_sms, device=new_device, dtype=torch.int32), + requires_grad=False, + ) + + def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]: + from ..integrations import HiggsLinear + + not_missing_keys = [] + for name, module in model.named_modules(): + if isinstance(module, HiggsLinear): + for missing in missing_keys: + if ( + (name in missing or name in f"{prefix}.{missing}") + and not missing.endswith(".weight") + and not missing.endswith(".bias") + ): + not_missing_keys.append(missing) + return [k for k in missing_keys if k not in not_missing_keys] + + @property + def is_trainable(self, model: Optional["PreTrainedModel"] = None): + return False + + def is_serializable(self, safe_serialization=None): + return True + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ) -> bool: + from ..integrations import HiggsLinear + + module, tensor_name = get_module_from_name(model, param_name) + if isinstance(module, HiggsLinear) and tensor_name == "weight" and param_value.dtype != torch.int16: + # Only quantize weights of HiggsLinear modules that are not already quantized + return True + else: + return False + + def _dequantize(self, model): + from ..integrations import dequantize_higgs + + model = dequantize_higgs(model) + return model diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_hqq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_hqq.py new file mode 100644 index 0000000000000000000000000000000000000000..775fea8f4901e6116d796bca491e1bab5d74f46d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_hqq.py @@ -0,0 +1,296 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING, Any, Dict, List + +from ..integrations import prepare_for_hqq_linear +from ..utils import is_accelerate_available, is_hqq_available, is_torch_available, logging +from .base import HfQuantizer +from .quantizers_utils import get_module_from_name + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + + +if is_accelerate_available(): + from accelerate.hooks import remove_hook_from_module + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +# Finds the parent of a node module named "name" +def find_parent(model, name): + module_tree = name.split(".")[:-1] + parent = model + for m in module_tree: + parent = parent._modules[m] + return parent + + +class HqqHfQuantizer(HfQuantizer): + """ + HQQ quantizer base HF class. + nn.Linear modules are first tagged with quant_config in _process_model_before_weight_loading(). + The actual quantization and offloading to the GPU is done in check_quantized_param(). + """ + + use_keep_in_fp32_modules = False + requires_parameters_quantization = True + requires_calibration = False + required_packages = ["hqq"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + self.torch_dtype = None + self.using_multi_gpu = False + + def validate_environment(self, *args, **kwargs): + if not (is_hqq_available()): + raise ImportError( + "A valid HQQ version (>=0.2.1) is not available. Please follow the instructions to install it: `https://github.com/mobiusml/hqq/`." + ) + + if kwargs.get("from_tf", False) or kwargs.get("from_flax", False): + raise ValueError( + "Converting weights from tf/flax weights is currently not supported, please make" + " sure the weights are in PyTorch format." + ) + + if not torch.cuda.is_available(): + raise RuntimeError("No GPU found. A GPU is needed for quantization.") + + if self.torch_dtype is None: + if "torch_dtype" in kwargs: + self.torch_dtype = kwargs["torch_dtype"] + else: + self.torch_dtype = torch.float32 + logger.info("Setting torch_dtype to torch.float32 as the default value since it was not specified.") + + device_map = kwargs.get("device_map", None) + if isinstance(device_map, dict): + if "cpu" in device_map.values() or "disk" in device_map.values(): + raise ValueError( + "You are attempting to use an HQQ model with a device_map that contains a CPU or disk device." + " This is not supported. Please remove the CPU or disk device from the device_map." + ) + else: + self.using_multi_gpu = len(set(device_map.values())) > 1 + + def update_missing_keys( + self, model: "PreTrainedModel", missing_keys: List[str], prefix: str, **kwargs + ) -> List[str]: + if self.pre_quantized: + return [key for key in missing_keys if ("weight" not in key)] + else: + return missing_keys + + # Adds missing keys for HQQLinear modules that are loaded but the model with initialized with torch.nn.Linear + def update_expected_keys( + self, model: "PreTrainedModel", expected_keys: List[str], loaded_keys: List[str] + ) -> List[str]: + if not self.pre_quantized: + return expected_keys + + # Collects all quantizable (linear) layers + def _find_hqq_quantizable_layers(model, layers): + for name, module in model.named_children(): + if isinstance(module, (torch.nn.Linear)): + layers.add(module.name) + _find_hqq_quantizable_layers(module, layers) + + new_keys = set(expected_keys) + if is_hqq_available(): + from hqq.core.quantize import HQQLinear + + # Name modules + for name, module in model.named_modules(): + module.name = name + + # valid modules are Linear layers that have HQQLinear state_dict. We ignore skip_modules and any layers with Linear state_dict() params + _valid_modules = set() + _find_hqq_quantizable_layers(model, _valid_modules) + _valid_modules -= set(model.config.quantization_config["skip_modules"]) + + # Append new expected layers based on _ref_keys + _ref_keys = HQQLinear( + linear_layer=None, quant_config=None, compute_dtype=torch.float16, device="cpu" + ).state_dict_keys() - {"bias"} + + # Clean-up + _rm_keys = set() + for key in new_keys: + if any(_module in key for _module in _valid_modules): + _rm_keys.add(key) + new_keys -= _rm_keys + # At this point, new_keys contains all the keys of the layers that are NOT HQQLinear or torch.nn.Linear + + # Re-populate Linear/HQQLinear + for _module in _valid_modules: + if _module + ".weight" in loaded_keys: + new_keys.add(_module + ".weight") + else: + new_keys.update({_module + "." + _ref_key for _ref_key in _ref_keys}) + if _module + ".bias" in loaded_keys: + new_keys.add(_module + ".bias") + + return list(new_keys) + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ) -> bool: + if is_hqq_available(): + from hqq.core.quantize import HQQLinear + module, tensor_name = get_module_from_name(model, param_name) + + if self.pre_quantized: + return ( + (isinstance(module, torch.nn.Linear) or isinstance(module, HQQLinear)) + and tensor_name != "weight" + and tensor_name != "bias" + ) + else: + return isinstance(module, torch.nn.Linear) and tensor_name == "weight" + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: List[str], + ): + """ + Each nn.Linear layer is processsed here. + We first check if the corresponding module state_dict contains already HQQ quantized parameters. + If not, we create a temp linear layer with the module state_dict params and use it for quantization + """ + + if is_hqq_available(): + from hqq.core.quantize import HQQLinear + + module, tensor_name = get_module_from_name(model, param_name) + layer_name = ".".join(param_name.split(".")[:-1]) + parent_module = find_parent(model, layer_name) + node = layer_name.split(".")[-1] + + # set module state_dict + module_state_dict = {} + for k, v in state_dict.items(): + if layer_name + "." in k: + module_state_dict[k.split(".")[-1]] = v + if unexpected_keys is not None and k in unexpected_keys: + unexpected_keys.remove(k) + + if self.pre_quantized: + if isinstance(module, HQQLinear): + return + else: + hqq_layer = HQQLinear( + linear_layer=None, + quant_config=None, + compute_dtype=self.torch_dtype, + device=target_device, + ) + + hqq_layer.load_state_dict(module_state_dict) + + if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor): + hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias) + + if self.using_multi_gpu: + hqq_layer = self._patch_layer_for_multigpu(hqq_layer) + + setattr(parent_module, node, hqq_layer) + + # cleanup + del module.__dict__, module + torch.cuda.empty_cache() + return + + # Step 1: populate module with weight/bias from module state dict + for key in module_state_dict: + setattr(module, key, torch.nn.Parameter(module_state_dict[key])) + + # Step 2: Replace module with either HQQLinear or move it to device. We do this via setattr on the parent as doing on it on the module + # directly doesn't work. + if hasattr(module, "quant_config"): + hqq_layer = HQQLinear( + module, + module.quant_config, + compute_dtype=self.torch_dtype, + device=target_device, + del_orig=True, + ) + + if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor): + hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias) + + if self.using_multi_gpu: + hqq_layer = self._patch_layer_for_multigpu(hqq_layer) + + setattr(parent_module, node, hqq_layer) + + else: + module = module.to(dtype=self.torch_dtype, device=target_device) + setattr(parent_module, node, module) + + torch.cuda.empty_cache() + + # Remove accelerate hook and uses a simpler forward pass. Otherwise, this breaks with multi-gpu + def _patch_layer_for_multigpu(self, hqq_layer): + hqq_layer = remove_hook_from_module(hqq_layer) + + def forward_with_device(self, x): + out = torch.matmul(x.to(self.device), self.dequantize().t()) + if self.bias is not None: + out += self.bias + return out + + hqq_layer.forward = lambda x: forward_with_device(hqq_layer, x) + return hqq_layer + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + device_map, + keep_in_fp32_modules: List[str] = None, + **kwargs, + ): + keep_in_fp32_modules = keep_in_fp32_modules if keep_in_fp32_modules is not None else [] + + # Add the corresponding quant_config to each valid module. This allows us to do the actual nn.Linear -> HQQLinear conversion in create_quantized_param(). + # prepare_for_hqq_linear() also sets the right quantization config inside the model (model.config.quantization_config) and the layers (hqq_layer.quant_config) + model = prepare_for_hqq_linear(model, quantization_config=self.quantization_config) + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + model.is_hqq_quantized = True + model.is_hqq_serializable = self.is_serializable() + return model + + def is_serializable(self, safe_serialization=None): + return True + + @property + def is_trainable(self) -> bool: + return True diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_quanto.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_quanto.py new file mode 100644 index 0000000000000000000000000000000000000000..230e8efe15067290be354e3da5a10f3f0d79248c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_quanto.py @@ -0,0 +1,208 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from packaging import version + +from .base import HfQuantizer +from .quantizers_utils import get_module_from_name + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import ( + is_accelerate_available, + is_optimum_quanto_available, + is_torch_available, + logging, +) +from ..utils.quantization_config import QuantoConfig + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class QuantoHfQuantizer(HfQuantizer): + """ + Quantizer for the quanto library + """ + + required_packages = ["quanto", "accelerate"] + requires_parameters_quantization = True + requires_calibration = False + + def __init__(self, quantization_config: QuantoConfig, **kwargs): + super().__init__(quantization_config, **kwargs) + self.post_init() + + def post_init(self): + r""" + Safety checker + """ + if self.quantization_config.activations is not None and not self.pre_quantized: + raise ValueError( + "We don't support quantizing the activations with transformers library." + "Use quanto library for more complex use cases such as activations quantization, calibration and quantization aware training." + ) + + def validate_environment(self, *args, **kwargs): + if not is_optimum_quanto_available(): + raise ImportError( + "Loading an optimum-quanto quantized model requires optimum-quanto library (`pip install optimum-quanto`)" + ) + if not is_accelerate_available(): + raise ImportError( + "Loading an optimum-quanto quantized model requires accelerate library (`pip install accelerate`)" + ) + + def update_device_map(self, device_map): + if device_map is None: + device_map = {"": "cpu"} + logger.info( + "The device_map was not initialized. " + "Setting device_map to {'':'cpu'}. " + "If you want to use the model for inference, please set device_map ='auto'" + ) + return device_map + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + logger.info("You did not specify `torch_dtype` in `from_pretrained`. Setting it to `torch.float32`.") + torch_dtype = torch.float32 + return torch_dtype + + def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]: + if is_optimum_quanto_available(): + from optimum.quanto import QModuleMixin + + not_missing_keys = [] + for name, module in model.named_modules(): + if isinstance(module, QModuleMixin): + for missing in missing_keys: + if ( + (name in missing or name in f"{prefix}.{missing}") + and not missing.endswith(".weight") + and not missing.endswith(".bias") + ): + not_missing_keys.append(missing) + return [k for k in missing_keys if k not in not_missing_keys] + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ) -> bool: + """ + Check if a parameter needs to be quantized. + """ + if is_optimum_quanto_available(): + from optimum.quanto import QModuleMixin + + device_map = kwargs.get("device_map", None) + param_device = kwargs.get("param_device", None) + # we don't quantize the model if the module is going to be offloaded to the cpu + if device_map is not None and param_device is not None: + device_map_values = set(device_map.values()) + if param_device == "cpu" and len(device_map_values) > 1: + if not (device_map_values == {"cpu"} or device_map_values == {"cpu", "disk"}): + return False + + module, tensor_name = get_module_from_name(model, param_name) + # We only quantize the weights and the bias is not quantized. + if isinstance(module, QModuleMixin) and "weight" in tensor_name: + # if the weights are quantized, don't need to recreate it again with `create_quantized_param` + return not module.frozen + else: + return False + + def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: + max_memory = {key: val * 0.90 for key, val in max_memory.items()} + return max_memory + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + *args, + **kwargs, + ): + """ + Create the quantized parameter by calling .freeze() after setting it to the module. + """ + from accelerate.utils import set_module_tensor_to_device + + set_module_tensor_to_device(model, param_name, target_device, param_value) + module, _ = get_module_from_name(model, param_name) + module.freeze() + module.weight.requires_grad = False + + def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": + if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.27.0"): + from accelerate.utils import CustomDtype + + mapping = { + "int8": torch.int8, + "float8": CustomDtype.FP8, + "int4": CustomDtype.INT4, + "int2": CustomDtype.INT2, + } + target_dtype = mapping[self.quantization_config.weights] + return target_dtype + else: + raise ValueError( + "You are using `device_map='auto'` on an optimum-quanto quantized model. To automatically compute" + " the appropriate device map, you should upgrade your `accelerate` library," + "`pip install --upgrade accelerate` or install it from source." + ) + + def _process_model_before_weight_loading( + self, model: "PreTrainedModel", keep_in_fp32_modules: List[str] = [], **kwargs + ): + from ..integrations import get_keys_to_not_convert, replace_with_quanto_layers + + # We keep some modules such as the lm_head in their original dtype for numerical stability reasons + if self.quantization_config.modules_to_not_convert is None: + self.modules_to_not_convert = get_keys_to_not_convert(model) + else: + self.modules_to_not_convert = self.quantization_config.modules_to_not_convert + + if not isinstance(self.modules_to_not_convert, list): + self.modules_to_not_convert = [self.modules_to_not_convert] + + self.modules_to_not_convert.extend(keep_in_fp32_modules) + + model, _ = replace_with_quanto_layers( + model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config + ) + model.config.quantization_config = self.quantization_config + + def _process_model_after_weight_loading(self, model, **kwargs): + return model + + @property + def is_trainable(self, model: Optional["PreTrainedModel"] = None): + return True + + def is_serializable(self, safe_serialization=None): + return False diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_torchao.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_torchao.py new file mode 100644 index 0000000000000000000000000000000000000000..bcc9c57dfa006d0444db68b28962d079e48c1ec5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_torchao.py @@ -0,0 +1,231 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +import types +from typing import TYPE_CHECKING, Union + +from packaging import version + +from .base import HfQuantizer +from .quantizers_utils import get_module_from_name + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from typing import Any, Dict, List + +from ..utils import is_torch_available, is_torchao_available, logging + + +if is_torch_available(): + import torch + import torch.nn as nn + +logger = logging.get_logger(__name__) + + +# Finds the parent of a node module named "name" +def find_parent(model, name): + module_tree = name.split(".")[:-1] + parent = model + for m in module_tree: + parent = parent._modules[m] + return parent + + +def _quantization_type(weight): + from torchao.dtypes import AffineQuantizedTensor + from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor + + if isinstance(weight, AffineQuantizedTensor): + return f"{weight.__class__.__name__}({weight._quantization_type()})" + + if isinstance(weight, LinearActivationQuantizedTensor): + return f"{weight.__class__.__name__}(activation={weight.input_quant_func}, weight={_quantization_type(weight.original_weight_tensor)})" + + +def _linear_extra_repr(self): + weight = _quantization_type(self.weight) + if weight is None: + return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight=None" + else: + return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={weight}" + + +class TorchAoHfQuantizer(HfQuantizer): + """ + Quantizer for torchao: https://github.com/pytorch/ao/ + """ + + requires_parameters_quantization = True + requires_calibration = False + required_packages = ["torchao"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + + def validate_environment(self, *args, **kwargs): + if not is_torchao_available(): + raise ImportError("Loading an torchao quantized model requires torchao library (`pip install torchao`)") + + self.offload = False + device_map = kwargs.get("device_map", None) + if isinstance(device_map, dict): + if "cpu" in device_map.values() or "disk" in device_map.values(): + if self.pre_quantized: + raise ValueError( + "You are attempting to perform cpu/disk offload with a pre-quantized torchao model " + "This is not supported yet . Please remove the CPU or disk device from the device_map." + ) + else: + self.offload = True + if self.pre_quantized: + weights_only = kwargs.get("weights_only", None) + if weights_only: + torch_version = version.parse(importlib.metadata.version("torch")) + if torch_version < version.parse("2.5.0"): + raise RuntimeError( + f"In order to use torchao pre-quantized model, you need to have torch>=2.5.0. However, the current version is {torch_version}." + f" You can also set with `weights_only=False` in `from_pretrained` if you don't want to update torch" + ) + + def update_torch_dtype(self, torch_dtype): + if self.quantization_config.quant_type == "int4_weight_only": + if torch_dtype is not None and torch_dtype != torch.bfloat16: + logger.warning_once( + f"Setting torch_dtype to {torch_dtype} for int4_weight_only quantization, but only bfloat16 is supported right now. Please set the torch_dtype to bfloat16." + ) + if torch_dtype is None: + logger.warning_once( + "Setting torch_dtype to torch.bfloat16 for int4_weight_only quantization since only bfloat16 is supported right now. Please set torch_dtype=torch.bfloat16 to remove this warning." + ) + torch_dtype = torch.bfloat16 + if self.quantization_config.quant_type == "int8_dynamic_activation_int8_weight": + if torch_dtype is None: + logger.info( + "Setting torch_dtype to torch.float32 for int8_dynamic_activation_int8_weight quantization as no torch_dtype was specified in from_pretrained" + ) + # we need to set the torch_dtype, otherwise we have dtype mismatch when performing the quantized linear op + torch_dtype = torch.float32 + return torch_dtype + + def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": + if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"): + from accelerate.utils import CustomDtype + + map_to_target_dtype = { + "int4_weight_only": CustomDtype.INT4, + "int8_weight_only": torch.int8, + "int8_dynamic_activation_int8_weight": torch.int8, + } + return map_to_target_dtype[self.quantization_config.quant_type] + else: + raise ValueError( + "You are using `device_map='auto'` on a torchao quantized model. To automatically compute" + " the appropriate device map, you should upgrade your `accelerate` library with " + "`pip install --upgrade accelerate`" + ) + + def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: + # need more space for the quantization parameters (e.g. scale). Tested with int4 wo and group size = 128 + max_memory = {key: val * 0.9 for key, val in max_memory.items()} + return max_memory + + def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs): + from ..integrations import get_keys_to_not_convert + + self.modules_to_not_convert = get_keys_to_not_convert(model) + + if self.quantization_config.modules_to_not_convert is not None: + self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert) + + return + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ) -> bool: + param_device = kwargs.pop("param_device", None) + # check if the param_name is not in self.modules_to_not_convert + if any((key + "." in param_name) or (key == param_name) for key in self.modules_to_not_convert): + return False + elif param_device == "cpu" and self.offload: + # We don't quantize weights that we offload + return False + else: + # we only quantize the weight of nn.Linear + module, tensor_name = get_module_from_name(model, param_name) + return isinstance(module, torch.nn.Linear) and (tensor_name == "weight") + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: List[str], + ): + """ + Each nn.Linear layer that needs to be quantized is processsed here. + First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module. + """ + from torchao.quantization import quantize_ + + module, tensor_name = get_module_from_name(model, param_name) + + if self.pre_quantized: + module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device)) + if isinstance(module, nn.Linear): + module.extra_repr = types.MethodType(_linear_extra_repr, module) + else: + module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device) + quantize_(module, self.quantization_config.get_apply_tensor_subclass()) + + def _process_model_after_weight_loading(self, model, **kwargs): + """No process required for torchao quantized model""" + return + + def is_serializable(self, safe_serialization=None): + if safe_serialization: + logger.warning( + "torchao quantized model does not support safe serialization, " + "please set `safe_serialization` to False" + ) + return False + _is_torchao_serializable = version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse( + "0.25.0" + ) + if not _is_torchao_serializable: + logger.warning("torchao quantized model is only serializable after huggingface_hub >= 0.25.0 ") + if self.offload and self.quantization_config.modules_to_not_convert is None: + logger.warning( + "The model contains offloaded modules and these modules are not quantized. We don't recommend saving the model as we won't be able to reload them." + "If you want to specify modules to not quantize, please specify modules_to_not_convert in the quantization_config." + ) + return False + return _is_torchao_serializable + + @property + def is_trainable(self): + supported_quant_types_for_training = [ + "int8_weight_only", + "int8_dynamic_activation_int8_weight", + ] + return self.quantization_config.quant_type in supported_quant_types_for_training diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_vptq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_vptq.py new file mode 100644 index 0000000000000000000000000000000000000000..1672c3ebc5a7d35d7c788671340eaca9762c356e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_vptq.py @@ -0,0 +1,98 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING, Optional + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import is_accelerate_available, is_torch_available, is_vptq_available, logging +from ..utils.quantization_config import QuantizationConfigMixin + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class VptqHfQuantizer(HfQuantizer): + """ + Quantizer of the VPTQ method. Enables the loading of prequantized models. + """ + + requires_calibration = True + required_packages = ["vptq"] + + def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): + super().__init__(quantization_config, **kwargs) + self.quantization_config = quantization_config + + def validate_environment(self, *args, **kwargs): + if not is_accelerate_available(): + raise ImportError("Using `vptq` quantization requires Accelerate: `pip install accelerate`") + + if not is_vptq_available(): + raise ImportError("Using `vptq` quantization requires VPTQ>=0.0.4: `pip install -U vptq`") + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + if torch.cuda.is_available(): + torch_dtype = torch.float16 + logger.info( + "CUDA available. Assuming VPTQ inference on GPU and loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually." + ) + else: + import vptq + + device_availability = getattr(vptq, "device_availability", lambda device: False) + if device_availability("cpu") is True: + raise RuntimeError("No GPU found. Please wait for the next release of VPTQ to use CPU inference") + torch_dtype = torch.float32 + logger.info("No GPU found. Assuming VPTQ inference on CPU and loading the model in `torch.float32`.") + return torch_dtype + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + **kwargs, + ): + """ + we don't have param like modules_to_not_convert to indicate which layers should not be quantized + because `quantization_config` include the layers that should be quantized + """ + from ..integrations import replace_with_vptq_linear + + modules_to_not_convert = kwargs.get("modules_to_not_convert", []) + ( + self.quantization_config.modules_to_not_convert or [] + ) + + replace_with_vptq_linear( + model, + quantization_config=self.quantization_config, + modules_to_not_convert=modules_to_not_convert, + ) + model.config.quantization_config = self.quantization_config + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + return model + + @property + def is_trainable(self, model: Optional["PreTrainedModel"] = None): + return False + + def is_serializable(self, safe_serialization=None): + return True diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizers_utils.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizers_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6ae287bf251b51337b8588b2e0176178316e7e96 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizers_utils.py @@ -0,0 +1,26 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Tuple + + +def get_module_from_name(module, tensor_name: str) -> Tuple[Any, str]: + if "." in tensor_name: + splits = tensor_name.split(".") + for split in splits[:-1]: + new_module = getattr(module, split) + if new_module is None: + raise ValueError(f"{module} has no attribute {split}.") + module = new_module + tensor_name = splits[-1] + return module, tensor_name