diff --git a/.gitattributes b/.gitattributes
index 9e3bd09e030501d2c64e03fa740a7bda429ef395..4693aff96530df800098c4e9b1871bc63e6bdb1f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -422,3 +422,8 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/
 .venv/lib/python3.11/site-packages/pkg_resources/_vendor/pyparsing/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pkg_resources/__pycache__/__init__.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pkg_resources/_vendor/more_itertools/__pycache__/more.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/logits_process.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d9bad05b8e4114608cfe4b7881f53c15950b2f6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations_tf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79ab2704fa66d3acaa99d707d2c479df3587f95f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/activations_tf.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/audio_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/audio_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3459b29826c2faa9acf5f652765be0b1e51cce1a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/audio_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/configuration_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/configuration_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c667cc4e9d668e3d6b3141ea9ba14280fe027074
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/configuration_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8fbce2e084be5af0857f70a01cdcd1aab57bdde
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_pytorch_checkpoint_to_tf2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_pytorch_checkpoint_to_tf2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7fb7469ba8189787bcfa1e31fab7be4b7c2241d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_pytorch_checkpoint_to_tf2.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2d2bab39fd3b2b01cdc34903e9d16b286f90d0d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2fed5838e2a4f5ed75fba53981029b869842603
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..141bf0d51b136f5c61694fcdea6154cbe9d4fb69
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/debug_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/debug_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3a5bc6cabdee64d381c9bd0662f57a19e0fa98e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/debug_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_check.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_check.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a009956be49c59013c7d78b250f1860eae88951
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_check.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_table.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_table.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d1e8cb806448a3f23b35ca8a3ac66a7b37d5f78
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dependency_versions_table.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92c57f627ce1497a35fe53be7c181ebf17c73b25
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d296c8ca4285faa3de6c5efad9301f4ab66446b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9af08a16cebab55a3fc2f436f2ff845ce837e214
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/file_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/file_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c64f2e2d4dbe518c6d4dc9224b938e0b3951a77
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/file_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/hf_argparser.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/hf_argparser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9578adbb990e6e181dc497e6b0de583e9313730c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/hf_argparser.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/hyperparameter_search.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/hyperparameter_search.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ff48710e220416b72e4350d9c21cbfa1ebf8897
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/hyperparameter_search.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e31e613072f74b425e12941123db8a944d648d1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_base.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea1b73999c92b255cda0949585348addcf5e2e7f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7afc1a0f47d94d14a6e94bad2559d01660690cd0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/keras_callbacks.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/keras_callbacks.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7c9ac743fb1a5e30d183e17e729f7d837a30525
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/keras_callbacks.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed2a12331343972b986817d3cc2b7f6be97f1101
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34b3253111e3b7c97567adacb0aec5398ff18d1f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26116465358b7b6e3c37c5b9573153967edae0da
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2f28468c292936578a68b686e1782ee0ce0d3e9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01dd22599555236aec052843fe6f11bd48ee503a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d09469fb6408591ccae72b3fe43445be6a7d4122
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0b2aef97cb2fc5b0bb1e7fc722c2570024f102e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..acc0dc498cc36566b390d70fe8d64666a5568ade
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d9898c807b746620f266162e4236bc3b4bbe25b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81fbb6f9c25895771a98c78061f04cafb56039f2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_utils.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:276838de4c287269109cf384c70ff8d5055e5b91bb854c5126112b913a4d0093
+size 281139
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0e2954725cc3cdc044268e8195500a9218106e5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization_tf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25698db85293361c9d9c07399f52f38b0cc51ea1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/optimization_tf.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/processing_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/processing_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd1d6e5f9360e81850206ad451afb34889f6b4a2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/processing_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/pytorch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/pytorch_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74d47bf059c9629624d58aa9ffb714f77f79d2be
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/pytorch_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/safetensors_conversion.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/safetensors_conversion.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ce180b27a598c315e0949f4fb8c70554a793e78
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/safetensors_conversion.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/tf_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tf_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2edeacbdc1a481d219620ad29e8dbbdf9df472ea
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tf_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/time_series_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/time_series_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1aa90264517975c935e33d6edcac666abbb9804f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/time_series_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..944929db9c0ed24cbc39b54c5f79dd5422bcde3b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9324d315f5fba3c9676e0e5612ad7b9b86058e29
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49edaecbc672677e9fdfa7390871d5a377c9645f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa11f4d36296dff15a1f257f9e9bca3d856758140df110c8e3854e18618ab31
+size 267562
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_callback.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_callback.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8c6cb87839c0e223468eb759ea49d8036f7bef7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_callback.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c693201e6b5e2df4ef1aef9c0ba459b71606cb2f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12c94ae24982c174222b9c848e7dad9f6f53ef2a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..444e5f0331e3c7adfdb9fef2fe38edfcbbaa5699
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/trainer_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10d5b1613a0ed4c0b1e70397f4f168663357bcad
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/logits_process.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/logits_process.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21aa97844bcc4e71c1c94a7cfa57b5994edb1f6f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/logits_process.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22e1b8514136448dfbd2a040a495b78427db9a11049cc608c6f038ddfdaae233
+size 165048
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28379f98dc1b88091adeea2dc16fe296e7909bcc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_utils.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3812724ea0a461913baef68de3e6f7e9e062a4f753b90a6876cb0fd235602b67
+size 155213
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27c49bd2c2b57b00cf3315ba63f649f0c77bf001
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/utils.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8ba052ef8f2b2f60ed4f6bbd2d958a022502d424a55bbebd161c81d67b7c091
+size 215014
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__init__.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3409af4cd78c62daa258303472fa5a6345b090d5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .auto import AutoHfQuantizer, AutoQuantizationConfig
+from .base import HfQuantizer
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/auto.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/auto.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a1a8a5371a696ac9ecdb849d21529ae3d6a4f47
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/auto.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d00c3c47d948a981c0d5665526014acac4462ab3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/base.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_aqlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_aqlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9af983f50749b9406771a2323a2b10dfea5c834
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_aqlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_awq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_awq.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5df94fbe4680766ee17ce9e406c5d3c904d29286
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_awq.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_4bit.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_4bit.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49caf1d4460f6c642a2300f2933b43a43d8b8362
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_4bit.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_8bit.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_8bit.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8275421b966d4a7115f150d254f14f7fc7a3f5f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_bnb_8bit.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_eetq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_eetq.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d112b23daff9426332eee20431bfabf7f61fc25
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_eetq.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_fbgemm_fp8.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_fbgemm_fp8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7d7c80480836e71b318e6fedaeaa32936195e7a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_fbgemm_fp8.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_higgs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_higgs.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66762bad5fa76ed75f340ca92ac4ec73de0a8340
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_higgs.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_hqq.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_hqq.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..048ff232b46b9203a6898f16ff8e46c9ef4beb66
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_hqq.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_quanto.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_quanto.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7265a0f53a156ee4ec767fe3d94e540af4179565
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizer_quanto.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizers_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizers_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68c9d2d9d8b017c94862601d62f27932e6498231
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/quantizers/__pycache__/quantizers_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/auto.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5b51d038ab8bbe3231e05295fe3844dee0b1ca7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/auto.py
@@ -0,0 +1,197 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Dict, Optional, Union
+
+from ..models.auto.configuration_auto import AutoConfig
+from ..utils.quantization_config import (
+    AqlmConfig,
+    AwqConfig,
+    BitNetConfig,
+    BitsAndBytesConfig,
+    CompressedTensorsConfig,
+    EetqConfig,
+    FbgemmFp8Config,
+    GPTQConfig,
+    HiggsConfig,
+    HqqConfig,
+    QuantizationConfigMixin,
+    QuantizationMethod,
+    QuantoConfig,
+    TorchAoConfig,
+    VptqConfig,
+)
+from .quantizer_aqlm import AqlmHfQuantizer
+from .quantizer_awq import AwqQuantizer
+from .quantizer_bitnet import BitNetHfQuantizer
+from .quantizer_bnb_4bit import Bnb4BitHfQuantizer
+from .quantizer_bnb_8bit import Bnb8BitHfQuantizer
+from .quantizer_compressed_tensors import CompressedTensorsHfQuantizer
+from .quantizer_eetq import EetqHfQuantizer
+from .quantizer_fbgemm_fp8 import FbgemmFp8HfQuantizer
+from .quantizer_gptq import GptqHfQuantizer
+from .quantizer_higgs import HiggsHfQuantizer
+from .quantizer_hqq import HqqHfQuantizer
+from .quantizer_quanto import QuantoHfQuantizer
+from .quantizer_torchao import TorchAoHfQuantizer
+from .quantizer_vptq import VptqHfQuantizer
+
+
+AUTO_QUANTIZER_MAPPING = {
+    "awq": AwqQuantizer,
+    "bitsandbytes_4bit": Bnb4BitHfQuantizer,
+    "bitsandbytes_8bit": Bnb8BitHfQuantizer,
+    "gptq": GptqHfQuantizer,
+    "aqlm": AqlmHfQuantizer,
+    "quanto": QuantoHfQuantizer,
+    "eetq": EetqHfQuantizer,
+    "higgs": HiggsHfQuantizer,
+    "hqq": HqqHfQuantizer,
+    "compressed-tensors": CompressedTensorsHfQuantizer,
+    "fbgemm_fp8": FbgemmFp8HfQuantizer,
+    "torchao": TorchAoHfQuantizer,
+    "bitnet": BitNetHfQuantizer,
+    "vptq": VptqHfQuantizer,
+}
+
+AUTO_QUANTIZATION_CONFIG_MAPPING = {
+    "awq": AwqConfig,
+    "bitsandbytes_4bit": BitsAndBytesConfig,
+    "bitsandbytes_8bit": BitsAndBytesConfig,
+    "eetq": EetqConfig,
+    "gptq": GPTQConfig,
+    "aqlm": AqlmConfig,
+    "quanto": QuantoConfig,
+    "hqq": HqqConfig,
+    "compressed-tensors": CompressedTensorsConfig,
+    "fbgemm_fp8": FbgemmFp8Config,
+    "higgs": HiggsConfig,
+    "torchao": TorchAoConfig,
+    "bitnet": BitNetConfig,
+    "vptq": VptqConfig,
+}
+
+
+class AutoQuantizationConfig:
+    """
+    The Auto-HF quantization config class that takes care of automatically dispatching to the correct
+    quantization config given a quantization config stored in a dictionary.
+    """
+
+    @classmethod
+    def from_dict(cls, quantization_config_dict: Dict):
+        quant_method = quantization_config_dict.get("quant_method", None)
+        # We need a special care for bnb models to make sure everything is BC ..
+        if quantization_config_dict.get("load_in_8bit", False) or quantization_config_dict.get("load_in_4bit", False):
+            suffix = "_4bit" if quantization_config_dict.get("load_in_4bit", False) else "_8bit"
+            quant_method = QuantizationMethod.BITS_AND_BYTES + suffix
+        elif quant_method is None:
+            raise ValueError(
+                "The model's quantization config from the arguments has no `quant_method` attribute. Make sure that the model has been correctly quantized"
+            )
+
+        if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
+            raise ValueError(
+                f"Unknown quantization type, got {quant_method} - supported types are:"
+                f" {list(AUTO_QUANTIZER_MAPPING.keys())}"
+            )
+
+        target_cls = AUTO_QUANTIZATION_CONFIG_MAPPING[quant_method]
+        return target_cls.from_dict(quantization_config_dict)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        if getattr(model_config, "quantization_config", None) is None:
+            raise ValueError(
+                f"Did not found a `quantization_config` in {pretrained_model_name_or_path}. Make sure that the model is correctly quantized."
+            )
+        quantization_config_dict = model_config.quantization_config
+        quantization_config = cls.from_dict(quantization_config_dict)
+        # Update with potential kwargs that are passed through from_pretrained.
+        quantization_config.update(**kwargs)
+        return quantization_config
+
+
+class AutoHfQuantizer:
+    """
+     The Auto-HF quantizer class that takes care of automatically instantiating to the correct
+    `HfQuantizer` given the `QuantizationConfig`.
+    """
+
+    @classmethod
+    def from_config(cls, quantization_config: Union[QuantizationConfigMixin, Dict], **kwargs):
+        # Convert it to a QuantizationConfig if the q_config is a dict
+        if isinstance(quantization_config, dict):
+            quantization_config = AutoQuantizationConfig.from_dict(quantization_config)
+
+        quant_method = quantization_config.quant_method
+
+        # Again, we need a special care for bnb as we have a single quantization config
+        # class for both 4-bit and 8-bit quantization
+        if quant_method == QuantizationMethod.BITS_AND_BYTES:
+            if quantization_config.load_in_8bit:
+                quant_method += "_8bit"
+            else:
+                quant_method += "_4bit"
+
+        if quant_method not in AUTO_QUANTIZER_MAPPING.keys():
+            raise ValueError(
+                f"Unknown quantization type, got {quant_method} - supported types are:"
+                f" {list(AUTO_QUANTIZER_MAPPING.keys())}"
+            )
+
+        target_cls = AUTO_QUANTIZER_MAPPING[quant_method]
+        return target_cls(quantization_config, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        quantization_config = AutoQuantizationConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls.from_config(quantization_config)
+
+    @classmethod
+    def merge_quantization_configs(
+        cls,
+        quantization_config: Union[dict, QuantizationConfigMixin],
+        quantization_config_from_args: Optional[QuantizationConfigMixin],
+    ):
+        """
+        handles situations where both quantization_config from args and quantization_config from model config are present.
+        """
+        if quantization_config_from_args is not None:
+            warning_msg = (
+                "You passed `quantization_config` or equivalent parameters to `from_pretrained` but the model you're loading"
+                " already has a `quantization_config` attribute. The `quantization_config` from the model will be used."
+            )
+        else:
+            warning_msg = ""
+
+        if isinstance(quantization_config, dict):
+            quantization_config = AutoQuantizationConfig.from_dict(quantization_config)
+
+        if (
+            isinstance(quantization_config, (GPTQConfig, AwqConfig, FbgemmFp8Config, CompressedTensorsConfig))
+            and quantization_config_from_args is not None
+        ):
+            # special case for GPTQ / AWQ / FbgemmFp8 config collision
+            loading_attr_dict = quantization_config_from_args.get_loading_attributes()
+            for attr, val in loading_attr_dict.items():
+                setattr(quantization_config, attr, val)
+
+            warning_msg += f"However, loading attributes (e.g. {list(loading_attr_dict.keys())}) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored."
+
+        if warning_msg != "":
+            warnings.warn(warning_msg)
+
+        return quantization_config
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/base.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6303b230204e92859cb116e3d6d17251e6b2ecc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/base.py
@@ -0,0 +1,245 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from ..utils import is_torch_available
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+if is_torch_available():
+    import torch
+
+
+class HfQuantizer(ABC):
+    """
+    Abstract class of the HuggingFace quantizer. Supports for now quantizing HF transformers models for inference and/or quantization.
+    This class is used only for transformers.PreTrainedModel.from_pretrained and cannot be easily used outside the scope of that method
+    yet.
+
+    Attributes
+        quantization_config (`transformers.utils.quantization_config.QuantizationConfigMixin`):
+            The quantization config that defines the quantization parameters of your model that you want to quantize.
+        modules_to_not_convert (`List[str]`, *optional*):
+            The list of module names to not convert when quantizing the model.
+        required_packages (`List[str]`, *optional*):
+            The list of required pip packages to install prior to using the quantizer
+        requires_calibration (`bool`):
+            Whether the quantization method requires to calibrate the model before using it.
+        requires_parameters_quantization (`bool`):
+            Whether the quantization method requires to create a new Parameter. For example, for bitsandbytes, it is
+            required to create a new xxxParameter in order to properly quantize the model.
+    """
+
+    requires_calibration = False
+    required_packages = None
+    requires_parameters_quantization = False
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        self.quantization_config = quantization_config
+
+        # -- Handle extra kwargs below --
+        self.modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
+        self.pre_quantized = kwargs.pop("pre_quantized", True)
+
+        if not self.pre_quantized and self.requires_calibration:
+            raise ValueError(
+                f"The quantization method {quantization_config.quant_method} does require the model to be pre-quantized."
+                f" You explicitly passed `pre_quantized=False` meaning your model weights are not quantized. Make sure to "
+                f"pass `pre_quantized=True` while knowing what you are doing."
+            )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        """
+        Some quantization methods require to explicitly set the dtype of the model to a
+        target dtype. You need to override this method in case you want to make sure that behavior is
+        preserved
+
+        Args:
+            torch_dtype (`torch.dtype`):
+                The input dtype that is passed in `from_pretrained`
+        """
+        return torch_dtype
+
+    def update_device_map(self, device_map: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        """
+        Override this method if you want to pass a override the existing device map with a new
+        one. E.g. for bitsandbytes, since `accelerate` is a hard requirement, if no device_map is
+        passed, the device_map is set to `"auto"``
+
+        Args:
+            device_map (`Union[dict, str]`, *optional*):
+                The device_map that is passed through the `from_pretrained` method.
+        """
+        return device_map
+
+    def adjust_target_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        """
+        Override this method if you want to adjust the `target_dtype` variable used in `from_pretrained`
+        to compute the device_map in case the device_map is a `str`. E.g. for bitsandbytes we force-set `target_dtype`
+        to `torch.int8` and for 4-bit we pass a custom enum `accelerate.CustomDtype.int4`.
+
+        Args:
+            torch_dtype (`torch.dtype`, *optional*):
+                The torch_dtype that is used to compute the device_map.
+        """
+        return torch_dtype
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `missing_keys`.
+
+        Args:
+            missing_keys (`List[str]`, *optional*):
+                The list of missing keys in the checkpoint compared to the state dict of the model
+        """
+        return missing_keys
+
+    def update_expected_keys(self, model, expected_keys: List[str], loaded_keys: List[str]) -> List[str]:
+        """
+        Override this method if you want to adjust the `update_expected_keys`.
+
+        Args:
+            expected_keys (`List[str]`, *optional*):
+                The list of the expected keys in the initialized model.
+            loaded_keys (`List[str]`, *optional*):
+                The list of the loaded keys in the checkpoint.
+        """
+        return expected_keys
+
+    def get_special_dtypes_update(self, model, torch_dtype: "torch.dtype") -> Dict[str, "torch.dtype"]:
+        """
+        returns dtypes for modules that are not quantized - used for the computation of the device_map in case
+        one passes a str as a device_map. The method will use the `modules_to_not_convert` that is modified
+        in `_process_model_before_weight_loading`.
+
+        Args:
+            model (`~transformers.PreTrainedModel`):
+                The model to quantize
+            torch_dtype (`torch.dtype`):
+                The dtype passed in `from_pretrained` method.
+        """
+
+        return {
+            name: torch_dtype
+            for name, _ in model.named_parameters()
+            if any(m in name for m in self.modules_to_not_convert)
+        }
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        """adjust max_memory argument for infer_auto_device_map() if extra memory is needed for quantization"""
+        return max_memory
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        """
+        checks if a loaded state_dict component is part of quantized param + some validation; only defined if
+        requires_parameters_quantization == True for quantization methods that require to create a new parameters
+        for quantization.
+        """
+        return False
+
+    def create_quantized_param(self, *args, **kwargs) -> "torch.nn.Parameter":
+        """
+        takes needed components from state_dict and creates quantized param; only applicable if
+        requires_parameters_quantization == True
+        """
+        if not self.requires_parameters_quantization:
+            raise AttributeError(
+                f"`.create_quantized_param()` method is not supported by quantizer class {self.__class__.__name__}."
+            )
+
+    def validate_environment(self, *args, **kwargs):
+        """
+        This method is used to potentially check for potential conflicts with arguments that are
+        passed in `from_pretrained`. You need to define it for all future quantizers that are integrated with transformers.
+        If no explicit check are needed, simply return nothing.
+        """
+        return
+
+    def preprocess_model(self, model: "PreTrainedModel", **kwargs):
+        """
+        Setting model attributes and/or converting model before weights loading. At this point
+        the model should be initialized on the meta device so you can freely manipulate the skeleton
+        of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`.
+
+        Args:
+            model (`~transformers.PreTrainedModel`):
+                The model to quantize
+            kwargs (`dict`, *optional*):
+                The keyword arguments that are passed along `_process_model_before_weight_loading`.
+        """
+        model.is_quantized = True
+        model.quantization_method = self.quantization_config.quant_method
+        return self._process_model_before_weight_loading(model, **kwargs)
+
+    def postprocess_model(self, model: "PreTrainedModel", **kwargs):
+        """
+        Post-process the model post weights loading.
+        Make sure to override the abstract method `_process_model_after_weight_loading`.
+
+        Args:
+            model (`~transformers.PreTrainedModel`):
+                The model to quantize
+            kwargs (`dict`, *optional*):
+                The keyword arguments that are passed along `_process_model_after_weight_loading`.
+        """
+        return self._process_model_after_weight_loading(model, **kwargs)
+
+    def dequantize(self, model):
+        """
+        Potentially dequantize the model to retrive the original model, with some loss in accuracy / performance.
+        Note not all quantization schemes support this.
+        """
+        model = self._dequantize(model)
+
+        # Delete quantizer and quantization config
+        del model.hf_quantizer
+        del model.config.quantization_config
+        del model.config._pre_quantization_dtype
+        model.is_quantized = False
+
+        return model
+
+    def _dequantize(self, model):
+        raise NotImplementedError(
+            f"{self.quantization_config.quant_method} has no implementation of `dequantize`, please raise an issue on GitHub."
+        )
+
+    @property
+    def is_qat_trainable(self) -> bool:
+        """Flag indicating whether the quantized model can carry out quantization aware training"""
+        return False
+
+    @abstractmethod
+    def _process_model_before_weight_loading(self, model, **kwargs): ...
+
+    @abstractmethod
+    def _process_model_after_weight_loading(self, model, **kwargs): ...
+
+    @abstractmethod
+    def is_serializable(self, safe_serialization=None): ...
+
+    @property
+    @abstractmethod
+    def is_trainable(self): ...
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_aqlm.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_aqlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d1d6f7e89f1e9aa2454a1fd56c81524660f9079
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_aqlm.py
@@ -0,0 +1,97 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import TYPE_CHECKING, Optional
+
+from packaging import version
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..integrations import replace_with_aqlm_linear
+from ..utils import is_accelerate_available, is_aqlm_available, is_torch_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class AqlmHfQuantizer(HfQuantizer):
+    """
+    Quantizer of the AQLM method. Enables the loading of prequantized models.
+    """
+
+    requires_calibration = True
+    required_packages = ["aqlm"]
+    optimum_quantizer = None
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError("Using `aqlm` quantization requires Accelerate: `pip install accelerate`")
+
+        if not is_aqlm_available():
+            raise ImportError("Using `aqlm` quantization requires AQLM: `pip install aqlm[gpu,cpu]`")
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            if torch.cuda.is_available():
+                torch_dtype = torch.float16
+                logger.info(
+                    "CUDA available. Assuming AQLM inference on GPU and loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually."
+                )
+            else:
+                torch_dtype = torch.float32
+                logger.info(
+                    "CUDA is unavailable. Assuming AQLM inference on CPU and loading the model in `torch.float32`. To overwrite it, set `torch_dtype` manually."
+                )
+        return torch_dtype
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        **kwargs,
+    ):
+        replace_with_aqlm_linear(
+            model,
+            quantization_config=self.quantization_config,
+            linear_weights_not_to_quantize=self.quantization_config.linear_weights_not_to_quantize,
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        aqlm_supports_training = version.parse(importlib.metadata.version("aqlm")) >= version.parse("1.0.2")
+        if aqlm_supports_training:
+            return True
+        else:
+            logger.warning(
+                f"Currently installed `aqlm` version ({importlib.metadata.version('aqlm')}) doesn't support training. If you wish to train a quantized model, please update `aqlm` with `pip install aqlm>=1.0.2`"
+            )
+            return False
+
+    def is_serializable(self, safe_serialization=None):
+        return True
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_awq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_awq.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a756b23a07e74253aeff1f22499f777126c2e6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_awq.py
@@ -0,0 +1,152 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.metadata
+from typing import TYPE_CHECKING
+
+from packaging import version
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_auto_awq_available, is_torch_available, logging
+from ..utils.quantization_config import AWQLinearVersion
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class AwqQuantizer(HfQuantizer):
+    """
+    4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978)
+    """
+
+    # AWQ requires data callibration - we support only inference
+    requires_calibration = True
+
+    required_packages = ["awq", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+    def validate_environment(self, device_map, **kwargs):
+        if not is_auto_awq_available():
+            raise ImportError("Loading an AWQ quantized model requires auto-awq library (`pip install autoawq`)")
+
+        if not is_accelerate_available():
+            raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)")
+
+        if self.quantization_config.version == AWQLinearVersion.GEMM and not torch.cuda.is_available():
+            logger.warning_once("No CUDA found, replace GEMM with IPEX version to support non-cuda AWQ model.")
+            self.quantization_config.version = AWQLinearVersion.IPEX
+
+        if self.quantization_config.version == AWQLinearVersion.IPEX:
+            if version.parse(importlib.metadata.version("autoawq")) < version.parse("0.2.6"):
+                raise RuntimeError(
+                    "To use IPEX backend, you need autoawq>0.6.2. Please install the latest version or from source."
+                )
+            if device_map is None:
+                logger.warning_once(
+                    "You have loaded an AWQ model without setting device_map, please set 'cpu' or 'xpu' or 'auto'"
+                )
+            elif isinstance(device_map, dict) and "disk" in device_map.values():
+                raise ValueError(
+                    "You are attempting to load an IPEX version AWQ model with a device_map that contains disk device."
+                    " This is not supported. Please make sure only cpu and xpu in the device_map."
+                )
+        else:
+            if not torch.cuda.is_available():
+                raise RuntimeError(
+                    "GPU is required to run AWQ quantized model. You can use IPEX version AWQ if you have an Intel CPU"
+                )
+
+            if device_map is None:
+                logger.warning_once(
+                    "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set "
+                    "your model on a GPU device in order to run your model."
+                )
+            elif device_map is not None:
+                if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+                    raise ValueError(
+                        "You are attempting to load an AWQ model with a device_map that contains a CPU or disk device."
+                        " This is not supported. Please remove the CPU or disk device from the device_map."
+                    )
+
+    def update_torch_dtype(self, torch_dtype):
+        if torch_dtype is None:
+            torch_dtype = torch.float16
+            logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.")
+        elif torch_dtype != torch.float16:
+            logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.")
+        return torch_dtype
+
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        from ..integrations import get_keys_to_not_convert, replace_quantization_scales, replace_with_awq_linear
+
+        self.modules_to_not_convert = get_keys_to_not_convert(model)
+
+        if self.quantization_config.modules_to_not_convert is not None:
+            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+
+        model, has_been_replaced = replace_with_awq_linear(
+            model, quantization_config=self.quantization_config, modules_to_not_convert=self.modules_to_not_convert
+        )
+
+        model = replace_quantization_scales(model, model.config.model_type)
+
+        if not has_been_replaced:
+            logger.warning(
+                "You are loading an AWQ model but no linear modules were found in your model."
+                " Please double check your model architecture, or submit an issue on github if you think this is a bug."
+            )
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        if self.quantization_config.do_fuse:
+            from ..integrations import fuse_awq_modules
+
+            model = fuse_awq_modules(model, self.quantization_config)
+            model._awq_is_fused = True  # TODO: consider storing this flag in model.config instead
+
+        if self.quantization_config.version == AWQLinearVersion.EXLLAMA:
+            from ..integrations import post_init_awq_exllama_modules
+
+            model = post_init_awq_exllama_modules(model, self.quantization_config.exllama_config)
+
+        if self.quantization_config.version == AWQLinearVersion.IPEX:
+            from ..integrations import post_init_awq_ipex_modules
+
+            model = post_init_awq_ipex_modules(model)
+
+    def is_serializable(self, safe_serialization=None):
+        # AWQ through auto-awq has been always serializable, except if the model is fused.
+        if self.quantization_config.do_fuse:
+            logger.warning("You cannot save an AWQ model that uses fused modules!")
+            return False
+
+        if self.quantization_config.version == AWQLinearVersion.EXLLAMA:
+            logger.warning("You cannot save an AWQ model that uses Exllama backend!")
+            return False
+
+        return True
+
+    @property
+    def is_trainable(self):
+        # AWQ supports PEFT fine-tuning from version 0.2.0
+        MIN_AWQ_VERSION_FOR_PEFT = "0.2.0"
+        return version.parse(importlib.metadata.version("autoawq")) >= version.parse(MIN_AWQ_VERSION_FOR_PEFT)
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bitnet.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bitnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3607caa00733ccb6ce67d313bf41f52bcad4657a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bitnet.py
@@ -0,0 +1,115 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Dict, List, Union
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class BitNetHfQuantizer(HfQuantizer):
+    """
+    1.58-bit quantization from BitNet quantization method:
+    Before loading: it converts the linear layers into BitLinear layers during loading.
+
+    Checkout the paper introducing this method : https://arxiv.org/pdf/2402.17764
+    """
+
+    requires_parameters_quantization = False
+    requires_calibration = True
+
+    required_packages = ["accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError("Loading a BitNet quantized model requires accelerate (`pip install accelerate`)")
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Loading ternary weights from tf/flax is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        if not torch.cuda.is_available():
+            logger.warning_once(
+                "You don't have a GPU available to load the model, the inference will be slow because of weight unpacking"
+            )
+            return
+
+        device_map = kwargs.get("device_map", None)
+        if device_map is None:
+            logger.warning_once(
+                "You have loaded a BitNet model on CPU and have a CUDA device available, make sure to set "
+                "your model on a GPU device in order to run your model."
+            )
+        elif device_map is not None:
+            if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+                raise ValueError(
+                    "You are attempting to load a BitNet model with a device_map that contains a CPU or disk device."
+                    "This is not supported. Please remove the CPU or disk device from the device_map."
+                )
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        device_map,
+        keep_in_fp32_modules: List[str] = [],
+        **kwargs,
+    ):
+        from ..integrations import get_keys_to_not_convert, replace_with_bitnet_linear
+
+        self.modules_to_not_convert = get_keys_to_not_convert(model)
+
+        if self.quantization_config.modules_to_not_convert is not None:
+            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+
+        model = replace_with_bitnet_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+            pre_quantized=self.pre_quantized,
+        )
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        target_dtype = torch.int8
+        return target_dtype
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return False
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_4bit.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_4bit.py
new file mode 100644
index 0000000000000000000000000000000000000000..8657bda166254df45217519c5119f7eff3f1566e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_4bit.py
@@ -0,0 +1,362 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from functools import cached_property
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from packaging import version
+
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import (
+    ACCELERATE_MIN_VERSION,
+    is_accelerate_available,
+    is_bitsandbytes_available,
+    is_torch_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from ..pytorch_utils import Conv1D
+
+logger = logging.get_logger(__name__)
+
+
+class Bnb4BitHfQuantizer(HfQuantizer):
+    """
+    4-bit quantization from bitsandbytes.py quantization method:
+        before loading: converts transformer layers into Linear4bit during loading: load 16bit weight and pass to the
+        layer object after: quantizes individual weights in Linear4bit into 4bit at the first .cuda() call
+        saving:
+            from state dict, as usual; saves weights and `quant_state` components
+        loading:
+            need to locate `quant_state` components and pass to Param4bit constructor
+    """
+
+    use_keep_in_fp32_modules = True
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    required_packages = ["bitsandbytes", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        if self.quantization_config.llm_int8_skip_modules is not None:
+            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError(
+                f"Using `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
+            )
+        if not is_bitsandbytes_available():
+            raise ImportError(
+                "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
+            )
+
+        from ..integrations import validate_bnb_backend_availability
+        from ..utils import is_bitsandbytes_multi_backend_available
+
+        bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
+        validate_bnb_backend_availability(raise_exception=True)
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        device_map = kwargs.get("device_map", None)
+        if (
+            device_map is not None
+            and isinstance(device_map, dict)
+            and not self.quantization_config.llm_int8_enable_fp32_cpu_offload
+        ):
+            device_map_without_lm_head = {
+                key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
+            }
+            if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
+                pass
+            elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
+                raise ValueError(
+                    "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
+                    "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
+                    "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to "
+                    "`from_pretrained`. Check "
+                    "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu "
+                    "for more details. "
+                )
+
+        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.39.0"):
+            raise ValueError(
+                "You have a version of `bitsandbytes` that is not compatible with 4bit inference and training"
+                " make sure you have the latest version of `bitsandbytes` installed"
+            )
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
+            from accelerate.utils import CustomDtype
+
+            if target_dtype != torch.int8:
+                logger.info("target_dtype {target_dtype} is replaced by `CustomDtype.INT4` for 4-bit BnB quantization")
+            return CustomDtype.INT4
+        else:
+            raise ValueError(
+                "You are using `device_map='auto'` on a 4bit loaded version of the model. To automatically compute"
+                " the appropriate device map, you should upgrade your `accelerate` library,"
+                "`pip install --upgrade accelerate` or install it from source to support fp4 auto device map"
+                "calculation. You may encounter unexpected behavior, or pass your own device map"
+            )
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        import bitsandbytes as bnb
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if isinstance(module._parameters.get(tensor_name, None), bnb.nn.Params4bit):
+            # Add here check for loaded components' dtypes once serialization is implemented
+            return True
+        elif isinstance(module, bnb.nn.Linear4bit) and tensor_name == "bias":
+            # bias could be loaded by regular set_module_tensor_to_device() from accelerate,
+            # but it would wrongly use uninitialized weight there.
+            return True
+        else:
+            return False
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        combines logic from _load_state_dict_into_meta_model and .integrations.bitsandbytes.py::set_module_quantized_tensor_to_device()
+        """
+        import bitsandbytes as bnb
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if tensor_name not in module._parameters:
+            raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+
+        old_value = getattr(module, tensor_name)
+
+        # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
+        if isinstance(target_device, int) and is_torch_npu_available():
+            target_device = f"npu:{target_device}"
+        if tensor_name == "bias":
+            if param_value is None:
+                new_value = old_value.to(target_device)
+            else:
+                new_value = param_value.to(target_device)
+
+            new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad)
+            module._parameters[tensor_name] = new_value
+            return
+
+        if not isinstance(module._parameters[tensor_name], bnb.nn.Params4bit):
+            raise ValueError("this function only loads `Linear4bit components`")
+        if (
+            old_value.device == torch.device("meta")
+            and target_device not in ["meta", torch.device("meta")]
+            and param_value is None
+        ):
+            raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.")
+
+        # construct `new_value` for the module._parameters[tensor_name]:
+        if self.pre_quantized:
+            # 4bit loading. Collecting components for restoring quantized weight
+            # This can be expanded to make a universal call for any quantized weight loading
+
+            if not self.is_serializable:
+                raise ValueError(
+                    "Detected int4 weights but the version of bitsandbytes is not compatible with int4 serialization. "
+                    "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+                )
+
+            if (param_name + ".quant_state.bitsandbytes__fp4" not in state_dict) and (
+                param_name + ".quant_state.bitsandbytes__nf4" not in state_dict
+            ):
+                raise ValueError(
+                    f"Supplied state dict for {param_name} does not contain `bitsandbytes__*` and possibly other `quantized_stats` components."
+                )
+
+            quantized_stats = {}
+            for k, v in state_dict.items():
+                if param_name + "." in k:
+                    quantized_stats[k] = v
+                    if unexpected_keys is not None and k in unexpected_keys:
+                        unexpected_keys.remove(k)
+
+            param_kwargs = {}
+            if self.is_bnb_supports_quant_storage_module:
+                param_kwargs["module"] = module
+
+            new_value = bnb.nn.Params4bit.from_prequantized(
+                data=param_value,
+                quantized_stats=quantized_stats,
+                requires_grad=False,
+                device=target_device,
+                **param_kwargs,
+            )
+        else:
+            new_value = param_value.to("cpu")
+
+            # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
+            # Since weights are saved in the correct "orientation", we skip transposing when loading.
+            if issubclass(module.source_cls, Conv1D):
+                new_value = new_value.T
+
+            kwargs = old_value.__dict__
+            new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
+
+        module._parameters[tensor_name] = new_value
+
+    # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.adjust_max_memory
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        # need more space for buffers that are created during quantization
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.update_torch_dtype
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
+            logger.info(
+                "Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to "
+                "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
+                "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                " torch_dtype=torch.float16 to remove this warning.",
+                torch_dtype,
+            )
+            torch_dtype = torch.float16
+        return torch_dtype
+
+    def update_device_map(self, device_map):
+        if device_map is None:
+            if torch.cuda.is_available():
+                device_map = {"": torch.cuda.current_device()}
+            elif is_torch_npu_available():
+                device_map = {"": f"npu:{torch.npu.current_device()}"}
+            elif is_torch_xpu_available():
+                device_map = {"": f"xpu:{torch.xpu.current_device()}"}
+            else:
+                device_map = {"": "cpu"}
+            logger.info(
+                "The device_map was not initialized. "
+                f"Setting device_map to {device_map}. "
+                "If you want to use the model for inference, please set device_map ='auto' "
+            )
+        return device_map
+
+    # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer._process_model_before_weight_loading
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        device_map,
+        keep_in_fp32_modules: List[str] = [],
+        **kwargs,
+    ):
+        from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear
+
+        llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
+
+        # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
+        if self.quantization_config.llm_int8_skip_modules is None:
+            self.modules_to_not_convert = get_keys_to_not_convert(model)
+        else:
+            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
+
+        if not isinstance(self.modules_to_not_convert, list):
+            self.modules_to_not_convert = [self.modules_to_not_convert]
+
+        self.modules_to_not_convert.extend(keep_in_fp32_modules)
+
+        # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk`
+        if isinstance(device_map, dict) and len(device_map.keys()) > 1:
+            keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
+
+            if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload:
+                raise ValueError(
+                    "If you want to offload some keys to `cpu` or `disk`, you need to set "
+                    "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "
+                    " converted to 8-bit but kept in 32-bit."
+                )
+            self.modules_to_not_convert.extend(keys_on_cpu)
+
+        model = replace_with_bnb_linear(
+            model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        # TODO: consider bringing replace_with_bnb_linear() code from ..integrations/bitsandbyter.py to here
+
+        model.config.quantization_config = self.quantization_config
+
+    # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer._process_model_after_weight_loading with 8bit->4bit
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        model.is_loaded_in_4bit = True
+        model.is_4bit_serializable = self.is_serializable()
+        return model
+
+    def is_serializable(self, safe_serialization=None):
+        _is_4bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.41.3")
+
+        if not _is_4bit_serializable:
+            logger.warning(
+                "You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. "
+                "If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed."
+            )
+            return False
+
+        return True
+
+    @cached_property
+    def is_bnb_supports_quant_storage_module(self) -> bool:
+        """
+        determines if the current version of bitsandbytes supports
+        the `module` parameter in `Params4bit.from_prequantized`
+        :return:
+        """
+        return version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.43.3")
+
+    @property
+    def is_trainable(self) -> bool:
+        return True
+
+    def _dequantize(self, model):
+        from ..integrations import dequantize_and_replace
+
+        model = dequantize_and_replace(
+            model, self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        return model
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_8bit.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_8bit.py
new file mode 100644
index 0000000000000000000000000000000000000000..093d612b914cefbb9e472bfa4f68f42d914ff480
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_bnb_8bit.py
@@ -0,0 +1,310 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from packaging import version
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import (
+    ACCELERATE_MIN_VERSION,
+    is_accelerate_available,
+    is_bitsandbytes_available,
+    is_torch_available,
+    is_torch_xpu_available,
+    logging,
+)
+from .quantizers_utils import get_module_from_name
+
+
+if is_torch_available():
+    import torch
+
+    from ..pytorch_utils import Conv1D
+
+logger = logging.get_logger(__name__)
+
+
+class Bnb8BitHfQuantizer(HfQuantizer):
+    """
+    8-bit quantization from bitsandbytes quantization method:
+        before loading: converts transformer layers into Linear8bitLt during loading: load 16bit weight and pass to the
+        layer object after: quantizes individual weights in Linear8bitLt into 8bit at fitst .cuda() call
+    saving:
+        from state dict, as usual; saves weights and 'SCB' component
+    loading:
+        need to locate SCB component and pass to the Linear8bitLt object
+    """
+
+    use_keep_in_fp32_modules = True
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    required_packages = ["bitsandbytes", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        if self.quantization_config.llm_int8_skip_modules is not None:
+            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError(
+                f"Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
+            )
+        if not is_bitsandbytes_available():
+            raise ImportError(
+                "Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
+            )
+
+        from ..integrations import validate_bnb_backend_availability
+        from ..utils import is_bitsandbytes_multi_backend_available
+
+        bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
+        validate_bnb_backend_availability(raise_exception=True)
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        device_map = kwargs.get("device_map", None)
+        if (
+            device_map is not None
+            and isinstance(device_map, dict)
+            and not self.quantization_config.llm_int8_enable_fp32_cpu_offload
+        ):
+            device_map_without_lm_head = {
+                key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
+            }
+            if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
+                pass
+            elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
+                raise ValueError(
+                    "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
+                    "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
+                    "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to "
+                    "`from_pretrained`. Check "
+                    "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu "
+                    "for more details. "
+                )
+
+        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.2"):
+            raise ValueError(
+                "You have a version of `bitsandbytes` that is not compatible with 8bit inference and training"
+                " make sure you have the latest version of `bitsandbytes` installed"
+            )
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        # need more space for buffers that are created during quantization
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
+            logger.info(
+                "Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to "
+                "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
+                "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                " torch_dtype=torch.float16 to remove this warning.",
+                torch_dtype,
+            )
+            torch_dtype = torch.float16
+        return torch_dtype
+
+    def update_device_map(self, device_map):
+        if device_map is None:
+            if torch.cuda.is_available():
+                device_map = {"": torch.cuda.current_device()}
+            elif is_torch_xpu_available():
+                device_map = {"": f"xpu:{torch.xpu.current_device()}"}
+            else:
+                device_map = {"": "cpu"}
+            logger.info(
+                "The device_map was not initialized. "
+                f"Setting device_map to {device_map}. "
+                "If you want to use the model for inference, please set device_map ='auto' "
+            )
+        return device_map
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        if target_dtype != torch.int8:
+            logger.info("target_dtype {target_dtype} is replaced by `torch.int8` for 8-bit BnB quantization")
+        return torch.int8
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        import bitsandbytes as bnb
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if isinstance(module._parameters.get(tensor_name, None), bnb.nn.Int8Params):
+            if self.pre_quantized:
+                if param_name.replace("weight", "SCB") not in state_dict.keys():
+                    raise ValueError("Missing quantization component `SCB`")
+                if param_value.dtype != torch.int8:
+                    raise ValueError(
+                        f"Incompatible dtype `{param_value.dtype}` when loading 8-bit prequantized weight. Expected `torch.int8`."
+                    )
+            return True
+        return False
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        combines logic from _load_state_dict_into_meta_model and .integrations.bitsandbytes.py::set_module_quantized_tensor_to_device()
+        needs aux items from state dicts, if found - removes them from unexpected_keys
+        """
+        import bitsandbytes as bnb
+
+        fp16_statistics_key = param_name.replace("weight", "SCB")
+        fp16_weights_format_key = param_name.replace("weight", "weight_format")
+
+        fp16_statistics = state_dict.get(fp16_statistics_key, None)
+        fp16_weights_format = state_dict.get(fp16_weights_format_key, None)
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if tensor_name not in module._parameters:
+            raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+
+        old_value = getattr(module, tensor_name)
+
+        if not isinstance(module._parameters[tensor_name], bnb.nn.Int8Params):
+            raise ValueError(f"Parameter `{tensor_name}` should only be a `bnb.nn.Int8Params` instance.")
+        if (
+            old_value.device == torch.device("meta")
+            and target_device not in ["meta", torch.device("meta")]
+            and param_value is None
+        ):
+            raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.")
+
+        new_value = param_value.to("cpu")
+        if self.pre_quantized and not self.is_serializable():
+            raise ValueError(
+                "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. "
+                "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+            )
+
+        # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
+        # Since weights are saved in the correct "orientation", we skip transposing when loading.
+        if issubclass(module.source_cls, Conv1D):
+            if fp16_statistics is None:
+                new_value = new_value.T
+
+        kwargs = old_value.__dict__
+        new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(target_device)
+
+        module._parameters[tensor_name] = new_value
+        if fp16_statistics is not None:
+            setattr(module.weight, "SCB", fp16_statistics.to(target_device))
+            if unexpected_keys is not None:
+                unexpected_keys.remove(fp16_statistics_key)
+
+        # We just need to pop the `weight_format` keys from the state dict to remove unneeded
+        # messages. The correct format is correctly retrieved during the first forward pass.
+        if fp16_weights_format is not None and unexpected_keys is not None:
+            unexpected_keys.remove(fp16_weights_format_key)
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        model.is_loaded_in_8bit = True
+        model.is_8bit_serializable = self.is_serializable()
+        return model
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        device_map,
+        keep_in_fp32_modules: List[str] = [],
+        **kwargs,
+    ):
+        from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear
+
+        llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
+
+        # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
+        if self.quantization_config.llm_int8_skip_modules is None:
+            self.modules_to_not_convert = get_keys_to_not_convert(model)
+        else:
+            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
+
+        if not isinstance(self.modules_to_not_convert, list):
+            self.modules_to_not_convert = [self.modules_to_not_convert]
+
+        self.modules_to_not_convert.extend(keep_in_fp32_modules)
+
+        # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk`
+        if isinstance(device_map, dict) and len(device_map.keys()) > 1:
+            keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
+
+            if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload:
+                raise ValueError(
+                    "If you want to offload some keys to `cpu` or `disk`, you need to set "
+                    "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "
+                    " converted to 8-bit but kept in 32-bit."
+                )
+            self.modules_to_not_convert.extend(keys_on_cpu)
+
+        model = replace_with_bnb_linear(
+            model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        # TODO: consider bringing replace_with_bnb_linear() code from ..integrations/bitsandbyter.py to here
+
+        model.config.quantization_config = self.quantization_config
+
+    def is_serializable(self, safe_serialization=None):
+        _bnb_supports_8bit_serialization = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse(
+            "0.37.2"
+        )
+
+        if not _bnb_supports_8bit_serialization:
+            logger.warning(
+                "You are calling `save_pretrained` to a 8-bit converted model, but your `bitsandbytes` version doesn't support it. "
+                "If you want to save 8-bit models, make sure to have `bitsandbytes>0.37.2` installed. You will most likely face errors or"
+                " unexpected behaviours."
+            )
+            return False
+
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.37.0")
+
+    def _dequantize(self, model):
+        from ..integrations import dequantize_and_replace
+
+        model = dequantize_and_replace(
+            model, self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        return model
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_compressed_tensors.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_compressed_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d208087bbbfece0a4bb47238a773b23e3dbcd77
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_compressed_tensors.py
@@ -0,0 +1,131 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+from ..utils import is_compressed_tensors_available, is_torch_available, logging
+from ..utils.quantization_config import CompressedTensorsConfig
+from .base import HfQuantizer
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class CompressedTensorsHfQuantizer(HfQuantizer):
+    """
+    Quantizer for the compressed_tensors package.  Loads and restores models to
+    quantized state with compressed_tensors
+    """
+
+    requires_calibration = True
+    required_packages = ["compressed_tensors"]
+
+    def __init__(self, quantization_config: CompressedTensorsConfig, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        if not is_compressed_tensors_available():
+            raise ImportError(
+                "Using `compressed_tensors` quantized models requires the compressed-tensors library: "
+                "`pip install compressed-tensors`"
+            )
+
+        from compressed_tensors.compressors import ModelCompressor
+
+        self.compressor = ModelCompressor.from_compression_config(quantization_config)
+        self.run_compressed = quantization_config.run_compressed
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_compressed_tensors_available():
+            raise ImportError(
+                "Using `compressed_tensors` quantized models requires the compressed-tensors library: "
+                "`pip install compressed-tensors`"
+            )
+        if not is_torch_available():
+            # torch already should be installed as part of compressed tensors
+            raise ImportError("torch is required for using compressed-tensors quantization")
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            logger.info("Loading model using torch.float16 for compressed-tensors quantization")
+            torch_dtype = torch.float16
+        elif torch_dtype != torch.float16:
+            logger.info(
+                "We suggest you to set `torch_dtype=torch.float16` for better efficiency with compressed_tensors."
+            )
+        return torch_dtype
+
+    def _process_model_before_weight_loading(self, model, **kwargs):
+        from compressed_tensors.quantization import apply_quantization_config
+
+        ct_quantization_config = self.compressor.quantization_config
+
+        if self.run_compressed and self.is_quantization_compressed:
+            apply_quantization_config(model, ct_quantization_config, run_compressed=True)
+        elif not self.is_quantization_compressed:
+            apply_quantization_config(model, ct_quantization_config)
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        """Decompress loaded model if necessary - need for qat"""
+
+        if (self.is_quantization_compressed and not self.run_compressed) or self.is_sparsification_compressed:
+            config = kwargs.get("config", None)
+            cache_path = config._name_or_path
+
+            if not os.path.exists(cache_path):
+                from transformers.utils import cached_file
+
+                config_file_path = cached_file(cache_path, "config.json")
+                cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
+
+            if self.is_quantization_compressed and not self.run_compressed:
+                from compressed_tensors.quantization import QuantizationStatus
+
+                self.compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN
+            self.compressor.decompress(model_path=cache_path, model=model)
+
+    @property
+    def is_quantization_compressed(self):
+        from compressed_tensors.quantization import QuantizationStatus
+
+        return (
+            self.quantization_config.quantization_config is not None
+            and self.quantization_config.quantization_config.quantization_status == QuantizationStatus.COMPRESSED
+        )
+
+    @property
+    def is_sparsification_compressed(self):
+        from compressed_tensors.config.base import CompressionFormat
+
+        return (
+            self.quantization_config.sparsity_config is not None
+            and self.quantization_config.sparsity_config.format != CompressionFormat.dense.value
+        )
+
+    @property
+    def is_trainable(self):
+        return True
+
+    def is_qat_trainable(self) -> bool:
+        """Loaded Models can carry out quantization aware training"""
+        # models need to be decompressed carry out qat
+        return not self.run_compressed or not self.is_quantization_compressed
+
+    def is_serializable(self, safe_serialization=None) -> bool:
+        """Models quantized using compressed tensors can be saved to disk"""
+        return True
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_eetq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_eetq.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dfce75c373ad7c1411e99f0d3536bc0c475bfd1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_eetq.py
@@ -0,0 +1,183 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_eetq_available, is_torch_available, logging
+from .quantizers_utils import get_module_from_name
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class EetqHfQuantizer(HfQuantizer):
+    """
+    8-bit quantization from EETQ quantization method:
+        before loading: converts transformer layers into W8A16Linear during loading: load 16bit weight and pass to the
+        layer object after: quantizes individual weights in Linear8bitLt into 8bit at first .cuda() call
+    """
+
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    required_packages = ["eetq", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_eetq_available():
+            raise ImportError(
+                "Using `eetq` 8-bit quantization requires eetq."
+                "Please install the latest version of eetq from : https://github.com/NetEase-FuXi/EETQ"
+            )
+
+        try:
+            import eetq  # noqa: F401
+        except ImportError as exc:
+            if "shard_checkpoint" in str(exc):
+                # EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
+                # shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
+                # TODO: Update message once eetq releases a fix
+                raise ImportError(
+                    "You are using a version of EETQ that is incompatible with the current transformers version. "
+                    "Either downgrade transformers to <= v4.46.3 or, if available, upgrade EETQ to > v1.0.0."
+                ) from exc
+            else:
+                raise
+
+        if not is_accelerate_available():
+            raise ImportError("Loading an EETQ quantized model requires accelerate (`pip install accelerate`)")
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting into 8-bit weights from tf/flax weights is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        if not torch.cuda.is_available():
+            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+
+        device_map = kwargs.get("device_map", None)
+        if device_map is None:
+            logger.warning_once(
+                "You have loaded an EETQ model on CPU and have a CUDA device available, make sure to set "
+                "your model on a GPU device in order to run your model."
+            )
+        elif device_map is not None:
+            if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+                raise ValueError(
+                    "You are attempting to load an EETQ model with a device_map that contains a CPU or disk device."
+                    " This is not supported. Please remove the CPU or disk device from the device_map."
+                )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = torch.float16
+            logger.info(
+                "Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to "
+                "requirements of `eetq` to enable model loading in 8-bit. "
+                "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                " torch_dtype=torch.float16 to remove this warning.",
+                torch_dtype,
+            )
+        elif torch_dtype != torch.float16:
+            logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with EETQ.")
+        return torch_dtype
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        from eetq import EetqLinear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if isinstance(module, EetqLinear):
+            if self.pre_quantized or tensor_name == "bias":
+                if tensor_name == "weight" and param_value.dtype != torch.int8:
+                    raise ValueError("Expect quantized weights but got an unquantized weight")
+                return False
+            else:
+                if tensor_name == "weight_scale":
+                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
+                return True
+        return False
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        quantizes weights into qweight and weight_scales
+        """
+        from eetq import quantize_and_preprocess_weights
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        new_value, weight_scale = quantize_and_preprocess_weights(param_value)
+
+        module._buffers[tensor_name] = new_value.to(target_device)
+        module.register("weight_scales", weight_scale.to(target_device))
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        device_map,
+        keep_in_fp32_modules: List[str] = [],
+        **kwargs,
+    ):
+        from ..integrations import get_keys_to_not_convert, replace_with_eetq_linear
+
+        self.modules_to_not_convert = get_keys_to_not_convert(model)
+
+        if self.quantization_config.modules_to_not_convert is not None:
+            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+
+        model = replace_with_eetq_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+            pre_quantized=self.pre_quantized,
+        )
+
+        model.config.quantization_config = self.quantization_config
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return True
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..07d5ce87ef6cc16977762cf8671b8fba7fe24f8c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.py
@@ -0,0 +1,204 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from packaging import version
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging
+from .quantizers_utils import get_module_from_name
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class FbgemmFp8HfQuantizer(HfQuantizer):
+    """
+    FP8 quantization using fbgemm kernels
+    """
+
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    required_packages = ["fbgemm-gpu", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_torch_available() or version.parse(importlib.metadata.version("torch")) < version.parse("2.1.0"):
+            raise ImportError(
+                "Using fbgemm fp8 quantization requires torch > 2.1.0"
+                "Please install the latest version of torch ( pip install --upgrade torch )"
+            )
+        if not is_fbgemm_gpu_available():
+            raise ImportError(
+                "Using fbgemm fp8 quantization requires fbgemm-gpu library"
+                "Please install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries"
+            )
+
+        if not is_accelerate_available("0.32.2"):
+            raise ImportError(
+                "Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)"
+            )
+
+        if not torch.cuda.is_available():
+            raise RuntimeError("Using FP8 quantized models with fbgemm kernels requires a GPU")
+
+        compute_capability = torch.cuda.get_device_capability()
+        major, minor = compute_capability
+        if major < 9:
+            raise ValueError(
+                "FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)"
+            )
+
+        device_map = kwargs.get("device_map", None)
+        if device_map is None:
+            logger.warning_once(
+                "You have loaded an FP8 model on CPU and have a CUDA device available, make sure to set "
+                "your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. "
+            )
+        elif device_map is not None:
+            if (
+                not self.pre_quantized
+                and isinstance(device_map, dict)
+                and ("cpu" in device_map.values() or "disk" in device_map.values())
+            ):
+                raise ValueError(
+                    "You are attempting to load an FP8 model with a device_map that contains a CPU or disk device."
+                    "This is not supported when the model is quantized on the fly. "
+                    "Please use a quantized checkpoint or remove the CPU or disk device from the device_map."
+                )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = torch.bfloat16
+            logger.info(
+                "Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to "
+                "requirements of `fbgemm-gpu` to enable model loading in fp8. "
+                "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                " torch_dtype=torch.bfloat16 to remove this warning.",
+                torch_dtype,
+            )
+        elif torch_dtype == torch.float16:
+            raise ValueError(
+                "You cannot use FP8 with torch_dtype=torch.float16."
+                "We recommend you passing torch_dtype=torch.bfloat16"
+            )
+        return torch_dtype
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        from ..integrations import FbgemmFp8Linear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if isinstance(module, FbgemmFp8Linear):
+            if self.pre_quantized or tensor_name == "bias":
+                if tensor_name == "weight" and param_value.dtype != torch.float8_e4m3fn:
+                    raise ValueError("Expect quantized weights but got an unquantized weight")
+                return False
+            else:
+                if tensor_name == "weight_scale":
+                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
+                return True
+        return False
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        Quantizes weights into weight and weight_scale
+        """
+        new_value, weight_scale = torch.ops.fbgemm.quantize_fp8_per_row(param_value)
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        module._buffers[tensor_name] = new_value.to(target_device)
+        # to have the right output shape -> (out_features, 1)
+        module._buffers["weight_scale"] = weight_scale.view(weight_scale.shape[0], 1).to(target_device)
+
+        if unexpected_keys is not None and param_name in unexpected_keys:
+            unexpected_keys.remove(param_name)
+        del param_name
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        device_map,
+        keep_in_fp32_modules: List[str] = [],
+        **kwargs,
+    ):
+        from ..integrations import get_keys_to_not_convert, replace_with_fbgemm_fp8_linear
+
+        self.modules_to_not_convert = get_keys_to_not_convert(model)
+
+        if self.quantization_config.modules_to_not_convert is not None:
+            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+
+        model = replace_with_fbgemm_fp8_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+            pre_quantized=self.pre_quantized,
+        )
+
+        model.config.quantization_config = self.quantization_config
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        from ..integrations import FbgemmFp8Linear
+
+        not_missing_keys = []
+        for name, module in model.named_modules():
+            if isinstance(module, FbgemmFp8Linear):
+                for missing in missing_keys:
+                    if (
+                        (name in missing or name in f"{prefix}.{missing}")
+                        and not missing.endswith(".weight")
+                        and not missing.endswith(".bias")
+                    ):
+                        not_missing_keys.append(missing)
+        return [k for k in missing_keys if k not in not_missing_keys]
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return False
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_gptq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_gptq.py
new file mode 100644
index 0000000000000000000000000000000000000000..d47a2ba79cb60da847000f13a1de0527703ce0e9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_gptq.py
@@ -0,0 +1,101 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import TYPE_CHECKING, Optional
+
+from packaging import version
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_auto_gptq_available, is_optimum_available, is_torch_available, logging
+from ..utils.quantization_config import GPTQConfig, QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class GptqHfQuantizer(HfQuantizer):
+    """
+    Quantizer of the GPTQ method - for GPTQ the quantizer support calibration of the model through
+    `auto_gptq` package. Quantization is done under the hood for users if they load a non-prequantized model.
+    """
+
+    requires_calibration = False
+    required_packages = ["optimum", "auto_gptq"]
+    optimum_quantizer = None
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        if not is_optimum_available():
+            raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)")
+        from optimum.gptq import GPTQQuantizer
+
+        self.optimum_quantizer = GPTQQuantizer.from_dict(self.quantization_config.to_dict_optimum())
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_optimum_available():
+            raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)")
+
+        if not is_auto_gptq_available():
+            raise ImportError(
+                "Loading a GPTQ quantized model requires the auto-gptq library (`pip install auto-gptq`)"
+            )
+
+        gptq_supports_cpu = version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
+        if not gptq_supports_cpu and not torch.cuda.is_available():
+            raise RuntimeError("GPU is required to quantize or run quantize model.")
+        elif version.parse(importlib.metadata.version("auto_gptq")) < version.parse("0.4.2"):
+            raise ImportError(
+                "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq`"
+            )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = torch.float16
+            logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.")
+        elif torch_dtype != torch.float16:
+            logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with GPTQ.")
+        return torch_dtype
+
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        if model.__class__.main_input_name != "input_ids":
+            raise RuntimeError("We can only quantize pure text model.")
+
+        if self.pre_quantized:
+            model = self.optimum_quantizer.convert_model(model)
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        if self.pre_quantized:
+            model = self.optimum_quantizer.post_init_model(model)
+        else:
+            if self.quantization_config.tokenizer is None:
+                self.quantization_config.tokenizer = model.name_or_path
+
+            self.optimum_quantizer.quantize_model(model, self.quantization_config.tokenizer)
+            model.config.quantization_config = GPTQConfig.from_dict(self.optimum_quantizer.to_dict())
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return True
+
+    def is_serializable(self, safe_serialization=None):
+        return True
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_higgs.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_higgs.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33e2f21e98fd8f9636e040a0c4f3f37992b574c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_higgs.py
@@ -0,0 +1,232 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_flute_available, is_hadamard_available, is_torch_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+def get_num_sms_from_device(device):
+    target_device_cc = torch.cuda.get_device_capability(device=device)
+    if target_device_cc == (8, 6):
+        return 84
+    elif target_device_cc == (8, 0):
+        return 108
+    elif target_device_cc == (8, 9):
+        return 128
+    else:
+        raise NotImplementedError(
+            f"Device capability {target_device_cc} not supported for FLUTE (yet?) to verify your device capability check out https://developer.nvidia.com/cuda-gpus"
+        )
+
+
+class HiggsHfQuantizer(HfQuantizer):
+    """
+    Quantizer of the HIGGS method. Enables the loading of prequantized models and in-flight quantization of full-precision models.
+    """
+
+    requires_calibration = False
+    requires_parameters_quantization = True
+    required_packages = ["flute-kernel", "fast_hadamard_transform"]
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, device_map, **kwargs):
+        if not torch.cuda.is_available():
+            raise NotImplementedError("HIGGS quantization is only supported on GPU. Please use a different quantizer.")
+
+        if not is_accelerate_available():
+            raise ImportError("Using `higgs` quantization requires Accelerate: `pip install accelerate`")
+
+        if not is_flute_available():
+            raise ImportError("Using `higgs` quantization requires FLUTE: `pip install flute-kernel>=0.3.0`")
+
+        if not is_hadamard_available():
+            raise ImportError(
+                "Using `higgs` quantization requires fast_hadamard_transform: `pip install fast_hadamard_transform`"
+            )
+
+        if device_map is None:
+            raise ValueError(
+                "You are attempting to load a HIGGS model without setting device_map."
+                " Please set device_map comprised of 'cuda' devices."
+            )
+        elif isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+            raise ValueError(
+                "You are attempting to load a HIGGS model with a device_map that contains a CPU or disk device."
+                " This is not supported. Please remove the CPU or disk device from the device_map."
+            )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            logger.info("`torch_dtype` is None. Setting `torch_dtype=torch.float16` for FLUTE compatibility.")
+            torch_dtype = torch.float16
+        elif torch_dtype != torch.float16 and torch_dtype != torch.bfloat16:
+            raise ValueError(
+                f"Invalid `torch_dtype` {torch_dtype}. HIGGS quantization only supports `torch_dtype=torch.float16` or `torch_dtype=torch.bfloat16`."
+            )
+
+        return torch_dtype
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        from ..integrations import quantize_with_higgs
+
+        """
+        Quantizes weights into weight and weight_scale
+        """
+        flute_dict = quantize_with_higgs(
+            param_value.to(target_device),
+            self.quantization_config.bits,
+            self.quantization_config.p,
+            self.quantization_config.group_size,
+            self.quantization_config.hadamard_size,
+        )
+
+        del param_value
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        for key, value in flute_dict.items():
+            if key in module._parameters:
+                module._parameters[key] = torch.nn.Parameter(value, requires_grad=False)
+            elif key in module._buffers:
+                module._buffers[key] = torch.nn.Buffer(value)
+            else:
+                raise ValueError(f"Unexpected key {key} in module {module}")
+
+        if unexpected_keys is not None and param_name in unexpected_keys:
+            unexpected_keys.remove(param_name)
+
+        module.num_sms_packed = torch.nn.Parameter(
+            torch.tensor(get_num_sms_from_device(target_device), device=target_device, dtype=torch.int32),
+            requires_grad=False,
+        )
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        **kwargs,
+    ):
+        from ..integrations import replace_with_higgs_linear
+
+        replace_with_higgs_linear(
+            model,
+            quantization_config=self.quantization_config,
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        import flute.utils
+
+        from ..integrations import HiggsLinear
+
+        flute_workspaces = {}
+        for name, module in model.named_modules():
+            if isinstance(module, HiggsLinear):
+                # Every HiggsLinear needs a "workspace": a buffer for the unpacking operation.
+                # This buffer needs to be on the same device as the weights, but can be reused across modules otherwise.
+                if module.weight.device not in flute_workspaces:
+                    flute_workspaces[module.weight.device] = flute.utils.make_workspace_streamk(
+                        device=module.weight.device
+                    )
+                module.workspace = flute_workspaces[module.weight.device]
+
+                # FLUTE weights are packed in a way that is optimized for a specific number of SMs (GPU streaming multiprocessors).
+                # If the model is loaded on a different device than the one it was saved on, we need to repack the weights.
+                if module.num_sms_packed.item() != get_num_sms_from_device(module.weight.device):
+                    new_device = module.weight.device
+                    new_num_sms = get_num_sms_from_device(new_device)
+                    module.weight.data = flute.utils.pack(
+                        flute.utils.unpack(
+                            weight=module.weight.data,
+                            scales=module.scales.data,
+                            workspace=module.workspace,
+                            num_bits=module.num_bits,
+                            group_size=module.group_size,
+                            num_sms_packed=module.num_sms_packed.item(),
+                        ).T.contiguous(),
+                        module.num_bits,
+                        module.group_size,
+                    )
+                    module.num_sms_packed = torch.nn.Parameter(
+                        torch.tensor(new_num_sms, device=new_device, dtype=torch.int32),
+                        requires_grad=False,
+                    )
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        from ..integrations import HiggsLinear
+
+        not_missing_keys = []
+        for name, module in model.named_modules():
+            if isinstance(module, HiggsLinear):
+                for missing in missing_keys:
+                    if (
+                        (name in missing or name in f"{prefix}.{missing}")
+                        and not missing.endswith(".weight")
+                        and not missing.endswith(".bias")
+                    ):
+                        not_missing_keys.append(missing)
+        return [k for k in missing_keys if k not in not_missing_keys]
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return False
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        from ..integrations import HiggsLinear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if isinstance(module, HiggsLinear) and tensor_name == "weight" and param_value.dtype != torch.int16:
+            # Only quantize weights of HiggsLinear modules that are not already quantized
+            return True
+        else:
+            return False
+
+    def _dequantize(self, model):
+        from ..integrations import dequantize_higgs
+
+        model = dequantize_higgs(model)
+        return model
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_hqq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_hqq.py
new file mode 100644
index 0000000000000000000000000000000000000000..775fea8f4901e6116d796bca491e1bab5d74f46d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_hqq.py
@@ -0,0 +1,296 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Dict, List
+
+from ..integrations import prepare_for_hqq_linear
+from ..utils import is_accelerate_available, is_hqq_available, is_torch_available, logging
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+
+if is_accelerate_available():
+    from accelerate.hooks import remove_hook_from_module
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+# Finds the parent of a node module named "name"
+def find_parent(model, name):
+    module_tree = name.split(".")[:-1]
+    parent = model
+    for m in module_tree:
+        parent = parent._modules[m]
+    return parent
+
+
+class HqqHfQuantizer(HfQuantizer):
+    """
+    HQQ quantizer base HF class.
+    nn.Linear modules are first tagged with quant_config in _process_model_before_weight_loading().
+    The actual quantization and offloading to the GPU is done in check_quantized_param().
+    """
+
+    use_keep_in_fp32_modules = False
+    requires_parameters_quantization = True
+    requires_calibration = False
+    required_packages = ["hqq"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.torch_dtype = None
+        self.using_multi_gpu = False
+
+    def validate_environment(self, *args, **kwargs):
+        if not (is_hqq_available()):
+            raise ImportError(
+                "A valid HQQ version (>=0.2.1) is not available. Please follow the instructions to install it: `https://github.com/mobiusml/hqq/`."
+            )
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting weights from tf/flax weights is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        if not torch.cuda.is_available():
+            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+
+        if self.torch_dtype is None:
+            if "torch_dtype" in kwargs:
+                self.torch_dtype = kwargs["torch_dtype"]
+            else:
+                self.torch_dtype = torch.float32
+                logger.info("Setting torch_dtype to torch.float32 as the default value since it was not specified.")
+
+        device_map = kwargs.get("device_map", None)
+        if isinstance(device_map, dict):
+            if "cpu" in device_map.values() or "disk" in device_map.values():
+                raise ValueError(
+                    "You are attempting to use an HQQ model with a device_map that contains a CPU or disk device."
+                    " This is not supported. Please remove the CPU or disk device from the device_map."
+                )
+            else:
+                self.using_multi_gpu = len(set(device_map.values())) > 1
+
+    def update_missing_keys(
+        self, model: "PreTrainedModel", missing_keys: List[str], prefix: str, **kwargs
+    ) -> List[str]:
+        if self.pre_quantized:
+            return [key for key in missing_keys if ("weight" not in key)]
+        else:
+            return missing_keys
+
+    # Adds missing keys for HQQLinear modules that are loaded but the model with initialized with torch.nn.Linear
+    def update_expected_keys(
+        self, model: "PreTrainedModel", expected_keys: List[str], loaded_keys: List[str]
+    ) -> List[str]:
+        if not self.pre_quantized:
+            return expected_keys
+
+        # Collects all quantizable (linear) layers
+        def _find_hqq_quantizable_layers(model, layers):
+            for name, module in model.named_children():
+                if isinstance(module, (torch.nn.Linear)):
+                    layers.add(module.name)
+                _find_hqq_quantizable_layers(module, layers)
+
+        new_keys = set(expected_keys)
+        if is_hqq_available():
+            from hqq.core.quantize import HQQLinear
+
+            # Name modules
+            for name, module in model.named_modules():
+                module.name = name
+
+            # valid modules are Linear layers that have HQQLinear state_dict. We ignore skip_modules and any layers with Linear state_dict() params
+            _valid_modules = set()
+            _find_hqq_quantizable_layers(model, _valid_modules)
+            _valid_modules -= set(model.config.quantization_config["skip_modules"])
+
+            # Append new expected layers based on _ref_keys
+            _ref_keys = HQQLinear(
+                linear_layer=None, quant_config=None, compute_dtype=torch.float16, device="cpu"
+            ).state_dict_keys() - {"bias"}
+
+            # Clean-up
+            _rm_keys = set()
+            for key in new_keys:
+                if any(_module in key for _module in _valid_modules):
+                    _rm_keys.add(key)
+            new_keys -= _rm_keys
+            # At this point, new_keys contains all the keys of the layers that are NOT HQQLinear or torch.nn.Linear
+
+            # Re-populate Linear/HQQLinear
+            for _module in _valid_modules:
+                if _module + ".weight" in loaded_keys:
+                    new_keys.add(_module + ".weight")
+                else:
+                    new_keys.update({_module + "." + _ref_key for _ref_key in _ref_keys})
+                if _module + ".bias" in loaded_keys:
+                    new_keys.add(_module + ".bias")
+
+        return list(new_keys)
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        if is_hqq_available():
+            from hqq.core.quantize import HQQLinear
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if self.pre_quantized:
+            return (
+                (isinstance(module, torch.nn.Linear) or isinstance(module, HQQLinear))
+                and tensor_name != "weight"
+                and tensor_name != "bias"
+            )
+        else:
+            return isinstance(module, torch.nn.Linear) and tensor_name == "weight"
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: List[str],
+    ):
+        """
+        Each nn.Linear layer is processsed here.
+        We first check if the corresponding module state_dict contains already HQQ quantized parameters.
+        If not, we create a temp linear layer with the module state_dict params and use it for quantization
+        """
+
+        if is_hqq_available():
+            from hqq.core.quantize import HQQLinear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        layer_name = ".".join(param_name.split(".")[:-1])
+        parent_module = find_parent(model, layer_name)
+        node = layer_name.split(".")[-1]
+
+        # set module state_dict
+        module_state_dict = {}
+        for k, v in state_dict.items():
+            if layer_name + "." in k:
+                module_state_dict[k.split(".")[-1]] = v
+                if unexpected_keys is not None and k in unexpected_keys:
+                    unexpected_keys.remove(k)
+
+        if self.pre_quantized:
+            if isinstance(module, HQQLinear):
+                return
+            else:
+                hqq_layer = HQQLinear(
+                    linear_layer=None,
+                    quant_config=None,
+                    compute_dtype=self.torch_dtype,
+                    device=target_device,
+                )
+
+            hqq_layer.load_state_dict(module_state_dict)
+
+            if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+                hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+            if self.using_multi_gpu:
+                hqq_layer = self._patch_layer_for_multigpu(hqq_layer)
+
+            setattr(parent_module, node, hqq_layer)
+
+            # cleanup
+            del module.__dict__, module
+            torch.cuda.empty_cache()
+            return
+
+        # Step 1: populate module with weight/bias from module state dict
+        for key in module_state_dict:
+            setattr(module, key, torch.nn.Parameter(module_state_dict[key]))
+
+        # Step 2: Replace module with either HQQLinear or move it to device. We do this via setattr on the parent as doing on it on the module
+        # directly doesn't work.
+        if hasattr(module, "quant_config"):
+            hqq_layer = HQQLinear(
+                module,
+                module.quant_config,
+                compute_dtype=self.torch_dtype,
+                device=target_device,
+                del_orig=True,
+            )
+
+            if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+                hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+            if self.using_multi_gpu:
+                hqq_layer = self._patch_layer_for_multigpu(hqq_layer)
+
+            setattr(parent_module, node, hqq_layer)
+
+        else:
+            module = module.to(dtype=self.torch_dtype, device=target_device)
+            setattr(parent_module, node, module)
+
+        torch.cuda.empty_cache()
+
+    # Remove accelerate hook and uses a simpler forward pass. Otherwise, this breaks with multi-gpu
+    def _patch_layer_for_multigpu(self, hqq_layer):
+        hqq_layer = remove_hook_from_module(hqq_layer)
+
+        def forward_with_device(self, x):
+            out = torch.matmul(x.to(self.device), self.dequantize().t())
+            if self.bias is not None:
+                out += self.bias
+            return out
+
+        hqq_layer.forward = lambda x: forward_with_device(hqq_layer, x)
+        return hqq_layer
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        device_map,
+        keep_in_fp32_modules: List[str] = None,
+        **kwargs,
+    ):
+        keep_in_fp32_modules = keep_in_fp32_modules if keep_in_fp32_modules is not None else []
+
+        # Add the corresponding quant_config to each valid module. This allows us to do the actual nn.Linear -> HQQLinear conversion in create_quantized_param().
+        # prepare_for_hqq_linear() also sets the right quantization config inside the model (model.config.quantization_config) and the layers (hqq_layer.quant_config)
+        model = prepare_for_hqq_linear(model, quantization_config=self.quantization_config)
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        model.is_hqq_quantized = True
+        model.is_hqq_serializable = self.is_serializable()
+        return model
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return True
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_quanto.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_quanto.py
new file mode 100644
index 0000000000000000000000000000000000000000..230e8efe15067290be354e3da5a10f3f0d79248c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_quanto.py
@@ -0,0 +1,208 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from packaging import version
+
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import (
+    is_accelerate_available,
+    is_optimum_quanto_available,
+    is_torch_available,
+    logging,
+)
+from ..utils.quantization_config import QuantoConfig
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class QuantoHfQuantizer(HfQuantizer):
+    """
+    Quantizer for the quanto library
+    """
+
+    required_packages = ["quanto", "accelerate"]
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    def __init__(self, quantization_config: QuantoConfig, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker
+        """
+        if self.quantization_config.activations is not None and not self.pre_quantized:
+            raise ValueError(
+                "We don't support quantizing the activations with transformers library."
+                "Use quanto library for more complex use cases such as activations quantization, calibration and quantization aware training."
+            )
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_optimum_quanto_available():
+            raise ImportError(
+                "Loading an optimum-quanto quantized model requires optimum-quanto library (`pip install optimum-quanto`)"
+            )
+        if not is_accelerate_available():
+            raise ImportError(
+                "Loading an optimum-quanto quantized model requires accelerate library (`pip install accelerate`)"
+            )
+
+    def update_device_map(self, device_map):
+        if device_map is None:
+            device_map = {"": "cpu"}
+            logger.info(
+                "The device_map was not initialized. "
+                "Setting device_map to {'':'cpu'}. "
+                "If you want to use the model for inference, please set device_map ='auto'"
+            )
+        return device_map
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            logger.info("You did not specify `torch_dtype` in `from_pretrained`. Setting it to `torch.float32`.")
+            torch_dtype = torch.float32
+        return torch_dtype
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        if is_optimum_quanto_available():
+            from optimum.quanto import QModuleMixin
+
+        not_missing_keys = []
+        for name, module in model.named_modules():
+            if isinstance(module, QModuleMixin):
+                for missing in missing_keys:
+                    if (
+                        (name in missing or name in f"{prefix}.{missing}")
+                        and not missing.endswith(".weight")
+                        and not missing.endswith(".bias")
+                    ):
+                        not_missing_keys.append(missing)
+        return [k for k in missing_keys if k not in not_missing_keys]
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        """
+        Check if a parameter needs to be quantized.
+        """
+        if is_optimum_quanto_available():
+            from optimum.quanto import QModuleMixin
+
+        device_map = kwargs.get("device_map", None)
+        param_device = kwargs.get("param_device", None)
+        # we don't quantize the model if the module is going to be offloaded to the cpu
+        if device_map is not None and param_device is not None:
+            device_map_values = set(device_map.values())
+            if param_device == "cpu" and len(device_map_values) > 1:
+                if not (device_map_values == {"cpu"} or device_map_values == {"cpu", "disk"}):
+                    return False
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        # We only quantize the weights and the bias is not quantized.
+        if isinstance(module, QModuleMixin) and "weight" in tensor_name:
+            # if the weights are quantized, don't need to recreate it again with `create_quantized_param`
+            return not module.frozen
+        else:
+            return False
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        *args,
+        **kwargs,
+    ):
+        """
+        Create the quantized parameter by calling .freeze() after setting it to the module.
+        """
+        from accelerate.utils import set_module_tensor_to_device
+
+        set_module_tensor_to_device(model, param_name, target_device, param_value)
+        module, _ = get_module_from_name(model, param_name)
+        module.freeze()
+        module.weight.requires_grad = False
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.27.0"):
+            from accelerate.utils import CustomDtype
+
+            mapping = {
+                "int8": torch.int8,
+                "float8": CustomDtype.FP8,
+                "int4": CustomDtype.INT4,
+                "int2": CustomDtype.INT2,
+            }
+            target_dtype = mapping[self.quantization_config.weights]
+            return target_dtype
+        else:
+            raise ValueError(
+                "You are using `device_map='auto'` on an optimum-quanto quantized model. To automatically compute"
+                " the appropriate device map, you should upgrade your `accelerate` library,"
+                "`pip install --upgrade accelerate` or install it from source."
+            )
+
+    def _process_model_before_weight_loading(
+        self, model: "PreTrainedModel", keep_in_fp32_modules: List[str] = [], **kwargs
+    ):
+        from ..integrations import get_keys_to_not_convert, replace_with_quanto_layers
+
+        # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
+        if self.quantization_config.modules_to_not_convert is None:
+            self.modules_to_not_convert = get_keys_to_not_convert(model)
+        else:
+            self.modules_to_not_convert = self.quantization_config.modules_to_not_convert
+
+        if not isinstance(self.modules_to_not_convert, list):
+            self.modules_to_not_convert = [self.modules_to_not_convert]
+
+        self.modules_to_not_convert.extend(keep_in_fp32_modules)
+
+        model, _ = replace_with_quanto_layers(
+            model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        return model
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return True
+
+    def is_serializable(self, safe_serialization=None):
+        return False
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_torchao.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_torchao.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcc9c57dfa006d0444db68b28962d079e48c1ec5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_torchao.py
@@ -0,0 +1,231 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import types
+from typing import TYPE_CHECKING, Union
+
+from packaging import version
+
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from typing import Any, Dict, List
+
+from ..utils import is_torch_available, is_torchao_available, logging
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+logger = logging.get_logger(__name__)
+
+
+# Finds the parent of a node module named "name"
+def find_parent(model, name):
+    module_tree = name.split(".")[:-1]
+    parent = model
+    for m in module_tree:
+        parent = parent._modules[m]
+    return parent
+
+
+def _quantization_type(weight):
+    from torchao.dtypes import AffineQuantizedTensor
+    from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor
+
+    if isinstance(weight, AffineQuantizedTensor):
+        return f"{weight.__class__.__name__}({weight._quantization_type()})"
+
+    if isinstance(weight, LinearActivationQuantizedTensor):
+        return f"{weight.__class__.__name__}(activation={weight.input_quant_func}, weight={_quantization_type(weight.original_weight_tensor)})"
+
+
+def _linear_extra_repr(self):
+    weight = _quantization_type(self.weight)
+    if weight is None:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight=None"
+    else:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={weight}"
+
+
+class TorchAoHfQuantizer(HfQuantizer):
+    """
+    Quantizer for torchao: https://github.com/pytorch/ao/
+    """
+
+    requires_parameters_quantization = True
+    requires_calibration = False
+    required_packages = ["torchao"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_torchao_available():
+            raise ImportError("Loading an torchao quantized model requires torchao library (`pip install torchao`)")
+
+        self.offload = False
+        device_map = kwargs.get("device_map", None)
+        if isinstance(device_map, dict):
+            if "cpu" in device_map.values() or "disk" in device_map.values():
+                if self.pre_quantized:
+                    raise ValueError(
+                        "You are attempting to perform cpu/disk offload with a pre-quantized torchao model "
+                        "This is not supported yet . Please remove the CPU or disk device from the device_map."
+                    )
+                else:
+                    self.offload = True
+        if self.pre_quantized:
+            weights_only = kwargs.get("weights_only", None)
+            if weights_only:
+                torch_version = version.parse(importlib.metadata.version("torch"))
+                if torch_version < version.parse("2.5.0"):
+                    raise RuntimeError(
+                        f"In order to use torchao pre-quantized model, you need to have torch>=2.5.0. However, the current version is {torch_version}."
+                        f" You can also set with `weights_only=False` in `from_pretrained` if you don't want to update torch"
+                    )
+
+    def update_torch_dtype(self, torch_dtype):
+        if self.quantization_config.quant_type == "int4_weight_only":
+            if torch_dtype is not None and torch_dtype != torch.bfloat16:
+                logger.warning_once(
+                    f"Setting torch_dtype to {torch_dtype} for int4_weight_only quantization, but only bfloat16 is supported right now. Please set the torch_dtype to bfloat16."
+                )
+            if torch_dtype is None:
+                logger.warning_once(
+                    "Setting torch_dtype to torch.bfloat16 for int4_weight_only quantization since only bfloat16 is supported right now. Please set torch_dtype=torch.bfloat16 to remove this warning."
+                )
+                torch_dtype = torch.bfloat16
+        if self.quantization_config.quant_type == "int8_dynamic_activation_int8_weight":
+            if torch_dtype is None:
+                logger.info(
+                    "Setting torch_dtype to torch.float32 for int8_dynamic_activation_int8_weight quantization as no torch_dtype was specified in from_pretrained"
+                )
+                # we need to set the torch_dtype, otherwise we have dtype mismatch when performing the quantized linear op
+                torch_dtype = torch.float32
+        return torch_dtype
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
+            from accelerate.utils import CustomDtype
+
+            map_to_target_dtype = {
+                "int4_weight_only": CustomDtype.INT4,
+                "int8_weight_only": torch.int8,
+                "int8_dynamic_activation_int8_weight": torch.int8,
+            }
+            return map_to_target_dtype[self.quantization_config.quant_type]
+        else:
+            raise ValueError(
+                "You are using `device_map='auto'` on a torchao quantized model. To automatically compute"
+                " the appropriate device map, you should upgrade your `accelerate` library with "
+                "`pip install --upgrade accelerate`"
+            )
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        # need more space for the quantization parameters (e.g. scale). Tested with int4 wo and group size = 128
+        max_memory = {key: val * 0.9 for key, val in max_memory.items()}
+        return max_memory
+
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        from ..integrations import get_keys_to_not_convert
+
+        self.modules_to_not_convert = get_keys_to_not_convert(model)
+
+        if self.quantization_config.modules_to_not_convert is not None:
+            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+
+        return
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        param_device = kwargs.pop("param_device", None)
+        # check if the param_name is not in self.modules_to_not_convert
+        if any((key + "." in param_name) or (key == param_name) for key in self.modules_to_not_convert):
+            return False
+        elif param_device == "cpu" and self.offload:
+            # We don't quantize weights that we offload
+            return False
+        else:
+            # we only quantize the weight of nn.Linear
+            module, tensor_name = get_module_from_name(model, param_name)
+            return isinstance(module, torch.nn.Linear) and (tensor_name == "weight")
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: List[str],
+    ):
+        """
+        Each nn.Linear layer that needs to be quantized is processsed here.
+        First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
+        """
+        from torchao.quantization import quantize_
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if self.pre_quantized:
+            module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device))
+            if isinstance(module, nn.Linear):
+                module.extra_repr = types.MethodType(_linear_extra_repr, module)
+        else:
+            module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
+            quantize_(module, self.quantization_config.get_apply_tensor_subclass())
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        """No process required for torchao quantized model"""
+        return
+
+    def is_serializable(self, safe_serialization=None):
+        if safe_serialization:
+            logger.warning(
+                "torchao quantized model does not support safe serialization, "
+                "please set `safe_serialization` to False"
+            )
+            return False
+        _is_torchao_serializable = version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(
+            "0.25.0"
+        )
+        if not _is_torchao_serializable:
+            logger.warning("torchao quantized model is only serializable after huggingface_hub >= 0.25.0 ")
+        if self.offload and self.quantization_config.modules_to_not_convert is None:
+            logger.warning(
+                "The model contains offloaded modules and these modules are not quantized. We don't recommend saving the model as we won't be able to reload them."
+                "If you want to specify modules to not quantize, please specify modules_to_not_convert in the quantization_config."
+            )
+            return False
+        return _is_torchao_serializable
+
+    @property
+    def is_trainable(self):
+        supported_quant_types_for_training = [
+            "int8_weight_only",
+            "int8_dynamic_activation_int8_weight",
+        ]
+        return self.quantization_config.quant_type in supported_quant_types_for_training
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_vptq.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_vptq.py
new file mode 100644
index 0000000000000000000000000000000000000000..1672c3ebc5a7d35d7c788671340eaca9762c356e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_vptq.py
@@ -0,0 +1,98 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Optional
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_torch_available, is_vptq_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class VptqHfQuantizer(HfQuantizer):
+    """
+    Quantizer of the VPTQ method. Enables the loading of prequantized models.
+    """
+
+    requires_calibration = True
+    required_packages = ["vptq"]
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError("Using `vptq` quantization requires Accelerate: `pip install accelerate`")
+
+        if not is_vptq_available():
+            raise ImportError("Using `vptq` quantization requires VPTQ>=0.0.4: `pip install -U vptq`")
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            if torch.cuda.is_available():
+                torch_dtype = torch.float16
+                logger.info(
+                    "CUDA available. Assuming VPTQ inference on GPU and loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually."
+                )
+            else:
+                import vptq
+
+                device_availability = getattr(vptq, "device_availability", lambda device: False)
+                if device_availability("cpu") is True:
+                    raise RuntimeError("No GPU found. Please wait for the next release of VPTQ to use CPU inference")
+                torch_dtype = torch.float32
+                logger.info("No GPU found. Assuming VPTQ inference on CPU and loading the model in `torch.float32`.")
+        return torch_dtype
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        **kwargs,
+    ):
+        """
+        we don't have param like modules_to_not_convert to indicate which layers should not be quantized
+        because `quantization_config` include the layers that should be quantized
+        """
+        from ..integrations import replace_with_vptq_linear
+
+        modules_to_not_convert = kwargs.get("modules_to_not_convert", []) + (
+            self.quantization_config.modules_to_not_convert or []
+        )
+
+        replace_with_vptq_linear(
+            model,
+            quantization_config=self.quantization_config,
+            modules_to_not_convert=modules_to_not_convert,
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return False
+
+    def is_serializable(self, safe_serialization=None):
+        return True
diff --git a/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizers_utils.py b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizers_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae287bf251b51337b8588b2e0176178316e7e96
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/quantizers/quantizers_utils.py
@@ -0,0 +1,26 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Tuple
+
+
+def get_module_from_name(module, tensor_name: str) -> Tuple[Any, str]:
+    if "." in tensor_name:
+        splits = tensor_name.split(".")
+        for split in splits[:-1]:
+            new_module = getattr(module, split)
+            if new_module is None:
+                raise ValueError(f"{module} has no attribute {split}.")
+            module = new_module
+        tensor_name = splits[-1]
+    return module, tensor_name