|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gc |
|
|
import tempfile |
|
|
import unittest |
|
|
|
|
|
import pytest |
|
|
import torch |
|
|
import torch.nn.functional as F |
|
|
from accelerate.utils.memory import clear_device_cache |
|
|
from parameterized import parameterized |
|
|
from torch import nn |
|
|
from transformers import ( |
|
|
AutoImageProcessor, |
|
|
AutoModelForCausalLM, |
|
|
AutoModelForImageClassification, |
|
|
AutoModelForSeq2SeqLM, |
|
|
AutoModelForSequenceClassification, |
|
|
AutoModelForTokenClassification, |
|
|
AutoTokenizer, |
|
|
BitsAndBytesConfig, |
|
|
LlamaForCausalLM, |
|
|
WhisperForConditionalGeneration, |
|
|
) |
|
|
from transformers.pytorch_utils import Conv1D |
|
|
|
|
|
from peft import ( |
|
|
AdaLoraConfig, |
|
|
AdaptionPromptConfig, |
|
|
BOFTConfig, |
|
|
HRAConfig, |
|
|
IA3Config, |
|
|
LNTuningConfig, |
|
|
LoHaConfig, |
|
|
LoKrConfig, |
|
|
LoraConfig, |
|
|
OFTConfig, |
|
|
PeftModel, |
|
|
RandLoraConfig, |
|
|
RoadConfig, |
|
|
TaskType, |
|
|
VBLoRAConfig, |
|
|
VeraConfig, |
|
|
get_peft_model, |
|
|
prepare_model_for_kbit_training, |
|
|
) |
|
|
from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_xpu_available |
|
|
from peft.tuners.lora.config import LoraRuntimeConfig |
|
|
from peft.utils import infer_device |
|
|
|
|
|
from .testing_utils import ( |
|
|
device_count, |
|
|
load_cat_image, |
|
|
require_bitsandbytes, |
|
|
require_deterministic_for_xpu, |
|
|
require_non_cpu, |
|
|
require_torch_multi_accelerator, |
|
|
) |
|
|
|
|
|
|
|
|
if is_bnb_available(): |
|
|
import bitsandbytes as bnb |
|
|
|
|
|
from peft.tuners.ia3 import Linear8bitLt as IA3Linear8bitLt |
|
|
from peft.tuners.lora import Linear8bitLt as LoraLinear8bitLt |
|
|
from peft.tuners.randlora import Linear8bitLt as RandLoraLinear8bitLt |
|
|
from peft.tuners.road import Linear8bitLt as RoadLinear8bitLt |
|
|
from peft.tuners.vera import Linear8bitLt as VeraLinear8bitLt |
|
|
|
|
|
if is_bnb_4bit_available(): |
|
|
from peft.tuners.ia3 import Linear4bit as IA3Linear4bit |
|
|
from peft.tuners.lora import Linear4bit as LoraLinear4bit |
|
|
from peft.tuners.randlora import Linear4bit as RandLoraLinear4bit |
|
|
from peft.tuners.road import Linear4bit as RoadLinear4bit |
|
|
from peft.tuners.vera import Linear4bit as VeraLinear4bit |
|
|
|
|
|
|
|
|
@require_non_cpu |
|
|
class PeftGPUCommonTests(unittest.TestCase): |
|
|
r""" |
|
|
A common tester to run common operations that are performed on GPU such as generation, loading in 8bit, etc. |
|
|
""" |
|
|
|
|
|
def setUp(self): |
|
|
self.seq2seq_model_id = "google/flan-t5-base" |
|
|
self.causal_lm_model_id = "facebook/opt-350m" |
|
|
self.audio_model_id = "openai/whisper-large" |
|
|
self.device = infer_device() |
|
|
|
|
|
def tearDown(self): |
|
|
r""" |
|
|
Efficient mechanism to free GPU memory after each test. Based on |
|
|
https://github.com/huggingface/transformers/issues/21094 |
|
|
""" |
|
|
clear_device_cache(garbage_collection=True) |
|
|
gc.collect() |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_lora_bnb_8bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 8bit quantization using LoRA works as expected |
|
|
""" |
|
|
whisper_8bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
opt_8bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_lora_config = LoraConfig( |
|
|
r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" |
|
|
) |
|
|
|
|
|
opt_lora_config = LoraConfig( |
|
|
r=16, |
|
|
lora_alpha=32, |
|
|
target_modules=["q_proj", "v_proj"], |
|
|
lora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none") |
|
|
|
|
|
flan_8bit = get_peft_model(flan_8bit, flan_lora_config) |
|
|
assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, LoraLinear8bitLt) |
|
|
|
|
|
opt_8bit = get_peft_model(opt_8bit, opt_lora_config) |
|
|
assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt) |
|
|
|
|
|
whisper_8bit = get_peft_model(whisper_8bit, config) |
|
|
assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt) |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_vera_bnb_8bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 8bit quantization using VeRA works as expected |
|
|
""" |
|
|
whisper_8bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
opt_8bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_vera_config = VeraConfig( |
|
|
r=16, target_modules=["q", "v"], vera_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" |
|
|
) |
|
|
|
|
|
opt_vera_config = VeraConfig( |
|
|
r=16, |
|
|
target_modules=["q_proj", "v_proj"], |
|
|
vera_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = VeraConfig(r=32, target_modules=["q_proj", "v_proj"], vera_dropout=0.05, bias="none") |
|
|
|
|
|
flan_8bit = get_peft_model(flan_8bit, flan_vera_config) |
|
|
assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, VeraLinear8bitLt) |
|
|
|
|
|
opt_8bit = get_peft_model(opt_8bit, opt_vera_config) |
|
|
assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear8bitLt) |
|
|
|
|
|
whisper_8bit = get_peft_model(whisper_8bit, config) |
|
|
assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear8bitLt) |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_randlora_bnb_8bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 8bit quantization using RandLora works as expected |
|
|
""" |
|
|
whisper_8bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
opt_8bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_randlora_config = RandLoraConfig( |
|
|
r=16, target_modules=["q", "v"], randlora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" |
|
|
) |
|
|
|
|
|
opt_randlora_config = RandLoraConfig( |
|
|
r=10, |
|
|
target_modules=["q_proj", "v_proj"], |
|
|
randlora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = RandLoraConfig(r=5, target_modules=["q_proj", "v_proj"], randlora_dropout=0.05, bias="none") |
|
|
|
|
|
flan_8bit = get_peft_model(flan_8bit, flan_randlora_config) |
|
|
assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, RandLoraLinear8bitLt) |
|
|
|
|
|
opt_8bit = get_peft_model(opt_8bit, opt_randlora_config) |
|
|
assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RandLoraLinear8bitLt) |
|
|
|
|
|
whisper_8bit = get_peft_model(whisper_8bit, config) |
|
|
assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RandLoraLinear8bitLt) |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_ia3_bnb_8bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 8bit quantization using IA3 works as expected |
|
|
""" |
|
|
whisper_8bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
opt_8bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_ia3_config = IA3Config(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM") |
|
|
|
|
|
opt_ia3_config = IA3Config( |
|
|
target_modules=["q_proj", "v_proj", "fc2"], |
|
|
feedforward_modules=["fc2"], |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = IA3Config(target_modules=["q_proj", "v_proj", "fc2"], feedforward_modules=["fc2"]) |
|
|
|
|
|
flan_8bit = get_peft_model(flan_8bit, flan_ia3_config) |
|
|
assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, IA3Linear8bitLt) |
|
|
|
|
|
opt_8bit = get_peft_model(opt_8bit, opt_ia3_config) |
|
|
assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear8bitLt) |
|
|
|
|
|
whisper_8bit = get_peft_model(whisper_8bit, config) |
|
|
assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear8bitLt) |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_road_bnb_8bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 8bit quantization using Road works as expected |
|
|
""" |
|
|
whisper_8bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
opt_8bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
|
|
|
flan_road_config = RoadConfig(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM") |
|
|
|
|
|
opt_road_config = RoadConfig( |
|
|
target_modules=["q_proj", "v_proj", "fc2"], |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = RoadConfig(target_modules=["q_proj", "v_proj", "fc2"]) |
|
|
|
|
|
flan_8bit = get_peft_model(flan_8bit, flan_road_config) |
|
|
assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, RoadLinear8bitLt) |
|
|
|
|
|
opt_8bit = get_peft_model(opt_8bit, opt_road_config) |
|
|
assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RoadLinear8bitLt) |
|
|
|
|
|
whisper_8bit = get_peft_model(whisper_8bit, config) |
|
|
assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RoadLinear8bitLt) |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
@parameterized.expand(["4bit", "8bit"]) |
|
|
def test_lora_bnb_quantization_from_pretrained_safetensors(self, quantization): |
|
|
r""" |
|
|
Tests that the bnb quantization using LoRA works as expected with safetensors weights. |
|
|
""" |
|
|
model_id = "facebook/opt-350m" |
|
|
peft_model_id = "ybelkada/test-st-lora" |
|
|
kwargs = {"device_map": "auto"} |
|
|
if quantization == "4bit": |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) |
|
|
else: |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
model = PeftModel.from_pretrained(model, peft_model_id) |
|
|
|
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
model.load_adapter(peft_model_id, "adapter2") |
|
|
model.set_adapter("adapter2") |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A |
|
|
assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
@parameterized.expand(["4bit", "8bit"]) |
|
|
def test_adalora_bnb_quantization_from_pretrained_safetensors(self, quantization): |
|
|
r""" |
|
|
Tests that the bnb quantization using AdaLora works as expected with safetensors weights. |
|
|
""" |
|
|
model_id = "facebook/opt-350m" |
|
|
kwargs = {"device_map": "auto"} |
|
|
if quantization == "4bit": |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) |
|
|
else: |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
config = AdaLoraConfig(task_type=TaskType.CAUSAL_LM, total_step=1) |
|
|
peft_model = get_peft_model(model, config) |
|
|
peft_model = prepare_model_for_kbit_training(peft_model) |
|
|
peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
|
peft_model.save_pretrained(tmp_dir) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
model = PeftModel.from_pretrained(model, tmp_dir) |
|
|
model = prepare_model_for_kbit_training(peft_model) |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
model.load_adapter(tmp_dir, "adapter2") |
|
|
model.set_adapter("adapter2") |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A |
|
|
assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
@parameterized.expand(["4bit", "8bit"]) |
|
|
def test_vera_bnb_quantization_from_pretrained_safetensors(self, quantization): |
|
|
r""" |
|
|
Tests that the bnb quantization using VeRA works as expected with safetensors weights. |
|
|
""" |
|
|
model_id = "facebook/opt-350m" |
|
|
kwargs = {"device_map": "auto"} |
|
|
if quantization == "4bit": |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) |
|
|
else: |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
config = VeraConfig(task_type=TaskType.CAUSAL_LM) |
|
|
peft_model = get_peft_model(model, config) |
|
|
peft_model = prepare_model_for_kbit_training(peft_model) |
|
|
peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
|
peft_model.save_pretrained(tmp_dir) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
model = PeftModel.from_pretrained(model, tmp_dir) |
|
|
model = prepare_model_for_kbit_training(model) |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
model.load_adapter(tmp_dir, "adapter2") |
|
|
model.set_adapter("adapter2") |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.vera_A |
|
|
assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.vera_A |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
@parameterized.expand(["4bit", "8bit"]) |
|
|
def test_randlora_bnb_quantization_from_pretrained_safetensors(self, quantization): |
|
|
r""" |
|
|
Tests that the bnb quantization using RandLora works as expected with safetensors weights. |
|
|
""" |
|
|
model_id = "facebook/opt-350m" |
|
|
kwargs = {"device_map": "auto"} |
|
|
if quantization == "4bit": |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) |
|
|
else: |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
config = RandLoraConfig(task_type=TaskType.CAUSAL_LM) |
|
|
peft_model = get_peft_model(model, config) |
|
|
peft_model = prepare_model_for_kbit_training(peft_model) |
|
|
peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
|
peft_model.save_pretrained(tmp_dir) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
model = PeftModel.from_pretrained(model, tmp_dir) |
|
|
model = prepare_model_for_kbit_training(model) |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
model.load_adapter(tmp_dir, "adapter2") |
|
|
model.set_adapter("adapter2") |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.randlora_A |
|
|
assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.randlora_A |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
@parameterized.expand(["4bit", "8bit"]) |
|
|
def test_ia3_bnb_quantization_from_pretrained_safetensors(self, quantization): |
|
|
r""" |
|
|
Tests that the bnb quantization using IA³ works as expected with safetensors weights. |
|
|
""" |
|
|
model_id = "facebook/opt-350m" |
|
|
kwargs = {"device_map": "auto"} |
|
|
if quantization == "4bit": |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) |
|
|
else: |
|
|
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
config = IA3Config(task_type=TaskType.CAUSAL_LM) |
|
|
peft_model = get_peft_model(model, config) |
|
|
peft_model = prepare_model_for_kbit_training(peft_model) |
|
|
peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
|
peft_model.save_pretrained(tmp_dir) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
model = PeftModel.from_pretrained(model, tmp_dir) |
|
|
model = prepare_model_for_kbit_training(model) |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
model.load_adapter(tmp_dir, "adapter2") |
|
|
model.set_adapter("adapter2") |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) |
|
|
|
|
|
|
|
|
assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.ia3_l |
|
|
assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.ia3_l |
|
|
|
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_lora_gptq_quantization_from_pretrained_safetensors(self): |
|
|
r""" |
|
|
Tests that the autogptq quantization using LoRA works as expected with safetensors weights. |
|
|
""" |
|
|
from transformers import GPTQConfig |
|
|
|
|
|
model_id = "marcsun13/opt-350m-gptq-4bit" |
|
|
quantization_config = GPTQConfig(bits=4, use_exllama=False) |
|
|
kwargs = { |
|
|
"pretrained_model_name_or_path": model_id, |
|
|
"torch_dtype": torch.float16, |
|
|
"device_map": "auto", |
|
|
"quantization_config": quantization_config, |
|
|
} |
|
|
model = AutoModelForCausalLM.from_pretrained(**kwargs) |
|
|
model = prepare_model_for_kbit_training(model) |
|
|
|
|
|
config = LoraConfig(task_type="CAUSAL_LM") |
|
|
peft_model = get_peft_model(model, config) |
|
|
peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) |
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
|
peft_model.save_pretrained(tmp_dir) |
|
|
model = AutoModelForCausalLM.from_pretrained(**kwargs) |
|
|
model = PeftModel.from_pretrained(model, tmp_dir) |
|
|
model = prepare_model_for_kbit_training(model) |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) |
|
|
|
|
|
|
|
|
model.load_adapter(tmp_dir, "adapter2") |
|
|
model.set_adapter("adapter2") |
|
|
model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) |
|
|
|
|
|
|
|
|
assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A |
|
|
assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_lora_bnb_4bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 4bit quantization using LoRA works as expected |
|
|
""" |
|
|
whisper_4bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
opt_4bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_lora_config = LoraConfig( |
|
|
r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" |
|
|
) |
|
|
|
|
|
opt_lora_config = LoraConfig( |
|
|
r=16, |
|
|
lora_alpha=32, |
|
|
target_modules=["q_proj", "v_proj"], |
|
|
lora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none") |
|
|
|
|
|
flan_4bit = get_peft_model(flan_4bit, flan_lora_config) |
|
|
assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, LoraLinear4bit) |
|
|
|
|
|
opt_4bit = get_peft_model(opt_4bit, opt_lora_config) |
|
|
assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit) |
|
|
|
|
|
whisper_4bit = get_peft_model(whisper_4bit, config) |
|
|
assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit) |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_vera_bnb_4bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 4bit quantization using VeRA works as expected |
|
|
""" |
|
|
whisper_4bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
opt_4bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_vera_config = VeraConfig( |
|
|
r=16, target_modules=["q", "v"], vera_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" |
|
|
) |
|
|
|
|
|
opt_vera_config = VeraConfig( |
|
|
r=16, |
|
|
target_modules=["q_proj", "v_proj"], |
|
|
vera_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = VeraConfig(r=32, target_modules=["q_proj", "v_proj"], vera_dropout=0.05, bias="none") |
|
|
|
|
|
flan_4bit = get_peft_model(flan_4bit, flan_vera_config) |
|
|
assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, VeraLinear4bit) |
|
|
|
|
|
opt_4bit = get_peft_model(opt_4bit, opt_vera_config) |
|
|
assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear4bit) |
|
|
|
|
|
whisper_4bit = get_peft_model(whisper_4bit, config) |
|
|
assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear4bit) |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_randlora_bnb_4bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 4bit quantization using RandLoRA works as expected |
|
|
""" |
|
|
whisper_4bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
opt_4bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_randlora_config = RandLoraConfig( |
|
|
r=16, target_modules=["q", "v"], randlora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" |
|
|
) |
|
|
|
|
|
opt_randlora_config = RandLoraConfig( |
|
|
r=16, |
|
|
target_modules=["q_proj", "v_proj"], |
|
|
randlora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = RandLoraConfig(r=32, target_modules=["q_proj", "v_proj"], randlora_dropout=0.05, bias="none") |
|
|
|
|
|
flan_4bit = get_peft_model(flan_4bit, flan_randlora_config) |
|
|
assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, RandLoraLinear4bit) |
|
|
|
|
|
opt_4bit = get_peft_model(opt_4bit, opt_randlora_config) |
|
|
assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RandLoraLinear4bit) |
|
|
|
|
|
whisper_4bit = get_peft_model(whisper_4bit, config) |
|
|
assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RandLoraLinear4bit) |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_ia3_bnb_4bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 4bit quantization using IA3 works as expected |
|
|
""" |
|
|
whisper_4bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
opt_4bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_ia3_config = IA3Config(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM") |
|
|
|
|
|
opt_ia3_config = IA3Config( |
|
|
target_modules=["q_proj", "v_proj", "fc2"], |
|
|
feedforward_modules=["fc2"], |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = IA3Config(target_modules=["q_proj", "v_proj", "fc2"], feedforward_modules=["fc2"]) |
|
|
|
|
|
flan_4bit = get_peft_model(flan_4bit, flan_ia3_config) |
|
|
assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, IA3Linear4bit) |
|
|
|
|
|
opt_4bit = get_peft_model(opt_4bit, opt_ia3_config) |
|
|
assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear4bit) |
|
|
|
|
|
whisper_4bit = get_peft_model(whisper_4bit, config) |
|
|
assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear4bit) |
|
|
|
|
|
@require_bitsandbytes |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_road_bnb_4bit_quantization(self): |
|
|
r""" |
|
|
Test that tests if the 4bit quantization using IA3 works as expected |
|
|
""" |
|
|
whisper_4bit = WhisperForConditionalGeneration.from_pretrained( |
|
|
self.audio_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
opt_4bit = AutoModelForCausalLM.from_pretrained( |
|
|
self.causal_lm_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, |
|
|
device_map="auto", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
flan_road_config = RoadConfig(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM") |
|
|
|
|
|
opt_road_config = RoadConfig( |
|
|
target_modules=["q_proj", "v_proj", "fc2"], |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
config = RoadConfig(target_modules=["q_proj", "v_proj", "fc2"]) |
|
|
|
|
|
flan_4bit = get_peft_model(flan_4bit, flan_road_config) |
|
|
assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, RoadLinear4bit) |
|
|
|
|
|
opt_4bit = get_peft_model(opt_4bit, opt_road_config) |
|
|
assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RoadLinear4bit) |
|
|
|
|
|
whisper_4bit = get_peft_model(whisper_4bit, config) |
|
|
assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RoadLinear4bit) |
|
|
|
|
|
@pytest.mark.multi_gpu_tests |
|
|
@require_torch_multi_accelerator |
|
|
def test_lora_causal_lm_multi_gpu_inference(self): |
|
|
r""" |
|
|
Test if LORA can be used for inference on multiple GPUs. |
|
|
""" |
|
|
lora_config = LoraConfig( |
|
|
r=16, |
|
|
lora_alpha=32, |
|
|
target_modules=["q_proj", "v_proj"], |
|
|
lora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, device_map="balanced") |
|
|
tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id) |
|
|
|
|
|
assert set(model.hf_device_map.values()) == set(range(device_count)) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
assert isinstance(model, PeftModel) |
|
|
|
|
|
dummy_input = "This is a dummy input:" |
|
|
input_ids = tokenizer(dummy_input, return_tensors="pt").input_ids.to(self.device) |
|
|
|
|
|
|
|
|
_ = model.generate(input_ids=input_ids) |
|
|
|
|
|
@require_torch_multi_accelerator |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_lora_seq2seq_lm_multi_gpu_inference(self): |
|
|
r""" |
|
|
Test if LORA can be used for inference on multiple GPUs - 8bit version. |
|
|
""" |
|
|
lora_config = LoraConfig( |
|
|
r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" |
|
|
) |
|
|
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.seq2seq_model_id, device_map="balanced", quantization_config=BitsAndBytesConfig(load_in_8bit=True) |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id) |
|
|
|
|
|
assert set(model.hf_device_map.values()) == set(range(device_count)) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
assert isinstance(model, PeftModel) |
|
|
assert isinstance(model.base_model.model.encoder.block[0].layer[0].SelfAttention.q, LoraLinear8bitLt) |
|
|
|
|
|
dummy_input = "This is a dummy input:" |
|
|
input_ids = tokenizer(dummy_input, return_tensors="pt").input_ids.to(self.device) |
|
|
|
|
|
|
|
|
_ = model.generate(input_ids=input_ids) |
|
|
|
|
|
@require_torch_multi_accelerator |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_adaption_prompt_8bit(self): |
|
|
model = LlamaForCausalLM.from_pretrained( |
|
|
"trl-internal-testing/tiny-random-LlamaForCausalLM", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto", |
|
|
) |
|
|
|
|
|
model = prepare_model_for_kbit_training(model) |
|
|
|
|
|
config = AdaptionPromptConfig( |
|
|
adapter_len=10, |
|
|
adapter_layers=2, |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
|
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
_ = model(random_input) |
|
|
|
|
|
@require_torch_multi_accelerator |
|
|
@pytest.mark.multi_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_adaption_prompt_4bit(self): |
|
|
model = LlamaForCausalLM.from_pretrained( |
|
|
"trl-internal-testing/tiny-random-LlamaForCausalLM", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto", |
|
|
) |
|
|
|
|
|
model = prepare_model_for_kbit_training(model) |
|
|
|
|
|
config = AdaptionPromptConfig( |
|
|
adapter_len=10, |
|
|
adapter_layers=2, |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
|
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
_ = model(random_input) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_print_4bit_expected(self): |
|
|
EXPECTED_TRAINABLE_PARAMS = 294912 |
|
|
EXPECTED_ALL_PARAMS = 125534208 |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
trainable_params, all_params = model.get_nb_trainable_parameters() |
|
|
|
|
|
assert trainable_params == EXPECTED_TRAINABLE_PARAMS |
|
|
assert all_params == EXPECTED_ALL_PARAMS |
|
|
|
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=bnb_config, |
|
|
) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
trainable_params, all_params = model.get_nb_trainable_parameters() |
|
|
|
|
|
assert trainable_params == EXPECTED_TRAINABLE_PARAMS |
|
|
assert all_params == EXPECTED_ALL_PARAMS |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_modules_to_save_grad(self): |
|
|
model_id = "bigscience/bloomz-560m" |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
|
model_id, |
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True), |
|
|
torch_dtype=torch.float32, |
|
|
) |
|
|
|
|
|
model = prepare_model_for_kbit_training(model) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=16, |
|
|
lora_alpha=16, |
|
|
lora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="SEQ_CLS", |
|
|
) |
|
|
|
|
|
peft_model = get_peft_model(model, config) |
|
|
|
|
|
lm_head = peft_model.base_model.model.score |
|
|
original_module = lm_head.original_module |
|
|
modules_to_save = lm_head.modules_to_save.default |
|
|
|
|
|
inputs = torch.randn(1024).to(model.device) |
|
|
o1 = lm_head(inputs) |
|
|
o1.mean().backward() |
|
|
|
|
|
assert modules_to_save.weight.requires_grad is True |
|
|
assert original_module.weight.grad is None |
|
|
assert modules_to_save.weight.grad is not None |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_8bit_merge_lora(self): |
|
|
torch.manual_seed(1000) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
out_base = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_before_merge = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.merge_and_unload() |
|
|
with torch.inference_mode(): |
|
|
out_after_merge = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
atol = 1e-3 |
|
|
rtol = 1 |
|
|
assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol) |
|
|
assert isinstance(model, PeftModel) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, bnb.nn.Linear8bitLt) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, bnb.nn.Linear8bitLt) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_8bit_merge_and_disable_lora(self): |
|
|
torch.manual_seed(1000) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
|
|
|
|
|
|
out_base = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_before = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.merge_adapter() |
|
|
with model.disable_adapter(): |
|
|
with torch.inference_mode(): |
|
|
out_after = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
atol = 1e-3 |
|
|
rtol = 1 |
|
|
assert not torch.allclose(out_base, out_before, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_base, out_after, atol=atol, rtol=rtol) |
|
|
assert isinstance(model, PeftModel) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear8bitLt) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_8bit_merge_lora_with_bias(self): |
|
|
|
|
|
torch.manual_seed(0) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
) |
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
out_base = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
lora_bias=True, |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_before_merge = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.merge_and_unload() |
|
|
with torch.inference_mode(): |
|
|
out_after_merge = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
atol = 1e-3 |
|
|
rtol = 1 |
|
|
assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_4bit_merge_lora(self): |
|
|
torch.manual_seed(3000) |
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=False, |
|
|
bnb_4bit_compute_dtype=torch.float32, |
|
|
) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.float32, |
|
|
) |
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
|
|
|
|
|
|
out_base = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_before_merge = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.merge_and_unload() |
|
|
with torch.inference_mode(): |
|
|
out_after_merge = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
|
|
|
atol = 0.01 |
|
|
rtol = 10 |
|
|
assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol) |
|
|
assert isinstance(model, PeftModel) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, bnb.nn.Linear4bit) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, bnb.nn.Linear4bit) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_4bit_merge_and_disable_lora(self): |
|
|
torch.manual_seed(3000) |
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=False, |
|
|
bnb_4bit_compute_dtype=torch.float32, |
|
|
) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.float32, |
|
|
) |
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
|
|
|
|
|
|
out_base = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_before = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.merge_adapter() |
|
|
with model.disable_adapter(): |
|
|
with torch.inference_mode(): |
|
|
out_after = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
atol = 0.01 |
|
|
rtol = 10 |
|
|
assert not torch.allclose(out_base, out_before, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_base, out_after, atol=atol, rtol=rtol) |
|
|
assert isinstance(model, PeftModel) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear4bit) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_4bit_merge_lora_with_bias(self): |
|
|
|
|
|
torch.manual_seed(3000) |
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=False, |
|
|
bnb_4bit_compute_dtype=torch.float32, |
|
|
) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.float32, |
|
|
) |
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
|
|
|
|
|
|
out_base = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
lora_bias=True, |
|
|
) |
|
|
model = get_peft_model(model, config) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_before_merge = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.merge_and_unload() |
|
|
with torch.inference_mode(): |
|
|
out_after_merge = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
|
|
|
atol = 0.01 |
|
|
rtol = 10 |
|
|
assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_4bit_lora_mixed_adapter_batches_lora(self): |
|
|
|
|
|
torch.manual_seed(3000) |
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=False, |
|
|
bnb_4bit_compute_dtype=torch.float32, |
|
|
) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") |
|
|
|
|
|
inputs = tokenizer( |
|
|
[ |
|
|
"Hello, my dog is cute", |
|
|
"Hello, my cat is awesome", |
|
|
"Hello, my fish is great", |
|
|
"Salut, mon chien est mignon", |
|
|
"Salut, mon chat est génial", |
|
|
"Salut, mon poisson est super", |
|
|
"Hallo, mein Hund ist süß", |
|
|
"Hallo, meine Katze ist toll", |
|
|
"Hallo, mein Fisch ist großartig", |
|
|
], |
|
|
return_tensors="pt", |
|
|
padding=True, |
|
|
).to(model.device) |
|
|
with torch.inference_mode(): |
|
|
out_base = model(**inputs).logits |
|
|
|
|
|
config0 = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
) |
|
|
model = get_peft_model(model, config0).eval() |
|
|
with torch.inference_mode(): |
|
|
out_adapter0 = model(**inputs).logits |
|
|
|
|
|
config1 = LoraConfig( |
|
|
r=16, |
|
|
init_lora_weights=False, |
|
|
) |
|
|
model.add_adapter("adapter1", config1) |
|
|
model.set_adapter("adapter1") |
|
|
with torch.inference_mode(): |
|
|
out_adapter1 = model(**inputs).logits |
|
|
|
|
|
atol, rtol = 3e-5, 1e-5 |
|
|
|
|
|
assert len(out_base) >= 3 |
|
|
assert len(out_base) == len(out_adapter0) == len(out_adapter1) |
|
|
assert not torch.allclose(out_base, out_adapter0, atol=atol, rtol=rtol) |
|
|
assert not torch.allclose(out_base, out_adapter1, atol=atol, rtol=rtol) |
|
|
assert not torch.allclose(out_adapter0, out_adapter1, atol=atol, rtol=rtol) |
|
|
|
|
|
|
|
|
adapters = ["__base__", "default", "adapter1"] |
|
|
adapter_names = [adapters[i % 3] for i in (range(9))] |
|
|
with torch.inference_mode(): |
|
|
out_mixed = model(**inputs, adapter_names=adapter_names).logits |
|
|
|
|
|
assert torch.allclose(out_base[::3], out_mixed[::3], atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_adapter0[1::3], out_mixed[1::3], atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_adapter1[2::3], out_mixed[2::3], atol=atol, rtol=rtol) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_8bit_lora_mixed_adapter_batches_lora(self): |
|
|
|
|
|
|
|
|
torch.manual_seed(3000) |
|
|
bnb_config = BitsAndBytesConfig(load_in_8bit=True) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") |
|
|
|
|
|
inputs = tokenizer( |
|
|
[ |
|
|
"Hello, my dog is cute", |
|
|
"Hello, my cat is awesome", |
|
|
"Hello, my fish is great", |
|
|
"Salut, mon chien est mignon", |
|
|
"Salut, mon chat est génial", |
|
|
"Salut, mon poisson est super", |
|
|
"Hallo, mein Hund ist süß", |
|
|
"Hallo, meine Katze ist toll", |
|
|
"Hallo, mein Fisch ist großartig", |
|
|
], |
|
|
return_tensors="pt", |
|
|
padding=True, |
|
|
).to(model.device) |
|
|
with torch.inference_mode(): |
|
|
out_base = F.softmax(model(**inputs).logits, dim=-1) |
|
|
|
|
|
config0 = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
) |
|
|
model = get_peft_model(model, config0).eval() |
|
|
with torch.inference_mode(): |
|
|
out_adapter0 = F.softmax(model(**inputs).logits, dim=-1) |
|
|
|
|
|
config1 = LoraConfig( |
|
|
r=16, |
|
|
init_lora_weights=False, |
|
|
) |
|
|
model.add_adapter("adapter1", config1) |
|
|
model.set_adapter("adapter1") |
|
|
with torch.inference_mode(): |
|
|
out_adapter1 = F.softmax(model(**inputs).logits, dim=-1) |
|
|
|
|
|
atol = 0.01 |
|
|
rtol = 0.5 |
|
|
|
|
|
assert len(out_base) >= 3 |
|
|
assert len(out_base) == len(out_adapter0) == len(out_adapter1) |
|
|
assert not torch.allclose(out_base, out_adapter0, atol=atol, rtol=rtol) |
|
|
assert not torch.allclose(out_base, out_adapter1, atol=atol, rtol=rtol) |
|
|
assert not torch.allclose(out_adapter0, out_adapter1, atol=atol, rtol=rtol) |
|
|
|
|
|
|
|
|
adapters = ["__base__", "default", "adapter1"] |
|
|
adapter_names = [adapters[i % 3] for i in (range(9))] |
|
|
with torch.inference_mode(): |
|
|
out_mixed = F.softmax(model(**inputs, adapter_names=adapter_names).logits, dim=-1) |
|
|
|
|
|
assert torch.allclose(out_base[::3], out_mixed[::3], atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_adapter0[1::3], out_mixed[1::3], atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_adapter1[2::3], out_mixed[2::3], atol=atol, rtol=rtol) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_serialization_shared_tensors(self): |
|
|
model_checkpoint = "roberta-base" |
|
|
peft_config = LoraConfig( |
|
|
task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all" |
|
|
) |
|
|
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=11).to(self.device) |
|
|
model = get_peft_model(model, peft_config) |
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
|
model.save_pretrained(tmp_dir, safe_serialization=True) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_deterministic_for_xpu |
|
|
@require_bitsandbytes |
|
|
def test_4bit_dora_inference(self): |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=False, |
|
|
bnb_4bit_compute_dtype=torch.float32, |
|
|
) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.float32, |
|
|
) |
|
|
|
|
|
torch.manual_seed(0) |
|
|
config_lora = LoraConfig(r=8, init_lora_weights=False, use_dora=False) |
|
|
model = get_peft_model(model, config_lora).eval() |
|
|
|
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
logits_lora = model(random_input).logits |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.float32, |
|
|
) |
|
|
torch.manual_seed(0) |
|
|
config_dora = LoraConfig(r=8, init_lora_weights=False, use_dora=True) |
|
|
model = get_peft_model(model, config_dora).eval() |
|
|
|
|
|
logits_dora = model(random_input).logits |
|
|
|
|
|
assert torch.allclose(logits_lora, logits_dora) |
|
|
|
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear4bit) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_deterministic_for_xpu |
|
|
@require_bitsandbytes |
|
|
def test_8bit_dora_inference(self): |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
|
|
|
torch.manual_seed(0) |
|
|
config_lora = LoraConfig(r=8, init_lora_weights=False, use_dora=False) |
|
|
model = get_peft_model(model, config_lora).eval() |
|
|
|
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
logits_lora = model(random_input).logits |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
torch_dtype=torch.float32, |
|
|
) |
|
|
torch.manual_seed(0) |
|
|
config_dora = LoraConfig(r=8, init_lora_weights=False, use_dora=True) |
|
|
model = get_peft_model(model, config_dora).eval() |
|
|
|
|
|
logits_dora = model(random_input).logits |
|
|
|
|
|
assert torch.allclose(logits_lora, logits_dora) |
|
|
|
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear8bitLt) |
|
|
assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_4bit_dora_merging(self): |
|
|
|
|
|
torch.manual_seed(0) |
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=False, |
|
|
bnb_4bit_compute_dtype=torch.float32, |
|
|
) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"trl-internal-testing/tiny-random-LlamaForCausalLM", |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
|
|
|
|
|
|
out_base = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
use_dora=True, |
|
|
) |
|
|
model = get_peft_model(model, config).eval() |
|
|
|
|
|
|
|
|
|
|
|
for name, module in model.named_modules(): |
|
|
if isinstance(module, LoraLinear4bit): |
|
|
module.lora_magnitude_vector["default"].weight = torch.nn.Parameter( |
|
|
10 * torch.rand_like(module.lora_magnitude_vector["default"].weight) |
|
|
) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_dora = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.merge_adapter() |
|
|
out_merged = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.unmerge_adapter() |
|
|
out_unmerged = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model = model.merge_and_unload() |
|
|
out_unloaded = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
atol = 1e-5 |
|
|
rtol = 1e-3 |
|
|
|
|
|
assert not torch.allclose(out_base, out_dora, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_dora, out_merged, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_dora, out_unmerged, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_dora, out_unloaded, atol=atol, rtol=rtol) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_8bit_dora_merging(self): |
|
|
|
|
|
torch.manual_seed(0) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
|
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
|
|
|
|
|
|
out_base = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
config = LoraConfig( |
|
|
r=8, |
|
|
init_lora_weights=False, |
|
|
use_dora=True, |
|
|
) |
|
|
model = get_peft_model(model, config).eval() |
|
|
|
|
|
|
|
|
|
|
|
for name, module in model.named_modules(): |
|
|
if isinstance(module, LoraLinear8bitLt): |
|
|
module.lora_magnitude_vector["default"].weight = torch.nn.Parameter( |
|
|
10 * torch.rand_like(module.lora_magnitude_vector["default"].weight) |
|
|
) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_dora = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.merge_adapter() |
|
|
out_merged = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.unmerge_adapter() |
|
|
out_unmerged = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model = model.merge_and_unload() |
|
|
out_unloaded = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
atol = 1e-3 |
|
|
rtol = 1 |
|
|
|
|
|
assert not torch.allclose(out_base, out_dora, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_dora, out_merged, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_dora, out_unmerged, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_dora, out_unloaded, atol=atol, rtol=rtol) |
|
|
|
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_dora_ephemeral_gpu_offload(self): |
|
|
torch.manual_seed(0) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
|
|
|
config = LoraConfig( |
|
|
r=128, |
|
|
init_lora_weights=False, |
|
|
use_dora=True, |
|
|
runtime_config=LoraRuntimeConfig( |
|
|
ephemeral_gpu_offload=True |
|
|
), |
|
|
) |
|
|
peft_model = get_peft_model(model, config).eval() |
|
|
|
|
|
assert peft_model.peft_config["default"].runtime_config.ephemeral_gpu_offload |
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
|
peft_model.save_pretrained(tmp_dir) |
|
|
|
|
|
|
|
|
peft_model_cpu = PeftModel.from_pretrained( |
|
|
model, |
|
|
tmp_dir, |
|
|
device_map={"": "cpu"}, |
|
|
).eval() |
|
|
|
|
|
|
|
|
assert not peft_model_cpu.peft_config["default"].runtime_config.ephemeral_gpu_offload |
|
|
|
|
|
|
|
|
peft_model_ego = PeftModel.from_pretrained( |
|
|
model, |
|
|
tmp_dir, |
|
|
device_map={"": "cpu"}, |
|
|
ephemeral_gpu_offload=True, |
|
|
).eval() |
|
|
|
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
with torch.inference_mode(): |
|
|
out_peft_model_cpu = F.softmax(peft_model_cpu(random_input).logits, dim=-1) |
|
|
out_peft_model_ego = F.softmax(peft_model_ego(random_input).logits, dim=-1) |
|
|
|
|
|
|
|
|
assert torch.allclose(out_peft_model_cpu, out_peft_model_ego) |
|
|
|
|
|
@require_torch_multi_accelerator |
|
|
@pytest.mark.multi_gpu_tests |
|
|
def test_dora_ephemeral_gpu_offload_multigpu(self): |
|
|
torch.manual_seed(0) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
|
|
|
config = LoraConfig( |
|
|
r=16, |
|
|
init_lora_weights=False, |
|
|
use_dora=True, |
|
|
runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True), |
|
|
) |
|
|
peft_model = get_peft_model(model, config).eval() |
|
|
|
|
|
layer = peft_model.base_model.model.model.decoder.layers[0].self_attn.v_proj |
|
|
lora_A, lora_B = layer.lora_A, layer.lora_B |
|
|
|
|
|
possible_combinations = ["cpu", self.device, f"{self.device}:0", f"{self.device}:1"] |
|
|
adapter_name = layer.active_adapter[0] |
|
|
for device_A in possible_combinations: |
|
|
la = lora_A.to(device_A) |
|
|
for device_B in possible_combinations: |
|
|
lb = lora_B.to(device_B) |
|
|
layer.lora_A, layer.lora_B = la, lb |
|
|
layer.lora_variant[adapter_name].init(layer, adapter_name=adapter_name) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_8bit_road_merging(self): |
|
|
|
|
|
torch.manual_seed(0) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
quantization_config=BitsAndBytesConfig(load_in_8bit=True), |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
|
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
|
|
|
|
|
|
out_base = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
config = RoadConfig( |
|
|
init_weights=False, |
|
|
) |
|
|
model = get_peft_model(model, config).eval() |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_road = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.merge_adapter() |
|
|
out_merged = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.unmerge_adapter() |
|
|
out_unmerged = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model = model.merge_and_unload() |
|
|
out_unloaded = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
atol = 1e-3 |
|
|
rtol = 1 |
|
|
|
|
|
assert not torch.allclose(out_base, out_road, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_road, out_merged, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_road, out_unmerged, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(out_road, out_unloaded, atol=atol, rtol=rtol) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
@require_bitsandbytes |
|
|
def test_4bit_road_merging(self): |
|
|
|
|
|
torch.manual_seed(0) |
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=False, |
|
|
bnb_4bit_compute_dtype=torch.float32, |
|
|
) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"trl-internal-testing/tiny-random-LlamaForCausalLM", |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
|
|
|
|
|
|
out_base = model(random_input).logits |
|
|
probs_base = F.softmax(out_base, dim=-1) |
|
|
|
|
|
config = RoadConfig( |
|
|
init_weights=False, |
|
|
group_size=4, |
|
|
) |
|
|
model = get_peft_model(model, config).eval() |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out_road = model(random_input).logits |
|
|
probs_road = F.softmax(out_road, dim=-1) |
|
|
|
|
|
model.merge_adapter() |
|
|
probs_merged = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model.unmerge_adapter() |
|
|
probs_unmerged = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
model = model.merge_and_unload() |
|
|
probs_unloaded = F.softmax(model(random_input).logits, dim=-1) |
|
|
|
|
|
atol = 1e-5 |
|
|
rtol = 1e-3 |
|
|
|
|
|
|
|
|
assert not torch.allclose(out_base, out_road, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(probs_road, probs_merged, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(probs_road, probs_unmerged, atol=atol, rtol=rtol) |
|
|
assert torch.allclose(probs_road, probs_unloaded, atol=atol, rtol=rtol) |
|
|
|
|
|
def test_apply_GS_hra_inference(self): |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
|
|
|
torch.manual_seed(0) |
|
|
config_hra = HRAConfig(r=8, init_weights=True, apply_GS=False) |
|
|
model = get_peft_model(model, config_hra).eval() |
|
|
|
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
logits_hra = model(random_input).logits |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
torch_dtype=torch.float32, |
|
|
) |
|
|
torch.manual_seed(0) |
|
|
config_hra_GS = HRAConfig(r=8, init_weights=True, apply_GS=True) |
|
|
model = get_peft_model(model, config_hra_GS) |
|
|
|
|
|
logits_hra_GS = model(random_input).logits |
|
|
|
|
|
assert not torch.allclose(logits_hra, logits_hra_GS) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_apply_GS_hra_conv2d_inference(self): |
|
|
|
|
|
model_id = "microsoft/resnet-18" |
|
|
image_processor = AutoImageProcessor.from_pretrained(model_id) |
|
|
image = load_cat_image() |
|
|
data = image_processor(image, return_tensors="pt") |
|
|
|
|
|
model = AutoModelForImageClassification.from_pretrained(model_id).eval() |
|
|
torch.manual_seed(0) |
|
|
config_hra = HRAConfig(r=8, init_weights=True, target_modules=["convolution"], apply_GS=False) |
|
|
model = get_peft_model(model, config_hra).eval() |
|
|
|
|
|
logits_hra = model(**data).logits |
|
|
|
|
|
model = AutoModelForImageClassification.from_pretrained(model_id).eval() |
|
|
torch.manual_seed(0) |
|
|
config_hra_GS = HRAConfig(r=8, init_weights=True, target_modules=["convolution"], apply_GS=True) |
|
|
model = get_peft_model(model, config_hra_GS) |
|
|
|
|
|
logits_hra_GS = model(**data).logits |
|
|
|
|
|
assert not torch.allclose(logits_hra, logits_hra_GS) |
|
|
|
|
|
@require_non_cpu |
|
|
@pytest.mark.single_gpu_tests |
|
|
def test_r_odd_hra_inference(self): |
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"facebook/opt-125m", |
|
|
torch_dtype=torch.float32, |
|
|
).eval() |
|
|
|
|
|
random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) |
|
|
|
|
|
torch.manual_seed(0) |
|
|
logits = model(random_input).logits |
|
|
|
|
|
config_hra = HRAConfig(r=7, init_weights=True, apply_GS=False) |
|
|
model = get_peft_model(model, config_hra).eval() |
|
|
logits_hra = model(random_input).logits |
|
|
|
|
|
assert not torch.allclose(logits, logits_hra) |
|
|
|
|
|
|
|
|
@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="test requires a GPU or XPU") |
|
|
@pytest.mark.single_gpu_tests |
|
|
class TestSameAdapterDifferentDevices: |
|
|
device = infer_device() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def mlp(self): |
|
|
class MLP(nn.Module): |
|
|
def __init__(self, bias=True): |
|
|
super().__init__() |
|
|
self.lin0 = nn.Linear(8, 32, bias=bias) |
|
|
self.lin1 = nn.Linear(32, 2, bias=bias) |
|
|
|
|
|
return MLP() |
|
|
|
|
|
@pytest.fixture |
|
|
def emb_conv1d(self): |
|
|
class ModelEmbConv1D(nn.Module): |
|
|
def __init__(self, emb_size=100): |
|
|
super().__init__() |
|
|
self.emb = nn.Embedding(emb_size, 5) |
|
|
self.conv1d = Conv1D(1, 5) |
|
|
|
|
|
return ModelEmbConv1D() |
|
|
|
|
|
@pytest.fixture |
|
|
def conv2d(self): |
|
|
class ModelConv2D(nn.Module): |
|
|
def __init__(self): |
|
|
super().__init__() |
|
|
self.conv2d = nn.Conv2d(5, 10, 3) |
|
|
|
|
|
return ModelConv2D() |
|
|
|
|
|
def test_lora_one_target_add_new_adapter_does_not_change_device(self, mlp): |
|
|
config = LoraConfig(target_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.lora_A.cpu() |
|
|
model.lin0.lora_B.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.lora_A.default.weight.device.type == "cpu" |
|
|
assert model.lin0.lora_B.default.weight.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.lora_A.default.weight.device.type == "cpu" |
|
|
assert model.lin0.lora_B.default.weight.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.lora_A.other.weight.device.type == self.device |
|
|
assert model.lin0.lora_B.other.weight.device.type == self.device |
|
|
|
|
|
def test_lora_multiple_targets_add_new_adapater_does_not_change_device(self, mlp): |
|
|
|
|
|
config = LoraConfig(target_modules=["lin0", "lin1"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
|
|
|
model.lin1.lora_A.cpu() |
|
|
model.lin1.lora_B.cpu() |
|
|
|
|
|
|
|
|
assert model.lin1.lora_A.default.weight.device.type == "cpu" |
|
|
assert model.lin1.lora_B.default.weight.device.type == "cpu" |
|
|
assert model.lin1.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.lora_A.default.weight.device.type == self.device |
|
|
assert model.lin0.lora_B.default.weight.device.type == self.device |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin1.lora_A.default.weight.device.type == "cpu" |
|
|
assert model.lin1.lora_B.default.weight.device.type == "cpu" |
|
|
assert model.lin1.base_layer.weight.device.type == self.device |
|
|
|
|
|
assert model.lin0.lora_A.default.weight.device.type == self.device |
|
|
assert model.lin0.lora_B.default.weight.device.type == self.device |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.lora_A.other.weight.device.type == self.device |
|
|
assert model.lin0.lora_B.other.weight.device.type == self.device |
|
|
assert model.lin1.lora_A.other.weight.device.type == self.device |
|
|
assert model.lin1.lora_B.other.weight.device.type == self.device |
|
|
|
|
|
def test_lora_embedding_target_add_new_adapter_does_not_change_device(self, emb_conv1d): |
|
|
|
|
|
config = LoraConfig(target_modules=["emb"]) |
|
|
model = get_peft_model(emb_conv1d, config) |
|
|
model = model.to(self.device) |
|
|
model.emb.lora_embedding_A.cpu() |
|
|
model.emb.lora_embedding_B.cpu() |
|
|
|
|
|
|
|
|
assert model.emb.lora_embedding_A.default.device.type == "cpu" |
|
|
assert model.emb.lora_embedding_B.default.device.type == "cpu" |
|
|
assert model.emb.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.emb.lora_embedding_A.default.device.type == "cpu" |
|
|
assert model.emb.lora_embedding_B.default.device.type == "cpu" |
|
|
|
|
|
assert model.emb.weight.device.type == self.device |
|
|
assert model.emb.lora_embedding_A.other.device.type == self.device |
|
|
assert model.emb.lora_embedding_B.other.device.type == self.device |
|
|
|
|
|
def test_lora_conv1d_target_add_new_adapter_does_not_change_device(self, emb_conv1d): |
|
|
|
|
|
config = LoraConfig(target_modules=["conv1d"]) |
|
|
model = get_peft_model(emb_conv1d, config) |
|
|
model = model.to(self.device) |
|
|
model.conv1d.lora_A.cpu() |
|
|
model.conv1d.lora_B.cpu() |
|
|
|
|
|
|
|
|
assert model.conv1d.lora_A.default.weight.device.type == "cpu" |
|
|
assert model.conv1d.lora_B.default.weight.device.type == "cpu" |
|
|
assert model.conv1d.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.conv1d.lora_A.default.weight.device.type == "cpu" |
|
|
assert model.conv1d.lora_B.default.weight.device.type == "cpu" |
|
|
|
|
|
assert model.conv1d.weight.device.type == self.device |
|
|
assert model.conv1d.lora_A.other.weight.device.type == self.device |
|
|
assert model.conv1d.lora_B.other.weight.device.type == self.device |
|
|
|
|
|
def test_lora_dora_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = LoraConfig(target_modules=["lin0"], use_dora=True) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.lora_A.cpu() |
|
|
model.lin0.lora_B.cpu() |
|
|
model.lin0.lora_magnitude_vector.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.lora_A.default.weight.device.type == "cpu" |
|
|
assert model.lin0.lora_B.default.weight.device.type == "cpu" |
|
|
assert model.lin0.lora_magnitude_vector.default.weight.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.lora_A.default.weight.device.type == "cpu" |
|
|
assert model.lin0.lora_B.default.weight.device.type == "cpu" |
|
|
assert model.lin0.lora_magnitude_vector.default.weight.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.lora_A.other.weight.device.type == self.device |
|
|
assert model.lin0.lora_B.other.weight.device.type == self.device |
|
|
assert model.lin0.lora_magnitude_vector.other.weight.device.type == self.device |
|
|
|
|
|
def test_adalora_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
|
|
|
config = AdaLoraConfig(target_modules=["lin0"], inference_mode=True, total_step=1) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.lora_A.cpu() |
|
|
model.lin0.lora_E.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.lora_A.default.device.type == "cpu" |
|
|
assert model.lin0.lora_E.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.lora_A.default.device.type == "cpu" |
|
|
assert model.lin0.lora_E.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.lora_A.other.device.type == self.device |
|
|
assert model.lin0.lora_E.other.device.type == self.device |
|
|
|
|
|
def test_boft_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = BOFTConfig(target_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.boft_R.cpu() |
|
|
model.lin0.boft_s.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.boft_R.default.device.type == "cpu" |
|
|
assert model.lin0.boft_s.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.boft_R.default.device.type == "cpu" |
|
|
assert model.lin0.boft_s.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.boft_R.other.device.type == self.device |
|
|
assert model.lin0.boft_s.other.device.type == self.device |
|
|
|
|
|
def test_ia3_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.ia3_l.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.ia3_l.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.ia3_l.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.ia3_l.other.device.type == self.device |
|
|
|
|
|
@pytest.mark.xfail(reason="LN Tuning handling of multiple adapters may not be correct", strict=True) |
|
|
def test_ln_tuning_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = LNTuningConfig(target_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.ln_tuning_layers.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.ln_tuning_layers.default.weight.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.ln_tuning_layers.default.weight.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.ln_tuning_layers.other.weight.device.type == self.device |
|
|
|
|
|
def test_loha_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = LoHaConfig(target_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.hada_w1_a.cpu() |
|
|
model.lin0.hada_w2_b.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.hada_w1_a.default.device.type == "cpu" |
|
|
assert model.lin0.hada_w2_b.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.hada_w1_a.default.device.type == "cpu" |
|
|
assert model.lin0.hada_w2_b.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.hada_w1_a.other.device.type == self.device |
|
|
assert model.lin0.hada_w2_b.other.device.type == self.device |
|
|
|
|
|
def test_lokr_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = LoKrConfig(target_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.lokr_w1.cpu() |
|
|
model.lin0.lokr_w2.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.lokr_w1.default.device.type == "cpu" |
|
|
assert model.lin0.lokr_w2.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.lokr_w1.default.device.type == "cpu" |
|
|
assert model.lin0.lokr_w2.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.lokr_w1.other.device.type == self.device |
|
|
assert model.lin0.lokr_w2.other.device.type == self.device |
|
|
|
|
|
def test_oft_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = OFTConfig(target_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.oft_R.default.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.oft_R.default.weight.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.oft_R.default.weight.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.oft_R.other.weight.device.type == self.device |
|
|
|
|
|
def test_vera_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = VeraConfig(target_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.vera_A.cpu() |
|
|
model.lin0.vera_lambda_d.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.vera_A.default.device.type == "cpu" |
|
|
assert model.lin0.vera_lambda_d.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.vera_A.default.device.type == "cpu" |
|
|
assert model.lin0.vera_lambda_d.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.vera_A.other.device.type == self.device |
|
|
assert model.lin0.vera_lambda_d.other.device.type == self.device |
|
|
|
|
|
def test_randlora_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = RandLoraConfig(target_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.randlora_A.cpu() |
|
|
model.lin0.randlora_lambda.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.randlora_A.default.device.type == "cpu" |
|
|
assert model.lin0.randlora_lambda.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.randlora_A.default.device.type == "cpu" |
|
|
assert model.lin0.randlora_lambda.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.randlora_A.other.device.type == self.device |
|
|
assert model.lin0.randlora_lambda.other.device.type == self.device |
|
|
|
|
|
def test_vblora_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = VBLoRAConfig(target_modules=["lin0"], vector_length=2) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.vblora_logits_A.cpu() |
|
|
model.lin0.vblora_logits_B.cpu() |
|
|
model.lin0.vblora_vector_bank.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.vblora_logits_A.default.device.type == "cpu" |
|
|
assert model.lin0.vblora_logits_B.default.device.type == "cpu" |
|
|
assert model.lin0.vblora_vector_bank.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.vblora_logits_A.default.device.type == "cpu" |
|
|
assert model.lin0.vblora_logits_B.default.device.type == "cpu" |
|
|
assert model.lin0.vblora_vector_bank.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.vblora_logits_A.other.device.type == self.device |
|
|
assert model.lin0.vblora_logits_B.other.device.type == self.device |
|
|
assert model.lin0.vblora_vector_bank.other.device.type == self.device |
|
|
|
|
|
def test_hra_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = HRAConfig(target_modules=["lin0"]) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.hra_u.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.hra_u.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.hra_u.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.hra_u.other.device.type == self.device |
|
|
|
|
|
def test_road_add_new_adapter_does_not_change_device(self, mlp): |
|
|
|
|
|
config = RoadConfig(target_modules=["lin0"], group_size=2) |
|
|
model = get_peft_model(mlp, config) |
|
|
model = model.to(self.device) |
|
|
model.lin0.road_theta.cpu() |
|
|
|
|
|
|
|
|
assert model.lin0.road_theta.default.device.type == "cpu" |
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
|
|
|
model.add_adapter("other", config) |
|
|
|
|
|
assert model.lin0.road_theta.default.device.type == "cpu" |
|
|
|
|
|
assert model.lin0.base_layer.weight.device.type == self.device |
|
|
assert model.lin0.road_theta.other.device.type == self.device |
|
|
|