from unsloth import FastVisionModel # FastLanguageModel for LLMs import torch from datasets import load_dataset from unsloth import is_bf16_supported from unsloth.trainer import UnslothVisionDataCollator from trl import SFTTrainer, SFTConfig from transformers import TextStreamer import datetime timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") model, tokenizer = FastVisionModel.from_pretrained( model_name = "/home/rzhong/project/unsloth/model_pretrain_20250301_113944", load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA. # 模型已经选了4bit量化后的,这里还需不需要再以4bit加载?建议实验一下 | 应该是一个意思 use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context max_seq_length = 2048, # unsloth支持4x的上下文微调,如果原模型支持8192的上下文,这里只需要设置为2048 dtype = torch.bfloat16, # A100支持bfloat16,可以减少显存占用。默认为None,也可以选择torch.float16 ) model = FastVisionModel.get_peft_model( model, finetune_vision_layers = True, # False if not finetuning vision layers finetune_language_layers = True, # False if not finetuning language layers finetune_attention_modules = True, # False if not finetuning attention layers finetune_mlp_modules = True, # False if not finetuning MLP layers r = 16, # The larger, the higher the accuracy, but might overfit lora_alpha = 16, # Recommended alpha == r at least lora_dropout = 0, bias = "none", random_state = 3407, use_rslora = False, # We support rank stabilized LoRA loftq_config = None, # And LoftQ # target_modules = "all-linear", # Optional now! Can specify a list if needed target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], ) dataset = load_dataset("/home/share/rzhong/dataset/google-landmark/dataset_4/dataset_file", split = "train") print(dataset) instruction = "描述这张图片。" # instruction = "Write the LaTeX representation for this image." def convert_to_conversation(sample): conversation = [ { "role": "user", "content" : [ {"type" : "text", "text" : instruction}, {"type" : "image", "image" : sample["image"]} ] }, { "role" : "assistant", "content" : [ {"type" : "text", "text" : sample["text"]} ] }, ] return { "messages" : conversation } pass converted_dataset = [convert_to_conversation(sample) for sample in dataset] print(converted_dataset[0]) FastVisionModel.for_training(model) # Enable for training! trainer = SFTTrainer( model = model, tokenizer = tokenizer, data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use! train_dataset = converted_dataset, args = SFTConfig( per_device_train_batch_size = 2, gradient_accumulation_steps = 4, # 原来是4。可以增加,相当于提高batch size,但不会影响内存消耗。增加会使loss曲线更平滑 warmup_steps = 5, # max_steps = None, num_train_epochs = 10, # Set this instead of max_steps for full training runs learning_rate = 5e-5, # 2e-4 1e-4 5e-5 2e-5 fp16 = not is_bf16_supported(), bf16 = is_bf16_supported(), logging_steps = 1, optim = "adamw_8bit", weight_decay = 0.01, lr_scheduler_type = "linear", seed = 3407, output_dir = f"outputs_pretrain_sft_{timestamp}", report_to = "none", # For Weights and Biases # You MUST put the below items for vision finetuning: remove_unused_columns = False, dataset_text_field = "", dataset_kwargs = {"skip_prepare_dataset": True}, dataset_num_proc = 4, max_seq_length = 2048, ), ) trainer_stats = trainer.train() model.save_pretrained(f"lora_model_pretrain_sft_{timestamp}") # Local saving tokenizer.save_pretrained(f"lora_model_pretrain_sft_{timestamp}")