import torch from datasets import load_dataset, load_metric from transformers import ( AutoProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer, ) from PIL import Image # --- 1. CONFIGURATION --- MODEL_NAME = "Salesforce/blip-image-captioning-base" DATASET_ID = "lambdalabs/pokemon-blip-captions" # Replace with COCO or your specialized dataset OUTPUT_DIR = "./blip-image-captioning-finetuned" NUM_TRAIN_EPOCHS = 3 BATCH_SIZE = 16 # --- 2. LOAD PROCESSOR AND MODEL --- # The processor handles both image feature extraction and text tokenization processor = AutoProcessor.from_pretrained(MODEL_NAME) model = BlipForConditionalGeneration.from_pretrained(MODEL_NAME) # --- 3. LOAD & PREPARE DATASET --- print(f"Loading dataset: {DATASET_ID}") ds = load_dataset(DATASET_ID) # We'll use the 'train' split and split it further for a validation set ds = ds['train'].train_test_split(test_size=0.1) train_ds = ds['train'] eval_ds = ds['test'] # Set the maximum sequence length for the captions max_caption_length = 50 def preprocess_data(examples): """Tokenizes captions and processes images.""" # Process images and captions together # BLIP processor handles image resizing, normalization, and text tokenization inputs = processor( images=[image.convert("RGB") for image in examples["image"]], text=examples["text"], padding="max_length", max_length=max_caption_length, truncation=True, return_tensors="pt" ) # The labels for Causal Language Modeling are the input tokens shifted right # The tokenizer includes BOS/EOS tokens which are essential here inputs["labels"] = inputs["input_ids"] # Delete the original image data since the processor has converted it to pixel_values del inputs["input_ids"] del inputs["attention_mask"] return inputs # Apply the preprocessing function to the dataset print("Applying preprocessing to the dataset...") # set_transform is highly efficient as it applies the function on-the-fly train_ds.set_transform(preprocess_data) eval_ds.set_transform(preprocess_data) # --- 4. TRAINING SETUP (Trainer API) --- # Define evaluation metric (often BLEU or ROUGE, but WER is common for generation) # Note: For simplicity, we skip complex metric computation in this basic script. training_args = TrainingArguments( output_dir=OUTPUT_DIR, num_train_epochs=NUM_TRAIN_EPOCHS, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, learning_rate=5e-5, evaluation_strategy="epoch", logging_dir=f"{OUTPUT_DIR}/logs", logging_steps=100, save_strategy="epoch", load_best_model_at_end=True, fp16=torch.cuda.is_available(), # Use mixed precision if a GPU is available push_to_hub=True, # Set this to True to push the model to the Hugging Face Hub! hub_model_id=f"YOUR_HUGGINGFACE_USERNAME/blip-finetuned-{DATASET_ID.split('/')[-1]}", # Customize this ) trainer = Trainer( model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds, tokenizer=processor.tokenizer, # Pass the tokenizer for the Trainer to use ) # --- 5. START TRAINING --- print("Starting training...") trainer.train() print("Training complete! Pushing model to Hub...") # --- 6. SAVE & PUSH TO HUB --- trainer.save_model(OUTPUT_DIR) # The push_to_hub=True in TrainingArguments automatically handles the final push. # You will need to log in to your Hugging Face account via the command line # (huggingface-cli login) or in a notebook (from huggingface_hub import notebook_login; notebook_login()).