Spaces:

zeddotes
/

computer-thoughts

Sleeping

App Files Files Community

zeddotes commited on Feb 22, 2025

Commit

5c85d5d

1 Parent(s): 66db8ea

updated

Browse files

Files changed (3) hide show

.gitignore +2 -0
.python-version +2 -0
train.py +75 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv
2	+ my_blip_computer_thoughts/

.python-version ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ 3.9.21

train.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+train.py
+A complete example of fine-tuning BLIP on 'agentsea/computer-thoughts' for captioning.
+All processing is done in the collate function. This is simpler and avoids shape mismatches.
+"""
+import torch
+from datasets import load_dataset, Image as HFImage
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    TrainingArguments,
+    Trainer
+)
+# 1. Load dataset
+dataset = load_dataset("agentsea/computer-thoughts")
+# 2. Rename "image_before" -> "image" and cast to HFImage so it becomes a PIL Image
+dataset = dataset.rename_column("image_before", "image")
+dataset = dataset.cast_column("image", HFImage())
+# 3. Create a small subset for demo (just 5 examples). Remove this if you want the full data.
+train_subset = dataset["train"].select(range(5))
+# 4. Load the BLIP base model and processor
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+# 5. Define a collate_fn that transforms images+text on-the-fly
+def collate_fn(examples):
+    # examples is a list of dicts, each dict with keys:
+    #   'task', 'image', 'image_after', 'action', 'thought', 'bad_thought', 'subtask', 'bad_subtask', etc.
+    # We'll use 'image' (PIL) and 'subtask' (string) as the caption.
+    images = [ex["image"] for ex in examples]   # PIL images
+    texts = [ex["subtask"] for ex in examples]  # or whichever text column you want
+    inputs = processor(images=images, text=texts, return_tensors="pt", padding=True)
+    # Add labels so the model can compute cross-entropy loss
+    # For a basic approach: labels = input_ids
+    inputs["labels"] = inputs["input_ids"].clone()
+    return inputs
+# 6. Define training arguments
+training_args = TrainingArguments(
+    output_dir="./my_blip_computer_thoughts",
+    num_train_epochs=1,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,  # effectively batch size 4 per device
+    logging_steps=5,
+    save_steps=20,
+    save_total_limit=2,
+    remove_unused_columns=False  # important when custom columns are in the dataset
+)
+# 6. Create Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_subset,   # or dataset["train"] for the full set
+    data_collator=collate_fn,
+)
+# 7. Train
+trainer.train()
+# 9. Push the final model + processor to Hugging Face Hub
+#    (Make sure you're logged in: huggingface-cli login)
+model.push_to_hub("zeddotes/blip-computer-thoughts")
+processor.push_to_hub("zeddotes/blip-computer-thoughts")
+print("Done training and pushed model to zeddotes/blip-computer-thoughts!")