Oculus / training /train_reasoning_v2.py

Upload training/train_reasoning_v2.py with huggingface_hub

d0ded2a verified 3 months ago

11.5 kB

	#!/usr/bin/env python3
	"""
	Oculus Reasoning Training V2 - BEAST MODE

	Goal: Beat Isaac 0.2-2B on VQA benchmarks
	Strategy:
	1. Use ALL available COCO data
	2. Diverse question templates
	3. Chain-of-thought style training
	4. Longer training (8 epochs)
	5. Learning rate warmup + decay
	"""

	import os
	import sys
	import json
	import random
	import math
	from pathlib import Path
	from dataclasses import dataclass
	from typing import List, Dict, Optional

	import torch
	from torch.utils.data import Dataset, DataLoader
	from torch.optim import AdamW
	from torch.optim.lr_scheduler import CosineAnnealingLR
	from PIL import Image
	from tqdm import tqdm

	OCULUS_ROOT = Path(__file__).parent
	sys.path.insert(0, str(OCULUS_ROOT))

	from oculus_unified_model import OculusForConditionalGeneration


	# ============================================================================
	# Advanced Dataset with Diverse Prompts
	# ============================================================================

	class ReasoningDataset(Dataset):
	"""
	Advanced dataset for reasoning training.
	Uses diverse question templates and chain-of-thought style answers.
	"""

	# Diverse question templates for VQA-style training
	CAPTION_PROMPTS = [
	"Describe this image in detail.",
	"What is happening in this image?",
	"Explain what you see.",
	"Provide a detailed description of the scene.",
	"What can you observe in this picture?",
	"Describe the contents of this image.",
	"What is shown here?",
	"Give a comprehensive description.",
	]

	COUNTING_PROMPTS = [
	"How many {obj}s are in this image?",
	"Count the number of {obj}s visible.",
	"What is the count of {obj}s?",
	"How many {obj}s can you see?",
	]

	EXISTENCE_PROMPTS = [
	"Is there a {obj} in this image?",
	"Can you see a {obj}?",
	"Does this image contain a {obj}?",
	"Is a {obj} visible in this picture?",
	]

	ATTRIBUTE_PROMPTS = [
	"What objects are visible in this image?",
	"What type of scene is this?",
	"Describe the main subject of this image.",
	"What is the setting of this image?",
	]

	def __init__(self, processor, data_dir="data/coco", max_samples=None):
	self.processor = processor
	self.samples = []

	# Load COCO data
	cap_file = Path(data_dir) / "annotations" / "captions_train2017.json"
	inst_file = Path(data_dir) / "annotations" / "instances_train2017.json"

	if not cap_file.exists():
	print("⚠️ COCO data not found!")
	return

	print("📚 Loading COCO data for reasoning training...")

	# Load captions
	with open(cap_file) as f:
	captions_data = json.load(f)

	# Load instances for counting/existence
	with open(inst_file) as f:
	instances_data = json.load(f)

	# Build indexes
	img_map = {img['id']: img for img in captions_data['images']}
	cat_map = {c['id']: c['name'] for c in instances_data['categories']}

	# Image to captions
	img_captions = {}
	for ann in captions_data['annotations']:
	img_id = ann['image_id']
	if img_id not in img_captions:
	img_captions[img_id] = []
	img_captions[img_id].append(ann['caption'])

	# Image to object counts
	img_objects = {}
	for ann in instances_data['annotations']:
	if ann.get('iscrowd', 0):
	continue
	img_id = ann['image_id']
	cat = cat_map.get(ann['category_id'], 'object')
	if img_id not in img_objects:
	img_objects[img_id] = {}
	img_objects[img_id][cat] = img_objects[img_id].get(cat, 0) + 1

	# Create training samples
	count = 0
	for img_id, captions in img_captions.items():
	img = img_map.get(img_id)
	if not img:
	continue

	img_path = Path(data_dir) / "images" / img['file_name']
	if not img_path.exists():
	continue

	# 1. Caption-based QA (main training signal)
	for caption in captions[:2]: # Use up to 2 captions per image
	prompt = random.choice(self.CAPTION_PROMPTS)
	self.samples.append({
	'path': str(img_path),
	'question': prompt,
	'answer': caption,
	'type': 'caption'
	})

	# 2. Existence questions
	objects = img_objects.get(img_id, {})
	if objects:
	obj = random.choice(list(objects.keys()))
	prompt = random.choice(self.EXISTENCE_PROMPTS).format(obj=obj)
	self.samples.append({
	'path': str(img_path),
	'question': prompt,
	'answer': "Yes",
	'type': 'existence'
	})

	# Also add negative examples
	all_cats = list(cat_map.values())
	missing = [c for c in all_cats if c not in objects]
	if missing:
	neg_obj = random.choice(missing[:10])
	prompt = random.choice(self.EXISTENCE_PROMPTS).format(obj=neg_obj)
	self.samples.append({
	'path': str(img_path),
	'question': prompt,
	'answer': "No",
	'type': 'existence_neg'
	})

	# 3. Counting questions (for objects with 2-10 instances)
	for obj, count_val in objects.items():
	if 2 <= count_val <= 10:
	prompt = random.choice(self.COUNTING_PROMPTS).format(obj=obj)
	# Chain-of-thought style answer
	answer = f"There are {count_val} {obj}s in this image."
	self.samples.append({
	'path': str(img_path),
	'question': prompt,
	'answer': answer,
	'type': 'counting'
	})
	break # One counting Q per image

	count += 1
	if max_samples and count >= max_samples:
	break

	# Shuffle samples
	random.shuffle(self.samples)

	print(f"✅ Loaded {len(self.samples)} reasoning samples")
	print(f" - Captions: {sum(1 for s in self.samples if s['type'] == 'caption')}")
	print(f" - Existence: {sum(1 for s in self.samples if 'existence' in s['type'])}")
	print(f" - Counting: {sum(1 for s in self.samples if s['type'] == 'counting')}")

	def __len__(self):
	return len(self.samples)

	def __getitem__(self, idx):
	item = self.samples[idx]

	try:
	image = Image.open(item['path']).convert('RGB')
	except:
	image = Image.new('RGB', (224, 224))

	# Encode
	encoding = self.processor(
	images=image,
	text=item['question'],
	padding="max_length",
	truncation=True,
	max_length=32,
	return_tensors="pt"
	)

	# Labels (answer)
	labels = self.processor(
	text=item['answer'],
	padding="max_length",
	truncation=True,
	max_length=64, # Longer for chain-of-thought
	return_tensors="pt"
	).input_ids

	return {
	"pixel_values": encoding.pixel_values.squeeze(0),
	"input_ids": encoding.input_ids.squeeze(0),
	"attention_mask": encoding.attention_mask.squeeze(0),
	"labels": labels.squeeze(0)
	}


	# ============================================================================
	# Training Loop with Advanced Features
	# ============================================================================

	def train():
	device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
	print(f"🚀 BEAST MODE TRAINING")
	print(f"Device: {device}")

	# Load model
	model_path = "checkpoints/oculus_detection_v2/final"
	print(f"\nLoading Oculus from {model_path}...")
	oculus = OculusForConditionalGeneration.from_pretrained(model_path)
	oculus.load_language_model(device=device)

	# Get VQA model
	vqa_model = oculus.lm_vqa_model
	vqa_model.train()
	vqa_model.to(device)

	# Dataset - USE ALL DATA (no max_samples limit, or set high)
	dataset = ReasoningDataset(oculus.lm_vqa_processor, max_samples=50000) # 50K samples!
	dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=0)

	# Optimizer with weight decay
	optimizer = AdamW(vqa_model.parameters(), lr=3e-5, weight_decay=0.01)

	# Cosine LR scheduler
	epochs = 8
	total_steps = len(dataloader) * epochs
	scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=1e-6)

	print(f"\n📊 Training Config:")
	print(f" Samples: {len(dataset)}")
	print(f" Batch size: 8")
	print(f" Epochs: {epochs}")
	print(f" Total steps: {total_steps}")
	print(f" LR: 3e-5 -> 1e-6 (cosine)")

	print("\n🔥 Starting training...")

	best_loss = float('inf')
	global_step = 0

	for epoch in range(epochs):
	total_loss = 0
	pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")

	for batch in pbar:
	batch = {k: v.to(device) for k, v in batch.items()}

	# Forward
	outputs = vqa_model(**batch)
	loss = outputs.loss

	# Backward
	loss.backward()

	# Gradient clipping
	torch.nn.utils.clip_grad_norm_(vqa_model.parameters(), 1.0)

	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()

	total_loss += loss.item()
	global_step += 1

	# Progress
	lr = scheduler.get_last_lr()[0]
	pbar.set_postfix(loss=f"{loss.item():.4f}", lr=f"{lr:.2e}")

	avg_loss = total_loss / len(dataloader)
	print(f"\n✓ Epoch {epoch+1} \| Avg Loss: {avg_loss:.4f}")

	# Save checkpoint if best
	if avg_loss < best_loss:
	best_loss = avg_loss
	checkpoint_dir = Path("checkpoints/oculus_reasoning_v2")
	checkpoint_dir.mkdir(parents=True, exist_ok=True)

	print(f" 💾 New best! Saving to {checkpoint_dir}")
	vqa_model.save_pretrained(checkpoint_dir / "vqa_model")
	oculus.lm_vqa_processor.save_pretrained(checkpoint_dir / "vqa_model")

	# Final save
	final_dir = Path("checkpoints/oculus_reasoning_v2/final")
	final_dir.mkdir(parents=True, exist_ok=True)
	vqa_model.save_pretrained(final_dir)
	oculus.lm_vqa_processor.save_pretrained(final_dir)

	print(f"\n✅ BEAST MODE TRAINING COMPLETE!")
	print(f" Best Loss: {best_loss:.4f}")
	print(f" Model saved to: checkpoints/oculus_reasoning_v2/final")


	if __name__ == "__main__":
	train()