OceanirAI
/

Oculus

+#!/usr/bin/env python3
+"""
+OCULUS Extended Detection Training
+Longer training with more data for better detection accuracy.
+"""
+import os
+import sys
+import json
+import time
+import random
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Dict, Tuple, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from PIL import Image
+OCULUS_ROOT = Path(__file__).parent
+sys.path.insert(0, str(OCULUS_ROOT))
+from oculus_unified_model import OculusForConditionalGeneration, OculusConfig
+@dataclass
+class ExtendedTrainingConfig:
+    """Extended training configuration."""
+    # Data
+    data_dir: str = "data/coco"
+    annotations_file: str = "annotations/instances_train2017.json"
+    images_subdir: str = "images"
+    # Training - EXTENDED
+    batch_size: int = 1
+    learning_rate: float = 3e-4
+    num_epochs: int = 5
+    warmup_steps: int = 200
+    max_samples: int = 8000  # More data
+    # Model
+    checkpoint_path: str = "checkpoints/oculus_detection/final"
+    # Checkpointing
+    save_every: int = 500
+    checkpoint_dir: str = "checkpoints/oculus_detection_v2"
+    # Logging
+    log_every: int = 50
+class COCODetectionDataset:
+    """COCO Detection dataset."""
+    COCO_CLASSES = [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
+        'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
+        'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
+        'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
+        'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
+        'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
+        'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+        'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
+        'toothbrush'
+    ]
+    def __init__(self, data_dir: str, annotations_file: str, images_subdir: str, max_samples: int = None):
+        self.data_dir = Path(data_dir)
+        self.images_dir = self.data_dir / images_subdir
+        annotations_path = self.data_dir / annotations_file
+        print(f"  Loading annotations from {annotations_path}...")
+        with open(annotations_path) as f:
+            coco_data = json.load(f)
+        self.cat_id_to_idx = {}
+        for i, cat in enumerate(coco_data['categories']):
+            self.cat_id_to_idx[cat['id']] = i
+        img_to_anns = {}
+        for ann in coco_data['annotations']:
+            img_id = ann['image_id']
+            if img_id not in img_to_anns:
+                img_to_anns[img_id] = []
+            img_to_anns[img_id].append(ann)
+        self.samples = []
+        for img_info in coco_data['images']:
+            img_id = img_info['id']
+            if img_id not in img_to_anns:
+                continue
+            img_path = self.images_dir / img_info['file_name']
+            if not img_path.exists():
+                continue
+            anns = img_to_anns[img_id]
+            boxes = []
+            labels = []
+            for ann in anns:
+                if 'bbox' not in ann or ann.get('iscrowd', 0):
+                    continue
+                x, y, w, h = ann['bbox']
+                x1 = x / img_info['width']
+                y1 = y / img_info['height']
+                x2 = (x + w) / img_info['width']
+                y2 = (y + h) / img_info['height']
+                x1, y1, x2, y2 = max(0, x1), max(0, y1), min(1, x2), min(1, y2)
+                boxes.append([x1, y1, x2, y2])
+                labels.append(self.cat_id_to_idx[ann['category_id']])
+            if boxes:
+                self.samples.append({
+                    'image_path': str(img_path),
+                    'boxes': boxes,
+                    'labels': labels,
+                    'width': img_info['width'],
+                    'height': img_info['height']
+                })
+            if max_samples and len(self.samples) >= max_samples:
+                break
+        print(f"  Loaded {len(self.samples):,} images with detections")
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        return self.samples[idx]
+class ExtendedTrainer:
+    """Extended trainer with better loss functions."""
+    def __init__(self, config: ExtendedTrainingConfig):
+        self.config = config
+        print("\n" + "=" * 60)
+        print("🎯 OCULUS EXTENDED DETECTION TRAINER")
+        print("=" * 60)
+        self._load_model()
+        self._load_dataset()
+        self._create_optimizer()
+        self.checkpoint_dir = Path(config.checkpoint_dir)
+        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    def _load_model(self):
+        """Load model with trained projector and heads."""
+        print("\n[Loading Model]")
+        # Try to resume from V2 checkpoint first
+        v2_checkpoint = Path("checkpoints/oculus_detection_v2/final")
+        if v2_checkpoint.exists():
+            print(f"  ✨ Resuming from V2 checkpoint: {v2_checkpoint}")
+            checkpoint_path = v2_checkpoint
+        else:
+            checkpoint_path = OCULUS_ROOT / self.config.checkpoint_path
+        self.model = OculusForConditionalGeneration.from_pretrained(checkpoint_path)
+        # Load existing detection heads
+        heads_path = checkpoint_path / "heads.pth"
+        if heads_path.exists():
+            heads = torch.load(heads_path)
+            self.model.detection_head.load_state_dict(heads['detection'])
+            self.model.point_head.load_state_dict(heads['point'])
+            print("  ✓ Loaded pre-trained detection heads")
+        # Load vision encoders
+        self.model.vision_encoder.load_encoders()
+        # Freeze vision encoder and projector
+        for param in self.model.vision_encoder.parameters():
+            param.requires_grad = False
+        for param in self.model.projector.parameters():
+            param.requires_grad = False
+        # Detection heads are trainable
+        for param in self.model.detection_head.parameters():
+            param.requires_grad = True
+        for param in self.model.point_head.parameters():
+            param.requires_grad = True
+        trainable = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
+        total = sum(p.numel() for p in self.model.parameters())
+        print(f"  ✓ Trainable: {trainable:,} / {total:,} parameters")
+    def _load_dataset(self):
+        """Load COCO detection dataset."""
+        print("\n[Loading Dataset]")
+        self.dataset = COCODetectionDataset(
+            self.config.data_dir,
+            self.config.annotations_file,
+            self.config.images_subdir,
+            max_samples=self.config.max_samples
+        )
+    def _create_optimizer(self):
+        """Create optimizer."""
+        print("\n[Optimizer]")
+        params = list(self.model.detection_head.parameters()) + \
+                 list(self.model.point_head.parameters())
+        self.optimizer = torch.optim.AdamW(params, lr=self.config.learning_rate, weight_decay=0.01)
+        # Learning rate scheduler
+        total_steps = self.config.num_epochs * len(self.dataset)
+        warmup_steps = self.config.warmup_steps
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return step / warmup_steps
+            return max(0.1, 1.0 - (step - warmup_steps) / (total_steps - warmup_steps))
+        self.scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda)
+        print(f"  ✓ AdamW (lr={self.config.learning_rate}) + scheduler")
+    def _compute_iou(self, box1: torch.Tensor, box2: torch.Tensor) -> torch.Tensor:
+        """Compute IoU between two boxes [x1, y1, x2, y2]."""
+        x1 = torch.max(box1[0], box2[0])
+        y1 = torch.max(box1[1], box2[1])
+        x2 = torch.min(box1[2], box2[2])
+        y2 = torch.min(box1[3], box2[3])
+        inter_w = torch.clamp(x2 - x1, min=0)
+        inter_h = torch.clamp(y2 - y1, min=0)
+        inter_area = inter_w * inter_h
+        area1 = torch.clamp((box1[2] - box1[0]) * (box1[3] - box1[1]), min=1e-8)
+        area2 = torch.clamp((box2[2] - box2[0]) * (box2[3] - box2[1]), min=1e-8)
+        union_area = area1 + area2 - inter_area + 1e-8
+        iou = inter_area / union_area
+        return torch.clamp(iou, min=0.0, max=1.0)
+    def compute_loss(
+        self,
+        vision_tokens: torch.Tensor,
+        target_boxes: List[List[float]],
+        target_labels: List[int]
+    ) -> Tuple[torch.Tensor, Dict]:
+        """Compute detection loss with IoU and classification."""
+        cls_logits, box_preds = self.model.detection_head(vision_tokens)
+        num_tokens = vision_tokens.shape[1]
+        total_cls_loss = torch.tensor(0.0, requires_grad=True)
+        total_box_loss = torch.tensor(0.0, requires_grad=True)
+        total_iou_loss = torch.tensor(0.0, requires_grad=True)
+        num_matches = 0
+        for gt_idx, (gt_box, gt_label) in enumerate(zip(target_boxes, target_labels)):
+            gt_box_t = torch.tensor(gt_box, dtype=torch.float32)
+            gt_label_t = torch.tensor([gt_label], dtype=torch.long)
+            pred_boxes = box_preds[0]  # [num_tokens, 4]
+            # Find best matching prediction using IoU
+            with torch.no_grad():
+                ious = []
+                for j in range(num_tokens):
+                    iou = self._compute_iou(pred_boxes[j], gt_box_t)
+                    ious.append(float(iou.detach()))
+                best_idx = int(np.argmax(ious))
+            # Classification loss
+            cls_loss = F.cross_entropy(
+                cls_logits[0, best_idx:best_idx+1],
+                gt_label_t,
+                label_smoothing=0.1
+            )
+            # Box regression loss (Smooth L1)
+            box_loss = F.smooth_l1_loss(pred_boxes[best_idx], gt_box_t)
+            # IoU loss (1 - IoU)
+            iou = self._compute_iou(pred_boxes[best_idx], gt_box_t)
+            iou_loss = 1.0 - iou
+            total_cls_loss = total_cls_loss + cls_loss
+            total_box_loss = total_box_loss + box_loss
+            total_iou_loss = total_iou_loss + iou_loss
+            num_matches += 1
+        if num_matches > 0:
+            total_cls_loss = total_cls_loss / num_matches
+            total_box_loss = total_box_loss / num_matches
+            total_iou_loss = total_iou_loss / num_matches
+        # Combined loss
+        total_loss = total_cls_loss + 5.0 * total_box_loss + 2.0 * total_iou_loss
+        return total_loss, {
+            'cls_loss': float(total_cls_loss.detach()),
+            'box_loss': float(total_box_loss.detach()),
+            'iou_loss': float(total_iou_loss.detach()),
+            'num_matches': num_matches
+        }
+    def train_step(self, sample: Dict) -> Tuple[float, Dict]:
+        """Single training step."""
+        self.optimizer.zero_grad()
+        try:
+            image = Image.open(sample['image_path']).convert('RGB')
+            with torch.no_grad():
+                vision_features = self.model.vision_encoder(image)
+            actual_dim = vision_features.shape[-1]
+            expected_dim = self.model.config.fused_vision_dim
+            if actual_dim != expected_dim:
+                if self.model.vision_adapter is None:
+                    self.model.vision_adapter = nn.Linear(actual_dim, expected_dim)
+                    nn.init.xavier_uniform_(self.model.vision_adapter.weight)
+                    nn.init.zeros_(self.model.vision_adapter.bias)
+                    self.optimizer.add_param_group({
+                        'params': self.model.vision_adapter.parameters()
+                    })
+                vision_features = self.model.vision_adapter(vision_features)
+            vision_tokens = self.model.projector(vision_features)
+            loss, metrics = self.compute_loss(
+                vision_tokens,
+                sample['boxes'],
+                sample['labels']
+            )
+            if loss.requires_grad:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+                self.optimizer.step()
+                self.scheduler.step()
+            return float(loss.detach()), metrics
+        except Exception as e:
+            return 0.0, {}
+    def save_checkpoint(self, step: int, loss: float, is_final: bool = False):
+        """Save checkpoint."""
+        if is_final:
+            checkpoint_path = self.checkpoint_dir / "final"
+        else:
+            checkpoint_path = self.checkpoint_dir / f"step_{step:06d}"
+        checkpoint_path.mkdir(exist_ok=True)
+        torch.save({
+            'detection': self.model.detection_head.state_dict(),
+            'point': self.model.point_head.state_dict(),
+            'adapter': self.model.vision_adapter.state_dict() if self.model.vision_adapter else None,
+        }, checkpoint_path / "heads.pth")
+        # Copy projector config
+        import shutil
+        src_projector = OCULUS_ROOT / self.config.checkpoint_path / "projector.npz"
+        src_config = OCULUS_ROOT / self.config.checkpoint_path / "config.json"
+        if src_projector.exists():
+            shutil.copy(src_projector, checkpoint_path / "projector.npz")
+        if src_config.exists():
+            shutil.copy(src_config, checkpoint_path / "config.json")
+        state = {'step': step, 'loss': loss}
+        with open(checkpoint_path / "state.json", "w") as f:
+            json.dump(state, f, indent=2)
+        print(f"  💾 Checkpoint: {checkpoint_path}")
+    def train(self):
+        """Main training loop."""
+        print("\n" + "=" * 60)
+        print("🚀 STARTING EXTENDED TRAINING")
+        print("=" * 60)
+        print(f"  Dataset: {len(self.dataset):,} samples")
+        print(f"  Epochs: {self.config.num_epochs}")
+        print(f"  Learning rate: {self.config.learning_rate}")
+        global_step = 0
+        best_loss = float('inf')
+        start_time = time.time()
+        for epoch in range(self.config.num_epochs):
+            print(f"\n📚 Epoch {epoch + 1}/{self.config.num_epochs}")
+            print("-" * 40)
+            indices = list(range(len(self.dataset)))
+            random.shuffle(indices)
+            epoch_loss = 0
+            epoch_cls = 0
+            epoch_box = 0
+            epoch_giou = 0
+            num_batches = 0
+            for i, idx in enumerate(indices):
+                sample = self.dataset[idx]
+                loss, metrics = self.train_step(sample)
+                if loss == 0:
+                    continue
+                epoch_loss += loss
+                epoch_cls += metrics.get('cls_loss', 0)
+                epoch_box += metrics.get('box_loss', 0)
+                epoch_giou += metrics.get('giou_loss', 0)
+                num_batches += 1
+                global_step += 1
+                if global_step % self.config.log_every == 0:
+                    elapsed = time.time() - start_time
+                    avg_loss = epoch_loss / num_batches
+                    lr = self.scheduler.get_last_lr()[0]
+                    print(f"  Step {global_step:5d} | Loss: {loss:.4f} | Avg: {avg_loss:.4f} | "
+                          f"Cls: {metrics.get('cls_loss', 0):.3f} | Box: {metrics.get('box_loss', 0):.3f} | "
+                          f"IoU: {metrics.get('iou_loss', 0):.3f} | LR: {lr:.6f} | {elapsed:.0f}s")
+                if global_step % self.config.save_every == 0:
+                    self.save_checkpoint(global_step, loss)
+                    if loss < best_loss:
+                        best_loss = loss
+            avg_epoch_loss = epoch_loss / max(num_batches, 1)
+            print(f"\n  ✓ Epoch {epoch + 1} | Avg: {avg_epoch_loss:.4f} | "
+                  f"Cls: {epoch_cls/max(num_batches,1):.3f} | "
+                  f"Box: {epoch_box/max(num_batches,1):.3f} | "
+                  f"GIoU: {epoch_giou/max(num_batches,1):.3f}")
+        print("\n" + "=" * 60)
+        print("💾 Saving Final Model")
+        print("=" * 60)
+        self.save_checkpoint(global_step, avg_epoch_loss, is_final=True)
+        print(f"✅ Training complete! Model: {self.checkpoint_dir / 'final'}")
+        return self.checkpoint_dir / "final"
+def main():
+    config = ExtendedTrainingConfig(
+        data_dir="data/coco",
+        max_samples=5000,  # More data
+        num_epochs=4,      # More epochs
+        learning_rate=3e-4,
+        save_every=500,
+        log_every=50,
+    )
+    trainer = ExtendedTrainer(config)
+    model_path = trainer.train()
+    # Run benchmarks after training
+    print("\n" + "=" * 60)
+    print("📊 RUNNING BENCHMARKS")
+    print("=" * 60)
+    from eval_benchmarks import run_benchmarks
+    run_benchmarks(str(model_path), benchmarks=['coco', 'counting', 'vqa'])
+if __name__ == "__main__":
+    main()