Upload 10 files

Browse files

Files changed (10) hide show

constants.py +30 -0
dataset.py +349 -0
decoder_language_model.py +165 -0
finetune_lm_head_ce_loss.py +418 -0
infer.py +504 -0
model_components.py +163 -0
train.py +264 -0
train_stage_2.py +267 -0
utils.py +57 -0
vision_language_model.py +400 -0

constants.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+IMAGE_SIZE = 512
+PATCH_SIZE = 16
+HIDDEN_DIM = 256
+CONTEXT_LENGTH = 1536
+TEXT_LENGTH = 512 # Max length for *target* sequence (coords)
+PROMPT_LENGTH = 64 # Max length for *prompt* sequence (description) - Adjust as needed
+DROPOUT = 0.1
+NUM_HEADS = 8
+NUM_LAYERS = 12 # Keep moderate layers
+BATCH_SIZE = 16
+LEARNING_RATE = 1e-3 # Lower LR might be needed with contrastive loss
+DTYPE = torch.float32 # torch.bfloat16 created some instability, why?
+GRAD_ACCUMULATION_STEPS = 16
+IMAGE_MEAN = [0.485, 0.456, 0.406]
+IMAGE_STD = [0.229, 0.224, 0.225]
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+IMAGE_LOCATION = "./images/"
+NUM_BINS = 32
+SHARED_EMBED_DIM = 256 # Dimension for contrastive space
+NUM_BINS = 32
+MAX_POINTS = 10 # Maximum number of points per image to handle
+# Training loop constants
+NUM_EPOCHS = 400 # desired number of epochs
+LOGGING_STEPS = 1 # Log every N optimization steps
+MAX_GRAD_NORM = 1.0
+LAMBDA_CONTRASTIVE = 2 # Weight for contrastive loss - TUNE THIS
+LAMBDA_REGRESSION = 2 # Works but noisy

dataset.py ADDED Viewed

	@@ -0,0 +1,349 @@

+from tqdm.auto import tqdm
+from constants import *
+from utils import *
+import pickle
+from torch.utils.data import Dataset
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch.utils.data import DataLoader
+import os
+def format_point_text(points):
+    # This function should already handle multiple points correctly
+    text = "<result_start>"
+    for point in points:
+        # Ensure point coordinates are within [0, 100] before processing
+        px = min(max(int(point.get('x', 50) * IMAGE_SIZE / 100), 0), IMAGE_SIZE - 1) # Added .get for safety
+        py = min(max(int(point.get('y', 50) * IMAGE_SIZE / 100), 0), IMAGE_SIZE - 1)
+        x_bin = min(px // (IMAGE_SIZE // NUM_BINS), NUM_BINS - 1)
+        y_bin = min(py // (IMAGE_SIZE // NUM_BINS), NUM_BINS - 1)
+        text += f"<pointx_start><coord_bin_{x_bin}><pointx_end><pointy_start><coord_bin_{y_bin}><pointy_end>"
+    text += "<result_end>" + tokenizer.eos_token
+    return text
+def format_data_for_training(sample):
+    """Format data sample for training, handling 0 to MAX_POINTS continuous coordinates."""
+    try:
+        # Check if 'points' key exists and is a list, otherwise treat as 0 points
+        sample_points = sample.get('points', [])
+        if not isinstance(sample_points, list):
+            print(f"Warning: Invalid 'points' type for {sample.get('image_url', 'N/A')}. Treating as 0 points.")
+            sample_points = []
+        # Limit the number of points processed
+        points_to_process = sample_points[:MAX_POINTS]
+        num_points = len(points_to_process)
+        # Load image - this is where most memory is used
+        image_path = f"{IMAGE_LOCATION}{sample['image_url']}"
+        # Check if file exists before attempting to open
+        if not os.path.exists(image_path):
+            print(f"Warning: Image not found: {image_path}. Skipping.")
+            return None
+        # Open image with error handling
+        try:
+            image = Image.open(image_path)
+            # Convert grayscale to RGB if needed
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+            image_tensor = image_to_tensor(image)
+            # Explicitly delete the PIL image to free memory
+            del image
+        except Exception as e:
+            print(f"Error processing image {image_path}: {e}")
+            return None
+        # Process text with memory efficiency in mind
+        prompt_text = f"<point_start>{sample['label']}<point_end>"
+        # format_point_text correctly handles an empty points_to_process list
+        target_text = format_point_text(points_to_process)
+        # Tokenize with explicit max lengths
+        prompt_tokens = tokenizer(prompt_text, return_tensors="pt", max_length=PROMPT_LENGTH,
+                                 truncation=True, padding=False)
+        target_tokens = tokenizer(target_text, return_tensors="pt", max_length=TEXT_LENGTH,
+                                 truncation=True, padding=False)
+        # Check for empty tokens after tokenization
+        if prompt_tokens.input_ids.numel() == 0 or target_tokens.input_ids.numel() == 0:
+            print(f"Warning: Empty tokens after tokenization for {sample.get('image_url', 'N/A')}. Skipping.")
+            return None
+        # --- Handle Multiple Continuous Coordinates with Padding (Handles num_points=0 correctly) ---
+        continuous_coords_list = []
+        for point in points_to_process: # This loop won't run if num_points is 0
+            coord_x = min(max(point.get('x', 50) / 100.0, 0.0), 1.0)
+            coord_y = min(max(point.get('y', 50) / 100.0, 0.0), 1.0)
+            continuous_coords_list.append([coord_x, coord_y])
+        # Pad coordinates and create mask
+        # If continuous_coords_list is empty, create empty tensor with right shape
+        if num_points == 0:
+            padded_coords = torch.full((MAX_POINTS, 2), -1.0)
+            coords_mask = torch.zeros(MAX_POINTS)
+        else:
+            coords_tensor = torch.tensor(continuous_coords_list, dtype=torch.float32)
+            padding_needed = MAX_POINTS - num_points
+            padded_coords = F.pad(coords_tensor, (0, 0, 0, padding_needed), value=-1.0)
+            coords_mask = torch.cat([torch.ones(num_points, dtype=torch.float32),
+                                   torch.zeros(padding_needed, dtype=torch.float32)])
+        # Create and return the formatted sample
+        return {
+            "image": image_tensor,
+            "prompt_ids": prompt_tokens.input_ids[0],
+            "target_ids": target_tokens.input_ids[0],
+            "continuous_coords": padded_coords,
+            "coords_mask": coords_mask,
+            "num_points": num_points,
+            "label": sample['label'],
+            "image_url": sample['image_url']
+        }
+    except FileNotFoundError:
+         print(f"Warning: Image not found: {sample.get('image_url', 'N/A')}. Skipping.")
+         return None
+    except Exception as e:
+        print(f"Error formatting sample ({sample.get('image_url', 'N/A')}): {e}. Skipping.")
+        import traceback
+        traceback.print_exc()
+        return None
+class PointDataset(Dataset):
+    def __init__(self, data_path="active_point_dataset.pkl", split="train", test_size=1000):
+        with open(data_path, "rb") as f:
+            raw_data = pickle.load(f)
+        # --- Corrected filter and print statement ---
+        # Keep samples with 0 to MAX_POINTS points. Handle potential non-list 'points' safely.
+        original_count = len(raw_data)
+        raw_data = [sample for sample in raw_data
+                    if 0 <= len(sample.get('points', [])) <= MAX_POINTS and isinstance(sample.get('points', []), list)]
+        filtered_count = len(raw_data)
+        print(f"Original raw data size: {original_count}")
+        print(f"Filtered raw data to {filtered_count} samples with 0 to {MAX_POINTS} points.")
+        total_samples = len(raw_data)
+        if total_samples == 0:
+             raise ValueError("No samples left after filtering. Check data or MAX_POINTS.") # Added error for empty dataset
+        if total_samples <= test_size:
+            print(f"Warning: Dataset size {total_samples} <= test_size {test_size}.")
+            test_size = max(1, int(total_samples * 0.2)) if total_samples > 1 else 0
+        train_end = total_samples - test_size
+        # Update print statement to reflect 0 points are included
+        print(f"Dataset: {total_samples} total (0 to {MAX_POINTS} points), {train_end} train, {test_size} test")
+        # --- Corrected split logic to use actual train/test counts ---
+        if split == "train":
+             # Check if train_end is valid before slicing
+             if train_end <= 0: print("Warning: No samples allocated for training split.")
+             self.raw_data = raw_data[:train_end]
+        elif split == "test":
+             # Check if test_size is valid before slicing
+             if test_size <= 0: print("Warning: No samples allocated for test split.")
+             self.raw_data = raw_data[train_end:]
+        else:
+             raise ValueError("split must be 'train' or 'test'")
+        # DO NOT preprocess data here - just store the raw data
+        # This is the key change - we don't load all images at once
+        print(f"Dataset initialized with {len(self.raw_data)} samples for {split}")
+        # Optional: Cache a small number of recent items to speed up repeated access
+        self.cache_size = 8000  # Adjust based on memory constraints
+        self.cache = {}  # Simple LRU cache for processed samples
+    def __len__(self):
+        return len(self.raw_data)
+    def __getitem__(self, idx):
+        # Check if the item is in the cache
+        if idx in self.cache:
+            return self.cache[idx]
+        # Process the sample on-demand
+        sample = self.raw_data[idx]
+        formatted = format_data_for_training(sample)
+        # If processing failed, try the next sample
+        if formatted is None:
+            # Find next valid index (with wrapping)
+            next_idx = (idx + 1) % len(self.raw_data)
+            # Prevent infinite loop if all samples are invalid
+            attempts = 0
+            while formatted is None and attempts < min(10, len(self.raw_data)):
+                sample = self.raw_data[next_idx]
+                formatted = format_data_for_training(sample)
+                next_idx = (next_idx + 1) % len(self.raw_data)
+                attempts += 1
+            # If we still don't have a valid sample after attempts, return a dummy sample
+            if formatted is None:
+                print(f"Warning: Failed to find valid sample after {attempts} attempts")
+                # Create minimal valid sample with zeros
+                formatted = self._create_dummy_sample()
+        # Update cache - simple LRU implementation
+        if len(self.cache) >= self.cache_size:
+            # Remove oldest item (first key)
+            if self.cache:
+                oldest_key = next(iter(self.cache))
+                del self.cache[oldest_key]
+        # Add to cache
+        self.cache[idx] = formatted
+        return formatted
+    def _create_dummy_sample(self):
+        """Creates a minimal valid sample when all else fails."""
+        # Create empty image tensor
+        image_tensor = torch.zeros(3, IMAGE_SIZE, IMAGE_SIZE)
+        # Create minimal tokens
+        prompt_text = "<point_start>dummy<point_end>"
+        target_text = "<result_start><result_end>" + tokenizer.eos_token
+        prompt_tokens = tokenizer(prompt_text, return_tensors="pt").input_ids[0]
+        target_tokens = tokenizer(target_text, return_tensors="pt").input_ids[0]
+        # Create empty coordinates
+        padded_coords = torch.full((MAX_POINTS, 2), -1.0)
+        coords_mask = torch.zeros(MAX_POINTS)
+        return {
+            "image": image_tensor,
+            "prompt_ids": prompt_tokens,
+            "target_ids": target_tokens,
+            "continuous_coords": padded_coords,
+            "coords_mask": coords_mask,
+            "num_points": 0,
+            "label": "dummy",
+            "image_url": "none"
+        }
+    # --- collate_fn remains the same as the previous version ---
+    @staticmethod
+    def collate_fn(batch):
+        # ... (Same as before, correctly handles stacking the padded coords and masks) ...
+        batch = [item for item in batch if item is not None]
+        if not batch: return None
+        images = torch.stack([item['image'] for item in batch]).to(DTYPE)
+        # --- Pad Prompt IDs ---
+        max_prompt_len = max(item['prompt_ids'].size(0) for item in batch)
+        prompt_ids_padded, prompt_attention_mask = [], []
+        for item in batch:
+            ids, pad_len = item['prompt_ids'], max_prompt_len - item['prompt_ids'].size(0)
+            prompt_ids_padded.append(torch.cat([ids, torch.full((pad_len,), tokenizer.pad_token_id, dtype=torch.long)]))
+            prompt_attention_mask.append(torch.cat([torch.ones_like(ids, dtype=torch.long), torch.zeros(pad_len, dtype=torch.long)]))
+        prompt_ids = torch.stack(prompt_ids_padded)
+        prompt_attention_mask = torch.stack(prompt_attention_mask)
+        # --- Pad Target IDs & Create Generative Targets ---
+        max_target_len = max(item['target_ids'].size(0) for item in batch)
+        target_ids_padded, target_attention_mask, generative_targets = [], [], []
+        for item in batch:
+            ids, pad_len = item['target_ids'], max_target_len - item['target_ids'].size(0)
+            padded_ids = torch.cat([ids, torch.full((pad_len,), tokenizer.pad_token_id, dtype=torch.long)])
+            target_ids_padded.append(padded_ids)
+            mask = torch.cat([torch.ones_like(ids, dtype=torch.long), torch.zeros(pad_len, dtype=torch.long)])
+            target_attention_mask.append(mask)
+            targets = torch.full_like(padded_ids, -100)
+            if ids.size(0) > 1:
+                 targets[:ids.size(0)-1] = ids[1:]
+            if ids.numel() > 0 and ids[-1] == tokenizer.eos_token_id:
+                 if ids.size(0) > 1:
+                     targets[ids.size(0)-1] = tokenizer.eos_token_id
+                 else:
+                     targets[0] = -100
+            generative_targets.append(targets)
+        target_ids = torch.stack(target_ids_padded)
+        target_attention_mask = torch.stack(target_attention_mask)
+        generative_targets = torch.stack(generative_targets)
+        # --- Stack Continuous Coords and Masks ---
+        continuous_coords = torch.stack([item['continuous_coords'] for item in batch])
+        coords_mask = torch.stack([item['coords_mask'] for item in batch])
+        num_points = [item['num_points'] for item in batch]
+        labels = [item['label'] for item in batch]
+        image_urls = [item.get('image_url', '') for item in batch]
+        return {
+            'image': images,
+            'prompt_ids': prompt_ids,
+            'prompt_attention_mask': prompt_attention_mask,
+            'target_ids': target_ids,
+            'target_attention_mask': target_attention_mask,
+            'generative_targets': generative_targets,
+            'continuous_coords': continuous_coords,
+            'coords_mask': coords_mask,
+            'num_points': num_points,
+            'label': labels,
+            'image_url': image_urls
+        }
+def create_train_dataloader(batch_size=BATCH_SIZE, num_workers=0, prefetch_factor=2):
+    """Create training dataloader with memory-efficient settings.
+    Args:
+        batch_size: Number of samples per batch
+        num_workers: Number of worker processes for data loading
+        prefetch_factor: Number of batches to prefetch per worker
+    Returns:
+        DataLoader instance or None if dataset is empty
+    """
+    dataset = PointDataset(split="train")
+    if len(dataset) == 0:
+        return None
+    # Configure DataLoader for memory efficiency
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        collate_fn=PointDataset.collate_fn,
+        pin_memory=True,  # Speeds up CPU to GPU transfer
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor if num_workers > 0 else None,  # Only valid with workers
+        persistent_workers=num_workers > 0,  # Keep workers alive between epochs
+        drop_last=False  # Don't drop the last incomplete batch
+    )
+def create_test_dataloader(batch_size=BATCH_SIZE, num_workers=0, prefetch_factor=2):
+    """Create test dataloader with memory-efficient settings.
+    Args:
+        batch_size: Number of samples per batch
+        num_workers: Number of worker processes for data loading
+        prefetch_factor: Number of batches to prefetch per worker
+    Returns:
+        DataLoader instance or None if dataset is empty
+    """
+    dataset = PointDataset(split="test")
+    if len(dataset) == 0:
+        print("Warning: Test dataset is empty. Returning None.")
+        return None
+    # Test loader with similar memory settings but no shuffling
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=PointDataset.collate_fn,
+        pin_memory=True,
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor if num_workers > 0 else None,
+        persistent_workers=num_workers > 0,
+        drop_last=False
+    )

decoder_language_model.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from model_components import Block
+from constants import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils import tokenizer, vocab_size
+class DecoderLanguageModel(nn.Module):
+    """
+    Transformer Decoder Language Model with optional coordinate regression head.
+    Processes a combined sequence of embeddings.
+    Outputs logits for token prediction and optionally regressed coordinates (for MAX_POINTS).
+    """
+    def __init__(self, n_embd=HIDDEN_DIM, vocab_size=vocab_size, num_heads=NUM_HEADS,
+                 n_layer=NUM_LAYERS, max_context=CONTEXT_LENGTH, dropout=DROPOUT):
+        super().__init__()
+        # --- Input Embeddings ---
+        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
+        self.position_embedding_table = nn.Embedding(max_context, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        # --- Transformer Blocks ---
+        self.blocks = nn.ModuleList([
+            Block(n_embd, num_heads, dropout, is_decoder=True)
+            for _ in range(n_layer)
+        ])
+        # --- Final Layer Norm ---
+        self.ln_f = nn.LayerNorm(n_embd)
+        # --- Output Heads ---
+        # 1. Head for token classification
+        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
+        # 2. Head for direct coordinate regression (predicting MAX_POINTS * 2 values)
+        self.regression_head = nn.Sequential(
+            nn.Linear(n_embd, n_embd // 2),
+            nn.GELU(),
+            nn.Linear(n_embd // 2, MAX_POINTS * 2), # Output MAX_POINTS * (x, y)
+            nn.Sigmoid()                           # Output activation [0, 1]
+        )
+        # --- End Output Heads ---
+        self.n_embd = n_embd
+        self.max_context = max_context
+        self.token_embedding_table.weight = self.lm_head.weight
+        self.apply(self._init_weights)
+        print(f"DecoderLanguageModel initialized with {n_layer} layers.")
+    def _init_weights(self, module):
+        # ... (same as before) ...
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        elif isinstance(module, nn.LayerNorm):
+             torch.nn.init.zeros_(module.bias)
+             torch.nn.init.ones_(module.weight)
+    def forward(self, combined_embeds, attention_mask=None, targets=None):
+        """
+        Forward pass for training or inference where loss is calculated.
+        Regression output is now handled *outside* this module by VLM.
+        """
+        # --- Input Validation & Processing ---
+        if combined_embeds.ndim != 3:
+             raise ValueError(f"DecoderLM received non-3D combined_embeds! Shape: {combined_embeds.shape}")
+        B, T, C = combined_embeds.shape
+        if T > self.max_context:
+            # ... (context truncation logic - same as before) ...
+            print(f"WARNING (Decoder forward): Input sequence length {T} > max context {self.max_context}. Truncating.")
+            combined_embeds = combined_embeds[:, -self.max_context:, :]
+            if attention_mask is not None: attention_mask = attention_mask[:, -self.max_context:]
+            if targets is not None: targets = targets[:, -self.max_context:]
+            T = self.max_context
+        # --- Positional Encoding ---
+        pos = torch.arange(0, T, dtype=torch.long, device=combined_embeds.device)
+        pos = pos.clamp(max=self.position_embedding_table.num_embeddings - 1)
+        pos_emb = self.position_embedding_table(pos) # Shape: (T, C)
+        x = combined_embeds + pos_emb.unsqueeze(0)
+        x = self.dropout(x)
+        # --- Transformer Blocks ---
+        for block in self.blocks:
+            x = block(x, attention_mask=attention_mask)
+        # --- Final Layer Norm ---
+        x_norm = self.ln_f(x) # Shape: (B, T, C) - Pass this out for VLM regression head
+        # --- Classification Head Output ---
+        logits = self.lm_head(x_norm) # Shape: (B, T, VocabSize)
+        # --- Classification Loss Calculation ---
+        class_loss = None
+        if targets is not None:
+            # ... (cross_entropy calculation - same as before) ...
+            try:
+                 class_loss = F.cross_entropy(
+                     logits.view(-1, logits.size(-1)),
+                     targets.view(-1),
+                     ignore_index=-100
+                 )
+                 if torch.isnan(class_loss):
+                      print("Warning: class_loss is NaN.")
+                      class_loss = None
+            except Exception as e:
+                 print(f"Error calculating cross_entropy: {e}")
+                 print(f"Logits shape: {logits.shape}, Targets shape: {targets.shape}")
+                 class_loss = None
+        # Return logits, class_loss, and the final normalized hidden states
+        return logits, class_loss, x_norm
+    # --- Generation Method (Example - if needed internally, otherwise VLM handles it) ---
+    # If VLM needs this class to perform generation based on token IDs:
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Autoregressive generation based on starting token IDs.
+        NOTE: This version doesn't handle combined embeddings directly.
+              The VisionLanguageModel should ideally use a method like
+              generate_from_embeddings or implement the loop externally.
+        """
+        self.eval()
+        for _ in range(max_new_tokens):
+            # --- Context Management ---
+            # Crop idx if longer than context length
+            idx_cond = idx if idx.size(1) <= self.max_context else idx[:, -self.max_context:]
+            # --- Forward Pass ---
+            # Get embeddings
+            tok_embeds = self.token_embedding_table(idx_cond) # (B, T, C)
+            # Get positional embeddings
+            pos = torch.arange(0, idx_cond.size(1), dtype=torch.long, device=idx.device)
+            pos = pos.clamp(max=self.max_context - 1)
+            pos_emb = self.position_embedding_table(pos).unsqueeze(0) # (1, T, C)
+            x = self.dropout(tok_embeds + pos_emb)
+            # Pass through blocks (no padding mask needed here as we handle single sequence)
+            for block in self.blocks:
+                x = block(x, attention_mask=None) # Causal mask is internal to block/head
+            # Final layer norm and head for the last token only
+            x = self.ln_f(x[:, -1:, :]) # (B, 1, C)
+            logits = self.lm_head(x)    # (B, 1, V)
+            logits = logits.squeeze(1) # (B, V)
+            # --- Sampling ---
+            logits = logits / temperature
+            if top_k is not None and top_k > 0:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
+            # Append sampled token
+            idx = torch.cat((idx, idx_next), dim=1)
+            # Stop if EOS
+            if hasattr(tokenizer, 'eos_token_id') and (idx_next == tokenizer.eos_token_id).all():
+                break
+        self.train()
+        return idx

finetune_lm_head_ce_loss.py ADDED Viewed

	@@ -0,0 +1,418 @@

+# finetune_lm_head_ce_loss.py
+# python finetune_lm_head_ce_loss.py --pretrained_model_path model_regression_multi_stage_2_11.pth
+# finetune_lm_head_ce_loss.py
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.optim.lr_scheduler import CosineAnnealingLR # Using Cosine decay for fine-tuning
+from tqdm.auto import tqdm
+import wandb
+from datetime import datetime
+import numpy as np
+import argparse
+import os
+import math
+import traceback # For detailed error printing
+from constants import *
+try:
+    # Ensure get_tokenizer defines global tokenizer and vocab_size
+    from utils import get_tokenizer, tokenizer, vocab_size, tensor_to_image, image_to_tensor
+    if 'tokenizer' not in globals() or 'vocab_size' not in globals():
+        print("Initializing tokenizer...")
+        tokenizer, vocab_size = get_tokenizer()
+except ImportError:
+    print("Error: Could not import required functions/variables from utils.py.")
+    exit()
+except NameError:
+    print("Error: tokenizer or vocab_size not defined after importing utils.")
+    exit()
+except Exception as e:
+    print(f"Error during utils import or tokenizer init: {e}")
+    exit()
+try:
+    # Dataset needs to handle 0 points and MAX_POINTS filter
+    # Collate fn should return necessary keys including 'generative_targets'
+    from dataset import create_train_dataloader, create_test_dataloader, PointDataset
+except ImportError:
+    print("Error: Could not import from dataset.py.")
+    exit()
+try:
+    # VisionLanguageModel __init__ should match the one used in the training script
+    # Make sure DecoderLanguageModel etc. are also available
+    from vision_language_model import VisionLanguageModel
+except ImportError:
+    print("Error: Could not import VisionLanguageModel from vision_language_model.py.")
+    exit()
+# --- Fine-tuning Specific Arguments ---
+parser = argparse.ArgumentParser(description="Re-initialize and fine-tune the LM head using ONLY classification loss.")
+parser.add_argument("--pretrained_model_path", type=str, required=True, help="Path to the pre-trained model state_dict (.pth file).")
+parser.add_argument("--output_model_path", type=str, default="model_lm_reinit_ce_finetuned.pth", help="Path to save the fine-tuned model.")
+parser.add_argument("--ft_epochs", type=int, default=10, help="Number of epochs for fine-tuning.")
+parser.add_argument("--ft_lr", type=float, default=5e-4, help="Learning rate for fine-tuning.")
+parser.add_argument("--ft_batch_size", type=int, default=BATCH_SIZE, help="Batch size for fine-tuning.")
+parser.add_argument("--ft_grad_accum", type=int, default=GRAD_ACCUMULATION_STEPS, help="Gradient accumulation steps.")
+parser.add_argument("--ft_log_steps", type=int, default=1, help="Logging frequency.")
+parser.add_argument("--train_final_ln", action='store_true', help="Also train the final LayerNorm (ln_f) before the lm_head.")
+parser.add_argument("--wandb_project", type=str, default="point-lm-head-reinit-ce-finetune", help="WandB project name.")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    # Use constants/args consistently
+    FT_BATCH_SIZE = args.ft_batch_size
+    FT_GRAD_ACCUM = args.ft_grad_accum
+    FT_LOG_STEPS = args.ft_log_steps
+    print(f"Using device: {DEVICE}")
+    print(f"Re-initializing and fine-tuning LM head (and final LN: {args.train_final_ln})")
+    print(f"Using ONLY Classification (Cross-Entropy) Loss")
+    print(f"Pretrained model: {args.pretrained_model_path}")
+    print(f"Output model: {args.output_model_path}")
+    print(f"Epochs: {args.ft_epochs}, LR: {args.ft_lr}, Batch Size: {FT_BATCH_SIZE}, Grad Accum: {FT_GRAD_ACCUM}")
+    # --- Load Model Definition ---
+    print("Loading model definition...")
+    try:
+        # Use parameters consistent with the pre-trained model's architecture
+        model_args = {
+            'n_embd': HIDDEN_DIM, 'vocab_size': vocab_size, 'img_size': IMAGE_SIZE, 'patch_size': PATCH_SIZE,
+            'num_heads': NUM_HEADS, 'num_blks_vit': NUM_LAYERS, 'num_blks_dec': NUM_LAYERS,
+            'emb_dropout': 0.0, 'blk_dropout': 0.0, 'max_context': CONTEXT_LENGTH,
+            'shared_embed_dim': SHARED_EMBED_DIM,
+            # Use the single contrastive lambda expected by the VLM class from training
+            'lambda_contrastive': 0.0,
+            'lambda_regression': 0.0,
+            'max_points': MAX_POINTS
+        }
+        model = VisionLanguageModel(**model_args).to(DEVICE)
+    except Exception as e:
+        print(f"Error initializing model structure: {e}")
+        exit()
+    # --- Load Pre-trained Weights ---
+    print(f"Loading pre-trained weights from: {args.pretrained_model_path}")
+    try:
+        # Use strict=False just in case parameter names differ slightly
+        model.load_state_dict(torch.load(args.pretrained_model_path, map_location=DEVICE, weights_only=True), strict=False)
+        print("Pre-trained weights loaded successfully.")
+    except FileNotFoundError: print(f"Error: Pre-trained model file not found at {args.pretrained_model_path}"); exit()
+    except Exception as e: print(f"Error loading model state_dict: {e}"); exit()
+    # --- Reinitialize LM Head ---
+    print("Reinitializing LM Head...")
+    model.decoder.lm_head.reset_parameters()
+    # --- Explicitly Re-Tie Weights AFTER reinitialization ---
+    model.decoder.token_embedding_table.weight = model.decoder.lm_head.weight
+    print("LM Head reinitialized and weights explicitly retied.")
+    # --- Freeze/Unfreeze Parameters (Do ONCE before loop) ---
+    print("Setting requires_grad flags...")
+    params_to_optimize = []
+    trainable_param_names = []
+    total_params = sum(p.numel() for p in model.parameters())
+    for param in model.parameters():
+        param.requires_grad = False # Freeze all
+    print("\nParameters explicitly marked as trainable:")
+    for param in model.decoder.lm_head.parameters():
+        param.requires_grad = True
+        params_to_optimize.append(param)
+        for name, p in model.decoder.lm_head.named_parameters():
+             if p is param: trainable_param_names.append(f"decoder.lm_head.{name}"); break
+    if args.train_final_ln:
+        for param in model.decoder.ln_f.parameters():
+            param.requires_grad = True
+            params_to_optimize.append(param)
+            for name, p in model.decoder.ln_f.named_parameters():
+                 if p is param: trainable_param_names.append(f"decoder.ln_f.{name}"); break
+    # --- Create Optimizer using the specific list ---
+    print("\nParameters passed to optimizer:")
+    for name in trainable_param_names: print(f"- {name}")
+    trainable_params_count = sum(p.numel() for p in params_to_optimize)
+    print(f"\nTotal parameters: {total_params}")
+    print(f"Trainable parameters (optimizer target): {trainable_params_count} ({100 * trainable_params_count / total_params:.2f}%)")
+    # Verification print
+    print("\nVerification: All parameters with requires_grad=True:")
+    actual_trainable_count = 0
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            is_in_optimize_list = any(p is param for p in params_to_optimize)
+            print(f"- {name} (Requires Grad: {param.requires_grad}, In Optimizer List: {is_in_optimize_list})")
+            actual_trainable_count += param.numel()
+    print(f"Actual trainable count (incl. tied): {actual_trainable_count}")
+    if not params_to_optimize: print("Error: No parameters collected for the optimizer."); exit()
+    optimizer = torch.optim.AdamW(params_to_optimize, lr=args.ft_lr, betas=(0.9, 0.95), weight_decay=0.1)
+    print("Optimizer created.")
+    # --- Dataloaders & Scheduler ---
+    print("Creating dataloaders...")
+    train_loader = create_train_dataloader(batch_size=FT_BATCH_SIZE, num_workers=4)
+    test_loader = create_test_dataloader(batch_size=FT_BATCH_SIZE, num_workers=2)
+    if train_loader is None: exit("Training loader failed to initialize.")
+    test_loader_has_data = test_loader and len(test_loader.dataset) > 0
+    scheduler = None
+    if train_loader and len(train_loader) > 0:
+        steps_per_epoch = (len(train_loader) // FT_GRAD_ACCUM) + (1 if len(train_loader) % FT_GRAD_ACCUM != 0 else 0)
+        total_steps = steps_per_epoch * args.ft_epochs
+        print(f"Fine-tuning: Total estimated optimization steps: {total_steps}")
+        scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=args.ft_lr / 10)
+    else: print("Warning: Train loader empty. Cannot setup scheduler.")
+    # --- Wandb Setup ---
+    wandb_enabled = False
+    try:
+        wandb.init(
+            project=args.wandb_project,
+            name=f"lm-head-reinit-ce-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
+            config={ "fine_tuning_lr": args.ft_lr, "fine_tuning_epochs": args.ft_epochs, "batch_size": FT_BATCH_SIZE,
+                     "grad_accum": FT_GRAD_ACCUM, "pretrained_model": args.pretrained_model_path, "train_final_ln": args.train_final_ln,
+                     "loss": "Classification Only" } )
+        wandb_enabled = True
+    except Exception as e: print(f"Wandb initialization failed: {e}.")
+    # --- Fine-tuning Loop ---
+    print("Starting LM head re-init fine-tuning with Classification Loss...")
+    torch.autograd.set_detect_anomaly(True)
+    step_counter = 0
+    optimizer.zero_grad()
+    for epoch in range(args.ft_epochs):
+        model.train() # Set dropout/layernorm layers to train mode
+        epoch_class_loss_accum = 0.0
+        valid_batches_accum = 0
+        pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"FT Epoch {epoch+1}/{args.ft_epochs}", leave=False)
+        for batch_idx, batch in pbar:
+            if batch is None: continue
+            # --- Unpack Data ---
+            try:
+                images = batch['image'].to(DEVICE, non_blocking=True).to(DTYPE)
+                prompt_ids = batch['prompt_ids'].to(DEVICE, non_blocking=True)
+                prompt_attention_mask = batch['prompt_attention_mask'].to(DEVICE, non_blocking=True)
+                target_ids = batch['target_ids'].to(DEVICE, non_blocking=True)
+                target_attention_mask = batch['target_attention_mask'].to(DEVICE, non_blocking=True)
+                generative_targets = batch['generative_targets'].to(DEVICE, non_blocking=True) # Needed for loss
+                # Pass None for unused args (model forward should handle this)
+                continuous_coords = batch.get('continuous_coords'); coords_mask = batch.get('coords_mask'); num_points_list = batch.get('num_points')
+                if continuous_coords is not None: continuous_coords = continuous_coords.to(DEVICE, non_blocking=True)
+                if coords_mask is not None: coords_mask = coords_mask.to(DEVICE, non_blocking=True)
+            except KeyError as e: print(f"KeyError unpacking batch: {e}"); continue
+            except Exception as e: print(f"Error unpacking batch: {e}"); continue
+            # --- Forward Pass ---
+            # Run full model normally. Autograd handles requires_grad flags.
+            try:
+                # We only need the logits output from the main model call
+                logits, _, _, _, _, _, *_ = model(
+                    img_array=images, prompt_ids=prompt_ids, prompt_attention_mask=prompt_attention_mask,
+                    target_ids=target_ids, target_attention_mask=target_attention_mask,
+                    generative_targets=generative_targets, # Pass targets, model might use internally
+                    continuous_coords=continuous_coords, coords_mask=coords_mask,
+                )
+                if logits is None or not torch.isfinite(logits).all():
+                    print(f"!!! ERROR: NaN/Inf/None detected in logits. Skipping batch {batch_idx}. !!!")
+                    optimizer.zero_grad(); continue
+            except Exception as e:
+                print(f"!!! ERROR during forward pass: {e} !!!"); traceback.print_exc()
+                optimizer.zero_grad(); continue
+            # --- Calculate Classification Loss EXTERNALLY ---
+            loss_to_backward = None
+            try:
+                # Get batch size and vocab size from logits
+                B, T_logits, V = logits.shape
+                # --- Prepare PADDED Targets for External CE Loss ---
+                # Logic to pad generative_targets to match T_logits
+                B_targ, T_target_orig = generative_targets.shape
+                N_img = model.num_patches
+                T_prompt = prompt_ids.shape[1]
+                T_combined_expected = N_img + T_prompt + T_target_orig # Expected full length
+                if T_logits != T_combined_expected:
+                     # Handle potential truncation due to context length
+                     print(f"Warning: Logits length {T_logits} != Expected combined length {T_combined_expected}. Adjusting targets.")
+                     T_target_in_logits = max(0, T_logits - (N_img + T_prompt))
+                     generative_targets_sliced = generative_targets[:, :T_target_in_logits]
+                     combined_class_targets = torch.cat([
+                         torch.full((B, T_logits - T_target_in_logits), -100, dtype=torch.long, device=DEVICE),
+                         generative_targets_sliced
+                     ], dim=1)
+                else:
+                     # Pad generative_targets normally
+                     combined_class_targets = torch.cat([
+                         torch.full((B, N_img + T_prompt), -100, dtype=torch.long, device=DEVICE),
+                         generative_targets
+                     ], dim=1)
+                # Verify shapes before loss calculation
+                if logits.shape[1] != combined_class_targets.shape[1]:
+                     raise ValueError(f"Shape mismatch before CE Loss! Logits T={logits.shape[1]}, Targets T={combined_class_targets.shape[1]}")
+                # Calculate loss using the logits that require grad and the padded targets
+                loss_to_backward = F.cross_entropy(
+                    logits.view(-1, V),             # Shape (B * T_logits, V)
+                    combined_class_targets.view(-1), # Shape (B * T_logits)
+                    ignore_index=-100
+                )
+                if not torch.isfinite(loss_to_backward):
+                    print(f"Warning: NaN/Inf detected in calculated class_loss ({loss_to_backward}).")
+                    loss_to_backward = None
+            except Exception as e:
+                print(f"Error calculating external CE loss: {e}")
+                loss_to_backward = None
+            # Check loss before backward
+            if loss_to_backward is None:
+                 print(f"Warning: Skipping batch {batch_idx} due to invalid loss calculation.")
+                 optimizer.zero_grad(); continue
+            # --- Verification ---
+            if loss_to_backward.grad_fn is None:
+                 print(f"!!! ERROR: loss_to_backward (value: {loss_to_backward.item()}) has no grad_fn! Batch {batch_idx} !!!")
+                 optimizer.zero_grad(); continue
+            # Accumulate for logging
+            epoch_class_loss_accum += loss_to_backward.item(); valid_batches_accum += 1
+            scaled_loss = loss_to_backward / FT_GRAD_ACCUM
+            # --- Backward Pass ---
+            try:
+                scaled_loss.backward()
+            except RuntimeError as e: print(f"!!! RUNTIME ERROR backward: {e} !!!"); optimizer.zero_grad(); continue
+            # --- Gradient Accumulation Step ---
+            if (batch_idx + 1) % FT_GRAD_ACCUM == 0 or (batch_idx + 1) == len(train_loader):
+                # Check/Clip gradients of OPTIMIZED parameters
+                found_non_finite_grad = False
+                for p in params_to_optimize:
+                     if p.grad is not None and not torch.isfinite(p.grad).all():
+                          print(f"!!! WARNING: NaN/Inf gradient BEFORE step. Skipping step. !!!")
+                          found_non_finite_grad = True; break
+                if found_non_finite_grad: optimizer.zero_grad(); continue
+                grad_norm = torch.nn.utils.clip_grad_norm_(params_to_optimize, MAX_GRAD_NORM)
+                if not torch.isfinite(grad_norm): print(f"!!! WARNING: Grad norm NaN/Inf ({grad_norm.item()}) AFTER clipping. Skipping step. !!!"); optimizer.zero_grad(); continue
+                optimizer.step()
+                if scheduler: scheduler.step()
+                optimizer.zero_grad()
+                step_counter += 1
+                # --- Logging ---
+                if step_counter % FT_LOG_STEPS == 0 and valid_batches_accum > 0:
+                    avg_class_loss = epoch_class_loss_accum / valid_batches_accum
+                    current_lr = optimizer.param_groups[0]['lr']
+                    # --- Test Evaluation (Class loss only) ---
+                    test_class_loss_val = float('nan')
+                    if test_loader_has_data:
+                         model.eval()
+                         with torch.no_grad():
+                              try:
+                                   test_batch = next(iter(test_loader))
+                                   if test_batch:
+                                        # Unpack test data needed for forward pass -> logits
+                                        t_images = test_batch['image'].to(DEVICE).to(DTYPE)
+                                        t_p_ids = test_batch['prompt_ids'].to(DEVICE)
+                                        t_p_mask = test_batch['prompt_attention_mask'].to(DEVICE)
+                                        t_t_ids = test_batch['target_ids'].to(DEVICE)
+                                        t_t_mask = test_batch['target_attention_mask'].to(DEVICE)
+                                        t_gen_targets = test_batch['generative_targets'].to(DEVICE) # Need this for external CE calc
+                                        # Pass None for other args if model handles it
+                                        t_cont_coords = test_batch.get('continuous_coords'); t_coords_mask = test_batch.get('coords_mask'); t_num_pts = test_batch.get('num_points')
+                                        if t_cont_coords is not None: t_cont_coords = t_cont_coords.to(DEVICE)
+                                        if t_coords_mask is not None: t_coords_mask = t_coords_mask.to(DEVICE)
+                                        # Run forward just to get logits
+                                        logits_t, _, _, _, _, _, *_ = model(
+                                             t_images, t_p_ids, t_p_mask, t_t_ids, t_t_mask, t_gen_targets,
+                                             t_cont_coords, t_coords_mask
+                                        )
+                                        # Calculate CE loss externally for logging
+                                        if logits_t is not None and t_gen_targets is not None:
+                                             try:
+                                                  # Prepare padded targets matching logits_t shape
+                                                  B_test, T_logits_t, V_test = logits_t.shape
+                                                  _, T_target_orig_t = t_gen_targets.shape
+                                                  N_img_test = model.num_patches
+                                                  T_prompt_test = t_p_ids.shape[1]
+                                                  T_combined_expected_t = N_img_test + T_prompt_test + T_target_orig_t
+                                                  if T_logits_t != T_combined_expected_t:
+                                                       T_target_in_logits_t = max(0, T_logits_t - (N_img_test + T_prompt_test))
+                                                       generative_targets_sliced_t = t_gen_targets[:, :T_target_in_logits_t]
+                                                       combined_class_targets_t = torch.cat([
+                                                           torch.full((B_test, T_logits_t - T_target_in_logits_t), -100, dtype=torch.long, device=DEVICE),
+                                                           generative_targets_sliced_t
+                                                       ], dim=1)
+                                                  else:
+                                                       combined_class_targets_t = torch.cat([
+                                                           torch.full((B_test, N_img_test + T_prompt_test), -100, dtype=torch.long, device=DEVICE),
+                                                           t_gen_targets
+                                                       ], dim=1)
+                                                  if logits_t.shape[1] != combined_class_targets_t.shape[1]:
+                                                       raise ValueError("Shape mismatch test CE!")
+                                                  t_class_loss = F.cross_entropy(logits_t.view(-1, V_test), combined_class_targets_t.view(-1), ignore_index=-100)
+                                                  test_class_loss_val = t_class_loss.item() if torch.isfinite(t_class_loss) else float('nan')
+                                             except Exception as e_ce_test: print(f"Error CE Test: {e_ce_test}")
+                              except StopIteration: print("Info: Test loader exhausted during logging.")
+                              except Exception as e: print(f"Error during test eval: {e}")
+                         model.train() # Set back to train mode
+                    # Log data
+                    log_data = { # Simplified logging
+                        "train/class_loss": avg_class_loss,
+                        "test/class_loss": test_class_loss_val,
+                        "epoch": epoch + ((batch_idx + 1) / len(train_loader)),
+                        "step": step_counter,
+                        "learning_rate": current_lr,
+                        "gradient_norm": grad_norm.item() if torch.is_tensor(grad_norm) else float('nan'),
+                    }
+                    pbar.set_postfix({"lr": f"{current_lr:.2e}", "cls_loss": f"{avg_class_loss:.4f}", "gnorm": f"{log_data['gradient_norm']:.3f}"})
+                    if wandb_enabled: wandb.log(log_data, step=step_counter)
+                    # Reset accumulators
+                    epoch_class_loss_accum = 0.0; valid_batches_accum = 0
+        # --- End of Epoch ---
+        print(f"\nFT Epoch {epoch+1}/{args.ft_epochs} completed.")
+        # Optional: Save checkpoint periodically
+        if (epoch + 1) % 5 == 0 or (epoch + 1) == args.ft_epochs:
+             chkpt_path = args.output_model_path.replace(".pth", f"_epoch{epoch+1}.pth")
+             try:
+                 torch.save(model.state_dict(), chkpt_path)
+                 print(f"Checkpoint saved to: {chkpt_path}")
+             except Exception as e: print(f"Error saving checkpoint: {e}")
+    # --- End of Fine-tuning ---
+    print("\nLM head fine-tuning with CE loss completed!")
+    try:
+        torch.save(model.state_dict(), args.output_model_path)
+        print(f"Fine-tuned model saved to: {args.output_model_path}")
+    except Exception as e: print(f"Error saving fine-tuned model: {e}")
+    if wandb_enabled:
+        wandb.finish()
+    torch.autograd.set_detect_anomaly(False) # Disable anomaly detection

infer.py ADDED Viewed

	@@ -0,0 +1,504 @@

+from constants import *
+from utils import image_to_tensor, tokenizer, tensor_to_image, vocab_size, tokenizer
+import torch
+import torch.nn.functional as F
+from PIL import ImageDraw, Image
+from dataset import create_test_dataloader
+from vision_language_model import VisionLanguageModel
+model = VisionLanguageModel(
+    n_embd=HIDDEN_DIM,
+    vocab_size=vocab_size,
+    img_size=IMAGE_SIZE,
+    patch_size=PATCH_SIZE,
+    num_heads=NUM_HEADS,
+    num_blks_vit=NUM_LAYERS, # Or specific value for ViT layers
+    num_blks_dec=NUM_LAYERS, # Or specific value for Decoder layers
+    emb_dropout=DROPOUT,
+    blk_dropout=DROPOUT,
+    max_context=CONTEXT_LENGTH,
+    shared_embed_dim=SHARED_EMBED_DIM,
+    lambda_contrastive=LAMBDA_CONTRASTIVE,
+    lambda_regression=LAMBDA_REGRESSION # Pass the regression weight
+).to(DEVICE)
+MODEL_PATH = "model_regression_multi_first_100.pth" # "model_regression_multi_16.pth"
+if DEVICE == "cuda":
+    model.load_state_dict(torch.load(MODEL_PATH, weights_only=True))
+else:
+    model.load_state_dict(torch.load(MODEL_PATH, weights_only=True, map_location=torch.device('cpu')))
+model.eval()
+def generate_sample_from_image_text(
+    model,
+    image_path,
+    prompt_label,
+    tokenizer,
+    device,
+    max_new_tokens=70,
+    temperature=0.8,
+    top_k=10,
+    output_path="generated_output.png"
+):
+    """
+    Generates a prediction for an image and prompt text and saves it to a file.
+    Generation loop is implemented *within* this function.
+    Args:
+        model: The trained VisionLanguageModel.
+        image_path: Path to the input image.
+        prompt_label: Text prompt/label to use.
+        tokenizer: The tokenizer used for training.
+        device: The computation device ('cuda' or 'cpu').
+        max_new_tokens (int): Max tokens to generate after the prompt.
+        temperature (float): Softmax temperature for sampling.
+        top_k (int): K for top-k sampling (0 or None to disable).
+        output_path (str): Path where to save the output image.
+    Returns:
+        None. Saves the image with prompt and generated output to a file.
+    """
+    model.eval()  # Set the model to evaluation mode
+    try:
+        with torch.no_grad(): # No need to track gradients during inference
+            # --- 1. Prepare Initial Inputs ---
+            # Load and process image
+            image = Image.open(image_path)
+            image_tensor = image_to_tensor(image).unsqueeze(0).to(device) # Add batch dim
+            # Tokenize prompt
+            prompt_text = f"<point_start>{prompt_label}<point_end>"
+            prompt_tokens = tokenizer(prompt_text, return_tensors="pt", truncation=True, padding=False)
+            prompt_ids = prompt_tokens.input_ids.to(device)
+            prompt_attention_mask = prompt_tokens.attention_mask.to(device)
+            B = 1 # We are processing one sample at a time
+            print(f"--- Generating Sample (Manual Loop) ---")
+            print(f"Original Label/Prompt Hint: {prompt_label}")
+            print(f"Input Prompt Tokens Decoded: {prompt_text}")
+            # --- 2. Pre-compute Image & Prompt Embeddings (Part of VLM Forward Logic) ---
+            image_embeds_raw = model.vision_encoder(image_tensor) # (1, N_img, C)
+            image_embeds_decoder = model.multimodal_projector(image_embeds_raw) # (1, N_img, C)
+            prompt_embeds_decoder = model.decoder.token_embedding_table(prompt_ids) # (1, T_prompt, C)
+            result_start_token_id = tokenizer.encode("<result_start>", add_special_tokens=False)[0]
+            result_start_embed = model.decoder.token_embedding_table(
+                torch.tensor([[result_start_token_id]], device=device) # Shape (1, 1, C)
+            )
+            # The initial sequence fed to the decoder blocks consists of image + prompt
+            current_embeds = torch.cat([
+                image_embeds_decoder,
+                prompt_embeds_decoder,
+                result_start_embed # Add the embedding for the first expected output token
+                ], dim=1)
+            generated_ids = [] # Store newly generated IDs
+            # --- 3. Autoregressive Generation Loop ---
+            for _ in range(max_new_tokens):
+                T_current = current_embeds.shape[1]
+                # Truncate if necessary (keep recent context)
+                if T_current > model.decoder.max_context: # Access max_context from decoder
+                    print(f"Warning: Truncating context from {T_current} to {model.decoder.max_context}")
+                    current_embeds = current_embeds[:, -model.decoder.max_context:, :]
+                    T_current = model.decoder.max_context
+                # Prepare positional embeddings for current length
+                pos = torch.arange(0, T_current, dtype=torch.long, device=device)
+                pos = pos.clamp(max=model.decoder.max_context - 1) # Clamp indices
+                pos_emb = model.decoder.position_embedding_table(pos).unsqueeze(0) # (1, T_current, C)
+                x = current_embeds + pos_emb
+                # Create attention mask (all ones, causal handles future)
+                # Note: We don't need padding mask here as we handle one sequence without padding
+                attention_mask = torch.ones(B, T_current, device=device, dtype=torch.long)
+                # Pass through Decoder Blocks
+                for block in model.decoder.blocks:
+                    # We assume the block forward takes (x, attention_mask)
+                    x = block(x, attention_mask=attention_mask)
+                # Final Layer Norm and LM Head for the *last* token prediction
+                x = model.decoder.ln_f(x[:, -1:, :]) # (B, 1, C) -> (1, 1, C)
+                logits = model.decoder.lm_head(x)    # (B, 1, V) -> (1, 1, V)
+                logits = logits.squeeze(1)           # (B, V)    -> (1, V)
+                # Sampling
+                logits = logits / temperature
+                if top_k is not None and top_k > 0:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                probs = F.softmax(logits, dim=-1)
+                # idx_next = torch.multinomial(probs, num_samples=1) # (1, 1) # test distribution
+                idx_next = torch.argmax(logits, dim=-1, keepdim=True) # test deterministic
+                # Store generated ID
+                generated_ids.append(idx_next)
+                # Stop if EOS token is generated
+                if idx_next.item() == tokenizer.eos_token_id:
+                    print("EOS token generated.")
+                    break
+                # Prepare for next iteration: Append embedding of new token
+                next_token_embed = model.decoder.token_embedding_table(idx_next) # (1, 1, C)
+                current_embeds = torch.cat([current_embeds, next_token_embed], dim=1) # Append along sequence dim
+            # --- 4. Combine and Decode Results ---
+            if generated_ids:
+                generated_ids_tensor = torch.cat(generated_ids, dim=1) # (1, T_generated)
+                initial_target_ids = torch.tensor([[result_start_token_id]], device=device)
+                full_generated_sequence_ids = torch.cat([prompt_ids, initial_target_ids, generated_ids_tensor], dim=1)
+            else:
+                full_generated_sequence_ids = prompt_ids # Nothing was generated
+            full_decoded_text = tokenizer.decode(full_generated_sequence_ids[0], skip_special_tokens=False)
+            print(f"\nFull Generated Sequence (Manual Loop):\n{full_decoded_text}")
+            # --- 5. Save visualization to file ---
+            save_coords_visualization(
+                image_tensor=image_tensor[0], # Remove batch dim for visualization
+                full_decoded_text=full_decoded_text,
+                tokenizer=tokenizer,
+                image_size=IMAGE_SIZE, # Assumes IMAGE_SIZE is globally defined
+                num_bins=NUM_BINS,     # Assumes NUM_BINS is globally defined
+                output_path=output_path
+            )
+            print(f"Visualization saved to: {output_path}")
+    except Exception as e:
+        print(f"An error occurred during sample generation: {e}")
+        import traceback
+        traceback.print_exc()
+def generate_sample_from_test_loader(
+    model,
+    test_loader,
+    tokenizer,
+    device,
+    max_new_tokens=70,
+    temperature=0.8,
+    top_k=10,
+    output_path="generated_output.png",
+    TEST_BATCH=8,
+    TEST_IDX=1
+):
+    """
+    Generates a prediction for one sample from the test loader and saves it to a file.
+    Generation loop is implemented *within* this function.
+    Args:
+        model: The trained VisionLanguageModel.
+        test_loader: DataLoader for the test set.
+        tokenizer: The tokenizer used for training.
+        device: The computation device ('cuda' or 'cpu').
+        max_new_tokens (int): Max tokens to generate after the prompt.
+        temperature (float): Softmax temperature for sampling.
+        top_k (int): K for top-k sampling (0 or None to disable).
+        output_path (str): Path where to save the output image.
+    Returns:
+        None. Saves the image with prompt and generated output to a file.
+    """
+    if not test_loader or len(test_loader.dataset) == 0:
+        print("Test loader is empty or not available.")
+        return
+    model.eval()  # Set the model to evaluation mode
+    try:
+        # Get a single batch from the test loader
+        with torch.no_grad(): # No need to track gradients during inference
+            my_iter = iter(test_loader)
+            for i in range(TEST_BATCH):
+                _ = next(my_iter)
+            batch = next(my_iter)
+            if batch is None:
+                print("Test loader yielded an empty batch.")
+                return
+            if batch['image'].shape[0] == 0:
+                 print("Test loader yielded a batch with 0 items.")
+                 return
+            # --- 1. Prepare Initial Inputs ---
+            image_tensor = batch['image'][TEST_IDX:TEST_IDX+1].to(device) # (1, 3, H, W)
+            prompt_ids = batch['prompt_ids'][TEST_IDX:TEST_IDX+1].to(device) # (1, T_prompt)
+            prompt_attention_mask = batch['prompt_attention_mask'][TEST_IDX:TEST_IDX+1].to(device) # (1, T_prompt)
+            label = batch['label'][TEST_IDX]
+            B = 1 # We are processing one sample at a time
+            print(f"--- Generating Sample (Manual Loop) ---")
+            print(f"Original Label/Prompt Hint: {label}")
+            prompt_text = tokenizer.decode(prompt_ids[0], skip_special_tokens=False)
+            print(f"Input Prompt Tokens Decoded: {prompt_text}")
+            # --- 2. Pre-compute Image & Prompt Embeddings (Part of VLM Forward Logic) ---
+            image_embeds_raw = model.vision_encoder(image_tensor) # (1, N_img, C)
+            image_embeds_decoder = model.multimodal_projector(image_embeds_raw) # (1, N_img, C)
+            prompt_embeds_decoder = model.decoder.token_embedding_table(prompt_ids) # (1, T_prompt, C)
+            result_start_token_id = tokenizer.encode("<result_start>", add_special_tokens=False)[0]
+            result_start_embed = model.decoder.token_embedding_table(
+                torch.tensor([[result_start_token_id]], device=device) # Shape (1, 1, C)
+            )
+            # The initial sequence fed to the decoder blocks consists of image + prompt
+            current_embeds = torch.cat([
+                image_embeds_decoder,
+                prompt_embeds_decoder,
+                result_start_embed # Add the embedding for the first expected output token
+                ], dim=1)
+            # current_embeds = torch.cat([image_embeds_decoder, prompt_embeds_decoder], dim=1) # (1, T_initial, C)
+            generated_ids = [] # Store newly generated IDs
+            # --- 3. Autoregressive Generation Loop ---
+            for _ in range(max_new_tokens):
+                T_current = current_embeds.shape[1]
+                # Truncate if necessary (keep recent context)
+                if T_current > model.decoder.max_context: # Access max_context from decoder
+                    print(f"Warning: Truncating context from {T_current} to {model.decoder.max_context}")
+                    current_embeds = current_embeds[:, -model.decoder.max_context:, :]
+                    T_current = model.decoder.max_context
+                # Prepare positional embeddings for current length
+                pos = torch.arange(0, T_current, dtype=torch.long, device=device)
+                pos = pos.clamp(max=model.decoder.max_context - 1) # Clamp indices
+                pos_emb = model.decoder.position_embedding_table(pos).unsqueeze(0) # (1, T_current, C)
+                x = current_embeds + pos_emb
+                # Create attention mask (all ones, causal handles future)
+                # Note: We don't need padding mask here as we handle one sequence without padding
+                attention_mask = torch.ones(B, T_current, device=device, dtype=torch.long)
+                # Pass through Decoder Blocks
+                for block in model.decoder.blocks:
+                    # We assume the block forward takes (x, attention_mask)
+                    x = block(x, attention_mask=attention_mask)
+                # Final Layer Norm and LM Head for the *last* token prediction
+                x = model.decoder.ln_f(x[:, -1:, :]) # (B, 1, C) -> (1, 1, C)
+                logits = model.decoder.lm_head(x)    # (B, 1, V) -> (1, 1, V)
+                logits = logits.squeeze(1)           # (B, V)    -> (1, V)
+                # Sampling
+                logits = logits / temperature
+                if top_k is not None and top_k > 0:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                probs = F.softmax(logits, dim=-1)
+                # idx_next = torch.multinomial(probs, num_samples=1) # (1, 1) # test distribution
+                idx_next = torch.argmax(logits, dim=-1, keepdim=True) # test deterministic
+                # Store generated ID
+                generated_ids.append(idx_next)
+                # Stop if EOS token is generated
+                if idx_next.item() == tokenizer.eos_token_id:
+                    print("EOS token generated.")
+                    break
+                # Prepare for next iteration: Append embedding of new token
+                next_token_embed = model.decoder.token_embedding_table(idx_next) # (1, 1, C)
+                current_embeds = torch.cat([current_embeds, next_token_embed], dim=1) # Append along sequence dim
+            # --- 4. Combine and Decode Results ---
+            if generated_ids:
+                generated_ids_tensor = torch.cat(generated_ids, dim=1) # (1, T_generated)
+                initial_target_ids = torch.tensor([[result_start_token_id]], device=device)
+                full_generated_sequence_ids = torch.cat([prompt_ids, initial_target_ids, generated_ids_tensor], dim=1)
+            else:
+                full_generated_sequence_ids = prompt_ids # Nothing was generated
+            full_decoded_text = tokenizer.decode(full_generated_sequence_ids[0], skip_special_tokens=False)
+            print(f"\nFull Generated Sequence (Manual Loop):\n{full_decoded_text}")
+            # --- 5. Save visualization to file ---
+            save_coords_visualization(
+                image_tensor=image_tensor[0], # Remove batch dim for visualization
+                full_decoded_text=full_decoded_text,
+                tokenizer=tokenizer,
+                image_size=IMAGE_SIZE, # Assumes IMAGE_SIZE is globally defined
+                num_bins=NUM_BINS,     # Assumes NUM_BINS is globally defined
+                output_path=output_path
+            )
+            print(f"Visualization saved to: {output_path}")
+    except StopIteration:
+        print("Test loader is exhausted.")
+    except Exception as e:
+        print(f"An error occurred during sample generation: {e}")
+        import traceback
+        traceback.print_exc()
+def parse_coordinate_tokens(text, tokenizer, num_bins):
+    """
+    Parses generated text to extract coordinate bin tokens.
+    Args:
+        text (str): The decoded output text from the model.
+        tokenizer: The tokenizer.
+        num_bins (int): The number of coordinate bins used.
+    Returns:
+        list[tuple(int, int)]: A list of (x_bin, y_bin) tuples, or None if parsing fails.
+    """
+    coords = []
+    try:
+        # Basic parsing - look for the pattern
+        x_start_token = "<pointx_start>"
+        x_end_token = "<pointx_end>"
+        y_start_token = "<pointy_start>"
+        y_end_token = "<pointy_end>"
+        result_end_token = "<result_end>"
+        # Find where the actual results start
+        try:
+             start_index = text.index("<result_start>") + len("<result_start>")
+        except ValueError:
+             print("Warning: <result_start> not found in generated text.")
+             return None
+        # Find where results end
+        try:
+             end_index = text.index(result_end_token, start_index)
+        except ValueError:
+             end_index = len(text) # Use end of string if <result_end> is missing
+             print(f"Warning: {result_end_token} not found. Parsing until end of string.")
+        current_pos = start_index
+        while current_pos < end_index:
+            # Find next X coordinate
+            x_start_idx = text.find(x_start_token, current_pos)
+            if x_start_idx == -1 or x_start_idx >= end_index: break # No more x points found
+            x_start_idx += len(x_start_token)
+            x_end_idx = text.find(x_end_token, x_start_idx)
+            if x_end_idx == -1 or x_end_idx >= end_index: break # Malformed
+            x_token_str = text[x_start_idx:x_end_idx].strip()
+            # Find next Y coordinate (must follow X)
+            y_start_idx = text.find(y_start_token, x_end_idx)
+            if y_start_idx == -1 or y_start_idx >= end_index: break # No corresponding y point
+            y_start_idx += len(y_start_token)
+            y_end_idx = text.find(y_end_token, y_start_idx)
+            if y_end_idx == -1 or y_end_idx >= end_index: break # Malformed
+            y_token_str = text[y_start_idx:y_end_idx].strip()
+            x_token_str = x_token_str[:-1]
+            y_token_str = y_token_str[:-1]
+            # Convert token strings to bin numbers
+            try:
+                x_bin = int(x_token_str.split("_")[-1])
+                y_bin = int(y_token_str.split("_")[-1])
+                if 0 <= x_bin < num_bins and 0 <= y_bin < num_bins:
+                    coords.append((x_bin, y_bin))
+                else:
+                    print(f"Warning: Parsed bin indices out of range ({x_bin}, {y_bin}). Skipping.")
+            except (ValueError, IndexError):
+                print(f"Warning: Could not parse bins from tokens '{x_token_str}', '{y_token_str}'. Skipping.")
+            # Move search position past the found Y token
+            current_pos = y_end_idx + len(y_end_token)
+        return coords if coords else None
+    except Exception as e:
+        print(f"Error during coordinate parsing: {e}")
+        return None
+def save_coords_visualization(image_tensor, full_decoded_text, tokenizer, image_size, num_bins, output_path):
+    """Parses coords, draws them on the image, and saves to a file."""
+    parsed_bins = parse_coordinate_tokens(full_decoded_text, tokenizer, num_bins)
+    # Convert tensor to PIL image for drawing
+    try:
+        pil_image = tensor_to_image(image_tensor.cpu()) # Ensure tensor is on CPU
+    except Exception as e:
+        print(f"Error converting tensor to image: {e}")
+        # Create a placeholder image if conversion fails
+        pil_image = Image.new('RGB', (image_size, image_size), color='white')
+        draw = ImageDraw.Draw(pil_image)
+        draw.text((10, 10), "Image conversion failed", fill="black")
+        pil_image.save(output_path)
+        return
+    draw = ImageDraw.Draw(pil_image)
+    radius = 5 # Radius of the drawn point
+    if parsed_bins:
+        print(f"\nParsed Coordinate Bins: {parsed_bins}")
+        bin_size_pixels = image_size / num_bins
+        for x_bin, y_bin in parsed_bins:
+            # Calculate center of the bin in pixels
+            center_x = (x_bin + 0.5) * bin_size_pixels
+            center_y = (y_bin + 0.5) * bin_size_pixels
+            # Draw a circle
+            bbox = [center_x - radius, center_y - radius, center_x + radius, center_y + radius]
+            draw.ellipse(bbox, outline="red", width=3)
+            # Optional: Draw bin boundaries for debugging
+            # draw.rectangle([x_bin*bin_size_pixels, y_bin*bin_size_pixels, (x_bin+1)*bin_size_pixels, (y_bin+1)*bin_size_pixels], outline="blue", width=1)
+        # Add a text label with the coordinates at the top of the image
+        coord_text = f"Generated Point(s): {parsed_bins}"
+        draw.text((10, 10), coord_text, fill="red")
+    else:
+        print("\nCould not parse valid coordinates from the generated text.")
+        # Add a text label indicating no coordinates were found
+        draw.text((10, 10), "No Coordinates Parsed", fill="red")
+    # Save the image to file
+    pil_image.save(output_path)
+import argparse
+# --- Example Usage ---
+# python infer.py --image ./data/test_images/image_1.png --prompt "a red apple"
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--image', type=str, help='Path to input image')
+    parser.add_argument('--prompt', type=str, help='Prompt label for generation')
+    args = parser.parse_args()
+    if args.image and args.prompt:
+        # Use image and prompt based generation
+        if 'model' in locals() and 'tokenizer' in locals():
+            generate_sample_from_image_text(
+                model=model,
+                image_path=args.image,
+                prompt_label=args.prompt,
+                tokenizer=tokenizer,
+                device=DEVICE,
+                output_path="model_prediction.png"
+            )
+        else:
+            print("Please ensure 'model' and 'tokenizer' are loaded before running generation.")
+    else:
+        # Use test loader based generation
+        if 'model' in locals() and 'test_loader' in locals() and 'tokenizer' in locals():
+            test_loader = create_test_dataloader(batch_size=2, num_workers=0)
+            generate_sample_from_test_loader(
+                model=model,
+                test_loader=test_loader,
+                tokenizer=tokenizer,
+                device=DEVICE,
+                output_path="model_prediction.png"
+            )
+        else:
+            print("Please ensure 'model', 'test_loader', and 'tokenizer' are loaded before running generation.")

model_components.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from constants import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PatchEmbeddings(nn.Module):
+    def __init__(self, patch_size=PATCH_SIZE, hidden_dim=HIDDEN_DIM):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels=3, out_channels=hidden_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, X):
+        X = self.conv(X) # (B, C, H/P, W/P)
+        X = X.flatten(2)  # (B, C, N) where N = (H/P)*(W/P)
+        X = X.transpose(1, 2)  # (B, N, C)
+        return X
+class Head(nn.Module):
+    def __init__(self, n_embd, head_size, dropout=DROPOUT, is_decoder=False):
+        super().__init__()
+        self.key = nn.Linear(n_embd, head_size, bias=False)
+        self.query = nn.Linear(n_embd, head_size, bias=False)
+        self.value = nn.Linear(n_embd, head_size, bias=False)
+        self.dropout = nn.Dropout(dropout)
+        self.is_decoder = is_decoder
+        # causal mask is registered persistent=False so it's not saved in state_dict
+        if self.is_decoder:
+            self.register_buffer("bias", torch.tril(torch.ones(CONTEXT_LENGTH, CONTEXT_LENGTH, dtype=torch.bool))
+                                 .view(1, CONTEXT_LENGTH, CONTEXT_LENGTH), persistent=False)
+    def forward(self, x, attention_mask=None):
+        B, T, C = x.shape
+        # print(f"B = {B} T={T}, C={C}")
+        k = self.key(x)   # (B, T, hs)
+        q = self.query(x) # (B, T, hs)
+        v = self.value(x) # (B, T, hs)
+        # Compute attention scores ("affinities")
+        wei = q @ k.transpose(-2, -1) * (k.size(-1)**-0.5) # (B, T, hs) @ (B, hs, T) -> (B, T, T)
+        if self.is_decoder:
+            # Apply causal mask
+            # Ensure the mask is sliced correctly if T < CONTEXT_LENGTH
+            causal_mask = self.bias[:, :T, :T]
+            wei = wei.masked_fill(causal_mask == 0, float('-inf'))
+        if attention_mask is not None:
+            # Apply padding mask (for text tokens)
+            # attention_mask shape: (B, T_combined) -> needs expansion
+            # Expand mask: (B, T) -> (B, 1, 1, T) or (B, 1, T, T) depending on what needs masking
+            # Mask where attention_mask is 0
+            # attention_mask shape: (B, T) == (B, T_key)
+            # Expand mask to align with wei's key dimension for broadcasting across queries
+            # Target shape for mask: [B, 1, T_key]
+            # print(f"attn mask = {attention_mask.shape}")
+            # print(f"wei shape = {wei.shape}")
+            mask = attention_mask.unsqueeze(1) # Shape [B, 1, T]
+            # Apply mask using broadcasting rules. masked_fill condition needs to be broadcastable to wei [B, T_query, T_key]
+            # (mask == 0) gives a boolean tensor of shape [B, 1, T]
+            # This broadcasts correctly: dim 2 (T vs T) matches, dim 1 (1 vs T) broadcasts 1->T, dim 0 (B vs B) matches.
+            wei = wei.masked_fill(mask == 0, float('-inf'))
+        # Apply softmax
+        wei = F.softmax(wei, dim=-1)
+        wei = self.dropout(wei)
+        # Perform weighted aggregation of values
+        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
+        # print(f"out shape = {out.shape}")
+        return out
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_embd, num_heads=NUM_HEADS, dropout=DROPOUT, is_decoder=False):
+        super().__init__()
+        assert n_embd % num_heads == 0
+        head_size = n_embd // num_heads
+        self.heads = nn.ModuleList([
+            Head(n_embd, head_size, dropout, is_decoder)
+            for _ in range(num_heads)
+        ])
+        self.proj = nn.Linear(n_embd, n_embd) # n_embd = num_heads * head_size
+        self.dropout = nn.Dropout(dropout)
+        self.is_decoder = is_decoder # Store is_decoder status
+    def forward(self, x, attention_mask=None):
+         # Pass attention_mask only if it's a decoder block dealing with combined sequence
+        out = torch.cat([h(x, attention_mask=attention_mask if self.is_decoder else None) for h in self.heads], dim=-1)
+        out = self.dropout(self.proj(out))
+        return out
+class FeedForward(nn.Module):
+    """ a simple linear layer followed by a non-linearity """
+    def __init__(self, n_embd, dropout=DROPOUT):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(n_embd, 4 * n_embd),
+            nn.GELU(), # Changed from ReLU to GELU, common in transformers
+            nn.Linear(4 * n_embd, n_embd), # Projection back to residual stream
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Block(nn.Module):
+    """ Transformer block: communication followed by computation """
+    def __init__(self, n_embd, num_heads=NUM_HEADS, dropout=DROPOUT, is_decoder=False):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = MultiHeadAttention(n_embd, num_heads, dropout, is_decoder)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.ffn = FeedForward(n_embd, dropout)
+        self.is_decoder = is_decoder # Store is_decoder status
+    def forward(self, x, attention_mask=None):
+        # Pass attention_mask only if it's a decoder block
+        # print(f"is decoder = {self.is_decoder} input shape = {x.shape}")
+        x = x + self.attn(self.ln1(x), attention_mask=attention_mask if self.is_decoder else None)
+        x = x + self.ffn(self.ln2(x))
+        # print(f"output shape = {x.shape}")
+        return x
+class ViT(nn.Module):
+    def __init__(self, img_size=IMAGE_SIZE, patch_size=PATCH_SIZE, num_hiddens=HIDDEN_DIM,
+                 num_heads=NUM_HEADS, num_blks=NUM_LAYERS, emb_dropout=DROPOUT, blk_dropout=DROPOUT):
+        super().__init__()
+        self.patch_embedding = PatchEmbeddings(patch_size, num_hiddens)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, num_hiddens))
+        num_patches = (img_size // patch_size) ** 2
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, num_hiddens) * 0.02) # Smaller init
+        self.dropout = nn.Dropout(emb_dropout)
+        # ViT blocks are NOT decoders (no causal mask)
+        self.blocks = nn.ModuleList([Block(num_hiddens, num_heads, blk_dropout, is_decoder=False) for _ in range(num_blks)])
+        self.layer_norm = nn.LayerNorm(num_hiddens) # Final LN
+    def forward(self, X):
+        x = self.patch_embedding(X) # (B, N, C)
+        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1) # (B, 1, C)
+        x = torch.cat((cls_tokens, x), dim=1) # (B, N+1, C)
+        # Add positional embedding
+        x = x + self.pos_embedding # Uses broadcasting
+        x = self.dropout(x)
+        for block in self.blocks:
+            # ViT blocks don't need attention_mask
+            x = block(x)
+        x = self.layer_norm(x) # Apply final layer norm
+        return x
+class MultiModalProjector(nn.Module):
+    # Projects image embedding dim to text embedding dim
+    def __init__(self, image_embed_dim=HIDDEN_DIM, text_embed_dim=HIDDEN_DIM, dropout=DROPOUT):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(image_embed_dim, text_embed_dim * 4), # Intermediate expansion
+            nn.GELU(),
+            nn.Linear(text_embed_dim * 4, text_embed_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)

train.py ADDED Viewed

	@@ -0,0 +1,264 @@

+from constants import *
+from dataset import create_train_dataloader, create_test_dataloader
+from vision_language_model import VisionLanguageModel
+from utils import *
+from datetime import datetime
+import wandb
+import torch
+import torch.optim as optim
+from torch.optim.lr_scheduler import OneCycleLR
+from tqdm.auto import tqdm
+print(f"Using device: {DEVICE}")
+print(f"Vocab size: {vocab_size}")
+# --- Initialize Model ---
+# Ensure lambda_regression is passed during initialization
+model = VisionLanguageModel(
+    n_embd=HIDDEN_DIM,
+    vocab_size=vocab_size,
+    img_size=IMAGE_SIZE,
+    patch_size=PATCH_SIZE,
+    num_heads=NUM_HEADS,
+    num_blks_vit=NUM_LAYERS, # Or specific value for ViT layers
+    num_blks_dec=NUM_LAYERS, # Or specific value for Decoder layers
+    emb_dropout=DROPOUT,
+    blk_dropout=DROPOUT,
+    max_context=CONTEXT_LENGTH,
+    shared_embed_dim=SHARED_EMBED_DIM,
+    lambda_contrastive=LAMBDA_CONTRASTIVE,
+    lambda_regression=LAMBDA_REGRESSION # Pass the regression weight
+).to(DEVICE)
+# --- Optimizer ---
+# Optimizer will automatically include all model parameters, including the new regression head
+optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.95), weight_decay=0.1)
+# --- Dataloaders ---
+# Ensure these functions now return 'continuous_coords' in the batch dictionary
+train_loader = create_train_dataloader(batch_size=BATCH_SIZE, num_workers=2) # Use num_workers=0 for easier debugging first
+test_loader = create_test_dataloader(batch_size=BATCH_SIZE, num_workers=2)
+if train_loader is None: exit("Training loader failed to initialize.")
+test_loader_has_data = test_loader and len(test_loader.dataset) > 0
+# --- LR Scheduler ---
+if train_loader and len(train_loader) > 0:
+    steps_per_epoch = (len(train_loader) // GRAD_ACCUMULATION_STEPS) + (1 if len(train_loader) % GRAD_ACCUMULATION_STEPS != 0 else 0)
+    total_steps = steps_per_epoch * NUM_EPOCHS
+    # Adjust warmup steps if total steps are very low
+    warmup_steps = min(max(1, total_steps // 10), 10000) # Ensure at least 1, max 10k warmup
+    print(f"Total estimated optimization steps: {total_steps}, Warmup steps: {warmup_steps}")
+    lr_scheduler = OneCycleLR(optimizer, max_lr=LEARNING_RATE, total_steps=total_steps, pct_start=warmup_steps/total_steps if total_steps > 0 else 0.1)
+else:
+    print("Warning: Train loader empty. Using constant LR.")
+    total_steps = 0; warmup_steps = 0
+    lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=1.0)
+# --- Wandb Setup ---
+try:
+    wandb.init(
+        # project="point-language-model-dualhead", # Suggest new project name
+        project="point-language-model-regression-vast",
+        name=f"point-vlm-dual-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
+        config={ # Add new hyperparameters
+            "image_size": IMAGE_SIZE, "patch_size": PATCH_SIZE, "hidden_dim": HIDDEN_DIM,
+            "context_length": CONTEXT_LENGTH, "dropout": DROPOUT,
+            "num_heads": NUM_HEADS, "num_layers": NUM_LAYERS, "batch_size": BATCH_SIZE,
+            "learning_rate": LEARNING_RATE, "grad_accum_steps": GRAD_ACCUMULATION_STEPS,
+            "shared_embed_dim": SHARED_EMBED_DIM, "lambda_contrastive": LAMBDA_CONTRASTIVE,
+            "lambda_regression": LAMBDA_REGRESSION, # Log regression weight
+            "architecture": "VisionLanguageModel (Dual Head)", "optimizer": "AdamW",
+            "num_epochs": NUM_EPOCHS, "total_steps": total_steps, "warmup_steps": warmup_steps
+        }
+    )
+    wandb_enabled = True
+    # Watch model gradients and parameters
+    # wandb.watch(model, log="all", log_freq=LOGGING_STEPS * GRAD_ACCUMULATION_STEPS)
+except Exception as e:
+    print(f"Wandb initialization failed: {e}. Running without wandb.")
+    wandb_enabled = False
+# --- Training Loop ---
+print("Starting training with Classification + Contrastive + Regression Loss (Multi-Point)...")
+step_counter = 0
+optimizer.zero_grad()
+for epoch in range(NUM_EPOCHS):
+    model.train()
+    epoch_total_loss_accum = 0.0
+    epoch_class_loss_accum = 0.0
+    epoch_con_loss_accum = 0.0
+    epoch_reg_loss_accum = 0.0
+    batches_since_log = 0
+    valid_batches_accum = 0 # Count batches with valid loss for averaging
+    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{NUM_EPOCHS}", leave=False)
+    for batch_idx, batch in pbar:
+        if batch is None: continue
+        # --- Unpack Batch Data ---
+        try:
+            images = batch['image'].to(DEVICE, non_blocking=True).to(DTYPE)
+            prompt_ids = batch['prompt_ids'].to(DEVICE, non_blocking=True)
+            prompt_attention_mask = batch['prompt_attention_mask'].to(DEVICE, non_blocking=True)
+            target_ids = batch['target_ids'].to(DEVICE, non_blocking=True)
+            target_attention_mask = batch['target_attention_mask'].to(DEVICE, non_blocking=True)
+            generative_targets = batch['generative_targets'].to(DEVICE, non_blocking=True)
+            continuous_coords = batch['continuous_coords'].to(DEVICE, non_blocking=True) # Padded
+            coords_mask = batch['coords_mask'].to(DEVICE, non_blocking=True)           # Mask
+        except KeyError as e:
+            print(f"Error: Missing key {e} in batch. Check dataloader and collate_fn.")
+            continue
+        # Clamp logit_scale
+        with torch.no_grad():
+             model.logit_scale.clamp_(0, torch.log(torch.tensor(100.0)))
+        # --- Forward Pass ---
+        # Model now returns potentially NaN scalar tensors for individual losses if invalid
+        logits, reg_output, total_loss, class_loss_s, contrastive_loss_s, regression_loss_s = model(
+            img_array=images,
+            prompt_ids=prompt_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            target_ids=target_ids,
+            target_attention_mask=target_attention_mask,
+            generative_targets=generative_targets,
+            continuous_coords=continuous_coords,
+            coords_mask=coords_mask # Pass mask for regression loss calculation
+        )
+        # --- Loss Handling & Accumulation ---
+        # Check for invalid total loss before backward pass
+        if total_loss is None or not torch.isfinite(total_loss):
+             print(f"Warning: Invalid total_loss ({total_loss}) detected at Epoch {epoch+1}, Batch {batch_idx}. Skipping backward/step.")
+             optimizer.zero_grad() # Reset gradients for safety if loss is invalid
+             continue # Skip this batch for optimization step
+        # Scale loss for gradient accumulation
+        scaled_loss = total_loss / GRAD_ACCUMULATION_STEPS
+        # Accumulate valid loss components for logging
+        # Check if the scalar tensor is finite before adding its item()
+        if torch.isfinite(total_loss):
+            epoch_total_loss_accum += total_loss.item()
+            valid_batches_accum += 1 # Increment count of batches contributing to average loss
+        if torch.isfinite(class_loss_s):
+            epoch_class_loss_accum += class_loss_s.item()
+        if torch.isfinite(contrastive_loss_s):
+            epoch_con_loss_accum += contrastive_loss_s.item()
+        if torch.isfinite(regression_loss_s):
+            epoch_reg_loss_accum += regression_loss_s.item()
+        batches_since_log += 1
+        # --- Backward Pass ---
+        try:
+             scaled_loss.backward()
+        except Exception as e:
+             print(f"Error during backward pass: {e}. Skipping step.")
+             optimizer.zero_grad() # Reset gradients if backward failed
+             continue
+        # --- Gradient Accumulation Step ---
+        if (batch_idx + 1) % GRAD_ACCUMULATION_STEPS == 0 or (batch_idx + 1) == len(train_loader):
+            # Clip gradients
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
+            # Check for non-finite gradients before stepping
+            all_finite = True
+            for p in model.parameters():
+                 if p.grad is not None and not torch.isfinite(p.grad).all():
+                      all_finite = False
+                      break
+            if not all_finite:
+                 print(f"Warning: Non-finite gradients detected at step {step_counter}. Skipping optimizer step.")
+                 optimizer.zero_grad()
+                 continue # Skip optimizer step and scheduler step
+            # Optimizer step
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            step_counter += 1
+            # --- Logging ---
+            if step_counter % LOGGING_STEPS == 0 and valid_batches_accum > 0: # Use valid_batches_accum
+                # Calculate average losses over the logging period using valid batch count
+                avg_total_loss = epoch_total_loss_accum / valid_batches_accum
+                avg_class_loss = epoch_class_loss_accum / valid_batches_accum
+                avg_con_loss = epoch_con_loss_accum / valid_batches_accum
+                avg_reg_loss = epoch_reg_loss_accum / valid_batches_accum
+                current_lr = optimizer.param_groups[0]['lr']
+                # --- Test Evaluation (Needs modification to handle mask) ---
+                test_class_loss_val = float('nan')
+                test_con_loss_val = float('nan')
+                test_reg_loss_val = float('nan')
+                if test_loader_has_data:
+                     model.eval()
+                     with torch.no_grad():
+                          try:
+                               test_batch = next(iter(test_loader))
+                               if test_batch:
+                                    t_images = test_batch['image'].to(DEVICE).to(DTYPE)
+                                    t_p_ids = test_batch['prompt_ids'].to(DEVICE)
+                                    t_p_mask = test_batch['prompt_attention_mask'].to(DEVICE)
+                                    t_t_ids = test_batch['target_ids'].to(DEVICE)
+                                    t_t_mask = test_batch['target_attention_mask'].to(DEVICE)
+                                    t_gen_targets = test_batch['generative_targets'].to(DEVICE)
+                                    t_cont_coords = test_batch['continuous_coords'].to(DEVICE) # Padded
+                                    t_coords_mask = test_batch['coords_mask'].to(DEVICE)       # Mask
+                                    _, _, _, t_class_loss, t_con_loss, t_reg_loss = model(
+                                        t_images, t_p_ids, t_p_mask, t_t_ids, t_t_mask,
+                                        t_gen_targets, t_cont_coords, t_coords_mask # Pass mask
+                                    )
+                                    # Use .item() only if the tensor is finite
+                                    test_class_loss_val = t_class_loss.item() if torch.isfinite(t_class_loss) else float('nan')
+                                    test_con_loss_val = t_con_loss.item() if torch.isfinite(t_con_loss) else float('nan')
+                                    test_reg_loss_val = t_reg_loss.item() if torch.isfinite(t_reg_loss) else float('nan')
+                          # ... (rest of exception handling) ...
+                          except StopIteration: print("Info: Test loader exhausted during logging.")
+                          except KeyError as e: print(f"Error: Missing key {e} in test batch.")
+                          except Exception as e: print(f"Error during test evaluation: {e}")
+                     model.train()
+                # Prepare data for logging
+                log_data = {
+                    "train/total_loss": avg_total_loss,
+                    "train/class_loss": avg_class_loss,
+                    "train/contrastive_loss": avg_con_loss,
+                    "train/regression_loss": avg_reg_loss,
+                    "test/class_loss": test_class_loss_val,
+                    "test/contrastive_loss": test_con_loss_val,
+                    "test/regression_loss": test_reg_loss_val,
+                    "epoch": epoch + ((batch_idx + 1) / len(train_loader)),
+                    "step": step_counter,
+                    "learning_rate": current_lr,
+                    "gradient_norm": grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm,
+                    "logit_scale": model.logit_scale.exp().item()
+                }
+                # Update progress bar
+                pbar.set_postfix({
+                    "lr": f"{current_lr:.2e}", "loss": f"{avg_total_loss:.3f}",
+                    "cls": f"{avg_class_loss:.3f}", "con": f"{avg_con_loss:.3f}",
+                    "reg": f"{avg_reg_loss:.3f}", "gnorm": f"{log_data['gradient_norm']:.2f}"
+                    })
+                if wandb_enabled: wandb.log(log_data)
+                # Reset accumulators
+                epoch_total_loss_accum, epoch_class_loss_accum, epoch_con_loss_accum, epoch_reg_loss_accum = 0.0, 0.0, 0.0, 0.0
+                batches_since_log = 0
+                valid_batches_accum = 0 # Reset valid batch count
+    # --- End of Epoch ---
+    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS} completed.")
+    # Optional: Add end-of-epoch evaluation or model saving here
+    if epoch % 5 == 0:
+        torch.save(model.state_dict(), f"model_regression_multi_{epoch+1}.pth")
+# --- End of Training ---
+print("\nTraining completed!")
+if wandb_enabled:
+    wandb.finish()

train_stage_2.py ADDED Viewed

	@@ -0,0 +1,267 @@

+from constants import *
+from dataset import create_train_dataloader, create_test_dataloader
+from vision_language_model import VisionLanguageModel
+from utils import *
+from datetime import datetime
+import wandb
+import torch
+import torch.optim as optim
+from torch.optim.lr_scheduler import OneCycleLR
+from tqdm.auto import tqdm
+print(f"Using device: {DEVICE}")
+print(f"Vocab size: {vocab_size}")
+# --- Initialize Model ---
+# Ensure lambda_regression is passed during initialization
+model = VisionLanguageModel(
+    n_embd=HIDDEN_DIM,
+    vocab_size=vocab_size,
+    img_size=IMAGE_SIZE,
+    patch_size=PATCH_SIZE,
+    num_heads=NUM_HEADS,
+    num_blks_vit=NUM_LAYERS, # Or specific value for ViT layers
+    num_blks_dec=NUM_LAYERS, # Or specific value for Decoder layers
+    emb_dropout=0.0,
+    blk_dropout=0.0,
+    max_context=CONTEXT_LENGTH,
+    shared_embed_dim=SHARED_EMBED_DIM,
+    lambda_contrastive=LAMBDA_CONTRASTIVE,
+    lambda_regression=LAMBDA_REGRESSION # Pass the regression weight
+).to(DEVICE)
+NUM_EPOCHS = 100
+model.load_state_dict(torch.load("model_regression_multi_16.pth", weights_only=True)) # we ran till 15 before it over fitted with higher learning rate
+# --- Optimizer ---
+# Optimizer will automatically include all model parameters, including the new regression head
+optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), weight_decay=0.1) # lower learning rate for second stage
+# --- Dataloaders ---
+# Ensure these functions now return 'continuous_coords' in the batch dictionary
+train_loader = create_train_dataloader(batch_size=BATCH_SIZE, num_workers=2) # Use num_workers=0 for easier debugging first
+test_loader = create_test_dataloader(batch_size=BATCH_SIZE, num_workers=2)
+if train_loader is None: exit("Training loader failed to initialize.")
+test_loader_has_data = test_loader and len(test_loader.dataset) > 0
+# --- LR Scheduler ---
+if train_loader and len(train_loader) > 0:
+    steps_per_epoch = (len(train_loader) // GRAD_ACCUMULATION_STEPS) + (1 if len(train_loader) % GRAD_ACCUMULATION_STEPS != 0 else 0)
+    total_steps = steps_per_epoch * NUM_EPOCHS
+    # Adjust warmup steps if total steps are very low
+    warmup_steps = min(max(1, total_steps // 10), 10000) # Ensure at least 1, max 10k warmup
+    print(f"Total estimated optimization steps: {total_steps}, Warmup steps: {warmup_steps}")
+    lr_scheduler = OneCycleLR(optimizer, max_lr=LEARNING_RATE, total_steps=total_steps, pct_start=warmup_steps/total_steps if total_steps > 0 else 0.1)
+else:
+    print("Warning: Train loader empty. Using constant LR.")
+    total_steps = 0; warmup_steps = 0
+    lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=1.0)
+# --- Wandb Setup ---
+try:
+    wandb.init(
+        # project="point-language-model-dualhead", # Suggest new project name
+        project="point-language-model-regression-vast",
+        name=f"point-vlm-dual-stage-2-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
+        config={ # Add new hyperparameters
+            "image_size": IMAGE_SIZE, "patch_size": PATCH_SIZE, "hidden_dim": HIDDEN_DIM,
+            "context_length": CONTEXT_LENGTH, "dropout": DROPOUT,
+            "num_heads": NUM_HEADS, "num_layers": NUM_LAYERS, "batch_size": BATCH_SIZE,
+            "learning_rate": LEARNING_RATE, "grad_accum_steps": GRAD_ACCUMULATION_STEPS,
+            "shared_embed_dim": SHARED_EMBED_DIM, "lambda_contrastive": LAMBDA_CONTRASTIVE,
+            "lambda_regression": LAMBDA_REGRESSION, # Log regression weight
+            "architecture": "VisionLanguageModel (Dual Head)", "optimizer": "AdamW",
+            "num_epochs": NUM_EPOCHS, "total_steps": total_steps, "warmup_steps": warmup_steps
+        }
+    )
+    wandb_enabled = True
+    # Watch model gradients and parameters
+    # wandb.watch(model, log="all", log_freq=LOGGING_STEPS * GRAD_ACCUMULATION_STEPS)
+except Exception as e:
+    print(f"Wandb initialization failed: {e}. Running without wandb.")
+    wandb_enabled = False
+# --- Training Loop ---
+print("Starting training with Classification + Contrastive + Regression Loss (Multi-Point)...")
+step_counter = 0
+optimizer.zero_grad()
+for epoch in range(NUM_EPOCHS):
+    model.train()
+    epoch_total_loss_accum = 0.0
+    epoch_class_loss_accum = 0.0
+    epoch_con_loss_accum = 0.0
+    epoch_reg_loss_accum = 0.0
+    batches_since_log = 0
+    valid_batches_accum = 0 # Count batches with valid loss for averaging
+    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{NUM_EPOCHS}", leave=False)
+    for batch_idx, batch in pbar:
+        if batch is None: continue
+        # --- Unpack Batch Data ---
+        try:
+            images = batch['image'].to(DEVICE, non_blocking=True).to(DTYPE)
+            prompt_ids = batch['prompt_ids'].to(DEVICE, non_blocking=True)
+            prompt_attention_mask = batch['prompt_attention_mask'].to(DEVICE, non_blocking=True)
+            target_ids = batch['target_ids'].to(DEVICE, non_blocking=True)
+            target_attention_mask = batch['target_attention_mask'].to(DEVICE, non_blocking=True)
+            generative_targets = batch['generative_targets'].to(DEVICE, non_blocking=True)
+            continuous_coords = batch['continuous_coords'].to(DEVICE, non_blocking=True) # Padded
+            coords_mask = batch['coords_mask'].to(DEVICE, non_blocking=True)           # Mask
+        except KeyError as e:
+            print(f"Error: Missing key {e} in batch. Check dataloader and collate_fn.")
+            continue
+        # Clamp logit_scale
+        with torch.no_grad():
+             model.logit_scale.clamp_(0, torch.log(torch.tensor(100.0)))
+        # --- Forward Pass ---
+        # Model now returns potentially NaN scalar tensors for individual losses if invalid
+        logits, reg_output, total_loss, class_loss_s, contrastive_loss_s, regression_loss_s = model(
+            img_array=images,
+            prompt_ids=prompt_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            target_ids=target_ids,
+            target_attention_mask=target_attention_mask,
+            generative_targets=generative_targets,
+            continuous_coords=continuous_coords,
+            coords_mask=coords_mask # Pass mask for regression loss calculation
+        )
+        # --- Loss Handling & Accumulation ---
+        # Check for invalid total loss before backward pass
+        if total_loss is None or not torch.isfinite(total_loss):
+             print(f"Warning: Invalid total_loss ({total_loss}) detected at Epoch {epoch+1}, Batch {batch_idx}. Skipping backward/step.")
+             optimizer.zero_grad() # Reset gradients for safety if loss is invalid
+             continue # Skip this batch for optimization step
+        # Scale loss for gradient accumulation
+        scaled_loss = total_loss / GRAD_ACCUMULATION_STEPS
+        # Accumulate valid loss components for logging
+        # Check if the scalar tensor is finite before adding its item()
+        if torch.isfinite(total_loss):
+            epoch_total_loss_accum += total_loss.item()
+            valid_batches_accum += 1 # Increment count of batches contributing to average loss
+        if torch.isfinite(class_loss_s):
+            epoch_class_loss_accum += class_loss_s.item()
+        if torch.isfinite(contrastive_loss_s):
+            epoch_con_loss_accum += contrastive_loss_s.item()
+        if torch.isfinite(regression_loss_s):
+            epoch_reg_loss_accum += regression_loss_s.item()
+        batches_since_log += 1
+        # --- Backward Pass ---
+        try:
+             scaled_loss.backward()
+        except Exception as e:
+             print(f"Error during backward pass: {e}. Skipping step.")
+             optimizer.zero_grad() # Reset gradients if backward failed
+             continue
+        # --- Gradient Accumulation Step ---
+        if (batch_idx + 1) % GRAD_ACCUMULATION_STEPS == 0 or (batch_idx + 1) == len(train_loader):
+            # Clip gradients
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
+            # Check for non-finite gradients before stepping
+            all_finite = True
+            for p in model.parameters():
+                 if p.grad is not None and not torch.isfinite(p.grad).all():
+                      all_finite = False
+                      break
+            if not all_finite:
+                 print(f"Warning: Non-finite gradients detected at step {step_counter}. Skipping optimizer step.")
+                 optimizer.zero_grad()
+                 continue # Skip optimizer step and scheduler step
+            # Optimizer step
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            step_counter += 1
+            # --- Logging ---
+            if step_counter % LOGGING_STEPS == 0 and valid_batches_accum > 0: # Use valid_batches_accum
+                # Calculate average losses over the logging period using valid batch count
+                avg_total_loss = epoch_total_loss_accum / valid_batches_accum
+                avg_class_loss = epoch_class_loss_accum / valid_batches_accum
+                avg_con_loss = epoch_con_loss_accum / valid_batches_accum
+                avg_reg_loss = epoch_reg_loss_accum / valid_batches_accum
+                current_lr = optimizer.param_groups[0]['lr']
+                # --- Test Evaluation (Needs modification to handle mask) ---
+                test_class_loss_val = float('nan')
+                test_con_loss_val = float('nan')
+                test_reg_loss_val = float('nan')
+                if test_loader_has_data:
+                     model.eval()
+                     with torch.no_grad():
+                          try:
+                               test_batch = next(iter(test_loader))
+                               if test_batch:
+                                    t_images = test_batch['image'].to(DEVICE).to(DTYPE)
+                                    t_p_ids = test_batch['prompt_ids'].to(DEVICE)
+                                    t_p_mask = test_batch['prompt_attention_mask'].to(DEVICE)
+                                    t_t_ids = test_batch['target_ids'].to(DEVICE)
+                                    t_t_mask = test_batch['target_attention_mask'].to(DEVICE)
+                                    t_gen_targets = test_batch['generative_targets'].to(DEVICE)
+                                    t_cont_coords = test_batch['continuous_coords'].to(DEVICE) # Padded
+                                    t_coords_mask = test_batch['coords_mask'].to(DEVICE)       # Mask
+                                    _, _, _, t_class_loss, t_con_loss, t_reg_loss = model(
+                                        t_images, t_p_ids, t_p_mask, t_t_ids, t_t_mask,
+                                        t_gen_targets, t_cont_coords, t_coords_mask # Pass mask
+                                    )
+                                    # Use .item() only if the tensor is finite
+                                    test_class_loss_val = t_class_loss.item() if torch.isfinite(t_class_loss) else float('nan')
+                                    test_con_loss_val = t_con_loss.item() if torch.isfinite(t_con_loss) else float('nan')
+                                    test_reg_loss_val = t_reg_loss.item() if torch.isfinite(t_reg_loss) else float('nan')
+                          # ... (rest of exception handling) ...
+                          except StopIteration: print("Info: Test loader exhausted during logging.")
+                          except KeyError as e: print(f"Error: Missing key {e} in test batch.")
+                          except Exception as e: print(f"Error during test evaluation: {e}")
+                     model.train()
+                # Prepare data for logging
+                log_data = {
+                    "train/total_loss": avg_total_loss,
+                    "train/class_loss": avg_class_loss,
+                    "train/contrastive_loss": avg_con_loss,
+                    "train/regression_loss": avg_reg_loss,
+                    "test/class_loss": test_class_loss_val,
+                    "test/contrastive_loss": test_con_loss_val,
+                    "test/regression_loss": test_reg_loss_val,
+                    "epoch": epoch + ((batch_idx + 1) / len(train_loader)),
+                    "step": step_counter,
+                    "learning_rate": current_lr,
+                    "gradient_norm": grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm,
+                    "logit_scale": model.logit_scale.exp().item()
+                }
+                # Update progress bar
+                pbar.set_postfix({
+                    "lr": f"{current_lr:.2e}", "loss": f"{avg_total_loss:.3f}",
+                    "cls": f"{avg_class_loss:.3f}", "con": f"{avg_con_loss:.3f}",
+                    "reg": f"{avg_reg_loss:.3f}", "gnorm": f"{log_data['gradient_norm']:.2f}"
+                    })
+                if wandb_enabled: wandb.log(log_data)
+                # Reset accumulators
+                epoch_total_loss_accum, epoch_class_loss_accum, epoch_con_loss_accum, epoch_reg_loss_accum = 0.0, 0.0, 0.0, 0.0
+                batches_since_log = 0
+                valid_batches_accum = 0 # Reset valid batch count
+    # --- End of Epoch ---
+    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS} completed.")
+    # Optional: Add end-of-epoch evaluation or model saving here
+    if epoch % 5 == 0:
+        torch.save(model.state_dict(), f"model_regression_multi_stage_2_{epoch+1}.pth")
+# --- End of Training ---
+print("\nTraining completed!")
+if wandb_enabled:
+    wandb.finish()

utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from constants import *
+from transformers import AutoTokenizer
+import torch
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+def get_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    point_tokens = [f"coord_bin_{i}" for i in range(0, NUM_BINS)]
+    new_tokens = [
+        "<point_start>", "<point_end>", "<result_start>",
+        "<result_end>", "<pointx_start>", "<pointx_end>",
+        "<pointy_start>", "<pointy_end>",
+        *point_tokens
+    ]
+    tokenizer.add_tokens(new_tokens)
+    # Ensure pad token is set (GPT2 usually doesn't have one by default)
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Or use eos_token if preferred
+        # tokenizer.pad_token_id = tokenizer.eos_token_id # Alternative if we want padding to be EOS
+    print(f"Tokenizer pad token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")
+    print(f"Tokenizer EOS token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}")
+    # Check if pad token ID is valid
+    if tokenizer.pad_token_id is None:
+         raise ValueError("Tokenizer pad token ID is not set!")
+    return tokenizer, len(tokenizer)
+def image_to_tensor(image, image_size=IMAGE_SIZE):
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    # We avoid the hassle of calculating
+    # changed co-ordinates for rotation etc for now. Can be added later.
+    transform = transforms.Compose([
+        transforms.Resize((image_size, image_size)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=IMAGE_MEAN, std=IMAGE_STD)
+    ])
+    return transform(image)
+def tensor_to_image(tensor):
+    tensor = tensor.clone().detach()
+    if tensor.is_cuda:
+        tensor = tensor.cpu()
+    mean = torch.tensor(IMAGE_MEAN).view(3, 1, 1)
+    std = torch.tensor(IMAGE_STD).view(3, 1, 1)
+    tensor = tensor * std + mean
+    tensor = torch.clamp(tensor, 0, 1)
+    image_np = tensor.numpy().transpose(1, 2, 0)
+    image_np = (image_np * 255).astype(np.uint8)
+    return Image.fromarray(image_np)
+tokenizer, vocab_size = get_tokenizer() # Initialize tokenizer globally

vision_language_model.py ADDED Viewed

	@@ -0,0 +1,400 @@

+from model_components import ViT, MultiModalProjector
+from decoder_language_model import DecoderLanguageModel
+from constants import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils import tokenizer, vocab_size
+class VisionLanguageModel(nn.Module):
+    """
+    Vision Language Model integrating ViT, Projector, Contrastive Loss, Decoder (Class + Reg).
+    Handles multiple points via padded regression targets and masked loss.
+    """
+    def __init__(self,
+                 n_embd=HIDDEN_DIM,
+                 vocab_size=vocab_size,
+                 img_size=IMAGE_SIZE,
+                 patch_size=PATCH_SIZE,
+                 num_heads=NUM_HEADS,
+                 num_blks_vit=NUM_LAYERS,
+                 num_blks_dec=NUM_LAYERS,
+                 emb_dropout=DROPOUT,
+                 blk_dropout=DROPOUT,
+                 max_context=CONTEXT_LENGTH,
+                 shared_embed_dim=SHARED_EMBED_DIM,
+                 lambda_contrastive=LAMBDA_CONTRASTIVE,
+                 lambda_regression=LAMBDA_REGRESSION, # Use the updated constant
+                 max_points = MAX_POINTS # Store max points
+                 ):
+        super().__init__()
+        # --- Vision Backbone ---
+        self.vision_encoder = ViT(
+            img_size=img_size,
+            patch_size=patch_size,
+            num_hiddens=n_embd, # Assuming ViT output dim matches decoder embed dim
+            num_heads=num_heads,
+            num_blks=num_blks_vit,
+            emb_dropout=emb_dropout,
+            blk_dropout=blk_dropout
+        )
+        # --- Multimodal Components ---
+        self.multimodal_projector =  MultiModalProjector(
+            image_embed_dim=n_embd, # Input from ViT
+            text_embed_dim=n_embd,  # Output matches decoder dim
+            dropout=emb_dropout
+        )
+        self.image_contrastive_head = nn.Linear(n_embd, shared_embed_dim, bias=False)
+        self.text_contrastive_head = nn.Linear(n_embd, shared_embed_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.log(torch.tensor(1 / 0.07)))
+        # --- Text Decoder ---
+        # DecoderLanguageModel now has regression head outputting MAX_POINTS*2
+        self.decoder = DecoderLanguageModel(
+            n_embd=n_embd,
+            vocab_size=vocab_size,
+            num_heads=num_heads,
+            n_layer=num_blks_dec,
+            max_context=max_context,
+            dropout=blk_dropout # Use block dropout for decoder consistency
+        )
+        # --- Store Configuration ---
+        self.n_embd = n_embd
+        self.vocab_size = vocab_size
+        self.num_patches = (img_size // patch_size)**2 + 1
+        self.lambda_contrastive = lambda_contrastive
+        self.lambda_regression = lambda_regression
+        self.max_points = max_points # Store max points
+        self._resize_embeddings_if_needed(self.vocab_size)
+        print("VisionLanguageModel initialized.")
+    def _resize_embeddings_if_needed(self, current_vocab_size):
+        """ Resizes decoder token embeddings if vocab size changed after init. """
+        decoder_embedding_size = self.decoder.token_embedding_table.num_embeddings
+        if decoder_embedding_size != current_vocab_size:
+            print(f"Resizing VLM decoder token embeddings from {decoder_embedding_size} to {current_vocab_size}")
+            # Freeze original weights before replacing layers
+            self.decoder.token_embedding_table.weight.requires_grad = False
+            self.decoder.lm_head.weight.requires_grad = False
+            # Create new layers
+            new_embedding = nn.Embedding(current_vocab_size, self.n_embd).to(DEVICE)
+            new_lm_head = nn.Linear(self.n_embd, current_vocab_size, bias=False).to(DEVICE)
+            # Assign new layers
+            self.decoder.token_embedding_table = new_embedding
+            self.decoder.lm_head = new_lm_head
+            # Re-tie weights
+            self.decoder.token_embedding_table.weight = self.decoder.lm_head.weight
+            print("VLM decoder embeddings resized and weights retied.")
+    def _calculate_contrastive_loss(self, image_features, text_features):
+        """ Calculates the symmetric InfoNCE loss. """
+        # Assumes features are already projected to shared_embed_dim
+        # image_features: (B, E)
+        # text_features: (B, E)
+        # Normalize features
+        image_features = F.normalize(image_features, dim=-1)
+        text_features = F.normalize(text_features, dim=-1)
+        # Cosine similarity as logits (using learnable temperature)
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # Calculate symmetric cross-entropy loss
+        labels = torch.arange(len(logits_per_image), device=logits_per_image.device)
+        loss_i = F.cross_entropy(logits_per_image, labels)
+        loss_t = F.cross_entropy(logits_per_text, labels)
+        contrastive_loss = (loss_i + loss_t) / 2.0
+        # Handle potential NaNs
+        if torch.isnan(contrastive_loss):
+             print("Warning: Contrastive loss is NaN.")
+             return None # Return None or zero tensor
+        return contrastive_loss
+    def forward(self,
+                img_array,
+                prompt_ids,
+                prompt_attention_mask,
+                target_ids,
+                target_attention_mask,
+                generative_targets=None,
+                continuous_coords=None, # Now expects shape (B, MAX_POINTS, 2), padded
+                coords_mask=None        # Mask for valid points (B, MAX_POINTS)
+                ):
+        """
+        Main forward pass for training. Calculates combined loss with masked regression loss.
+        """
+        # --- 1. Encode Image ---
+        image_embeds_raw = self.vision_encoder(img_array) # (B, N_img, C)
+        B, N_img, C_img = image_embeds_raw.shape
+        img_cls_token = image_embeds_raw[:, 0]
+        # --- 2. Contrastive Loss Path ---
+        contrastive_loss = None
+        # ... (contrastive loss calculation - same as before) ...
+        image_features_contrast = self.image_contrastive_head(img_cls_token)
+        with torch.no_grad(): # Keep no_grad here for efficiency if prompt embeddings aren't trained via contrastive
+             prompt_text_embeds_contrast = self.decoder.token_embedding_table(prompt_ids)
+        prompt_lengths = prompt_attention_mask.sum(dim=1)
+        last_token_indices = (prompt_lengths - 1).clamp(min=0)
+        gather_indices = last_token_indices.view(B, 1, 1).expand(-1, -1, C_img)
+        prompt_last_token_embed = prompt_text_embeds_contrast.gather(1, gather_indices).squeeze(1)
+        text_features_contrast = self.text_contrastive_head(prompt_last_token_embed)
+        contrastive_loss = self._calculate_contrastive_loss(image_features_contrast, text_features_contrast)
+        # --- 3. Generative / Regression Path ---
+        image_embeds_decoder = self.multimodal_projector(image_embeds_raw)
+        prompt_embeds_decoder = self.decoder.token_embedding_table(prompt_ids)
+        target_embeds_decoder = self.decoder.token_embedding_table(target_ids)
+        B, T_prompt, C = prompt_embeds_decoder.shape
+        B, T_target, _ = target_embeds_decoder.shape
+        # Prepare combined input sequence and attention mask for the decoder
+        combined_embeds = torch.cat([
+            image_embeds_decoder, prompt_embeds_decoder, target_embeds_decoder
+        ], dim=1)
+        combined_attention_mask = torch.cat([
+            torch.ones(B, N_img, dtype=torch.long, device=DEVICE),
+            prompt_attention_mask,
+            target_attention_mask
+        ], dim=1)
+        T_combined = combined_embeds.shape[1]
+        # Prepare combined targets for the classification loss
+        combined_class_targets = None
+        if generative_targets is not None:
+            combined_class_targets = torch.cat([
+                torch.full((B, N_img + T_prompt), -100, dtype=torch.long, device=DEVICE),
+                generative_targets
+            ], dim=1)
+        # --- Pass through Decoder ---
+        logits, class_loss, x_norm = self.decoder(
+            combined_embeds,
+            attention_mask=combined_attention_mask,
+            targets=combined_class_targets
+        )
+        # x_norm shape: (B, T_combined, C)
+        # --- Calculate Regression Output & Loss (Modified for multiple points) ---
+        regression_loss = None
+        regression_output = None
+        if continuous_coords is not None and coords_mask is not None and x_norm is not None:
+            # Strategy: Use hidden state corresponding to token *before* <result_end> (or <eos>)
+            # This single state predicts coordinates for *all* MAX_POINTS.
+            target_lengths = target_attention_mask.sum(dim=1) # Length of actual target tokens (B,)
+            # Index relative to start of *target sequence* is length - 2 (token before <eos>/<result_end>)
+            relative_target_idx = (target_lengths - 2).clamp(min=0)
+            # Absolute index in the combined sequence's hidden states (x_norm)
+            absolute_idx = N_img + T_prompt + relative_target_idx
+            absolute_idx = absolute_idx.clamp(max=T_combined - 1) # Clamp index
+            # Gather the hidden states at these specific indices
+            gather_indices_reg = absolute_idx.view(B, 1, 1).expand(-1, -1, C)
+            try:
+                hidden_state_for_regression = x_norm.gather(1, gather_indices_reg).squeeze(1) # Shape: (B, C)
+                # Pass through the regression head
+                regression_output_flat = self.decoder.regression_head(hidden_state_for_regression) # Shape: (B, MAX_POINTS * 2)
+                # Reshape to (B, MAX_POINTS, 2)
+                regression_output = regression_output_flat.view(B, self.max_points, 2)
+                # --- Calculate MASKED regression loss (L1 - Mean Absolute Error) ---
+                loss_per_coord = F.l1_loss(regression_output, continuous_coords, reduction='none') # (B, MAX_POINTS, 2)
+                # Apply mask (mask is (B, MAX_POINTS), need to broadcast to (B, MAX_POINTS, 2))
+                masked_loss = loss_per_coord * coords_mask.unsqueeze(-1)
+                # Sum loss over valid points and coordinates, divide by number of valid coordinates
+                num_valid_coords = coords_mask.sum() * 2 # Total number of valid x,y values in batch
+                if num_valid_coords > 0:
+                    regression_loss = masked_loss.sum() / num_valid_coords
+                else:
+                    regression_loss = torch.tensor(0.0, device=DEVICE) # No valid points in batch
+                if torch.isnan(regression_loss):
+                    print("Warning: Regression loss is NaN.")
+                    regression_loss = torch.tensor(0.0, device=DEVICE, requires_grad=True) # Set to zero tensor if NaN
+            except Exception as e:
+                 print(f"Error during regression calculation: {e}")
+                 print(f"x_norm shape: {x_norm.shape}, absolute_idx: {absolute_idx}")
+                 regression_loss = None
+                 regression_output = None # Ensure output is None if error occurs
+        # --- 4. Combine All Losses ---
+        total_loss = torch.tensor(0.0, device=DEVICE) # Ensure requires_grad=True
+        # Add valid losses with their respective weights
+        loss_log = {}
+        if class_loss is not None and torch.isfinite(class_loss):
+            total_loss += class_loss # Weight = 1.0 assumed
+            loss_log["class_loss"] = class_loss.item()
+        else:
+            # If class_loss is None or NaN/Inf, don't add it, log NaN
+            loss_log["class_loss"] = float('nan')
+            print(f"Warning: Invalid class_loss ({class_loss})")
+        if contrastive_loss is not None and torch.isfinite(contrastive_loss):
+            total_loss += self.lambda_contrastive * contrastive_loss
+            loss_log["contrastive_loss"] = contrastive_loss.item()
+        else:
+            loss_log["contrastive_loss"] = float('nan')
+            print(f"Warning: Invalid contrastive_loss ({contrastive_loss})")
+        if regression_loss is not None and torch.isfinite(regression_loss):
+            total_loss += self.lambda_regression * regression_loss
+            loss_log["regression_loss"] = regression_loss.item()
+        else:
+            loss_log["regression_loss"] = float('nan')
+             # Don't print warning if it was intentionally set to 0 due to no valid points
+            if regression_loss is not None and not (regression_loss == 0.0 and num_valid_coords == 0):
+                 print(f"Warning: Invalid regression_loss ({regression_loss})")
+        # Handle case where total loss becomes NaN/Inf
+        if not torch.isfinite(total_loss):
+            print(f"Warning: Total loss became non-finite ({total_loss}). Setting to zero and clearing gradients.")
+            total_loss = torch.tensor(0.0, device=DEVICE, requires_grad=True)
+            # It might be safer to skip the optimizer step entirely here, handled in training loop
+        # Use the loss_log dictionary for clearer logging later
+        class_loss_val = loss_log["class_loss"]
+        contrastive_loss_val = loss_log["contrastive_loss"]
+        regression_loss_val = loss_log["regression_loss"]
+        # Return all relevant outputs (use scalar values for loss logging)
+        return logits, regression_output, total_loss, \
+               torch.tensor(class_loss_val), torch.tensor(contrastive_loss_val), torch.tensor(regression_loss_val)
+    # --- Generation Method ---
+    @torch.no_grad() # Ensure no gradients are computed during generation
+    def generate(self, img_array, idx_prompt, max_new_tokens,
+                 temperature=1.0, top_k=None, # Default to greedy if temp=1, top_k=None
+                 force_result_start=True # Option to manually add <result_start>
+                 ):
+        """
+        Generates token sequences autoregressively based on image and prompt.
+        Uses the classification head (lm_head).
+        Args:
+            img_array (torch.Tensor): Input image tensor (B, 3, H, W). B should be 1 for this impl.
+            idx_prompt (torch.Tensor): Input prompt token IDs (B, T_prompt).
+            max_new_tokens (int): Maximum number of new tokens to generate.
+            temperature (float): Softmax temperature. 1.0 means no change. Lower values make it sharper.
+            top_k (int | None): If set, restricts sampling to top K most likely tokens.
+            force_result_start (bool): If True, manually appends <result_start> embedding
+                                       after the prompt before starting generation loop.
+        Returns:
+            torch.Tensor: Generated sequence IDs, including the prompt (B, T_prompt + T_generated).
+        """
+        self.eval() # Ensure model is in eval mode
+        B = img_array.shape[0]
+        if B > 1:
+            # This simplified generation loop assumes B=1 for clarity
+            # Batch generation requires careful handling of EOS and padding within the loop
+            print("Warning: Generation function currently assumes batch size B=1.")
+            # Process only the first item for now
+            img_array = img_array[:1]
+            idx_prompt = idx_prompt[:1]
+            B = 1
+        # --- 1. Prepare Initial Embeddings ---
+        image_embeds_raw = self.vision_encoder(img_array)
+        image_embeds_decoder = self.multimodal_projector(image_embeds_raw)
+        prompt_embeds_decoder = self.decoder.token_embedding_table(idx_prompt)
+        # Initial sequence for the decoder loop
+        current_embeds = torch.cat([image_embeds_decoder, prompt_embeds_decoder], dim=1)
+        generated_ids_list = [] # Store newly generated IDs as a list
+        # Manually add <result_start> if forced
+        if force_result_start:
+            try:
+                 result_start_token_id = tokenizer.encode("<result_start>", add_special_tokens=False)[0]
+                 result_start_embed = self.decoder.token_embedding_table(
+                     torch.tensor([[result_start_token_id]], device=DEVICE)
+                 )
+                 current_embeds = torch.cat([current_embeds, result_start_embed], dim=1)
+                 # Also store this token ID if we added it
+                 generated_ids_list.append(torch.tensor([[result_start_token_id]], device=DEVICE))
+            except Exception as e:
+                 print(f"Warning: Could not encode or add <result_start>: {e}")
+        # --- 2. Autoregressive Loop ---
+        for _ in range(max_new_tokens):
+            T_current = current_embeds.shape[1]
+            # Context truncation
+            if T_current > self.decoder.max_context:
+                current_embeds = current_embeds[:, -self.decoder.max_context:, :]
+                T_current = self.decoder.max_context
+            # Prepare inputs for decoder blocks
+            pos = torch.arange(0, T_current, dtype=torch.long, device=DEVICE)
+            pos = pos.clamp(max=self.decoder.max_context - 1)
+            pos_emb = self.decoder.position_embedding_table(pos).unsqueeze(0)
+            x = current_embeds + pos_emb
+            attention_mask = torch.ones(B, T_current, device=DEVICE, dtype=torch.long) # No padding needed
+            # Pass through decoder blocks
+            for block in self.decoder.blocks:
+                x = block(x, attention_mask=attention_mask)
+            # Get logits for the last token
+            x = self.decoder.ln_f(x[:, -1:, :]) # (B, 1, C)
+            logits = self.decoder.lm_head(x)    # (B, 1, V)
+            logits = logits.squeeze(1) / temperature # Apply temperature (B, V)
+            # --- Sampling / Decoding ---
+            # Optional: Top-K filtering
+            if top_k is not None and top_k > 0:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf') # Apply mask
+            # Get probabilities
+            probs = F.softmax(logits, dim=-1)
+            # Sample next token ID
+            # For deterministic output (greedy), use torch.argmax instead of multinomial
+            if temperature == 0.0 or top_k == 1: # Greedy condition
+                 idx_next = torch.argmax(probs, dim=-1, keepdim=True)
+            else:
+                 idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
+            # Append the generated token ID
+            generated_ids_list.append(idx_next)
+            # Stop if EOS is generated
+            if hasattr(tokenizer, 'eos_token_id') and idx_next.item() == tokenizer.eos_token_id:
+                break
+            # Prepare for next iteration
+            next_token_embed = self.decoder.token_embedding_table(idx_next)
+            current_embeds = torch.cat([current_embeds, next_token_embed], dim=1)
+        # --- 3. Combine results ---
+        if generated_ids_list:
+            generated_ids_tensor = torch.cat(generated_ids_list, dim=1) # (B, T_generated)
+            full_sequence_ids = torch.cat([idx_prompt, generated_ids_tensor], dim=1)
+        else:
+            full_sequence_ids = idx_prompt # Return only prompt if nothing generated
+        self.train() # Set model back to training mode
+        return full_sequence_ids