#!/usr/bin/env python3 """ CSIRO Image2Biomass Prediction - Kaggle Training Notebook ========================================================== This is a self-contained training notebook for Kaggle. Run this on Kaggle GPU to train models, then use the inference notebook to submit. Key features: - DINOv2-Base + ConvNeXt-Large ensemble - 5-fold stratified CV - Log-transformed targets - Label Distribution Smoothing - Weighted SmoothL1 loss + consistency regularization - Heavy D4 augmentations for pasture images - Gradient checkpointing for memory efficiency """ # ============================================================ # 1. Setup & Configuration # ============================================================ import os import sys import json import time import random import warnings from pathlib import Path from typing import Dict, List, Optional, Tuple import numpy as np import pandas as pd import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler from torch.cuda.amp import GradScaler, autocast from PIL import Image from sklearn.model_selection import StratifiedKFold warnings.filterwarnings('ignore') # Install dependencies os.system('pip install -q timm albumentations') import timm import albumentations as A from albumentations.pytorch import ToTensorV2 # ============================================================ # 2. Configuration # ============================================================ class CFG: # Paths - adjust for your Kaggle setup COMPETITION = 'csiro-biomass' DATA_DIR = Path(f'/kaggle/input/{COMPETITION}') OUTPUT_DIR = Path('/kaggle/working') # Model BACKBONE = 'vit_base_patch14_dinov2.lvd142m' # DINOv2-Base # BACKBONE = 'convnext_large.fb_in22k_ft_in1k' # ConvNeXt-Large IMG_SIZE = 224 HIDDEN_DIM = 512 DROPOUT = 0.3 SEPARATE_HEADS = False GRAD_CHECKPOINTING = True # Training EPOCHS = 30 BATCH_SIZE = 16 # Adjust based on GPU memory BACKBONE_LR = 3e-5 HEAD_LR = 1e-3 MIN_LR = 1e-7 WEIGHT_DECAY = 1e-2 WARMUP_RATIO = 0.05 MAX_GRAD_NORM = 1.0 GRAD_ACCUM = 2 # Effective batch = BATCH_SIZE * GRAD_ACCUM PATIENCE = 8 # Augmentation AUG_STRENGTH = 'medium' LOG_TRANSFORM = True # LDS USE_LDS = True LDS_BINS = 100 LDS_KERNEL_SIZE = 5 LDS_SIGMA = 2.0 # Loss MSE_WEIGHT = 0.0 CONSISTENCY_WEIGHT = 0.1 # CV N_FOLDS = 5 TRAIN_FOLDS = [0, 1, 2, 3, 4] # Which folds to train # Misc SEED = 42 NUM_WORKERS = 2 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' # Target configuration TARGET_COLS = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g'] TARGET_WEIGHTS = [0.1, 0.1, 0.1, 0.2, 0.5] IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) def set_seed(seed=42): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) set_seed(CFG.SEED) print(f"Device: {CFG.DEVICE}") if CFG.DEVICE == 'cuda': print(f"GPU: {torch.cuda.get_device_name(0)}") # ============================================================ # 3. Data Loading & EDA # ============================================================ # Auto-detect data directory structure if not CFG.DATA_DIR.exists(): # Try alternative paths for alt in ['/kaggle/input/csiro-image2biomass-prediction', '/kaggle/input/csiro-biomass', '/kaggle/input/csiro-image2biomass', './data']: if Path(alt).exists(): CFG.DATA_DIR = Path(alt) break print(f"Data directory: {CFG.DATA_DIR}") print(f"Files: {os.listdir(CFG.DATA_DIR) if CFG.DATA_DIR.exists() else 'NOT FOUND'}") # Load CSVs train_csv = None for fname in ['train.csv', 'Train.csv']: fpath = CFG.DATA_DIR / fname if fpath.exists(): train_csv = fpath break test_csv = None for fname in ['test.csv', 'Test.csv']: fpath = CFG.DATA_DIR / fname if fpath.exists(): test_csv = fpath break train_df = pd.read_csv(train_csv) if train_csv else None test_df = pd.read_csv(test_csv) if test_csv else None # Find image directories train_img_dir = None test_img_dir = None for d in ['train_images', 'train', 'images/train']: if (CFG.DATA_DIR / d).exists(): train_img_dir = CFG.DATA_DIR / d break for d in ['test_images', 'test', 'images/test']: if (CFG.DATA_DIR / d).exists(): test_img_dir = CFG.DATA_DIR / d break if train_df is not None: print(f"\nTrain shape: {train_df.shape}") print(f"Train columns: {list(train_df.columns)}") print(f"\nTarget statistics:") for col in TARGET_COLS: if col in train_df.columns: s = train_df[col] print(f" {col}: mean={s.mean():.2f}, median={s.median():.2f}, " f"std={s.std():.2f}, min={s.min():.2f}, max={s.max():.2f}, " f"zeros={100*(s==0).mean():.1f}%") if test_df is not None: print(f"\nTest shape: {test_df.shape}") print(f"Test columns: {list(test_df.columns)}") # ============================================================ # 4. Dataset & Augmentations # ============================================================ class BiomassDataset(Dataset): def __init__(self, image_dir, df, targets=None, transform=None, log_transform=True, use_ndvi=False): self.image_dir = Path(image_dir) self.df = df.reset_index(drop=True) self.targets = targets self.transform = transform self.log_transform = log_transform self.use_ndvi = use_ndvi def __len__(self): return len(self.df) def __getitem__(self, idx): row = self.df.iloc[idx] img_id = row['image_id'] if 'image_id' in row.index else row.name # Find image img_path = None for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']: p = self.image_dir / f"{img_id}{ext}" if p.exists(): img_path = p break if img_path is None: candidates = list(self.image_dir.glob(f"{img_id}*")) img_path = candidates[0] if candidates else self.image_dir / f"{img_id}.jpg" img = np.array(Image.open(img_path).convert('RGB')) if self.transform: img_tensor = self.transform(image=img)['image'] else: img_tensor = torch.tensor(img.transpose(2, 0, 1), dtype=torch.float32) / 255.0 result = {'image': img_tensor, 'image_id': str(img_id)} if self.use_ndvi and 'NDVI' in self.df.columns: result['ndvi'] = torch.tensor(float(row['NDVI']), dtype=torch.float32) if self.targets is not None: target_values = self.targets.iloc[idx][TARGET_COLS].values.astype(np.float32) if self.log_transform: target_values = np.log1p(target_values) result['targets'] = torch.tensor(target_values, dtype=torch.float32) return result def get_transforms(img_size, is_train=True, aug_strength='medium'): if is_train: if aug_strength == 'light': return A.Compose([ A.RandomResizedCrop(size=(img_size, img_size), scale=(0.7, 1.0)), A.HorizontalFlip(p=0.5), A.VerticalFlip(p=0.5), A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD), ToTensorV2(), ]) elif aug_strength == 'medium': return A.Compose([ A.RandomResizedCrop(size=(img_size, img_size), scale=(0.5, 1.0)), A.HorizontalFlip(p=0.5), A.VerticalFlip(p=0.5), A.RandomRotate90(p=0.5), A.Transpose(p=0.5), A.RandomBrightnessContrast(0.2, 0.2, p=0.5), A.HueSaturationValue(15, 25, 15, p=0.4), A.OneOf([A.GaussianBlur(blur_limit=(3, 5)), A.MotionBlur(blur_limit=5)], p=0.15), A.CoarseDropout(num_holes_range=(1, 4), hole_height_range=(int(img_size*0.05), int(img_size*0.15)), hole_width_range=(int(img_size*0.05), int(img_size*0.15)), p=0.2), A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD), ToTensorV2(), ]) else: # heavy return A.Compose([ A.RandomResizedCrop(size=(img_size, img_size), scale=(0.4, 1.0)), A.HorizontalFlip(p=0.5), A.VerticalFlip(p=0.5), A.RandomRotate90(p=0.5), A.Transpose(p=0.5), A.RandomBrightnessContrast(0.3, 0.3, p=0.7), A.HueSaturationValue(20, 30, 20, p=0.5), A.RandomGamma((80, 120), p=0.3), A.OneOf([A.GaussianBlur((3, 7)), A.MotionBlur(blur_limit=7)], p=0.2), A.CoarseDropout(num_holes_range=(1, 8), hole_height_range=(int(img_size*0.05), int(img_size*0.2)), hole_width_range=(int(img_size*0.05), int(img_size*0.2)), p=0.3), A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD), ToTensorV2(), ]) else: return A.Compose([ A.Resize(height=int(img_size * 1.14), width=int(img_size * 1.14)), A.CenterCrop(height=img_size, width=img_size), A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD), ToTensorV2(), ]) # ============================================================ # 5. Model # ============================================================ class BiomassModel(nn.Module): def __init__(self, backbone_name, num_targets=5, hidden_dim=512, dropout=0.3, pretrained=True, img_size=224, use_ndvi=False, separate_heads=False, grad_checkpointing=False): super().__init__() self.use_ndvi = use_ndvi self.separate_heads = separate_heads kwargs = {'pretrained': pretrained, 'num_classes': 0} if 'vit' in backbone_name or 'dinov2' in backbone_name: kwargs['img_size'] = img_size self.backbone = timm.create_model(backbone_name, **kwargs) feat_dim = self.backbone.num_features if grad_checkpointing and hasattr(self.backbone, 'set_grad_checkpointing'): self.backbone.set_grad_checkpointing(True) if use_ndvi: self.ndvi_embed = nn.Sequential(nn.Linear(1, 32), nn.GELU(), nn.Linear(32, 64)) feat_dim += 64 if separate_heads: self.heads = nn.ModuleList([ nn.Sequential( nn.LayerNorm(feat_dim), nn.Dropout(dropout), nn.Linear(feat_dim, hidden_dim), nn.GELU(), nn.Dropout(dropout * 0.5), nn.Linear(hidden_dim, 1), ) for _ in range(num_targets) ]) else: self.head = nn.Sequential( nn.LayerNorm(feat_dim), nn.Dropout(dropout), nn.Linear(feat_dim, hidden_dim), nn.GELU(), nn.Dropout(dropout * 0.5), nn.Linear(hidden_dim, hidden_dim // 2), nn.GELU(), nn.Dropout(dropout * 0.3), nn.Linear(hidden_dim // 2, num_targets), ) def forward(self, x, ndvi=None): features = self.backbone(x) if self.use_ndvi and ndvi is not None: features = torch.cat([features, self.ndvi_embed(ndvi.unsqueeze(-1))], dim=-1) if self.separate_heads: return torch.cat([h(features) for h in self.heads], dim=-1) return self.head(features) def get_param_groups(self, backbone_lr, head_lr): return [ {'params': self.backbone.parameters(), 'lr': backbone_lr}, {'params': [p for n, p in self.named_parameters() if 'backbone' not in n], 'lr': head_lr}, ] # ============================================================ # 6. Loss Functions # ============================================================ class WeightedSmoothL1Loss(nn.Module): def __init__(self, target_weights=None, beta=1.0): super().__init__() self.beta = beta w = target_weights or TARGET_WEIGHTS self.register_buffer('weights', torch.tensor(w, dtype=torch.float32)) def forward(self, pred, target): loss = F.smooth_l1_loss(pred, target, beta=self.beta, reduction='none') return (loss * self.weights.unsqueeze(0)).mean() class CombinedLoss(nn.Module): def __init__(self, mse_weight=0.0, consistency_weight=0.1): super().__init__() self.smoothl1 = WeightedSmoothL1Loss() self.mse_weight = mse_weight self.consistency_weight = consistency_weight def forward(self, pred, target): loss = self.smoothl1(pred, target) if self.mse_weight > 0: mse = ((pred - target) ** 2 * torch.tensor(TARGET_WEIGHTS, device=pred.device).unsqueeze(0)).mean() loss += self.mse_weight * mse if self.consistency_weight > 0: comp_sum = pred[:, 0] + pred[:, 1] + pred[:, 2] loss += self.consistency_weight * F.mse_loss(comp_sum, pred[:, 4]) return loss # ============================================================ # 7. LDS Weights # ============================================================ def get_lds_weights(labels, bins=100, kernel_size=5, sigma=2.0): from scipy.ndimage import convolve1d if labels.ndim > 1: labels = labels[:, -1] # Dry_Total_g hist, edges = np.histogram(labels, bins=bins) kernel = np.exp(-np.linspace(-3, 3, kernel_size)**2 / (2 * sigma**2)) kernel /= kernel.sum() smoothed = convolve1d(hist.astype(float), kernel, mode='reflect') centers = (edges[:-1] + edges[1:]) / 2 weights = 1.0 / (np.interp(labels, centers, smoothed) + 1e-8) return weights / weights.mean() # ============================================================ # 8. Metrics # ============================================================ def compute_weighted_r2(preds, targets, weights=None): if weights is None: weights = TARGET_WEIGHTS all_p, all_t, all_w = [], [], [] for j in range(preds.shape[1]): all_p.extend(preds[:, j]) all_t.extend(targets[:, j]) all_w.extend([weights[j]] * preds.shape[0]) all_p, all_t, all_w = map(np.array, (all_p, all_t, all_w)) wmean = np.sum(all_w * all_t) / np.sum(all_w) ss_res = np.sum(all_w * (all_t - all_p) ** 2) ss_tot = np.sum(all_w * (all_t - wmean) ** 2) return 1.0 - ss_res / (ss_tot + 1e-8) def compute_per_target_r2(preds, targets): results = {} for i, name in enumerate(TARGET_COLS): ss_res = np.sum((targets[:, i] - preds[:, i]) ** 2) ss_tot = np.sum((targets[:, i] - targets[:, i].mean()) ** 2) results[name] = 1.0 - ss_res / (ss_tot + 1e-8) return results # ============================================================ # 9. Training Loop # ============================================================ def train_one_fold(fold, train_df, val_df, train_targets, val_targets, train_img_dir): print(f"\n{'='*60}") print(f"FOLD {fold}") print(f"Train: {len(train_df)}, Val: {len(val_df)}") print(f"{'='*60}") device = torch.device(CFG.DEVICE) # Datasets train_ds = BiomassDataset( train_img_dir, train_df, train_targets, transform=get_transforms(CFG.IMG_SIZE, True, CFG.AUG_STRENGTH), log_transform=CFG.LOG_TRANSFORM, ) val_ds = BiomassDataset( train_img_dir, val_df, val_targets, transform=get_transforms(CFG.IMG_SIZE, False), log_transform=CFG.LOG_TRANSFORM, ) # LDS sampler if CFG.USE_LDS: sample_weights = get_lds_weights( train_targets[TARGET_COLS].values, CFG.LDS_BINS, CFG.LDS_KERNEL_SIZE, CFG.LDS_SIGMA) sampler = WeightedRandomSampler(sample_weights, len(train_ds), replacement=True) train_loader = DataLoader(train_ds, batch_size=CFG.BATCH_SIZE, sampler=sampler, num_workers=CFG.NUM_WORKERS, pin_memory=True, drop_last=True) else: train_loader = DataLoader(train_ds, batch_size=CFG.BATCH_SIZE, shuffle=True, num_workers=CFG.NUM_WORKERS, pin_memory=True, drop_last=True) val_loader = DataLoader(val_ds, batch_size=CFG.BATCH_SIZE * 2, shuffle=False, num_workers=CFG.NUM_WORKERS, pin_memory=True) # Model model = BiomassModel( CFG.BACKBONE, 5, CFG.HIDDEN_DIM, CFG.DROPOUT, True, CFG.IMG_SIZE, separate_heads=CFG.SEPARATE_HEADS, grad_checkpointing=CFG.GRAD_CHECKPOINTING, ).to(device) print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}") # Optimizer & Scheduler optimizer = torch.optim.AdamW( model.get_param_groups(CFG.BACKBONE_LR, CFG.HEAD_LR), weight_decay=CFG.WEIGHT_DECAY, ) n_steps = len(train_loader) * CFG.EPOCHS // CFG.GRAD_ACCUM warmup_steps = int(n_steps * CFG.WARMUP_RATIO) warmup = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=warmup_steps) cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_steps - warmup_steps, eta_min=CFG.MIN_LR) scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, [warmup, cosine], milestones=[warmup_steps]) loss_fn = CombinedLoss(CFG.MSE_WEIGHT, CFG.CONSISTENCY_WEIGHT).to(device) scaler = GradScaler() if device.type == 'cuda' else None # Training best_r2 = -float('inf') patience = 0 save_path = CFG.OUTPUT_DIR / f'fold_{fold}' save_path.mkdir(parents=True, exist_ok=True) for epoch in range(1, CFG.EPOCHS + 1): t0 = time.time() # Train model.train() running_loss = 0 n_samples = 0 for batch_idx, batch in enumerate(train_loader): images = batch['image'].to(device) targets = batch['targets'].to(device) ndvi = batch.get('ndvi', None) if ndvi is not None: ndvi = ndvi.to(device) if scaler: with autocast(dtype=torch.float16): preds = model(images, ndvi) loss = loss_fn(preds, targets) / CFG.GRAD_ACCUM scaler.scale(loss).backward() if (batch_idx + 1) % CFG.GRAD_ACCUM == 0: scaler.unscale_(optimizer) nn.utils.clip_grad_norm_(model.parameters(), CFG.MAX_GRAD_NORM) scaler.step(optimizer) scaler.update() optimizer.zero_grad() scheduler.step() else: preds = model(images, ndvi) loss = loss_fn(preds, targets) / CFG.GRAD_ACCUM loss.backward() if (batch_idx + 1) % CFG.GRAD_ACCUM == 0: nn.utils.clip_grad_norm_(model.parameters(), CFG.MAX_GRAD_NORM) optimizer.step() optimizer.zero_grad() scheduler.step() running_loss += loss.item() * CFG.GRAD_ACCUM * images.size(0) n_samples += images.size(0) train_loss = running_loss / n_samples # Validate model.eval() val_preds, val_targets_arr = [], [] val_loss = 0 val_n = 0 with torch.no_grad(): for batch in val_loader: images = batch['image'].to(device) targets = batch['targets'].to(device) ndvi = batch.get('ndvi', None) if ndvi is not None: ndvi = ndvi.to(device) if scaler: with autocast(dtype=torch.float16): preds = model(images, ndvi) loss = loss_fn(preds, targets) else: preds = model(images, ndvi) loss = loss_fn(preds, targets) val_loss += loss.item() * images.size(0) val_n += images.size(0) val_preds.append(preds.cpu().numpy()) val_targets_arr.append(targets.cpu().numpy()) val_loss /= val_n val_preds = np.concatenate(val_preds) val_targets_arr = np.concatenate(val_targets_arr) # Inverse transform for metric if CFG.LOG_TRANSFORM: val_preds_orig = np.expm1(val_preds) val_targets_orig = np.expm1(val_targets_arr) else: val_preds_orig = val_preds val_targets_orig = val_targets_arr val_preds_orig = np.clip(val_preds_orig, 0, None) r2 = compute_weighted_r2(val_preds_orig, val_targets_orig) per_r2 = compute_per_target_r2(val_preds_orig, val_targets_orig) elapsed = time.time() - t0 lr = optimizer.param_groups[0]['lr'] print(f"Epoch {epoch:02d}/{CFG.EPOCHS} | " f"train_loss={train_loss:.4f} | val_loss={val_loss:.4f} | " f"R²={r2:.4f} | lr={lr:.2e} | {elapsed:.0f}s") for name, val in per_r2.items(): print(f" {name}: {val:.4f}") # Save best if r2 > best_r2: best_r2 = r2 patience = 0 torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'weighted_r2': best_r2, 'per_target_r2': per_r2, 'args': { 'backbone': CFG.BACKBONE, 'img_size': CFG.IMG_SIZE, 'hidden_dim': CFG.HIDDEN_DIM, 'dropout': CFG.DROPOUT, 'separate_heads': CFG.SEPARATE_HEADS, 'log_transform': CFG.LOG_TRANSFORM, 'use_ndvi': False, }, }, save_path / 'best_model.pth') print(f" *** New best R²={best_r2:.4f} ***") else: patience += 1 if patience >= CFG.PATIENCE: print(f"Early stopping at epoch {epoch}") break print(f"\nFold {fold} best R²: {best_r2:.4f}") return best_r2 # ============================================================ # 10. K-Fold Training # ============================================================ if train_df is not None and train_img_dir is not None: targets = train_df[TARGET_COLS].copy() bins = pd.qcut(targets['Dry_Total_g'], q=min(10, CFG.N_FOLDS), labels=False, duplicates='drop') kf = StratifiedKFold(n_splits=CFG.N_FOLDS, shuffle=True, random_state=CFG.SEED) fold_scores = [] for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train_df, bins)): if fold_idx not in CFG.TRAIN_FOLDS: continue fold_train_df = train_df.iloc[train_idx] fold_val_df = train_df.iloc[val_idx] fold_train_targets = targets.iloc[train_idx] fold_val_targets = targets.iloc[val_idx] score = train_one_fold( fold_idx, fold_train_df, fold_val_df, fold_train_targets, fold_val_targets, str(train_img_dir), ) fold_scores.append(score) print(f"\n{'='*60}") print(f"All folds: {[f'{s:.4f}' for s in fold_scores]}") print(f"Mean R²: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}") print(f"{'='*60}") else: print("Training data not found! Check DATA_DIR setting.") # ============================================================ # 11. Save training info # ============================================================ training_info = { 'backbone': CFG.BACKBONE, 'img_size': CFG.IMG_SIZE, 'n_folds': CFG.N_FOLDS, 'fold_scores': fold_scores if 'fold_scores' in dir() else [], 'mean_r2': float(np.mean(fold_scores)) if 'fold_scores' in dir() and fold_scores else 0, } with open(CFG.OUTPUT_DIR / 'training_info.json', 'w') as f: json.dump(training_info, f, indent=2) print("\nTraining complete! Models saved to:", CFG.OUTPUT_DIR) print("Next step: Use inference.py or the inference notebook to generate submission.csv")