csiro-image2biomass / kaggle_train_notebook.py
notRaphael's picture
Upload kaggle_train_notebook.py with huggingface_hub
bb14e48 verified
#!/usr/bin/env python3
"""
CSIRO Image2Biomass Prediction - Kaggle Training Notebook
==========================================================
This is a self-contained training notebook for Kaggle.
Run this on Kaggle GPU to train models, then use the inference notebook to submit.
Key features:
- DINOv2-Base + ConvNeXt-Large ensemble
- 5-fold stratified CV
- Log-transformed targets
- Label Distribution Smoothing
- Weighted SmoothL1 loss + consistency regularization
- Heavy D4 augmentations for pasture images
- Gradient checkpointing for memory efficiency
"""
# ============================================================
# 1. Setup & Configuration
# ============================================================
import os
import sys
import json
import time
import random
import warnings
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.cuda.amp import GradScaler, autocast
from PIL import Image
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
# Install dependencies
os.system('pip install -q timm albumentations')
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
# ============================================================
# 2. Configuration
# ============================================================
class CFG:
# Paths - adjust for your Kaggle setup
COMPETITION = 'csiro-biomass'
DATA_DIR = Path(f'/kaggle/input/{COMPETITION}')
OUTPUT_DIR = Path('/kaggle/working')
# Model
BACKBONE = 'vit_base_patch14_dinov2.lvd142m' # DINOv2-Base
# BACKBONE = 'convnext_large.fb_in22k_ft_in1k' # ConvNeXt-Large
IMG_SIZE = 224
HIDDEN_DIM = 512
DROPOUT = 0.3
SEPARATE_HEADS = False
GRAD_CHECKPOINTING = True
# Training
EPOCHS = 30
BATCH_SIZE = 16 # Adjust based on GPU memory
BACKBONE_LR = 3e-5
HEAD_LR = 1e-3
MIN_LR = 1e-7
WEIGHT_DECAY = 1e-2
WARMUP_RATIO = 0.05
MAX_GRAD_NORM = 1.0
GRAD_ACCUM = 2 # Effective batch = BATCH_SIZE * GRAD_ACCUM
PATIENCE = 8
# Augmentation
AUG_STRENGTH = 'medium'
LOG_TRANSFORM = True
# LDS
USE_LDS = True
LDS_BINS = 100
LDS_KERNEL_SIZE = 5
LDS_SIGMA = 2.0
# Loss
MSE_WEIGHT = 0.0
CONSISTENCY_WEIGHT = 0.1
# CV
N_FOLDS = 5
TRAIN_FOLDS = [0, 1, 2, 3, 4] # Which folds to train
# Misc
SEED = 42
NUM_WORKERS = 2
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
# Target configuration
TARGET_COLS = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']
TARGET_WEIGHTS = [0.1, 0.1, 0.1, 0.2, 0.5]
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def set_seed(seed=42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(CFG.SEED)
print(f"Device: {CFG.DEVICE}")
if CFG.DEVICE == 'cuda':
print(f"GPU: {torch.cuda.get_device_name(0)}")
# ============================================================
# 3. Data Loading & EDA
# ============================================================
# Auto-detect data directory structure
if not CFG.DATA_DIR.exists():
# Try alternative paths
for alt in ['/kaggle/input/csiro-image2biomass-prediction', '/kaggle/input/csiro-biomass',
'/kaggle/input/csiro-image2biomass', './data']:
if Path(alt).exists():
CFG.DATA_DIR = Path(alt)
break
print(f"Data directory: {CFG.DATA_DIR}")
print(f"Files: {os.listdir(CFG.DATA_DIR) if CFG.DATA_DIR.exists() else 'NOT FOUND'}")
# Load CSVs
train_csv = None
for fname in ['train.csv', 'Train.csv']:
fpath = CFG.DATA_DIR / fname
if fpath.exists():
train_csv = fpath
break
test_csv = None
for fname in ['test.csv', 'Test.csv']:
fpath = CFG.DATA_DIR / fname
if fpath.exists():
test_csv = fpath
break
train_df = pd.read_csv(train_csv) if train_csv else None
test_df = pd.read_csv(test_csv) if test_csv else None
# Find image directories
train_img_dir = None
test_img_dir = None
for d in ['train_images', 'train', 'images/train']:
if (CFG.DATA_DIR / d).exists():
train_img_dir = CFG.DATA_DIR / d
break
for d in ['test_images', 'test', 'images/test']:
if (CFG.DATA_DIR / d).exists():
test_img_dir = CFG.DATA_DIR / d
break
if train_df is not None:
print(f"\nTrain shape: {train_df.shape}")
print(f"Train columns: {list(train_df.columns)}")
print(f"\nTarget statistics:")
for col in TARGET_COLS:
if col in train_df.columns:
s = train_df[col]
print(f" {col}: mean={s.mean():.2f}, median={s.median():.2f}, "
f"std={s.std():.2f}, min={s.min():.2f}, max={s.max():.2f}, "
f"zeros={100*(s==0).mean():.1f}%")
if test_df is not None:
print(f"\nTest shape: {test_df.shape}")
print(f"Test columns: {list(test_df.columns)}")
# ============================================================
# 4. Dataset & Augmentations
# ============================================================
class BiomassDataset(Dataset):
def __init__(self, image_dir, df, targets=None, transform=None,
log_transform=True, use_ndvi=False):
self.image_dir = Path(image_dir)
self.df = df.reset_index(drop=True)
self.targets = targets
self.transform = transform
self.log_transform = log_transform
self.use_ndvi = use_ndvi
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx]
img_id = row['image_id'] if 'image_id' in row.index else row.name
# Find image
img_path = None
for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
p = self.image_dir / f"{img_id}{ext}"
if p.exists():
img_path = p
break
if img_path is None:
candidates = list(self.image_dir.glob(f"{img_id}*"))
img_path = candidates[0] if candidates else self.image_dir / f"{img_id}.jpg"
img = np.array(Image.open(img_path).convert('RGB'))
if self.transform:
img_tensor = self.transform(image=img)['image']
else:
img_tensor = torch.tensor(img.transpose(2, 0, 1), dtype=torch.float32) / 255.0
result = {'image': img_tensor, 'image_id': str(img_id)}
if self.use_ndvi and 'NDVI' in self.df.columns:
result['ndvi'] = torch.tensor(float(row['NDVI']), dtype=torch.float32)
if self.targets is not None:
target_values = self.targets.iloc[idx][TARGET_COLS].values.astype(np.float32)
if self.log_transform:
target_values = np.log1p(target_values)
result['targets'] = torch.tensor(target_values, dtype=torch.float32)
return result
def get_transforms(img_size, is_train=True, aug_strength='medium'):
if is_train:
if aug_strength == 'light':
return A.Compose([
A.RandomResizedCrop(size=(img_size, img_size), scale=(0.7, 1.0)),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.5),
A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
ToTensorV2(),
])
elif aug_strength == 'medium':
return A.Compose([
A.RandomResizedCrop(size=(img_size, img_size), scale=(0.5, 1.0)),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.5),
A.RandomRotate90(p=0.5),
A.Transpose(p=0.5),
A.RandomBrightnessContrast(0.2, 0.2, p=0.5),
A.HueSaturationValue(15, 25, 15, p=0.4),
A.OneOf([A.GaussianBlur(blur_limit=(3, 5)), A.MotionBlur(blur_limit=5)], p=0.15),
A.CoarseDropout(num_holes_range=(1, 4),
hole_height_range=(int(img_size*0.05), int(img_size*0.15)),
hole_width_range=(int(img_size*0.05), int(img_size*0.15)), p=0.2),
A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
ToTensorV2(),
])
else: # heavy
return A.Compose([
A.RandomResizedCrop(size=(img_size, img_size), scale=(0.4, 1.0)),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.5),
A.RandomRotate90(p=0.5),
A.Transpose(p=0.5),
A.RandomBrightnessContrast(0.3, 0.3, p=0.7),
A.HueSaturationValue(20, 30, 20, p=0.5),
A.RandomGamma((80, 120), p=0.3),
A.OneOf([A.GaussianBlur((3, 7)), A.MotionBlur(blur_limit=7)], p=0.2),
A.CoarseDropout(num_holes_range=(1, 8),
hole_height_range=(int(img_size*0.05), int(img_size*0.2)),
hole_width_range=(int(img_size*0.05), int(img_size*0.2)), p=0.3),
A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
ToTensorV2(),
])
else:
return A.Compose([
A.Resize(height=int(img_size * 1.14), width=int(img_size * 1.14)),
A.CenterCrop(height=img_size, width=img_size),
A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
ToTensorV2(),
])
# ============================================================
# 5. Model
# ============================================================
class BiomassModel(nn.Module):
def __init__(self, backbone_name, num_targets=5, hidden_dim=512,
dropout=0.3, pretrained=True, img_size=224,
use_ndvi=False, separate_heads=False, grad_checkpointing=False):
super().__init__()
self.use_ndvi = use_ndvi
self.separate_heads = separate_heads
kwargs = {'pretrained': pretrained, 'num_classes': 0}
if 'vit' in backbone_name or 'dinov2' in backbone_name:
kwargs['img_size'] = img_size
self.backbone = timm.create_model(backbone_name, **kwargs)
feat_dim = self.backbone.num_features
if grad_checkpointing and hasattr(self.backbone, 'set_grad_checkpointing'):
self.backbone.set_grad_checkpointing(True)
if use_ndvi:
self.ndvi_embed = nn.Sequential(nn.Linear(1, 32), nn.GELU(), nn.Linear(32, 64))
feat_dim += 64
if separate_heads:
self.heads = nn.ModuleList([
nn.Sequential(
nn.LayerNorm(feat_dim), nn.Dropout(dropout),
nn.Linear(feat_dim, hidden_dim), nn.GELU(),
nn.Dropout(dropout * 0.5), nn.Linear(hidden_dim, 1),
) for _ in range(num_targets)
])
else:
self.head = nn.Sequential(
nn.LayerNorm(feat_dim), nn.Dropout(dropout),
nn.Linear(feat_dim, hidden_dim), nn.GELU(),
nn.Dropout(dropout * 0.5),
nn.Linear(hidden_dim, hidden_dim // 2), nn.GELU(),
nn.Dropout(dropout * 0.3),
nn.Linear(hidden_dim // 2, num_targets),
)
def forward(self, x, ndvi=None):
features = self.backbone(x)
if self.use_ndvi and ndvi is not None:
features = torch.cat([features, self.ndvi_embed(ndvi.unsqueeze(-1))], dim=-1)
if self.separate_heads:
return torch.cat([h(features) for h in self.heads], dim=-1)
return self.head(features)
def get_param_groups(self, backbone_lr, head_lr):
return [
{'params': self.backbone.parameters(), 'lr': backbone_lr},
{'params': [p for n, p in self.named_parameters() if 'backbone' not in n], 'lr': head_lr},
]
# ============================================================
# 6. Loss Functions
# ============================================================
class WeightedSmoothL1Loss(nn.Module):
def __init__(self, target_weights=None, beta=1.0):
super().__init__()
self.beta = beta
w = target_weights or TARGET_WEIGHTS
self.register_buffer('weights', torch.tensor(w, dtype=torch.float32))
def forward(self, pred, target):
loss = F.smooth_l1_loss(pred, target, beta=self.beta, reduction='none')
return (loss * self.weights.unsqueeze(0)).mean()
class CombinedLoss(nn.Module):
def __init__(self, mse_weight=0.0, consistency_weight=0.1):
super().__init__()
self.smoothl1 = WeightedSmoothL1Loss()
self.mse_weight = mse_weight
self.consistency_weight = consistency_weight
def forward(self, pred, target):
loss = self.smoothl1(pred, target)
if self.mse_weight > 0:
mse = ((pred - target) ** 2 *
torch.tensor(TARGET_WEIGHTS, device=pred.device).unsqueeze(0)).mean()
loss += self.mse_weight * mse
if self.consistency_weight > 0:
comp_sum = pred[:, 0] + pred[:, 1] + pred[:, 2]
loss += self.consistency_weight * F.mse_loss(comp_sum, pred[:, 4])
return loss
# ============================================================
# 7. LDS Weights
# ============================================================
def get_lds_weights(labels, bins=100, kernel_size=5, sigma=2.0):
from scipy.ndimage import convolve1d
if labels.ndim > 1:
labels = labels[:, -1] # Dry_Total_g
hist, edges = np.histogram(labels, bins=bins)
kernel = np.exp(-np.linspace(-3, 3, kernel_size)**2 / (2 * sigma**2))
kernel /= kernel.sum()
smoothed = convolve1d(hist.astype(float), kernel, mode='reflect')
centers = (edges[:-1] + edges[1:]) / 2
weights = 1.0 / (np.interp(labels, centers, smoothed) + 1e-8)
return weights / weights.mean()
# ============================================================
# 8. Metrics
# ============================================================
def compute_weighted_r2(preds, targets, weights=None):
if weights is None:
weights = TARGET_WEIGHTS
all_p, all_t, all_w = [], [], []
for j in range(preds.shape[1]):
all_p.extend(preds[:, j])
all_t.extend(targets[:, j])
all_w.extend([weights[j]] * preds.shape[0])
all_p, all_t, all_w = map(np.array, (all_p, all_t, all_w))
wmean = np.sum(all_w * all_t) / np.sum(all_w)
ss_res = np.sum(all_w * (all_t - all_p) ** 2)
ss_tot = np.sum(all_w * (all_t - wmean) ** 2)
return 1.0 - ss_res / (ss_tot + 1e-8)
def compute_per_target_r2(preds, targets):
results = {}
for i, name in enumerate(TARGET_COLS):
ss_res = np.sum((targets[:, i] - preds[:, i]) ** 2)
ss_tot = np.sum((targets[:, i] - targets[:, i].mean()) ** 2)
results[name] = 1.0 - ss_res / (ss_tot + 1e-8)
return results
# ============================================================
# 9. Training Loop
# ============================================================
def train_one_fold(fold, train_df, val_df, train_targets, val_targets, train_img_dir):
print(f"\n{'='*60}")
print(f"FOLD {fold}")
print(f"Train: {len(train_df)}, Val: {len(val_df)}")
print(f"{'='*60}")
device = torch.device(CFG.DEVICE)
# Datasets
train_ds = BiomassDataset(
train_img_dir, train_df, train_targets,
transform=get_transforms(CFG.IMG_SIZE, True, CFG.AUG_STRENGTH),
log_transform=CFG.LOG_TRANSFORM,
)
val_ds = BiomassDataset(
train_img_dir, val_df, val_targets,
transform=get_transforms(CFG.IMG_SIZE, False),
log_transform=CFG.LOG_TRANSFORM,
)
# LDS sampler
if CFG.USE_LDS:
sample_weights = get_lds_weights(
train_targets[TARGET_COLS].values, CFG.LDS_BINS, CFG.LDS_KERNEL_SIZE, CFG.LDS_SIGMA)
sampler = WeightedRandomSampler(sample_weights, len(train_ds), replacement=True)
train_loader = DataLoader(train_ds, batch_size=CFG.BATCH_SIZE, sampler=sampler,
num_workers=CFG.NUM_WORKERS, pin_memory=True, drop_last=True)
else:
train_loader = DataLoader(train_ds, batch_size=CFG.BATCH_SIZE, shuffle=True,
num_workers=CFG.NUM_WORKERS, pin_memory=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=CFG.BATCH_SIZE * 2, shuffle=False,
num_workers=CFG.NUM_WORKERS, pin_memory=True)
# Model
model = BiomassModel(
CFG.BACKBONE, 5, CFG.HIDDEN_DIM, CFG.DROPOUT, True, CFG.IMG_SIZE,
separate_heads=CFG.SEPARATE_HEADS, grad_checkpointing=CFG.GRAD_CHECKPOINTING,
).to(device)
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
# Optimizer & Scheduler
optimizer = torch.optim.AdamW(
model.get_param_groups(CFG.BACKBONE_LR, CFG.HEAD_LR),
weight_decay=CFG.WEIGHT_DECAY,
)
n_steps = len(train_loader) * CFG.EPOCHS // CFG.GRAD_ACCUM
warmup_steps = int(n_steps * CFG.WARMUP_RATIO)
warmup = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=warmup_steps)
cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_steps - warmup_steps, eta_min=CFG.MIN_LR)
scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, [warmup, cosine], milestones=[warmup_steps])
loss_fn = CombinedLoss(CFG.MSE_WEIGHT, CFG.CONSISTENCY_WEIGHT).to(device)
scaler = GradScaler() if device.type == 'cuda' else None
# Training
best_r2 = -float('inf')
patience = 0
save_path = CFG.OUTPUT_DIR / f'fold_{fold}'
save_path.mkdir(parents=True, exist_ok=True)
for epoch in range(1, CFG.EPOCHS + 1):
t0 = time.time()
# Train
model.train()
running_loss = 0
n_samples = 0
for batch_idx, batch in enumerate(train_loader):
images = batch['image'].to(device)
targets = batch['targets'].to(device)
ndvi = batch.get('ndvi', None)
if ndvi is not None:
ndvi = ndvi.to(device)
if scaler:
with autocast(dtype=torch.float16):
preds = model(images, ndvi)
loss = loss_fn(preds, targets) / CFG.GRAD_ACCUM
scaler.scale(loss).backward()
if (batch_idx + 1) % CFG.GRAD_ACCUM == 0:
scaler.unscale_(optimizer)
nn.utils.clip_grad_norm_(model.parameters(), CFG.MAX_GRAD_NORM)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
scheduler.step()
else:
preds = model(images, ndvi)
loss = loss_fn(preds, targets) / CFG.GRAD_ACCUM
loss.backward()
if (batch_idx + 1) % CFG.GRAD_ACCUM == 0:
nn.utils.clip_grad_norm_(model.parameters(), CFG.MAX_GRAD_NORM)
optimizer.step()
optimizer.zero_grad()
scheduler.step()
running_loss += loss.item() * CFG.GRAD_ACCUM * images.size(0)
n_samples += images.size(0)
train_loss = running_loss / n_samples
# Validate
model.eval()
val_preds, val_targets_arr = [], []
val_loss = 0
val_n = 0
with torch.no_grad():
for batch in val_loader:
images = batch['image'].to(device)
targets = batch['targets'].to(device)
ndvi = batch.get('ndvi', None)
if ndvi is not None:
ndvi = ndvi.to(device)
if scaler:
with autocast(dtype=torch.float16):
preds = model(images, ndvi)
loss = loss_fn(preds, targets)
else:
preds = model(images, ndvi)
loss = loss_fn(preds, targets)
val_loss += loss.item() * images.size(0)
val_n += images.size(0)
val_preds.append(preds.cpu().numpy())
val_targets_arr.append(targets.cpu().numpy())
val_loss /= val_n
val_preds = np.concatenate(val_preds)
val_targets_arr = np.concatenate(val_targets_arr)
# Inverse transform for metric
if CFG.LOG_TRANSFORM:
val_preds_orig = np.expm1(val_preds)
val_targets_orig = np.expm1(val_targets_arr)
else:
val_preds_orig = val_preds
val_targets_orig = val_targets_arr
val_preds_orig = np.clip(val_preds_orig, 0, None)
r2 = compute_weighted_r2(val_preds_orig, val_targets_orig)
per_r2 = compute_per_target_r2(val_preds_orig, val_targets_orig)
elapsed = time.time() - t0
lr = optimizer.param_groups[0]['lr']
print(f"Epoch {epoch:02d}/{CFG.EPOCHS} | "
f"train_loss={train_loss:.4f} | val_loss={val_loss:.4f} | "
f"R²={r2:.4f} | lr={lr:.2e} | {elapsed:.0f}s")
for name, val in per_r2.items():
print(f" {name}: {val:.4f}")
# Save best
if r2 > best_r2:
best_r2 = r2
patience = 0
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'weighted_r2': best_r2,
'per_target_r2': per_r2,
'args': {
'backbone': CFG.BACKBONE,
'img_size': CFG.IMG_SIZE,
'hidden_dim': CFG.HIDDEN_DIM,
'dropout': CFG.DROPOUT,
'separate_heads': CFG.SEPARATE_HEADS,
'log_transform': CFG.LOG_TRANSFORM,
'use_ndvi': False,
},
}, save_path / 'best_model.pth')
print(f" *** New best R²={best_r2:.4f} ***")
else:
patience += 1
if patience >= CFG.PATIENCE:
print(f"Early stopping at epoch {epoch}")
break
print(f"\nFold {fold} best R²: {best_r2:.4f}")
return best_r2
# ============================================================
# 10. K-Fold Training
# ============================================================
if train_df is not None and train_img_dir is not None:
targets = train_df[TARGET_COLS].copy()
bins = pd.qcut(targets['Dry_Total_g'], q=min(10, CFG.N_FOLDS), labels=False, duplicates='drop')
kf = StratifiedKFold(n_splits=CFG.N_FOLDS, shuffle=True, random_state=CFG.SEED)
fold_scores = []
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train_df, bins)):
if fold_idx not in CFG.TRAIN_FOLDS:
continue
fold_train_df = train_df.iloc[train_idx]
fold_val_df = train_df.iloc[val_idx]
fold_train_targets = targets.iloc[train_idx]
fold_val_targets = targets.iloc[val_idx]
score = train_one_fold(
fold_idx, fold_train_df, fold_val_df,
fold_train_targets, fold_val_targets, str(train_img_dir),
)
fold_scores.append(score)
print(f"\n{'='*60}")
print(f"All folds: {[f'{s:.4f}' for s in fold_scores]}")
print(f"Mean R²: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
print(f"{'='*60}")
else:
print("Training data not found! Check DATA_DIR setting.")
# ============================================================
# 11. Save training info
# ============================================================
training_info = {
'backbone': CFG.BACKBONE,
'img_size': CFG.IMG_SIZE,
'n_folds': CFG.N_FOLDS,
'fold_scores': fold_scores if 'fold_scores' in dir() else [],
'mean_r2': float(np.mean(fold_scores)) if 'fold_scores' in dir() and fold_scores else 0,
}
with open(CFG.OUTPUT_DIR / 'training_info.json', 'w') as f:
json.dump(training_info, f, indent=2)
print("\nTraining complete! Models saved to:", CFG.OUTPUT_DIR)
print("Next step: Use inference.py or the inference notebook to generate submission.csv")