"""
Dino V3 finetunning for script classification
==============================================

Progressive finetuning with page-level train/val/test split
Runs on three preprocessed variants:
    - whole page /
    - patches color /
    - patches_clahe/

Usage:
    #Exp1: whole page
    python finetune_dinov3.py --data_dir ./Data/output/whole_page --experiment whole_page
    
    #Exp2: patches_color
    python finetune_dinov3.py --data_dir ./Data/output/patches_color --experiment patches_color

    #Exp3: CLAHE_patches
    python finetune_dinov3.py --data_dir ./Data/output/patches_clahe --experiment patches_clahe

Outputs (under --output_dir/<experiment>/):
    best_<stage_slug>.pt   — best val macro-F1 per stage
    history_stage_{a,b,c}.json — per-epoch metrics per stage
    training_history_stage_{a,b,c}.png — curves per stage
    final_model.pt — weights chosen by best val across stages + test_metrics metadata
    results.json, confusion_matrix.*, training_history.png (full run)

Requirements:
    pip install torch torchvision transformers scikit-learn matplotlib seaborn
        # DINOv3 requires transformers >= 4.56.0
    # If not available: pip install --upgrade git+https://github.com/huggingface/transformers.git
"""
import os 
import re 
import json
import argparse
import random
from pathlib import Path
from collections import Counter, defaultdict 
from datetime import datetime

import numpy as np
import torch
import torch.nn as nn
from torch.cuda.amp import GradScaler
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms
from PIL import Image
from sklearn.metrics import (classification_report, confusion_matrix, f1_score, accuracy_score )

try:
    from transformers import AutoImageProcessor, AutoModel
except ImportError:
    raise ImportError("transformers >= 4.56.0 required for DINOv3.\n"
        "Install: pip install --upgrade git+https://github.com/huggingface/transformers.git"
    )
# =====================
# CONFIG 
# =====================

DINOV3_MODEL_ID = "facebook/dinov3-vits16-pretrain-lvd1689m"
EMBEDDING_DIM = 384 
VALID_EXT = {'.jpg', '.jpeg', '.png', '.tif', '.tiff', '.bmp', '.webp'}
SEED = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# ======================
# Page level spliting 
# ======================

def get_page_name(filepath):
    
    """
    Extract the original page name from a patch filename.
    e.g., 'manuscript001_p3.png' → 'manuscript001'
    e.g., 'manuscript001.png' → 'manuscript001'
    
    This ensures all patches from the same page stay in the same split.
    """
    stem = Path(filepath).stem
    page_name = re.sub(r'_p\d+$','',stem)
    return page_name


def normalize_label_key(label: str) -> str:
    """Normalize class names for manifest lookup."""
    return re.sub(r'[^a-z0-9]+', '_', label.lower()).strip('_')


def load_exclusion_manifest(manifest_path: str):
    """
    Load class->page_ids exclusions from JSON.
    Returns a dict keyed by normalized class labels.
    """
    if not manifest_path:
        return {}

    path = Path(manifest_path)
    if not path.is_file():
        print(f"  Exclusion manifest not found, skipping exclusions: {path}")
        return {}

    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)
    if not isinstance(raw, dict):
        raise ValueError(f"Exclusion manifest must be a JSON object: {path}")

    manifest = {}
    for label, ids in raw.items():
        if not isinstance(ids, list):
            continue
        norm_label = normalize_label_key(str(label))
        manifest[norm_label] = {str(x).strip() for x in ids if str(x).strip()}
    return manifest


def create_page_level(data_dir, val_ratio=0.15, test_ratio=0.15, seed=SEED, excluded_pages_by_label=None):

    """
    Split at the PAGE level, not the image/patch level.
    All patches from one page go into the same split.
    
    Returns:
        splits: dict with 'train', 'val', 'test' keys
                each value is a list of (filepath, label) tuples
        label_to_idx: dict mapping label strings to integers
    """
    set_seed(seed)
    data_dir = Path(data_dir)

    class_pages = defaultdict(lambda: defaultdict(list))
    skipped_by_label = Counter()
    
    for cls_dir in sorted(data_dir.iterdir()):
        if not cls_dir.is_dir() or cls_dir.name.startswith('.'):
            continue
        label = cls_dir.name
        excluded_pages = set()
        if excluded_pages_by_label:
            excluded_pages = excluded_pages_by_label.get(normalize_label_key(label), set())
        for img_path in sorted(cls_dir.iterdir()):
            if img_path.suffix.lower() in VALID_EXT:
                page = get_page_name(str(img_path))
                if page in excluded_pages:
                    skipped_by_label[label] += 1
                    continue
                class_pages[label][page].append(str(img_path))
    
    # Create label mapping
    labels = sorted(class_pages.keys())
    label_to_idx = {label: idx for idx, label in enumerate(labels)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}
    
    # Split pages per class (stratified)
    splits = {'train': [], 'val': [], 'test': []}
    
    for label in labels:
        pages = list(class_pages[label].keys())
        random.shuffle(pages)
        
        n_pages = len(pages)
        n_test = max(1, int(n_pages * test_ratio))
        n_val = max(1, int(n_pages * val_ratio))
        n_train = n_pages - n_test - n_val
        
        test_pages = pages[:n_test]
        val_pages = pages[n_test:n_test + n_val]
        train_pages = pages[n_test + n_val:]
        
        for page in train_pages:
            for fpath in class_pages[label][page]:
                splits['train'].append((fpath, label))
        for page in val_pages:
            for fpath in class_pages[label][page]:
                splits['val'].append((fpath, label))
        for page in test_pages:
            for fpath in class_pages[label][page]:
                splits['test'].append((fpath, label))
    
    return splits, label_to_idx, idx_to_label, dict(skipped_by_label)

class ScriptDataset(Dataset):
    def __init__(self, samples, label_to_idx, processor, augment = False):
        self.samples = samples
        self.label_to_idx = label_to_idx
        self.processor = processor
        self.augment = augment

        #Document aware augmentation
        if augment:
            self.aug_transform = transforms.Compose([
                transforms.RandomRotation(degrees=5, fill=255),
                transforms.ColorJitter(brightness=0.2, contrast=0.2),
                transforms.RandomResizedCrop(224, scale=(0.7, 1.0), ratio=(0.9, 1.1)),
                transforms.RandomErasing(p=0.1, scale=(0.02, 0.08)),
            ])
        else:
            self.aug_transform = None

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        file_path,label_str = self.samples[idx]

        #Load Image
        img = Image.open(file_path).convert('RGB')

        if self.aug_transform is not None and self.augment:
            img = transforms.ToTensor()(img)
            img = self.aug_transform(img)
            img = transforms.ToPILImage()(img)
        
        # Process with DINOv3 processor (resize, normalize)
        inputs = self.processor(images=img, return_tensors="pt")
        pixel_values = inputs['pixel_values'].squeeze(0)
        
        label_idx = self.label_to_idx[label_str]
        
        return pixel_values, label_idx

class DINOv3Classifier(nn.Module):
    """
    DINOv3 ViT-S backbone + MLP classification head.
    
    The backbone outputs:
      - CLS token: 384-dim embedding (used for classification)
      - Patch tokens: 196 × 384-dim (not used in this version)
      - Register tokens: 4 × 384-dim (not used)
    
    Classification head: 384 → 128 → num_classes
    """

    def __init__(self, model_id, num_classes, dropout=0.1):
        super().__init__()

        #Load pretrained backbone
        self.backbone = AutoModel.from_pretrained(model_id)

        #Get embedding dim
        hidden_size = self.backbone.config.hidden_size

        #Classification head
        self.head = nn.Sequential(
                nn.LayerNorm(hidden_size),
                nn.Dropout(dropout),
                nn.Linear(hidden_size, 128),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(128, num_classes),
            )

        self.freeze_backbone()

    def freeze_backbone(self):
        """Freeze all the backbone paramenters"""
        for params in self.backbone.parameters():
            params.requires_grad = False

    def unfreeze_last_n_blocks(self, n):
        """
        Unfreeze the last N transformer blocks.
        DINOv3 ViT-S has 12 blocks (layers).
        """
        # First freeze everything
        self.freeze_backbone()

        # HF DINOv3ViTModel: blocks at backbone.model.layer, final norm at backbone.norm
        # (not ViT/BERT-style backbone.encoder.layer).
        if hasattr(self.backbone, "model") and hasattr(self.backbone.model, "layer"):
            layers = self.backbone.model.layer
        elif hasattr(self.backbone, "encoder") and hasattr(self.backbone.encoder, "layer"):
            layers = self.backbone.encoder.layer
        else:
            raise AttributeError(
                "Backbone has no recognizable transformer blocks "
                "(expected .model.layer for DINOv3 or .encoder.layer for ViT/BERT)."
            )

        total_layers = len(layers)
        for i in range(max(0, total_layers - n), total_layers):
            for param in layers[i].parameters():
                param.requires_grad = True

        if hasattr(self.backbone, "norm"):
            for param in self.backbone.norm.parameters():
                param.requires_grad = True
        elif hasattr(self.backbone, "layernorm"):
            for param in self.backbone.layernorm.parameters():
                param.requires_grad = True
    
    def forward(self, pixel_values):
        # Get backbone outputs
        outputs = self.backbone(pixel_values=pixel_values)
        
        # Use CLS token (first token)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        
        # Classify
        logits = self.head(cls_embedding)
        return logits
 

# ====================================
# Tranining 
# ====================================

def get_class_weights(samples, label_to_idx, device):
    """Compute inverse-frequency class weights for balanced training."""
    counts = Counter(label for _, label in samples)
    total = sum(counts.values())
    weights = torch.zeros(len(label_to_idx), device=device)
    for label, idx in label_to_idx.items():
        cnt = max(counts.get(label, 1), 1)
        weights[idx] = total / (len(label_to_idx) * cnt)
    return weights

def get_weighted_sampler(samples, label_to_idx):
    """WeightedRandomSampler for balanced batches."""
    counts = Counter(label for _, label in samples)
    total = sum(counts.values())
    class_weights = {label: total / count for label, count in counts.items()}
    sample_weights = [class_weights[label] for _, label in samples]
    return WeightedRandomSampler(sample_weights, len(samples), replacement=True)

def train_one_epoch(model, loader, criterion, optimizer, device, scaler=None):
    """Train for one epoch with optional mixed precision."""
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch_idx, (images,labels) in enumerate(loader):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        if scaler:
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                logits = model(images)
                loss = criterion(logits, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(images)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * images.size(0)
        _, predicted = logits.max(1)
        correct += predicted.eq(labels).sum().item()
        total += labels.size(0)

        if(batch_idx + 1) % 50 == 0:
            print(f" batch {batch_idx+1}/{len(loader)} | "
                  f"loss: {loss.item():.4f} | acc: {correct/total:.3f}")

    return total_loss / total, correct / total


def _stage_checkpoint_slug(stage_name: str) -> str:
    """Stable filename fragment (no spaces/colons) for checkpoint paths."""
    s = re.sub(r"[^a-z0-9]+", "_", stage_name.lower())
    return re.sub(r"_+", "_", s).strip("_")


@torch.no_grad()
def evaluate(model, loader, criterion, device, idx_to_label=None):
    """Return validation/test metrics and per-sample preds, labels, probs."""
    model.eval()
    total_loss = 0.0
    total = 0
    all_preds = []
    all_labels = []
    all_probs = []

    for images, labels in loader:
        images = images.to(device)
        labels = labels.to(device)
        logits = model(images)
        loss = criterion(logits, labels)
        bs = images.size(0)
        total_loss += loss.item() * bs
        total += bs
        probs = torch.softmax(logits, dim=1)
        pred = logits.argmax(dim=1)
        all_preds.extend(pred.cpu().numpy().tolist())
        all_labels.extend(labels.cpu().numpy().tolist())
        all_probs.extend(probs.cpu().numpy().tolist())

    avg_loss = total_loss / max(total, 1)
    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0)
    weighted_f1 = f1_score(all_labels, all_preds, average="weighted", zero_division=0)
    metrics = {
        "loss": float(avg_loss),
        "accuracy": float(acc),
        "macro_f1": float(macro_f1),
        "weighted_f1": float(weighted_f1),
    }
    return metrics, all_preds, all_labels, all_probs


def evaluate_page_level(samples, probs, label_to_idx, idx_to_label):
    """
    Aggregate patch-level probabilities to page-level predictions.

    Args:
        samples: list of (filepath, label_str) for the evaluated split.
        probs: list of per-sample probability vectors (same order as samples).
    """
    if len(samples) != len(probs):
        raise ValueError(
            f"samples/probs length mismatch: {len(samples)} != {len(probs)}"
        )

    page_preds = defaultdict(list)
    page_labels = {}

    # Page-level true labels from file stems
    for filepath, label_str in samples:
        page = get_page_name(filepath)
        page_labels[page] = label_to_idx[label_str]

    # Group probabilities by page
    for (filepath, _), p in zip(samples, probs):
        page = get_page_name(filepath)
        page_preds[page].append(np.asarray(p, dtype=np.float32))

    pages_sorted = sorted(page_preds.keys())
    all_page_true = []
    all_page_pred = []
    page_avg_probs = {}

    for page in pages_sorted:
        avg_probs = np.mean(page_preds[page], axis=0)
        pred_idx = int(np.argmax(avg_probs))
        true_idx = int(page_labels[page])
        all_page_true.append(true_idx)
        all_page_pred.append(pred_idx)
        page_avg_probs[page] = avg_probs.tolist()

    acc = accuracy_score(all_page_true, all_page_pred)
    macro_f1 = f1_score(all_page_true, all_page_pred, average="macro", zero_division=0)
    weighted_f1 = f1_score(all_page_true, all_page_pred, average="weighted", zero_division=0)

    metrics = {
        "accuracy": float(acc),
        "macro_f1": float(macro_f1),
        "weighted_f1": float(weighted_f1),
        "num_pages": int(len(pages_sorted)),
        "num_samples": int(len(samples)),
    }

    return {
        "metrics": metrics,
        "pages": pages_sorted,
        "page_true": all_page_true,
        "page_pred": all_page_pred,
        "page_avg_probs": page_avg_probs,
    }


#============================
# Progressive fine-tunning
#============================

def run_stage(model, train_loader, val_loader, criterion, device, stage_name, lr_backbone, lr_head, epochs, output_dir, idx_to_label, use_amp=True):
    """Run one stage of progressive fine-tuning."""

    print(f"\n{'='*60}")
    print(f" {stage_name}")
    print(f"{'='*60}")

    # Set up optimizer with different LRs for backbone and head
    param_groups = []

    backbone_params = [p for p in model.backbone.parameters() if p.requires_grad]
    head_params = list(model.head.parameters())

    if backbone_params:
        param_groups.append({'params': backbone_params, 'lr': lr_backbone})
        print(f" Backbone params (trainable): {sum(p.numel() for p in backbone_params):,}")

    param_groups.append({'params': head_params, 'lr': lr_head})
    print(f" Head params: {sum(p.numel() for p in head_params):,}")
    print(f" LR backbone: {lr_backbone}, LR head: {lr_head}")
    print(f" Epochs: {epochs}")

    optimizer = torch.optim.AdamW(param_groups, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    scaler = torch.amp.GradScaler() if use_amp and device.type == 'cuda' else None

    slug = _stage_checkpoint_slug(stage_name)
    checkpoint_path = output_dir / f'best_{slug}.pt'

    best_val_f1 = 0
    best_epoch = 0
    history = []

    for epoch in range(epochs):
        print(f"\n Epoch {epoch+1}/{epochs}")

        train_loss, train_acc = train_one_epoch(
                model, train_loader, criterion, optimizer, device, scaler
        )

        val_metrics, _, _, _ = evaluate(model, val_loader, criterion, device)

        scheduler.step()

        print(f" Train loss: {train_loss:.4f} | acc: {train_acc:.3f}")
        print(f" Val loss: {val_metrics['loss']:.4f} | "
              f"acc: {val_metrics['accuracy']:.3f} | "
              f"macro-F1: {val_metrics['macro_f1']:.3f}")

        history.append({
              'epoch': epoch + 1,
              'train_loss': train_loss,
              'train_acc': train_acc,
              'val_macro_f1': val_metrics['macro_f1'],
              'val_loss': val_metrics['loss'],
              'val_accuracy': val_metrics['accuracy'],
        })

        # Save best model (always use slug path so load paths in main() match)
        if val_metrics['macro_f1'] > best_val_f1:
            best_val_f1 = val_metrics['macro_f1']
            best_epoch = epoch + 1
            torch.save({
                'model_state_dict': model.state_dict(),
                'epoch': epoch + 1,
                'val_macro_f1': best_val_f1,
                'val_accuracy': val_metrics['accuracy'],
                'stage_name': stage_name,
                'stage_slug': slug,
            }, checkpoint_path)
            print(f" * New best! Saved to {checkpoint_path}")

    print(f"\n {stage_name} complete. Best: epoch {best_epoch}, macro-F1: {best_val_f1:.3f}")
    return history, best_val_f1

# ==========================
# MAIN
# ==========================
            
def _torch_load(path):
    try:
        return torch.load(path, weights_only=False)
    except TypeError:
        return torch.load(path)


def _save_stage_history_json(output_dir: Path, stage_key: str, history: list) -> None:
    """Write one JSON file per training stage (loss / val metrics per epoch)."""
    path = output_dir / f'history_{stage_key}.json'
    with open(path, 'w') as f:
        json.dump(history, f, indent=2, default=str)
    print(f"  Stage history saved: {path}")


def _plot_stage_history(output_dir: Path, stage_key: str, history: list, experiment: str) -> None:
    """Save train loss + val macro-F1 curves for a single stage."""
    if not history:
        return
    try:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt

        epochs = [h['epoch'] for h in history]
        train_loss = [h['train_loss'] for h in history]
        val_f1 = [h['val_macro_f1'] for h in history]

        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        axes[0].plot(epochs, train_loss, 'b-')
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Train loss')
        axes[0].set_title(f'{stage_key} — train loss')

        axes[1].plot(epochs, val_f1, 'g-')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Val macro-F1')
        axes[1].set_title(f'{stage_key} — validation')

        fig.suptitle(f'{experiment} / {stage_key}')
        plt.tight_layout()
        out_path = output_dir / f'training_history_{stage_key}.png'
        plt.savefig(out_path, dpi=150)
        plt.close()
        print(f"  Stage plot saved: {out_path}")
    except Exception as e:
        print(f"  (Skipping stage plot for {stage_key}: {e})")


def _save_stage_artifacts(output_dir: Path, stage_key: str, history: list, experiment: str) -> None:
    _save_stage_history_json(output_dir, stage_key, history)
    _plot_stage_history(output_dir, stage_key, history, experiment)


def main():
    parser = argparse.ArgumentParser(description="Fine-tune DINO ViT-S")
    parser.add_argument(
        "--data_dir", type=str, required=True,
        help="Path to processed data (e.g., ./Data/output/whole_page)",
    )
    parser.add_argument(
        "--experiment", type=str, required=True,
        choices=["whole_page", "patches_color", "patches_clahe"],
        help="Which experiment variant",
    )
    parser.add_argument("--output_dir", type=str, default="./results",
                        help="Where to save checkpoints and results")
    parser.add_argument("--batch_size", type=int, default=32,
                        help="Batch size (reduce if OOM)")
    parser.add_argument("--epochs_a", type=int, default=20,
                        help="Epochs for Stage A (head only)")
    parser.add_argument("--epochs_b", type=int, default=10,
                        help="Epochs for Stage B (last 2 blocks)")
    parser.add_argument("--epochs_c", type=int, default=10,
                        help="Epochs for Stage C (last 4 blocks)")
    parser.add_argument("--num_workers", type=int, default=4)
    parser.add_argument("--no_amp", action="store_true",
                        help="Disable mixed precision")
    parser.add_argument("--skip_stage_c", action="store_true",
                        help="Skip Stage C (last 4 blocks)")
    parser.add_argument(
        "--exclude_manifest",
        type=str,
        default="./benchmark_page_ids.json",
        help="Optional class->page_ids JSON; excluded pages are skipped during split creation",
    )
    args = parser.parse_args()

    stage_a_name = "Stage A: Head only"
    stage_b_name = "Stage B: Last 2 blocks"
    stage_c_name = "Stage C: Last 4 blocks"

    set_seed(SEED)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    output_dir = Path(args.output_dir) / args.experiment
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"\n{'='*60}")
    print(f"  DINOv3 ViT-S Fine-Tuning")
    print(f"  {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'='*60}")
    print(f"  Experiment:  {args.experiment}")
    print(f"  Data dir:    {args.data_dir}")
    print(f"  Device:      {device}")
    print(f"  Batch size:  {args.batch_size}")
    print(f"  AMP:         {not args.no_amp}")
    print(f"  Exclusions:  {args.exclude_manifest}")

    # Page level split
    print(f"\n Creating page level split")
    excluded_pages_by_label = load_exclusion_manifest(args.exclude_manifest)
    excluded_label_count = len(excluded_pages_by_label)
    excluded_id_count = sum(len(v) for v in excluded_pages_by_label.values())
    if excluded_label_count:
        print(f"  Loaded exclusions: {excluded_label_count} labels, {excluded_id_count} page IDs")
    splits, label_to_idx, idx_to_label, skipped_by_label = create_page_level(
        args.data_dir,
        excluded_pages_by_label=excluded_pages_by_label,
    )
    num_classes = len(label_to_idx)

    print(f"  Classes: {num_classes}")
    print(f"  Train: {len(splits['train'])} | Val: {len(splits['val'])} | Test: {len(splits['test'])}")
    if skipped_by_label:
        print("\n  Skipped excluded files by class:")
        for label, count in sorted(skipped_by_label.items()):
            print(f"    {label:<20s} {count:>6d}")

    # Print per-class split counts
    for split_name in ['train', 'val', 'test']:
        counts = Counter(label for _, label in splits[split_name])
        print(f"\n  {split_name}:")
        for label in sorted(counts.keys()):
            print(f"    {label:<20s} {counts[label]:>6d}")
    
    # Save splits for reproducibility
    splits_info = {
        split_name: [(fp, label) for fp, label in samples]
        for split_name, samples in splits.items()
    }
    with open(output_dir / 'splits.json', 'w') as f:
        json.dump({
            'label_to_idx': label_to_idx,
            'idx_to_label': {str(k): v for k, v in idx_to_label.items()},
            'split_counts': {
                name: dict(Counter(l for _, l in samples))
                for name, samples in splits.items()
            },
            'exclude_manifest': str(args.exclude_manifest),
            'excluded_label_count': excluded_label_count,
            'excluded_page_id_count': excluded_id_count,
            'skipped_excluded_files_by_class': dict(skipped_by_label),
        }, f, indent=2)

    print(f"Loading DINOv3 processor: {DINOV3_MODEL_ID}")
    processor = AutoImageProcessor.from_pretrained(DINOV3_MODEL_ID)

    train_dataset = ScriptDataset(splits['train'], label_to_idx, processor, augment=True)
    val_dataset = ScriptDataset(splits['val'], label_to_idx, processor, augment=False)
    test_dataset = ScriptDataset(splits['test'], label_to_idx, processor, augment=False)

    train_loader = DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True,
        num_workers=args.num_workers, pin_memory=(device.type == 'cuda'),
    )
    val_loader = DataLoader(
        val_dataset, batch_size=args.batch_size, shuffle=False,
        num_workers=args.num_workers, pin_memory=(device.type == 'cuda'),
    )
    test_loader = DataLoader(
        test_dataset, batch_size=args.batch_size, shuffle=False,
        num_workers=args.num_workers, pin_memory=(device.type == 'cuda'),
    )

    print(f"\n  Building DINOv3 classifier ({num_classes} classes)...")
    model = DINOv3Classifier(DINOV3_MODEL_ID, num_classes, dropout=0.1)
    model = model.to(device)

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"  Total params:     {total_params:,}")
    print(f"  Trainable params: {trainable_params:,} (head only)")

    class_weights = get_class_weights(splits['train'], label_to_idx, device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    use_amp = not args.no_amp and device.type == 'cuda'
    all_history = {}

    # Stage A: Head only (backbone frozen)
    model.freeze_backbone()
    history_a, best_f1_a = run_stage(
            model, train_loader, val_loader, criterion, device,
            stage_name=stage_a_name,
            lr_backbone=0, lr_head=1e-3,
            epochs=args.epochs_a, output_dir=output_dir,
            idx_to_label=idx_to_label, use_amp=use_amp,
    )
    all_history['stage_a'] = history_a
    _save_stage_artifacts(output_dir, 'stage_a', history_a, args.experiment)

    ckpt_a = output_dir / f"best_{_stage_checkpoint_slug(stage_a_name)}.pt"
    best_a = _torch_load(ckpt_a)
    model.load_state_dict(best_a['model_state_dict'])

    model.unfreeze_last_n_blocks(2)
    history_b, best_f1_b = run_stage(
            model, train_loader, val_loader, criterion, device,
            stage_name=stage_b_name,
            lr_backbone=1e-5, lr_head=1e-3,
            epochs=args.epochs_b, output_dir=output_dir,
            idx_to_label=idx_to_label, use_amp=use_amp,
    )
    all_history['stage_b'] = history_b
    _save_stage_artifacts(output_dir, 'stage_b', history_b, args.experiment)

    if not args.skip_stage_c:
        ckpt_b = output_dir / f"best_{_stage_checkpoint_slug(stage_b_name)}.pt"
        best_b = _torch_load(ckpt_b)
        model.load_state_dict(best_b['model_state_dict'])

        model.unfreeze_last_n_blocks(4)
        history_c, best_f1_c = run_stage(
                model, train_loader, val_loader, criterion, device,
                stage_name=stage_c_name,
                lr_backbone=5e-6, lr_head=5e-4,
                epochs=args.epochs_c, output_dir=output_dir,
                idx_to_label=idx_to_label, use_amp=use_amp,
        )
        all_history['stage_c'] = history_c
        _save_stage_artifacts(output_dir, 'stage_c', history_c, args.experiment)

    # Final evaluation on test set
    print(f"\n{'='*60}")
    print(f" FINAL TEST EVALUATION")
    print(f"{'='*60}")

    best_checkpoints = list(output_dir.glob('best_*.pt'))
    best_f1 = 0.0
    best_ckpt = None
    for ckpt_path in best_checkpoints:
        ckpt = _torch_load(ckpt_path)
        if ckpt.get('val_macro_f1', 0) > best_f1:
            best_f1 = ckpt['val_macro_f1']
            best_ckpt = ckpt_path

    if best_ckpt is None:
        raise RuntimeError("No checkpoint found under output_dir; cannot run test evaluation.")

    print(f" Loading best checkpoint: {best_ckpt} (val F1: {best_f1:.3f})")
    model.load_state_dict(_torch_load(best_ckpt)['model_state_dict'])

    test_metrics, test_preds, test_labels, test_probs = evaluate(
            model, test_loader, criterion, device, idx_to_label
    )
    page_eval = evaluate_page_level(
        splits['test'],
        test_probs,
        label_to_idx=label_to_idx,
        idx_to_label=idx_to_label,
    )
    page_metrics = page_eval["metrics"]

    # Canonical weights for this experiment (same as loaded best val checkpoint, after test eval)
    final_model_path = output_dir / 'final_model.pt'
    torch.save(
        {
            'model_state_dict': model.state_dict(),
            'experiment': args.experiment,
            'model_id': DINOV3_MODEL_ID,
            'num_classes': num_classes,
            'label_to_idx': label_to_idx,
            'source_val_checkpoint': str(best_ckpt),
            'val_macro_f1_at_selection': float(best_f1),
            'test_metrics': test_metrics,
            'page_test_metrics': page_metrics,
        },
        final_model_path,
    )
    print(f"\n  Final model (for deployment / comparison) saved: {final_model_path}")

    print(f"\n Test accuracy: {test_metrics['accuracy']:.3f}")
    print(f" Test macro-F1: {test_metrics['macro_f1']:.3f}")
    print(f" Test weighted-F1: {test_metrics['weighted_f1']:.3f}")
    print(f" Page accuracy: {page_metrics['accuracy']:.3f} "
          f"| Page macro-F1: {page_metrics['macro_f1']:.3f} "
          f"| Pages: {page_metrics['num_pages']}")

    #Classification report
    target_names = [idx_to_label[i] for i in range(num_classes)]
    report = classification_report(
            test_labels, test_preds, target_names=target_names, zero_division=0
    )
    print(f"\n{report}")

    # Confusion matrix
    cm = confusion_matrix(test_labels, test_preds)
    page_cm = confusion_matrix(page_eval["page_true"], page_eval["page_pred"])

    # Save everything
    results = {
            'experiment': args.experiment,
            'model': DINOV3_MODEL_ID,
            'num_classes': num_classes,
            'best_val_checkpoint': str(best_ckpt),
            'val_macro_f1_at_selection': float(best_f1),
            'final_model_path': str(final_model_path),
            'test_metrics': test_metrics,
            'page_test_metrics': page_metrics,
            'history': all_history,
            'confusion_matrix': cm.tolist(),
            'page_confusion_matrix': page_cm.tolist(),
            'label_to_idx': label_to_idx,
            'classification_report': report,
            'page_classification_report': classification_report(
                page_eval["page_true"], page_eval["page_pred"], target_names=target_names, zero_division=0
            ),
    }

    with open(output_dir / 'results.json', 'w') as f:
        json.dump(results, f, indent=2, default=str)

    # Save confusion matrix as CSV
    import pandas as pd
    cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)
    cm_df.to_csv(output_dir / 'confusion_matrix.csv')
    page_cm_df = pd.DataFrame(page_cm, index=target_names, columns=target_names)
    page_cm_df.to_csv(output_dir / 'page_confusion_matrix.csv')

    # Plot confusion matrix
    try:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        import seaborn as sns

        fig, ax = plt.subplots(figsize=(14, 12))
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=target_names, yticklabels=target_names, ax=ax)
        ax.set_xlabel('Predicted label')
        ax.set_ylabel('True label')
        ax.set_title(f'Confusion Matrix — {args.experiment} (macro-F1: {test_metrics["macro_f1"]:.3f})')
        plt.tight_layout()
        plt.savefig(output_dir / 'confusion_matrix.png', dpi=150)
        plt.close()
        print(f"\n  Confusion matrix saved: {output_dir / 'confusion_matrix.png'}")
    except ImportError:
        print("  (matplotlib/seaborn not available, skipping plot)")

    # Plot page-level confusion matrix
    try:
        fig, ax = plt.subplots(figsize=(14, 12))
        sns.heatmap(page_cm, annot=True, fmt='d', cmap='Greens',
                    xticklabels=target_names, yticklabels=target_names, ax=ax)
        ax.set_xlabel('Predicted label')
        ax.set_ylabel('True label')
        ax.set_title(
            f'Page Confusion Matrix — {args.experiment} '
            f'(macro-F1: {page_metrics["macro_f1"]:.3f})'
        )
        plt.tight_layout()
        plt.savefig(output_dir / 'page_confusion_matrix.png', dpi=150)
        plt.close()
        print(f"  Page confusion matrix saved: {output_dir / 'page_confusion_matrix.png'}")
    except Exception:
        pass
    
    # Plot training history
    try:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        all_epochs = []
        all_train_loss = []
        all_val_f1 = []
        offset = 0
        
        for stage_name, stage_history in all_history.items():
            for entry in stage_history:
                all_epochs.append(entry['epoch'] + offset)
                all_train_loss.append(entry['train_loss'])
                all_val_f1.append(entry['val_macro_f1'])
            offset += len(stage_history)
        
        axes[0].plot(all_epochs, all_train_loss, 'b-')
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Train Loss')
        axes[0].set_title('Training Loss')
        
        axes[1].plot(all_epochs, all_val_f1, 'g-')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Macro F1')
        axes[1].set_title('Validation Macro-F1')
        
        plt.suptitle(f'{args.experiment} — Progressive Fine-Tuning')
        plt.tight_layout()
        plt.savefig(output_dir / 'training_history.png', dpi=150)
        plt.close()
        print(f"  Training history saved: {output_dir / 'training_history.png'}")
    except Exception:
        pass
    
    print(f"\n{'='*60}")
    print(f"  All results saved to: {output_dir}")
    print(f"{'='*60}\n")

if __name__ == "__main__":
    main()