Spaces:

dAISYTUIlmenau
/

LTN_Classification

Sleeping

App Files Files Community

chenchangliu commited on Feb 17

Commit

a38d768

verified ·

1 Parent(s): 3ece5b1

Upload code_EffNet_train_best.py

Browse files

Files changed (1) hide show

code_EffNet_train_best.py +423 -0

code_EffNet_train_best.py ADDED Viewed

	@@ -0,0 +1,423 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+# Two-head EfficientNet classifier (multi-task) with:
+# - Dataset: reads image crops from DATA_ROOT/{train,val,test} and labels from sidecar .txt files
+#   (each .txt contains: "species_id state_id")
+# - Augmentation (train only): resize to IMG_SIZE, random horizontal flip, random 0/90/180/270 rotation,
+#   mild ColorJitter (lighting/camera variation), and small translate/scale jitter via RandomAffine
+# - Transfer learning: EfficientNet-B0 pretrained backbone shared by two classification heads
+#   (species head: NUM_SPECIES classes, state head: NUM_STATES classes)
+# - Optimization: AdamW with separate learning rates for backbone vs heads (LR_BACKBONE, LR_HEADS)
+# - Warm-up: freeze backbone for the first FREEZE_EPOCHS epochs, then unfreeze and fine-tune end-to-end
+# - LR schedule: CosineAnnealingLR applied only after unfreezing (T_max = EPOCHS - FREEZE_EPOCHS)
+# - Logging: W&B (self-hosted) logs per-head losses/accuracies, combined accuracy, and current LR values
+# - Checkpointing: saves best.pt (by combined val accuracy = mean of two head accuracies) and last.pt
+### To prevent overfitting after 15/20 epochs:
+# - Added: label smoothing to prevent overfitting: ce = nn.CrossEntropyLoss(label_smoothing=0.05)
+# - RandomErasing applied AFTER normalization, because it expects a tensor
+# - Increased dropout
+# - Increased FREEZE_EPOCHS
+# - Reduced color augmentation, use very small numbers
+# - Reduced LR_HEADS
+# - Try freezing batch norm
+from pathlib import Path
+import os
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from torchvision.io import read_image
+from torchvision import transforms
+from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights
+import wandb
+# ------------------ CONFIG ------------------
+DATA_ROOT = Path("LTN_crop_twohead")
+NUM_SPECIES = 12
+NUM_STATES = 4
+EPOCHS = 150
+BATCH = 32
+LR_BACKBONE = 3e-5
+LR_HEADS = 2e-4  # slightly reduced to reduce overfitting
+WEIGHT_DECAY = 1e-2
+FREEZE_EPOCHS = 15  # freeze backbone initially
+IMG_SIZE = 224
+WORKERS = 1
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# --------------------------------------------
+### Freeze batchNorm
+def set_bn_eval(m):
+    if isinstance(m, nn.BatchNorm2d):
+        m.eval()
+###-----
+class RandomRotate90:
+    """Random rotation by k * 90 degrees, k in {0,1,2,3}."""
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        k = torch.randint(0, 4, (1,)).item()
+        return torch.rot90(x, k, dims=[1, 2])  # rotate H,W for CHW tensor
+class TwoHeadCrops(Dataset):
+    """
+    Expects layout:
+      LTN_crop_twohead/{train,val,test}/.../*.jpg
+      LTN_crop_twohead/{train,val,test}/.../*.txt  (contains: "species_id state_id")
+    The folder name is ignored. Labels come from the .txt next to each image.
+    """
+    def __init__(self, root: Path, split: str):
+        img_exts = {".jpg", ".jpeg", ".png"}
+        paths = []
+        for p in (root / split).rglob("*"):
+            if p.is_dir():
+                continue
+            if p.suffix.lower() not in img_exts:
+                continue
+            if any(part.startswith(".") for part in p.parts):
+                continue
+            if not p.with_suffix(".txt").exists():
+                continue
+            paths.append(p)
+        self.img_paths = sorted(paths)
+        if not self.img_paths:
+            raise RuntimeError(f"No images found under: {root / split}")
+        # Augmentation only for training split
+        if split == "train":
+            self.tfm = transforms.Compose([
+                transforms.Resize((IMG_SIZE, IMG_SIZE), antialias=True),
+                transforms.RandomHorizontalFlip(p=0.5),
+                RandomRotate90(), # 90, 180, 270
+                transforms.RandomApply([
+                    transforms.RandomAffine(
+                        degrees=0,              # no arbitrary angle
+                        translate=(0.02, 0.02),  # small shift
+                        scale=(0.95, 1.05),      # small zoom
+                        shear=None,
+                    )
+                ], p=0.5),
+                transforms.ColorJitter( ### augmentation in both position and color
+                    brightness=0.05,
+                    contrast=0.02,
+                    saturation=0.02,
+                    hue=0.01,
+                ),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225]),
+                transforms.RandomErasing( ### Add random erasing 25% of images affected, earsed area: 2–10%
+                    p=0.20,
+                    scale=(0.01, 0.05),
+                    ratio=(0.5, 2.0),
+                    value=0
+                ),
+            ])
+        else:
+            self.tfm = transforms.Compose([
+                transforms.Resize((IMG_SIZE, IMG_SIZE), antialias=True),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225]),
+            ])
+    def __len__(self) -> int:
+        return len(self.img_paths)
+    def __getitem__(self, i: int):
+        p = self.img_paths[i]
+        x = read_image(str(p)).float() / 255.0  # CHW in [0..1]
+        x = self.tfm(x)
+        lab = p.with_suffix(".txt").read_text(encoding="utf-8").strip().split()
+        species_id = int(lab[0])
+        state_id = int(lab[1])
+        return x, torch.tensor(species_id, dtype=torch.long), torch.tensor(state_id, dtype=torch.long)
+class EffNetTwoHead(nn.Module):
+    """EfficientNet backbone + two classification heads."""
+    def __init__(self, num_species: int, num_states: int, pretrained: bool = True):
+        super().__init__()
+        base = efficientnet_b0(weights=EfficientNet_B0_Weights.DEFAULT if pretrained else None)
+        self.features = base.features
+        self.pool = base.avgpool
+        c = base.classifier[1].in_features
+        self.drop = nn.Dropout(0.3)
+        self.head_species = nn.Linear(c, num_species)
+        self.head_state = nn.Linear(c, num_states)
+    def forward(self, x: torch.Tensor):
+        x = self.features(x)
+        x = self.pool(x)
+        x = torch.flatten(x, 1)
+        x = self.drop(x)
+        return self.head_species(x), self.head_state(x)
+@torch.no_grad()
+def eval_one_epoch(model: nn.Module, loader: DataLoader, ce):
+    """
+    Evaluate:
+    - total loss and per-head losses
+    - per-head accuracies
+    """
+    model.eval()
+    loss_sum_total = 0.0
+    loss_sum_sp = 0.0
+    loss_sum_st = 0.0
+    n = 0
+    correct_sp = 0
+    correct_st = 0
+    for x, ysp, yst in loader:
+        x = x.to(DEVICE, non_blocking=True)
+        ysp = ysp.to(DEVICE, non_blocking=True)
+        yst = yst.to(DEVICE, non_blocking=True)
+        lsp, lst = model(x)
+        loss_sp = ce(lsp, ysp)
+        loss_st = ce(lst, yst)
+        loss = loss_sp + loss_st
+        bs = x.size(0)
+        loss_sum_total += float(loss.item()) * bs
+        loss_sum_sp += float(loss_sp.item()) * bs
+        loss_sum_st += float(loss_st.item()) * bs
+        n += bs
+        correct_sp += int((lsp.argmax(1) == ysp).sum().item())
+        correct_st += int((lst.argmax(1) == yst).sum().item())
+    val_loss = loss_sum_total / max(1, n)
+    val_loss_sp = loss_sum_sp / max(1, n)
+    val_loss_st = loss_sum_st / max(1, n)
+    val_acc_sp = correct_sp / max(1, n)
+    val_acc_st = correct_st / max(1, n)
+    return val_loss, val_loss_sp, val_loss_st, val_acc_sp, val_acc_st
+def main():
+    # W&B setup (self-hosted server)
+    os.environ.setdefault("WANDB_BASE_URL", "http://k8s.tu-ilmenau.de:31020")
+    run = wandb.init(
+        project="EffNetCls",
+        entity="mase-students",
+        config={
+            "epochs": EPOCHS,
+            "batch": BATCH,
+            "lr_backbone": LR_BACKBONE,
+            "lr_heads": LR_HEADS,
+            "weight_decay": WEIGHT_DECAY,
+            "freeze_epochs": FREEZE_EPOCHS,
+            "img_size": IMG_SIZE,
+        },
+    )
+    # Data
+    train_ds = TwoHeadCrops(DATA_ROOT, "train")
+    val_ds = TwoHeadCrops(DATA_ROOT, "val")
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=BATCH,
+        shuffle=True,
+        num_workers=WORKERS,
+        pin_memory=True,
+    )
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=BATCH,
+        shuffle=False,
+        num_workers=WORKERS,
+        pin_memory=True,
+    )
+    # Model
+    model = EffNetTwoHead(NUM_SPECIES, NUM_STATES, pretrained=True).to(DEVICE)
+    # Freeze backbone initially (heads learn first, then fine-tune backbone)
+    for p in model.features.parameters():
+        p.requires_grad = False
+    # Optimizer with separate LR groups
+    opt = torch.optim.AdamW(
+        [
+            {"params": model.features.parameters(), "lr": LR_BACKBONE},
+            {"params": model.pool.parameters(), "lr": LR_BACKBONE},
+            {"params": model.drop.parameters(), "lr": LR_BACKBONE},
+            {"params": model.head_species.parameters(), "lr": LR_HEADS},
+            {"params": model.head_state.parameters(), "lr": LR_HEADS},
+        ],
+        weight_decay=WEIGHT_DECAY,
+    )
+    # LR starts high, smooth cosine decay across the whole training
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+        opt,
+        T_max=EPOCHS - FREEZE_EPOCHS,
+        eta_min=1e-6,
+    )
+    ce_train = nn.CrossEntropyLoss(label_smoothing=0.05) # apply label smoothing only for training
+    ce_val = nn.CrossEntropyLoss()
+    # Save best by combined accuracy; always save last at end
+    best_acc = -1.0
+    for epoch in range(1, EPOCHS + 1):
+        # Unfreeze backbone after warm-up
+        if epoch == FREEZE_EPOCHS + 1:
+            for p in model.features.parameters():
+                p.requires_grad = True
+            print(f"[epoch {epoch:03d}] Backbone unfrozen")
+        # ---- Train one epoch ----
+        model.train()
+        # Freeze BatchNorm running statistics after unfreezing the backbone
+        if epoch > FREEZE_EPOCHS:
+            model.apply(set_bn_eval)
+        ########
+        loss_sum_total = 0.0
+        loss_sum_sp = 0.0
+        loss_sum_st = 0.0
+        n = 0
+        correct_sp = 0
+        correct_st = 0
+        for x, ysp, yst in train_loader:
+            x = x.to(DEVICE, non_blocking=True)
+            ysp = ysp.to(DEVICE, non_blocking=True)
+            yst = yst.to(DEVICE, non_blocking=True)
+            opt.zero_grad(set_to_none=True)
+            lsp, lst = model(x)
+            # per-head losses
+            loss_sp = ce_train(lsp, ysp)
+            loss_st = ce_train(lst, yst)
+            loss = loss_sp + loss_st
+            loss.backward()
+            opt.step()
+            bs = x.size(0)
+            loss_sum_total += float(loss.item()) * bs
+            loss_sum_sp += float(loss_sp.item()) * bs
+            loss_sum_st += float(loss_st.item()) * bs
+            n += bs
+            correct_sp += int((lsp.argmax(1) == ysp).sum().item())
+            correct_st += int((lst.argmax(1) == yst).sum().item())
+        train_loss = loss_sum_total / max(1, n)
+        train_loss_sp = loss_sum_sp / max(1, n)
+        train_loss_st = loss_sum_st / max(1, n)
+        train_acc_sp = correct_sp / max(1, n)
+        train_acc_st = correct_st / max(1, n)
+        # ---- Validate ----
+        val_loss, val_loss_sp, val_loss_st, val_acc_sp, val_acc_st = eval_one_epoch(model, val_loader, ce_val)
+        # after unfreeze, avoids “wasting” cosine decay while backbone is frozen.
+        if epoch > FREEZE_EPOCHS:
+            scheduler.step()
+        # Read current LRs after scheduler.step()
+        lr_backbone = opt.param_groups[0]["lr"]
+        lr_heads = opt.param_groups[-1]["lr"]
+        combined_acc = 0.5 * (val_acc_sp + val_acc_st)
+        # ---- Print per-epoch summary ----
+        print(
+            f"epoch {epoch:03d} | "
+            f"train_loss={train_loss:.4f} (sp={train_loss_sp:.4f}, st={train_loss_st:.4f}) | "
+            f"train_acc_sp={train_acc_sp:.3f} | train_acc_st={train_acc_st:.3f} | "
+            f"val_loss={val_loss:.4f} (sp={val_loss_sp:.4f}, st={val_loss_st:.4f}) | "
+            f"val_acc_sp={val_acc_sp:.3f} | val_acc_st={val_acc_st:.3f} | "
+            f"val_acc_combined={combined_acc:.3f} | "
+            f"lr_backbone={lr_backbone:.6f} | lr_heads={lr_heads:.6f}"
+        )
+        # ---- W&B logging ----
+        wandb.log({
+            "epoch": epoch,
+            "train/loss_total": train_loss,
+            "train/loss_species": train_loss_sp,
+            "train/loss_state": train_loss_st,
+            "train/acc_species": train_acc_sp,
+            "train/acc_state": train_acc_st,
+            "val/loss_total": val_loss,
+            "val/loss_species": val_loss_sp,
+            "val/loss_state": val_loss_st,
+            "val/acc_species": val_acc_sp,
+            "val/acc_state": val_acc_st,
+            "val/acc_combined": combined_acc,
+            "lr/backbone": lr_backbone,
+            "lr/heads": lr_heads,
+        })
+        # ---- Save best checkpoint by combined accuracy ----
+        if combined_acc > best_acc:
+            best_acc = combined_acc
+            torch.save(
+                {
+                    "model": model.state_dict(),
+                    "epoch": epoch,
+                    "best_acc": best_acc,
+                    "val_acc_species": val_acc_sp,
+                    "val_acc_state": val_acc_st,
+                    "val_acc_combined": combined_acc,
+                    "num_species": NUM_SPECIES,
+                    "num_states": NUM_STATES,
+                    "img_size": IMG_SIZE,
+                },
+                "best.pt",
+            )
+    # Always save last checkpoint
+    torch.save(
+        {
+            "model": model.state_dict(),
+            "epoch": EPOCHS,
+            "num_species": NUM_SPECIES,
+            "num_states": NUM_STATES,
+            "img_size": IMG_SIZE,
+        },
+        "last.pt",
+    )
+    run.finish()
+    print("Done. Saved best.pt and last.pt")
+if __name__ == "__main__":
+    main()