Training in progress, epoch 6

Browse files

Files changed (4) hide show

adapter_model.safetensors +1 -1
log.txt +21 -21
runs/Aug21_10-48-46_meedgxh100a/events.out.tfevents.1755787728.meedgxh100a.2323190.0 +2 -2
train_medgemma_focalft_final_amd_copy.py +695 -0

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:35e16b795535298f0a3ed8b3ed4a1367d9eca1b89855759fafd6526288bb86ef
 size 6127553104

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b47a69a2a5f6aae43a6f6092b082ca0afcb41231d53e7ffd363c9e163d04746
 size 6127553104

log.txt CHANGED Viewed

@@ -1,29 +1,29 @@
-AUC: 0.8361
-Sensitivity at Specificity 80%: 0.6733
-Sensitivity at Specificity 85%: 0.5966
-Sensitivity at Specificity 90%: 0.5227
-Sensitivity at Specificity 95%: 0.4119
 ##############################
 Sex Group AUC
-Sex 0: 0.8212
-Sex 1: 0.8581
-ES-AUC Sex: 0.8358
 ##############################
 Race Group AUC
-Race Asian: 0.6105
-Race Black or African American: 0.9798
-Race White: 0.8419
-Race Other or Unknown: 0.7765
-ES-AUC Race: 0.8325
 ##############################
 Ethnic Group AUC
-Ethnic 0: 0.8374
-Ethnic 1: 0.7869
-Ethnic Unknown or Not Reported: 0.8220
-ES-AUC Ethnic: 0.8355
 ##############################
 Language Group AUC
-Language English: 0.8354
-Language Spanish: 0.8283
-Language Other or Unknown: 0.8521
-ES-AUC Language: 0.8359

+AUC: 0.8415
+Sensitivity at Specificity 80%: 0.7188
+Sensitivity at Specificity 85%: 0.6278
+Sensitivity at Specificity 90%: 0.5256
+Sensitivity at Specificity 95%: 0.4148
 ##############################
 Sex Group AUC
+Sex 0: 0.8426
+Sex 1: 0.8472
+ES-AUC Sex: 0.8414
 ##############################
 Race Group AUC
+Race Asian: 0.7143
+Race Black or African American: 1.0000
+Race White: 0.8407
+Race Other or Unknown: 0.8420
+ES-AUC Race: 0.8391
 ##############################
 Ethnic Group AUC
+Ethnic 0: 0.8457
+Ethnic 1: 0.8248
+Ethnic Unknown or Not Reported: 0.7278
+ES-AUC Ethnic: 0.8404
 ##############################
 Language Group AUC
+Language English: 0.8373
+Language Spanish: 0.9066
+Language Other or Unknown: 0.8520
+ES-AUC Language: 0.8408

runs/Aug21_10-48-46_meedgxh100a/events.out.tfevents.1755787728.meedgxh100a.2323190.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c3fc5cb272d60e081d8ae134b3416e75e6c9a30b17b56ce9982ac33e8fc81b69
-size 19441

 version https://git-lfs.github.com/spec/v1
+oid sha256:c93ad5672908e684ae1945d49192862d333e26d69c9806267dec22484de2f2de
+size 21751

train_medgemma_focalft_final_amd_copy.py ADDED Viewed

	@@ -0,0 +1,695 @@

+# 특정 체크포인트로 inference
+# python train.py --task amd --name my_exp --checkpoint checkpoint-938
+# 평가만 실행 (최신 체크포인트 자동 선택)
+# python train.py --task amd --name my_exp --eval_only
+# 특정 체크포인트로 평가만 실행
+# python train.py --task amd --name my_exp --checkpoint checkpoint-500 --eval_only
+# 기존 방식 (훈련 후 최신 체크포인트로 평가)
+# python train.py --task amd --name my_exp
+from __future__ import division, print_function
+# Standard library imports
+import os
+import os.path as osp
+import random
+import argparse
+import logging
+import shutil
+# Third-party imports
+from tqdm import tqdm
+from PIL import Image
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn as nn
+import torch.nn.functional as F
+from sklearn.metrics import roc_auc_score
+from transformers import AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig
+from peft import LoraConfig, get_peft_model, PeftModel
+from trl import SFTConfig, SFTTrainer
+from torch.utils.data import Subset
+import wandb
+# Local imports
+from utils import compute_es_auc, compute_group_auc, compute_es_auc_multi
+# ==================== CONSTANTS ====================
+SEED = 42
+# Group categories for bias analysis
+GROUPS = [
+    ['0', '1'],  # Sex
+    ["Asian", "Black or African American", "White", "Other or Unknown"],  # Race
+    ["0", "1", "Unknown or Not Reported"],  # Ethnicity
+    ["English", "Spanish", "Other or Unknown"]  # Language
+]
+# Mapping dictionaries for demographic data
+RACEMAP = {
+    "Asian": 1,
+    "White": 2,
+    "Other or Unknown": 3,
+    "Black or African American": 4
+}
+ETHNICMAP = {
+    "0": 0,
+    "1": 1,
+    "Unknown or Not Reported": 2
+}
+LANGUAGEMAP = {
+    "English": 0,
+    "Spanish": 1,
+    "Other or Unknown": 2,
+}
+# ==================== TASK-SPECIFIC CONFIGURATIONS ====================
+TASK_CONFIGS = {
+    'dr': {
+        'task_idx': -3,
+        'disease_name': 'Diabetic Retinopathy',
+        'num_epochs': 15,
+        'learning_rate': 5e-4,
+        'pos_weight': 0.75,
+        'neg_weight': 0.25,
+        'batch_size': 8,
+        'lr_scheduler': 'linear'
+    },
+    'amd': {
+        'task_idx': -2,
+        'disease_name': 'Aged Macular Degeneration',
+        'num_epochs': 8,
+        'learning_rate': 5e-4,
+        'pos_weight': 0.75,
+        'neg_weight': 0.25,
+        'batch_size': 8,
+        'lr_scheduler': 'linear'
+    },
+    'glaucoma': {
+        'task_idx': -1,
+        'disease_name': 'Glaucoma',
+        'num_epochs': 12,
+        'learning_rate': 7e-4,
+        'pos_weight': 0.8,
+        'neg_weight': 0.2,
+        'batch_size': 6,
+        'lr_scheduler': 'cosine'
+    }
+}
+# ==================== SETUP FUNCTIONS ====================
+def setup_reproducibility():
+    """Set up random seeds for reproducible results."""
+    random.seed(SEED)
+    np.random.seed(SEED)
+    torch.manual_seed(SEED)
+    torch.cuda.manual_seed_all(SEED)
+    # CUDNN settings for complete reproducibility
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def setup_logging(exp_name):
+    """Set up logging configuration."""
+    log_path = os.path.join(exp_name, "log.txt")
+    if osp.isfile(log_path):
+        os.remove(log_path)
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    # Console handler
+    console_handler = logging.StreamHandler()
+    logger.addHandler(console_handler)
+    # File handler
+    file_handler = logging.FileHandler(log_path)
+    logger.addHandler(file_handler)
+    return logger
+# ==================== DATA PROCESSING ====================
+def mask_until_after_assistant(labels: torch.Tensor, tokenizer, assistant_token_ids: list):
+    """Mask tokens until after the assistant token for proper loss computation."""
+    for i in range(labels.size(0)):
+        for j in range(labels.size(1) - len(assistant_token_ids) + 1):
+            if torch.equal(labels[i, j:j+len(assistant_token_ids)],
+                          torch.tensor(assistant_token_ids, device=labels.device)):
+                labels[i, :j + len(assistant_token_ids)] = -100  # Mask until ASSISTANT:
+                break
+    return labels
+def collate_fn(examples):
+    """Custom collate function for processing batches of image-text data."""
+    texts = []
+    images = []
+    # Process each example
+    for example in examples:
+        # Process image
+        image = example["image"].convert("RGB")
+        image = image.resize((IM_SIZE, IM_SIZE))
+        images.append([image])
+        # Process text
+        texts.append(processor.apply_chat_template(
+            example["messages"], add_generation_prompt=False, tokenize=False
+        ).strip())
+    # Tokenize and process
+    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    # Create labels for loss computation
+    labels = batch["input_ids"].clone()
+    # Mask special tokens
+    image_token_id = [
+        processor.tokenizer.convert_tokens_to_ids(
+            processor.tokenizer.special_tokens_map["boi_token"]
+        )
+    ]
+    # Apply masks
+    labels[labels == processor.tokenizer.pad_token_id] = -100
+    labels[labels == image_token_id] = -100
+    labels[labels == 262144] = -100
+    # Mask until assistant token
+    labels = mask_until_after_assistant(labels, processor.tokenizer, ASST_ID)
+    labels[:, -1] = -100
+    batch["labels"] = labels
+    return batch
+def format_data(sample, task_idx, disease_name, system_message, img_root_path):
+    """Format training data sample into the required structure."""
+    label = 'negative' if sample[task_idx] == '0.0' else 'positive'
+    prompt = f"Please diagnose whether the {disease_name} exist or not based on the given image.\n"
+    example = {
+        "image": Image.open(os.path.join(img_root_path, sample[1])),
+        "label": 0 if sample[task_idx] == '0.0' else 1,
+        "messages": [
+            {"role": "system", "content": [{"type": "text", "text": system_message}]},
+            {"role": "user", "content": [
+                {"type": "image"},
+                {"type": "text", "text": prompt},
+            ]},
+            {"role": "assistant", "content": [{"type": "text", "text": str(label)}]}
+        ]
+    }
+    return example
+def format_data_for_inference(sample, task_idx, disease_name, system_message, img_root_path):
+    """Format validation data sample for inference."""
+    prompt = f"Please diagnose whether the {disease_name} exist or not based on the given image."
+    example = {
+        "image": Image.open(os.path.join(img_root_path, sample[1])),
+        "messages": [
+            {"role": "system", "content": [{"type": "text", "text": system_message}]},
+            {"role": "user", "content": [
+                {"type": "image"},
+                {"type": "text", "text": prompt + "\n"},
+            ]},
+        ],
+        "groups": sample[2:]
+    }
+    return example
+def create_subset(data, task_idx, train=True):
+    """Create balanced subset of data for training/validation."""
+    if task_idx == -1:  # Glaucoma
+        neg = [s for s in data if s[task_idx] == '0.0']
+        pos = [s for s in data if s[task_idx] != '0.0']
+        num_sample = len(pos)
+        if train:
+            return random.sample(neg, 10*num_sample), pos
+        else:
+            return random.sample(neg, 5*num_sample), pos
+    ###########################################################
+    elif task_idx == -2:  # AMD
+        neg = []
+        pos = []
+        for s in data:
+            if s[task_idx] in ['3.0']:
+                s[task_idx] = '1.0'
+                pos.append(s)
+            else:
+                s[task_idx] = '0.0'
+                neg.append(s)
+        num_sample = len(pos)
+        if train:
+            print(f"AMD - Number of positive samples: {num_sample}")
+            return random.sample(neg, 10*num_sample), pos
+        else:
+            return random.sample(neg, 1*num_sample), pos
+            # return neg, pos
+    ###########################################################
+    elif task_idx == -3:  # DR
+        neg = [s for s in data if s[task_idx] == '0.0']
+        pos = [s for s in data if s[task_idx] != '0.0']
+        num_sample = len(pos)
+        if train:
+            return random.sample(neg, 5*num_sample), pos
+        else:
+            return random.sample(neg, 10*num_sample), pos
+    else:
+        raise ValueError(f"Unsupported task_idx: {task_idx}")
+# ==================== MODEL COMPONENTS ====================
+class WeightedCELossFromCausalLM(nn.Module):
+    """Custom weighted cross-entropy loss for handling class imbalance."""
+    def __init__(self, pos_weight=1.5, neg_weight=0.5, ignore_index=-100):
+        super().__init__()
+        self.pos_weight = pos_weight
+        self.neg_weight = neg_weight
+        self.ignore_index = ignore_index
+    def forward(self, logits, labels):
+        """
+        Compute weighted cross-entropy loss.
+        Args:
+            logits: (B, L, V) - model logits
+            labels: (B, L) - target labels
+        """
+        shift_logits = logits[..., :-1, :].contiguous()     # (B, L-1, V)
+        shift_labels = labels[..., 1:].contiguous()         # (B, L-1)
+        # Flatten for CE loss
+        B, L1, V = shift_logits.shape
+        shift_logits = shift_logits.view(-1, V)             # (B*L-1, V)
+        shift_labels = shift_labels.view(-1)                # (B*L-1,)
+        # Compute CE loss without reduction
+        ce_loss = F.cross_entropy(
+            shift_logits, shift_labels,
+            ignore_index=self.ignore_index, reduction='none'
+        )
+        # Apply token-based weights
+        weights = torch.ones_like(ce_loss)
+        weights[shift_labels == POS_ID[0]] = self.pos_weight
+        weights[shift_labels == NEG_ID[0]] = self.neg_weight
+        # Apply valid mask and compute weighted loss
+        valid_mask = shift_labels != self.ignore_index
+        ce_loss = ce_loss[valid_mask]
+        weights = weights[valid_mask]
+        weighted_loss = (ce_loss * weights).mean()
+        return weighted_loss
+class CustomSFTTrainer(SFTTrainer):
+    """Custom trainer with weighted loss and token accuracy logging."""
+    def __init__(self, task_config, *args, **kwargs):
+        self.task_config = task_config
+        super().__init__(*args, **kwargs)
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        """Compute training loss with custom weighted loss and metrics logging."""
+        mode = "train" if self.model.training else "eval"
+        outputs = model(**inputs)
+        logits = outputs.logits
+        labels = inputs["labels"]
+        # Apply task-specific weighted loss
+        loss_fn = WeightedCELossFromCausalLM(
+            pos_weight=self.task_config['pos_weight'],
+            neg_weight=self.task_config['neg_weight']
+        )
+        loss = loss_fn(logits, labels)
+        # Count training tokens
+        if mode == "train":
+            if "attention_mask" in inputs:
+                num_tokens_in_batch = self.accelerator.gather_for_metrics(
+                    inputs["attention_mask"].sum()
+                ).sum().item()
+            elif "position_ids" in inputs:
+                local_num_tokens = torch.tensor(
+                    inputs["position_ids"].size(1),
+                    device=inputs["position_ids"].device
+                )
+                num_tokens_in_batch = self.accelerator.gather_for_metrics(
+                    local_num_tokens
+                ).sum().item()
+            else:
+                raise ValueError("Expected 'attention_mask' or 'position_ids' in inputs.")
+            self._total_train_tokens += num_tokens_in_batch
+        self._metrics[mode]["num_tokens"] = [self._total_train_tokens]
+        # Calculate token-level accuracy
+        if "labels" in inputs and not self.args.use_liger_kernel:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = inputs["labels"][..., 1:].contiguous()
+            predictions = shift_logits.argmax(dim=-1)
+            mask = shift_labels != -100
+            correct_predictions = (predictions == shift_labels) & mask
+            correct_tokens = self.accelerator.gather_for_metrics(correct_predictions.sum())
+            total_tokens = self.accelerator.gather_for_metrics(mask.sum())
+            total_sum = total_tokens.sum()
+            accuracy = (correct_tokens.sum() / total_sum).item() if total_sum > 0 else 0.0
+            self._metrics[mode]["mean_token_accuracy"].append(accuracy)
+        return (loss, outputs) if return_outputs else loss
+# ==================== MAIN EXECUTION ====================
+def setup_model_and_processor(model_id):
+    """Initialize and configure the model and processor."""
+    model_kwargs = dict(
+        attn_implementation="eager",
+        torch_dtype=torch.bfloat16,
+        device_map="auto"
+    )
+    model_kwargs["quantization_config"] = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_quant_storage=torch.bfloat16,
+    )
+    model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
+    processor = AutoProcessor.from_pretrained(model_id)
+    processor.tokenizer.padding_side = "right"
+    return model, processor
+def run_inference(model, processor, val_dataset, task_idx, logger):
+    """Run inference on validation dataset and compute metrics."""
+    batch_size = 1
+    model.eval()
+    preds, targets, infos = [], [], {
+        'sex': [], 'race': [], 'ethnic': [], 'language': []
+    }
+    for i in tqdm(range(0, len(val_dataset), batch_size), desc="Running inference"):
+        batch = val_dataset[i:i + batch_size]
+        # Prepare inputs
+        texts, images = [], []
+        for example in batch:
+            text = processor.apply_chat_template(
+                example["messages"], add_generation_prompt=True, tokenize=False
+            ).strip()
+            texts.append(text)
+            image = example["image"].convert("RGB").resize((IM_SIZE, IM_SIZE))
+            images.append([image])
+        # Run inference
+        with torch.no_grad():
+            texts[0] += "\n"
+            inputs = processor(
+                text=texts, images=images,
+                return_tensors="pt", padding=True
+            ).to(model.device)
+            outputs = model(**inputs, output_hidden_states=False, return_dict=True)
+            logits = outputs.logits
+            # Calculate probability
+            probs = torch.sigmoid(logits[0, -1, POS_ID] - logits[0, -1, NEG_ID])
+            predicted_token = processor.tokenizer.decode(outputs.logits[0].argmax(-1)[-1])
+            # print(f"==> {predicted_token} | {probs}")
+            # Process targets and demographic info with task-specific logic
+            target_value = batch[0]['groups'][task_idx]
+            # if task_idx == -2:  # AMD - only '3.0' is positive
+                # target = 1.0 if target_value == '3.0' else 0.0
+            # else:  # DR and Glaucoma - anything != '0.0' is positive
+            target = 0.0 if target_value == '0.0' else 1.0
+            info = {
+                'sex': batch[0]['groups'][0].item(),
+                'race': batch[0]['groups'][1].item(),
+                'ethnic': batch[0]['groups'][2].item(),
+                'language': batch[0]['groups'][3].item()
+            }
+            preds.append(probs.detach().cpu().item())
+            targets.append(target)
+            for key in infos.keys():
+                infos[key].append(info[key])
+    # Compute and log metrics
+    targets, preds = np.array(targets), np.array(preds)
+    auc_score = roc_auc_score(targets, preds)
+    logger.info(f"AUC: {auc_score:.4f}")
+    compute_es_auc(targets, preds, logger)
+    # Compute group-wise AUC scores
+    group_labels = [
+        ['0', '1'],  # Sex
+        ["Asian", "Black or African American", "White", "Other or Unknown"],  # Race
+        ["0", "1", "Unknown or Not Reported"],  # Ethnicity
+        ["English", "Spanish", "Other or Unknown"]  # Language
+    ]
+    for group, labels in zip(['Sex', 'Race', 'Ethnic', 'Language'], group_labels):
+        compute_group_auc(
+            targets, preds, infos[group.lower()], labels,
+            group, logger, auc_score, False
+        )
+if __name__ == '__main__':
+    # ==================== ARGUMENT PARSING ====================
+    parser = argparse.ArgumentParser(description='Medical Image Classification Training')
+    parser.add_argument("--task", required=True, choices=['amd', 'dr', 'glaucoma'],
+                       help='Medical task: amd, dr, or glaucoma')
+    parser.add_argument("--name", required=True, help='Experiment name')
+    parser.add_argument("--use_subset", action='store_true',
+                       help='Use balanced subset of data')
+    parser.add_argument("--checkpoint", type=str, default=None,
+                       help='Specific checkpoint to use for inference (e.g., checkpoint-938)')
+    parser.add_argument("--eval_only", action='store_true',
+                       help='Only run evaluation without training')
+    args = parser.parse_args()
+    # ==================== SETUP ====================
+    setup_reproducibility()
+    # Get task-specific configuration
+    task_config = TASK_CONFIGS[args.task]
+    task_idx = task_config['task_idx']
+    disease_name = task_config['disease_name']
+    print(f"Task: {args.task.upper()}")
+    print(f"Disease: {disease_name}")
+    print(f"Epochs: {task_config['num_epochs']}")
+    print(f"Learning Rate: {task_config['learning_rate']}")
+    print(f"Batch Size: {task_config['batch_size']}")
+    print(f"LR Scheduler: {task_config['lr_scheduler']}")
+    print("=" * 50)
+    # System message for the model
+    system_message = f"""You are an expert AI in ophthalmology.
+    Your primary role is to provide accurate, reliable, and up-to-date medical knowledge based on credible sources.
+    You must follow these guidelines:
+    1. Be accurate, concise, and clinically relevant.
+    2. Use proper medical terms.
+    3. Avoid overexplaining unless requested.
+    4. Tone: confident, professional, precise.
+    Do not include any explanation or thought.
+    If {disease_name} is present, answer exactly 'positive'. Otherwise answer 'negative'."""
+    # ==================== DATA LOADING ====================
+    img_root_path = '/PHShome/sy1081/exeye/data'
+    train_dataset_raw = np.load('/PHShome/sy1081/exeye/data/train_final.npy')
+    val_dataset_raw = np.load('/PHShome/sy1081/exeye/data/val_final.npy')
+    # Create subsets
+    train_dataset_raw = sum(create_subset(train_dataset_raw, task_idx, train=True), [])
+    val_dataset_raw = sum(create_subset(val_dataset_raw, task_idx, train=False), [])
+    # Format datasets
+    train_dataset = [
+        format_data(s, task_idx, disease_name, system_message, img_root_path)
+        for s in tqdm(train_dataset_raw, desc="Formatting training data")
+    ]
+    random.shuffle(train_dataset)
+    val_dataset = [
+        format_data_for_inference(s, task_idx, disease_name, system_message, img_root_path)
+        for s in tqdm(val_dataset_raw, desc="Formatting validation data")
+    ]
+    print("=" * 50)
+    print(f"Dataset sizes | Train: {len(train_dataset)} | Val: {len(val_dataset)}")
+    print("=" * 50)
+    # ==================== MODEL SETUP ====================
+    model_id = "google/medgemma-27b-it"
+    model, processor = setup_model_and_processor(model_id)
+    # Get token IDs
+    POS_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("positive"))
+    NEG_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("negative"))
+    ASST_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("model\n"))
+    IM_SIZE = 512
+    # LoRA configuration
+    peft_config = LoraConfig(
+        lora_alpha=8,
+        lora_dropout=0.05,
+        r=16,
+        bias="none",
+        target_modules="all-linear",
+        task_type="CAUSAL_LM",
+        modules_to_save=["lm_head", "embed_tokens"],
+    )
+    # ==================== EXPERIMENT SETUP ====================
+    exp_name = f"{model_id.split('/')[-1]}-{args.name}"
+    # Determine phase and load model if exists
+    if args.eval_only or args.checkpoint:
+        # Evaluation mode or specific checkpoint specified
+        if args.checkpoint:
+            checkpoint_path = os.path.join(exp_name, args.checkpoint)
+            if not os.path.exists(checkpoint_path):
+                raise ValueError(f"Specified checkpoint {checkpoint_path} does not exist")
+            print(f"Loading specified checkpoint: {args.checkpoint}")
+        else:
+            # Find the latest checkpoint automatically for eval_only
+            if not os.path.exists(exp_name):
+                raise ValueError(f"Experiment directory {exp_name} does not exist")
+            checkpoints = [d for d in os.listdir(exp_name) if d.startswith("checkpoint-")]
+            if not checkpoints:
+                print("No checkpoint found, loading base experiment...")
+                checkpoint_path = exp_name
+            else:
+                # Sort by checkpoint number
+                latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
+                checkpoint_path = os.path.join(exp_name, latest_checkpoint)
+                print(f"Loading latest checkpoint: {latest_checkpoint}")
+        model = PeftModel.from_pretrained(model, checkpoint_path)
+        phase = "eval"
+        logger = setup_logging(exp_name)
+    elif os.path.exists(exp_name):
+        print("Loading trained PEFT weights...")
+        # Find the latest checkpoint automatically
+        checkpoints = [d for d in os.listdir(exp_name) if d.startswith("checkpoint-")]
+        if checkpoints:
+            # Sort by checkpoint number
+            latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
+            print(f"Loading from {latest_checkpoint}")
+            model = PeftModel.from_pretrained(model, exp_name + f"/{latest_checkpoint}")
+        else:
+            print("No checkpoint found, loading base experiment...")
+            model = PeftModel.from_pretrained(model, exp_name)
+        phase = "val"
+        logger = setup_logging(exp_name)
+    else:
+        print("Initializing new LoRA model...")
+        model = get_peft_model(model, peft_config)
+        model.print_trainable_parameters()
+        phase = "train"
+        os.makedirs(exp_name, exist_ok=True)
+    # Task-specific training configuration
+    training_args = SFTConfig(
+        output_dir=exp_name,
+        num_train_epochs=task_config['num_epochs'],
+        per_device_train_batch_size=task_config['batch_size'],
+        per_device_eval_batch_size=4,
+        gradient_accumulation_steps=8,
+        gradient_checkpointing=True,
+        optim="adamw_torch_fused",
+        logging_steps=10,
+        save_strategy="epoch",
+        eval_strategy="steps",
+        eval_steps=10000,
+        learning_rate=task_config['learning_rate'],
+        bf16=True,
+        max_grad_norm=1.0,
+        warmup_ratio=0.03,
+        lr_scheduler_type=task_config['lr_scheduler'],
+        push_to_hub=True,
+        report_to="tensorboard",
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        dataset_kwargs={"skip_prepare_dataset": True},
+        remove_unused_columns=False,
+        label_names=["labels"],
+    )
+    # Initialize wandb with task-specific project name
+    wandb.init(
+        project=f"{exp_name}-{args.task.upper()}-Project",
+        name=f"{exp_name}-{args.task}",
+        config=dict(training_args.to_dict(), **task_config)
+    )
+    # ==================== TRAINER SETUP ====================
+    trainer = CustomSFTTrainer(
+        task_config=task_config,
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,
+        data_collator=collate_fn,
+        peft_config=peft_config,
+        processing_class=processor.tokenizer,
+    )
+    # Copy source code for reproducibility
+    if phase in ["val", "eval"]:
+        shutil.copy(
+            "/PHShome/sy1081/exeye/train_medgemma_focalft_final.py",
+            os.path.join(exp_name, f"train_medgemma_focalft_final_{args.task}_copy.py")
+        )
+    # ==================== TRAINING ====================
+    if phase == 'train':
+        print(f"Starting {args.task.upper()} training with task-specific configuration...")
+        trainer.train()
+        trainer.save_model(training_args.output_dir)
+        logger = setup_logging(exp_name)
+    # ==================== EVALUATION ====================
+    if phase in ["val", "eval"]:
+        print(f"Starting {args.task.upper()} evaluation...")
+        if args.checkpoint:
+            print(f"Using checkpoint: {args.checkpoint}")
+        run_inference(model, processor, val_dataset, task_idx, logger)