Training in progress, epoch 0

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +58 -0
adapter_config.json +47 -0
adapter_model.safetensors +3 -0
added_tokens.json +3 -0
runs/Jul15_11-02-42_seribizon/events.out.tfevents.1752591767.seribizon.4176318.0 +3 -0
special_tokens_map.json +33 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0
train_medgemma_ft_copy.py +420 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+base_model: google/medgemma-4b-it
+library_name: transformers
+model_name: medgemma-4b-it-dr5
+tags:
+- generated_from_trainer
+- sft
+- trl
+licence: license
+---
+# Model Card for medgemma-4b-it-dr5
+This model is a fine-tuned version of [google/medgemma-4b-it](https://huggingface.co/google/medgemma-4b-it).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="berkamphoon/medgemma-4b-it-dr5", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yoon307-kaist/medgemma-4b-it-dr5-Project/runs/gtg6ozbb)
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.19.0
+- Transformers: 4.51.3
+- Pytorch: 2.5.0
+- Datasets: 3.6.0
+- Tokenizers: 0.21.1
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

adapter_config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/medgemma-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "lm_head",
+    "embed_tokens"
+  ],
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "fc1",
+    "out_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "fc2",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d42fc8e20b081bdc403e82f0abfbfc247cfa3b40dc005dad28b8ca245f84feb9
+size 2839124552

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

runs/Jul15_11-02-42_seribizon/events.out.tfevents.1752591767.seribizon.4176318.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba5dfd3fd51e2c00b8b34e24788fd50aae2bfeea01ec7fdbc9b0e9191445b793
+size 8559

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ebf1915455f8237564395182c49e3c685cfe3533b3d50ec6d49ce65ec43c32e
+size 33384723

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

train_medgemma_ft_copy.py ADDED Viewed

	@@ -0,0 +1,420 @@

+from __future__ import division, print_function
+# === Base ===
+import os
+import os.path as osp
+import random
+import argparse
+import logging
+from tqdm import tqdm
+from matplotlib import pyplot as plt
+import pdb
+from PIL import Image
+import shutil
+import os
+# === DL ===
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+# === Custom ===
+import tools.imutils as imutils
+import tools.utils as utils
+import tools.pyutils as pyutils
+from tools.utils import compute_es_auc, compute_group_auc, ImprovedBalancedBatchSampler, compute_es_auc_multi
+# === Evaluation ===
+from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score
+# === Transformers ===
+from transformers import  AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig, pipeline
+from peft import LoraConfig, get_peft_model
+from trl import SFTTrainer, SFTConfig
+import wandb
+# === Label Masking Function ===
+def mask_until_after_assistant(labels: torch.Tensor, tokenizer, assistant_token_ids: list):
+    for i in range(labels.size(0)):
+        for j in range(labels.size(1) - len(assistant_token_ids) + 1):
+            if torch.equal(labels[i, j:j+len(assistant_token_ids)], torch.tensor(assistant_token_ids, device=labels.device)):
+                labels[i, :j + len(assistant_token_ids)] = -100  # ASSISTANT: 까지 마스킹
+                break
+    return labels
+# === Collate Function ===
+def collate_fn(examples):
+    texts = []
+    images = []
+    for example in examples:
+        image = example["image"].convert("RGB")
+        image = image.resize((512,512))
+        images.append([image])
+        texts.append(processor.apply_chat_template(
+            example["messages"], add_generation_prompt=False, tokenize=False
+        ).strip())
+    # Tokenize the texts and process the images
+    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    # The labels are the input_ids, with the padding and image tokens masked in
+    # the loss computation
+    labels = batch["input_ids"].clone()
+    # Mask image tokens
+    image_token_id = [
+        processor.tokenizer.convert_tokens_to_ids(
+            processor.tokenizer.special_tokens_map["boi_token"]
+        )
+    ]
+    # Mask tokens that are not used in the loss computation
+    labels[labels == processor.tokenizer.pad_token_id] = -100
+    labels[labels == image_token_id] = -100
+    labels[labels == 262144] = -100
+    labels = mask_until_after_assistant(labels, processor.tokenizer, ASST_ID)
+    labels[:,-1] = -100
+    batch["labels"] = labels
+    # pdb.set_trace()
+    return batch
+def format_data(sample):
+    label = 'negative' if sample[task_idx] == '0.0' else 'positive'
+    prompt = f"Please diagnose whether the {disease_name} exist or not based on the given image.\n"
+    # pdb.set_trace()
+    example = {}
+    example["image"] = Image.open(os.path.join(img_root_path, sample[1]))
+    example["label"] = 0 if sample[task_idx]== '0,0' else 1
+    example["messages"] = [
+        {"role": "system", "content": [{"type": "text", "text": system_message}]},
+        {"role": "user", "content": [
+            # {"type": "image", "image": os.path.join(img_root_path, sample[1])},
+            {"type": "image"},
+            {"type": "text", "text": prompt},
+        ]},
+        {"role": "assistant", "content": [{"type": "text", "text": str(label)}]}
+    ]
+    return example
+def format_data_for_inference(sample):
+    prompt = f"Please diagnose whether the {disease_name} exist or not based on the given image.\n"
+    # pdb.set_trace()
+    example = {}
+    example["image"] = Image.open(os.path.join(img_root_path, sample[1]))
+    # example["label"] = 0 if sample[task_idx]== '0,0' else 1
+    example["messages"] = [
+        {"role": "system", "content": [{"type": "text", "text": system_message}]},
+        {"role": "user", "content": [
+            # {"type": "image", "image": os.path.join(img_root_path, sample[1])},
+            {"type": "image"},
+            {"type": "text", "text": prompt+"\n"},
+        ]},
+        # {"role": "assistant", "content": [{"type": "text", "text": str(label)}]}
+    ]
+    # prompt = f"Please diagnose whether the {disease_name} exist or not based on the given image."
+    # return [
+    #     {"role": "system", "content": [{"type": "text", "text": system_message}]},
+    #     {"role": "user", "content": [
+    #         {"type": "image", "image": os.path.join(img_root_path, sample[1])},
+    #         {"type": "text", "text": prompt}
+    #     ]}
+    # ]
+    return example
+# === Logit Preprocessing ===
+def slice_logits(logits, labels):
+    if isinstance(logits, (tuple, list)):
+        logits = logits[0]
+    return logits.detach().cpu()
+def compute_metrics(eval_pred):
+    logits = torch.tensor(eval_pred.predictions)
+    token_ids = logits.argmax(dim=-1)  # (B, L): predicted token at each position
+    batch_logits = []
+    for b in range(logits.size(0)):
+        seq = token_ids[b]  # (L,)
+        idxs = torch.where((seq == POS_ID[0]) | (seq == NEG_ID[0]))[0]
+        if len(idxs) == 0:
+            raise ValueError(f"Neither pos_id nor neg_id found in sequence {b}")
+        t = idxs[0].item()  # first position where pos or neg appears
+        tok_id = seq[t].item()  # should be either pos_id or neg_id
+        batch_logits.append(logits[b, t, tok_id])  # scalar
+    batch_logits = torch.stack(batch_logits)  # shape: [B]
+    pred_texts = processor.tokenizer.batch_decode(token_ids[:,-1], skip_special_tokens=True)
+    # print(pred_texts)
+    # pdb.set_trace()
+    probs = torch.sigmoid(logits[:,-1, POS_ID[0]] - logits[:,-1, NEG_ID[0]]).numpy()
+    # probs = torch.sigmoid(batch_logits).numpy()
+    labels = torch.tensor(eval_pred.label_ids)
+    gt_ids = labels[labels != -100].view(logits.size(0), -1)[:, 0]
+    y_true = (gt_ids == POS_ID[0]).int().cpu().numpy()
+    auc_val = roc_auc_score(y_true, probs)
+    fpr, tpr, thr = roc_curve(y_true, probs)
+    best = thr[np.argmax(tpr - fpr)]
+    acc = accuracy_score(y_true, probs >= best)
+    return {"roc_auc": auc_val, "accuracy": acc}
+def run_custom_evaluation(trainer, val_dataset, val_labels):
+    outputs = trainer.predict(val_dataset)
+    logits = torch.from_numpy(outputs.predictions)  # (B, S, L)
+    # pdb.set_trace()
+    probs = torch.sigmoid(logits[:,-1, POS_ID[0]] - logits[:,-1, NEG_ID[0]]).numpy()
+    # decoded = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+    # y_pred = [1 if "positive" in t.lower() else 0 for t in decoded]
+    auc_val = roc_auc_score(val_labels, probs)
+    # acc = accuracy_score(val_labels, y_pred)
+    print(f"[Custom Eval] AUC: {auc_val:.4f}")
+    # print(f"[Custom Eval] AUC: {auc_val:.4f}, ACC: {acc:.4f}")
+    return {"auc": auc_val}
+# === Main ===
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", required=True, help='amd, dr, glaucoma')
+    parser.add_argument("--name", required=True)
+    parser.add_argument("--use_subset", action='store_true')
+    args = parser.parse_args()
+    pyutils.same_seeds(0)
+    task_map = {'dr': (-3, 'Diabetic Retinopathy'), 'amd': (-2, 'Aged Macular Degeneration'), 'glaucoma': (-1, 'Glaucoma')}
+    task_idx, disease_name = task_map[args.task]
+    system_message = f"""You are an expert AI in ophthalmology.\n
+    Your primary role is to provide accurate, reliable, and up-to-date medical knowledge based on credible sources.\n
+    "You must follow these guidelines:\n"
+    "1. Be accurate, concise, and clinically relevant.\n"
+    "2. Use proper medical terms.\n"
+    "3. Avoid overexplaining unless requested.\n"
+    "4. Tone: confident, professional, precise.\n"
+    "Do not include any explanation or thought."
+    If {disease_name} is present, answer exactly 'positive'. Otherwise answer 'negative'."""
+    cudnn.benchmark = True
+    img_root_path = '/shared/ssd_30T/yoon/exEYE/Eyeproject/data'
+    train_dataset = np.load('/shared/ssd_30T/yoon/exEYE/datasplit/train_final.npy')
+    val_dataset_raw = np.load('/shared/ssd_30T/yoon/exEYE/datasplit/val_final.npy')
+    if args.use_subset:
+        def subset(data,train=True):
+            neg = [s for s in data if s[task_idx] == '0.0']
+            pos = [s for s in data if s[task_idx] != '0.0']
+            num_sample = len(pos)
+            if train:
+                return random.sample(neg, 5*num_sample), random.sample(pos, num_sample)
+            else:
+                return random.sample(neg, num_sample), random.sample(pos, num_sample)
+                # return random.sample(neg, 15), random.sample(pos, 15)
+                # return neg, random.sample(pos, num_sample)
+        train_dataset = sum(subset(train_dataset,train=True), [])
+        val_dataset_raw = sum(subset(val_dataset_raw,train=False), [])
+    train_dataset = [format_data(s) for s in tqdm(train_dataset)]
+    random.shuffle(train_dataset)
+    val_dataset = [format_data_for_inference(s) for s in tqdm(val_dataset_raw)]
+    val_labels = [1 if s[task_idx] != '0.0' else 0 for s in val_dataset_raw]
+    # val_dataset = [format_data(s) for s in tqdm(val_dataset)]
+    print("="*50)
+    print(f"Total number of Data| Train: {len(train_dataset)} | Val : {len(val_dataset)}")
+    print("="*50)
+    model_id = "google/medgemma-4b-it"
+    model_kwargs = dict(
+    attn_implementation="eager",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    )
+    model_kwargs["quantization_config"] = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
+        bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
+    )
+    model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
+    processor = AutoProcessor.from_pretrained(model_id)
+    # Use right padding to avoid issues during training
+    processor.tokenizer.padding_side = "right"
+    # processor.image_processor.size = {"height": 512, "width": 512}
+    # processor.image_processor.crop_size = {"height": 512, "width": 512}
+    POS_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("positive")) #30558
+    NEG_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("negative")) #27851
+    ASST_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("model\n"))
+    peft_config = LoraConfig(
+        lora_alpha=16,
+        lora_dropout=0.05,
+        r=16,
+        bias="none",
+        target_modules="all-linear",
+        task_type="CAUSAL_LM",
+        modules_to_save=[
+            "lm_head",
+            "embed_tokens",
+        ],
+    )
+    exp_name = f"{model_id.split('/')[-1]}-{args.name}"
+    if os.path.exists(exp_name):
+        from peft import PeftModel
+        print("🔁 Loading trained PEFT weights...")
+        model = PeftModel.from_pretrained(model, exp_name)
+        # model = PeftModel.from_pretrained(model, "llava-1.5-7b-hf-dr-all/checkpoint-80")
+        phase= "val"
+    else:
+        print("🚀 Initializing new LoRA model...")
+        model = get_peft_model(model, peft_config)
+        model.print_trainable_parameters()
+        phase= "train"
+    training_args = SFTConfig(
+        output_dir=exp_name,
+        num_train_epochs=15,                       # Number of training epochs
+        per_device_train_batch_size=4,                           # Batch size per device during training
+        per_device_eval_batch_size=4,                            # Batch size per device during evaluation
+        gradient_accumulation_steps=8,                           # Number of steps before performing a backward/update pass
+        gradient_checkpointing=True,                             # Enable gradient checkpointing to reduce memory usage
+        optim="adamw_torch_fused",                               # Use fused AdamW optimizer for better performance
+        logging_steps=10,                                        # Number of steps between logs
+        save_strategy="epoch",                                   # Save checkpoint every epoch
+        eval_strategy="steps",                                   # Evaluate every `eval_steps`
+        eval_steps=10000,                                           # Number of steps between evaluations
+        learning_rate=3e-4,                             # Learning rate based on QLoRA paper
+        bf16=True,                                               # Use bfloat16 precision
+        max_grad_norm=0.3,                                       # Max gradient norm based on QLoRA paper
+        warmup_ratio=0.03,                                       # Warmup ratio based on QLoRA paper
+        lr_scheduler_type="linear",                              # Use linear learning rate scheduler
+        push_to_hub=True,                                        # Push model to Hub
+        report_to="tensorboard",                                 # Report metrics to tensorboard
+        gradient_checkpointing_kwargs={"use_reentrant": False},  # Set gradient checkpointing to non-reentrant to avoid issues
+        dataset_kwargs={"skip_prepare_dataset": True},           # Skip default dataset preparation to preprocess manually
+        remove_unused_columns = False,                           # Columns are unused for training but needed for data collator
+        label_names=["labels"],
+    )
+    # training_args.remove_unused_columns = False
+    wandb.init(project=f"{exp_name}-Project", name=exp_name, config=training_args)
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,
+        data_collator=collate_fn,
+        peft_config=peft_config,
+        processing_class=processor.tokenizer,
+        # compute_metrics=compute_metrics,
+        # preprocess_logits_for_metrics=slice_logits,
+    )
+    shutil.copy("/shared/ssd_30T/yoon/exEYE/Eyeproject/train_medgemma_ft.py",os.path.join(".",exp_name,"train_medgemma_ft_copy.py"))
+    if phase == 'train':
+        trainer.train()
+        trainer.save_model(training_args.output_dir)
+    # custom_eval_metrics = run_custom_evaluation(trainer, val_dataset, val_labels)
+    # else:
+        # ft_pipe = pipeline(
+        #     "image-text-to-text",
+        #     model=exp_name,
+        #     processor=processor,
+        #     torch_dtype=torch.bfloat16,
+        # )
+        # # Set `do_sample = False` for deterministic responses
+        # ft_pipe.model.generation_config.do_sample = False
+        # ft_pipe.model.generation_config.pad_token_id = processor.tokenizer.eos_token_id
+        # # Use left padding during inference
+        # processor.tokenizer.padding_side = "left"
+        # texts = []
+        # images = []
+        # for example in val_dataset:
+        #     text = processor.apply_chat_template(
+        #         example["messages"], add_generation_prompt=True, tokenize=False
+        #     ).strip()
+        #     texts.append(text)
+        #     image = example["image"].convert("RGB").resize((512, 512))
+        #     images.append([image])  # 리스트로 감싸야 MedGEMMA가 기대하는 batched format
+        # # pdb.set_trace()
+        # ft_outputs = ft_pipe(
+        #     text=texts,
+        #     images=images,
+        #     max_new_tokens=5,
+        #     batch_size=1,
+        #     return_full_text=False,
+        # )
+    batch_size = 1
+    model.eval()
+    all_logits = []
+    for i in tqdm(range(0, len(val_dataset), batch_size), desc="Running inference with logits"):
+        batch = val_dataset[i:i + batch_size]
+        # prepare inputs
+        texts = []
+        images = []
+        for example in batch:
+            text = processor.apply_chat_template(
+                example["messages"], add_generation_prompt=True, tokenize=False
+            ).strip()
+            texts.append(text)
+            image = example["image"].convert("RGB").resize((512, 512))
+            images.append([image])
+        # tokenizer & image processor
+        with torch.no_grad():
+            texts[0] += "\n"
+            inputs = processor(
+                text=texts,
+                images=images,
+                return_tensors="pt",
+                padding=True
+            ).to(model.device)
+            outputs = model(**inputs, output_hidden_states=False, return_dict=True)
+            # pdb.set_trace()
+            print(processor.tokenizer.decode(outputs.logits[0].argmax(-1)[-1]))
+            # logits: (B, L, V)
+            all_logits.append(outputs.logits.to(torch.float32).detach().cpu().numpy())
+    # pdb.set_trace()
+    logits= torch.from_numpy(np.stack(all_logits,axis=0)).squeeze(1)
+    probs = torch.sigmoid(logits[:,-1, POS_ID] - logits[:,-1, NEG_ID])
+    # decoded = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+    # y_pred = [1 if "positive" in t.lower() else 0 for t in decoded]
+    auc_val = roc_auc_score(val_labels, probs)
+    print(auc_val)
+    # print(trainer.evaluate())

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:917843d0db4b87699709b89729f6dbbf7627e023b2ea7d95950d17712c751c5e
+size 5752