|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
!pip install -q trackio |
|
|
|
|
|
import os |
|
|
import gc |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from PIL import Image |
|
|
import torch |
|
|
from torch import nn |
|
|
from torch.utils.data import Dataset, DataLoader |
|
|
from torchvision import transforms, models |
|
|
import pytorch_lightning as pl |
|
|
from pytorch_lightning.callbacks import ModelCheckpoint |
|
|
from sklearn.model_selection import train_test_split |
|
|
from kaggle_secrets import UserSecretsClient |
|
|
import trackio |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PipelineSettings: |
|
|
def __init__(self): |
|
|
self.DATA_ROOT_DIR = "/kaggle/input/sep-25-dl-gen-ai-nppe-1/face_dataset" |
|
|
self.TRAIN_CSV_PATH = f"{self.DATA_ROOT_DIR}/train.csv" |
|
|
self.TEST_CSV_PATH = f"{self.DATA_ROOT_DIR}/test.csv" |
|
|
|
|
|
self.INPUT_IMAGE_SIZE = 128 |
|
|
self.BATCH_SIZE = 128 |
|
|
self.LEARNING_RATE = 1e-3 |
|
|
self.NUM_EPOCHS = 10 |
|
|
self.AGE_LOSS_WEIGHT = 0.01 |
|
|
|
|
|
self.NUM_DATALOADER_WORKERS = os.cpu_count() |
|
|
|
|
|
settings = PipelineSettings() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ImageAugmentor: |
|
|
def __init__(self, image_size): |
|
|
self.image_size = image_size |
|
|
self.norm_params = {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]} |
|
|
|
|
|
def get_training_transforms(self): |
|
|
return transforms.Compose([ |
|
|
transforms.Resize((self.image_size, self.image_size)), |
|
|
transforms.RandomHorizontalFlip(p=0.5), |
|
|
transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1), |
|
|
transforms.ToTensor(), |
|
|
transforms.Normalize(**self.norm_params), |
|
|
]) |
|
|
|
|
|
def get_inference_transforms(self): |
|
|
return transforms.Compose([ |
|
|
transforms.Resize((self.image_size, self.image_size)), |
|
|
transforms.ToTensor(), |
|
|
transforms.Normalize(**self.norm_params), |
|
|
]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FaceImageDataset(Dataset): |
|
|
def __init__(self, metadata_df, image_dir, image_transform=None): |
|
|
self.metadata = metadata_df |
|
|
self.image_dir = image_dir |
|
|
self.transform = image_transform |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.metadata) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
row = self.metadata.iloc[idx] |
|
|
image_path = os.path.join(self.image_dir, row['full_path']) |
|
|
image = Image.open(image_path).convert("RGB") |
|
|
|
|
|
if self.transform: |
|
|
image = self.transform(image) |
|
|
|
|
|
gender_target = torch.tensor(row['gender'], dtype=torch.float32) |
|
|
age_target = torch.tensor(row['age'], dtype=torch.float32) |
|
|
return image, gender_target, age_target |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FaceDataModule(pl.LightningDataModule): |
|
|
def __init__(self, config: PipelineSettings): |
|
|
super().__init__() |
|
|
self.cfg = config |
|
|
self.augmentor = ImageAugmentor(self.cfg.INPUT_IMAGE_SIZE) |
|
|
self.train_df, self.val_df = None, None |
|
|
|
|
|
def prepare_data(self): |
|
|
pass |
|
|
|
|
|
def setup(self, stage=None): |
|
|
if stage == 'fit' or stage is None: |
|
|
full_train = pd.read_csv(self.cfg.TRAIN_CSV_PATH) |
|
|
self.train_df, self.val_df = train_test_split( |
|
|
full_train, test_size=0.15, random_state=42, stratify=full_train['gender'] |
|
|
) |
|
|
|
|
|
self.train_dataset = FaceImageDataset( |
|
|
self.train_df, self.cfg.DATA_ROOT_DIR, self.augmentor.get_training_transforms() |
|
|
) |
|
|
self.val_dataset = FaceImageDataset( |
|
|
self.val_df, self.cfg.DATA_ROOT_DIR, self.augmentor.get_inference_transforms() |
|
|
) |
|
|
|
|
|
def train_dataloader(self): |
|
|
return DataLoader(self.train_dataset, batch_size=self.cfg.BATCH_SIZE, |
|
|
shuffle=True, num_workers=self.cfg.NUM_DATALOADER_WORKERS) |
|
|
|
|
|
def val_dataloader(self): |
|
|
return DataLoader(self.val_dataset, batch_size=self.cfg.BATCH_SIZE, |
|
|
num_workers=self.cfg.NUM_DATALOADER_WORKERS) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AbstractFaceModel(pl.LightningModule): |
|
|
def __init__(self, learning_rate, age_loss_weight): |
|
|
super().__init__() |
|
|
self.save_hyperparameters() |
|
|
self.lr = learning_rate |
|
|
self.age_weight = age_loss_weight |
|
|
self.gender_loss_fn = nn.BCEWithLogitsLoss() |
|
|
self.age_loss_fn = nn.MSELoss() |
|
|
|
|
|
self.training_step_outputs = [] |
|
|
self.validation_step_outputs = [] |
|
|
|
|
|
def _calculate_losses(self, gender_preds, age_preds, gender_labels, age_labels): |
|
|
gender_loss = self.gender_loss_fn(gender_preds.squeeze(), gender_labels) |
|
|
age_loss = self.age_loss_fn(age_preds.squeeze(), age_labels) |
|
|
total_loss = gender_loss + (age_loss * self.age_weight) |
|
|
return total_loss, gender_loss, age_loss |
|
|
|
|
|
def training_step(self, batch, batch_idx): |
|
|
images, gender_labels, age_labels = batch |
|
|
gender_preds, age_preds = self(images) |
|
|
total_loss, gender_loss, age_loss = self._calculate_losses( |
|
|
gender_preds, age_preds, gender_labels, age_labels |
|
|
) |
|
|
|
|
|
|
|
|
self.log('train_loss', total_loss, on_step=True, on_epoch=True, prog_bar=True) |
|
|
|
|
|
|
|
|
self.training_step_outputs.append({ |
|
|
'loss_total': total_loss.detach(), |
|
|
'loss_gender': gender_loss.detach(), |
|
|
'loss_age': age_loss.detach() |
|
|
}) |
|
|
|
|
|
|
|
|
try: |
|
|
trackio.log({ |
|
|
'train/loss_total': total_loss.item(), |
|
|
'train/loss_gender': gender_loss.item(), |
|
|
'train/loss_age': age_loss.item(), |
|
|
'step': self.global_step |
|
|
}) |
|
|
except: pass |
|
|
|
|
|
return total_loss |
|
|
|
|
|
def on_train_epoch_end(self): |
|
|
if len(self.training_step_outputs) > 0: |
|
|
|
|
|
avg_total = torch.stack([x['loss_total'] for x in self.training_step_outputs]).mean() |
|
|
avg_gender = torch.stack([x['loss_gender'] for x in self.training_step_outputs]).mean() |
|
|
avg_age = torch.stack([x['loss_age'] for x in self.training_step_outputs]).mean() |
|
|
|
|
|
|
|
|
try: |
|
|
trackio.log({ |
|
|
'train/epoch_loss_total': avg_total.item(), |
|
|
'train/epoch_loss_gender': avg_gender.item(), |
|
|
'train/epoch_loss_age': avg_age.item(), |
|
|
'epoch': self.current_epoch |
|
|
}) |
|
|
except: pass |
|
|
|
|
|
self.training_step_outputs.clear() |
|
|
|
|
|
def validation_step(self, batch, batch_idx): |
|
|
images, gender_labels, age_labels = batch |
|
|
gender_preds, age_preds = self(images) |
|
|
total_loss, gender_loss, age_loss = self._calculate_losses( |
|
|
gender_preds, age_preds, gender_labels, age_labels |
|
|
) |
|
|
|
|
|
|
|
|
self.log('val_loss', total_loss, on_epoch=True, prog_bar=True) |
|
|
|
|
|
|
|
|
self.validation_step_outputs.append({ |
|
|
'loss_total': total_loss.detach(), |
|
|
'loss_gender': gender_loss.detach(), |
|
|
'loss_age': age_loss.detach() |
|
|
}) |
|
|
|
|
|
def on_validation_epoch_end(self): |
|
|
if len(self.validation_step_outputs) > 0: |
|
|
|
|
|
avg_total = torch.stack([x['loss_total'] for x in self.validation_step_outputs]).mean() |
|
|
avg_gender = torch.stack([x['loss_gender'] for x in self.validation_step_outputs]).mean() |
|
|
avg_age = torch.stack([x['loss_age'] for x in self.validation_step_outputs]).mean() |
|
|
|
|
|
|
|
|
try: |
|
|
trackio.log({ |
|
|
'val/loss_total': avg_total.item(), |
|
|
'val/loss_gender': avg_gender.item(), |
|
|
'val/loss_age': avg_age.item(), |
|
|
'epoch': self.current_epoch |
|
|
}) |
|
|
except: pass |
|
|
|
|
|
self.validation_step_outputs.clear() |
|
|
|
|
|
def configure_optimizers(self): |
|
|
return torch.optim.Adam(self.parameters(), lr=self.lr) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ScratchCNNModel(AbstractFaceModel): |
|
|
def __init__(self, learning_rate, age_loss_weight): |
|
|
super().__init__(learning_rate, age_loss_weight) |
|
|
|
|
|
def conv_block(in_f, out_f): |
|
|
return nn.Sequential( |
|
|
nn.Conv2d(in_f, out_f, 3, padding=1, bias=False), |
|
|
nn.BatchNorm2d(out_f), |
|
|
nn.ReLU(inplace=True), |
|
|
nn.MaxPool2d(2, 2) |
|
|
) |
|
|
|
|
|
self.feature_extractor = nn.Sequential( |
|
|
conv_block(3, 32), conv_block(32, 64), |
|
|
conv_block(64, 128), conv_block(128, 256) |
|
|
) |
|
|
|
|
|
probe = torch.randn(1, 3, settings.INPUT_IMAGE_SIZE, settings.INPUT_IMAGE_SIZE) |
|
|
flat_size = self.feature_extractor(probe).view(1, -1).size(1) |
|
|
|
|
|
self.gender_head = nn.Linear(flat_size, 1) |
|
|
self.age_head = nn.Linear(flat_size, 1) |
|
|
|
|
|
def forward(self, x): |
|
|
features = torch.flatten(self.feature_extractor(x), 1) |
|
|
return self.gender_head(features), self.age_head(features) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FineTunedResNetModel(AbstractFaceModel): |
|
|
def __init__(self, learning_rate, age_loss_weight): |
|
|
super().__init__(learning_rate, age_loss_weight) |
|
|
resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT) |
|
|
num_features = resnet.fc.in_features |
|
|
|
|
|
self.backbone = nn.Sequential(*list(resnet.children())[:-1]) |
|
|
self.gender_head = nn.Linear(num_features, 1) |
|
|
self.age_head = nn.Linear(num_features, 1) |
|
|
|
|
|
def forward(self, x): |
|
|
features = torch.flatten(self.backbone(x), 1) |
|
|
return self.gender_head(features), self.age_head(features) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PipelineRunner: |
|
|
def __init__(self, cfg: PipelineSettings): |
|
|
self.cfg = cfg |
|
|
self.data_module = FaceDataModule(cfg) |
|
|
self._setup_trackio() |
|
|
|
|
|
def _setup_trackio(self): |
|
|
try: |
|
|
secrets = UserSecretsClient() |
|
|
hf_token = secrets.get_secret("HUGGINGFACE_TOKEN") |
|
|
os.environ["HF_TOKEN"] = hf_token |
|
|
print("β
TrackIO auth configured") |
|
|
except Exception as e: |
|
|
print(f"β οΈ TrackIO auth failed: {e}") |
|
|
|
|
|
def _train_model(self, model, model_name, run_name): |
|
|
print(f"\n{'='*70}\nπ Training: {model_name}\n{'='*70}") |
|
|
|
|
|
|
|
|
try: |
|
|
trackio.init( |
|
|
space_id="muhammad-bilal1/dlgenai-nppe", |
|
|
project="25-t3-nppe1", |
|
|
group=run_name, |
|
|
config={ |
|
|
"lr": self.cfg.LEARNING_RATE, |
|
|
"epochs": self.cfg.NUM_EPOCHS, |
|
|
"batch_size": self.cfg.BATCH_SIZE, |
|
|
"model": model_name, |
|
|
"image_size": self.cfg.INPUT_IMAGE_SIZE, |
|
|
"age_weight": self.cfg.AGE_LOSS_WEIGHT |
|
|
} |
|
|
) |
|
|
print(f"β
TrackIO initialized: {run_name}") |
|
|
except Exception as e: |
|
|
print(f"β οΈ TrackIO init failed: {e}") |
|
|
|
|
|
|
|
|
checkpoint_cb = ModelCheckpoint( |
|
|
monitor='val_loss', |
|
|
dirpath='/kaggle/working/', |
|
|
filename=f'{model_name}-best-model', |
|
|
save_top_k=1, |
|
|
mode='min' |
|
|
) |
|
|
|
|
|
|
|
|
trainer = pl.Trainer( |
|
|
max_epochs=self.cfg.NUM_EPOCHS, |
|
|
accelerator='gpu', |
|
|
devices='auto', |
|
|
strategy="ddp_notebook", |
|
|
callbacks=[checkpoint_cb], |
|
|
log_every_n_steps=10 |
|
|
) |
|
|
|
|
|
trainer.fit(model, self.data_module) |
|
|
print(f"β
Checkpoint: {checkpoint_cb.best_model_path}") |
|
|
|
|
|
|
|
|
try: |
|
|
final_val = trainer.callback_metrics.get('val_loss', torch.tensor(0.0)).item() |
|
|
trackio.log({"final_val_loss": final_val}) |
|
|
trackio.finish() |
|
|
print("β
TrackIO run finished") |
|
|
except Exception as e: |
|
|
print(f"β οΈ TrackIO finish failed: {e}") |
|
|
|
|
|
del model, trainer, checkpoint_cb |
|
|
gc.collect() |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
def execute(self): |
|
|
print("\nπ₯ TRAINING PIPELINE STARTED\n") |
|
|
|
|
|
|
|
|
scratch = ScratchCNNModel(self.cfg.LEARNING_RATE, self.cfg.AGE_LOSS_WEIGHT) |
|
|
self._train_model(scratch, "scratch", "scratch-cnn-run") |
|
|
|
|
|
|
|
|
finetuned = FineTunedResNetModel(self.cfg.LEARNING_RATE, self.cfg.AGE_LOSS_WEIGHT) |
|
|
self._train_model(finetuned, "finetuned", "resnet-finetuned-run") |
|
|
|
|
|
print("\nπ TRAINING COMPLETE!") |
|
|
print("π Checkpoints: /kaggle/working/") |
|
|
print("π TrackIO Dashboard: https://huggingface.co/spaces/muhammad-bilal1/dlgenai-nppe") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
pipeline = PipelineRunner(settings) |
|
|
pipeline.execute() |
|
|
|
|
|
|