working physicsnemo training script

Browse files

Files changed (6) hide show

physicsnemo/Dataset.py +9 -7
physicsnemo/MeshGraphNet.py +30 -0
physicsnemo/config.yaml +2 -9
physicsnemo/setup/Dockerfile +25 -0
physicsnemo/setup/build_image.sh +4 -0
physicsnemo/train.py +208 -7

physicsnemo/Dataset.py CHANGED Viewed

@@ -42,7 +42,7 @@ def make_graph(node_features: np.array, dtype=torch.float32):
     dphi = phi[src] - phi[dst]
     dphi = torch.remainder(dphi + np.pi, 2 * np.pi) - np.pi
     dR = torch.sqrt(deta ** 2 + dphi ** 2)
-    edge_features = torch.stack([deta, dphi, dR], dim=1)
     g.edata['features'] = edge_features
     g.globals = torch.tensor([num_nodes], dtype=dtype)
@@ -102,7 +102,7 @@ def process_chunk(args):
             node_features = np.empty((0, len(features)))
         graphs.append(make_graph(node_features, dtype=dtype))
-    labels = torch.full((len(graphs),), label, dtype=torch.long)
     dgl.save_graphs(f"{save_path}/{name}_{chunk_id:02d}.bin", graphs, {'label': labels})
     print(f"Saved {name} chunk {chunk_id:02d} to {save_path}/{name}_{chunk_id:03d}.bin")
     return
@@ -151,8 +151,7 @@ class Root_Graph:
             num_entries = tree.num_entries
         print(f"Getting branches: {branches}")
-        graphs = []
         step_size = math.ceil(num_entries / self.chunks)
         # Prepare chunk arguments for each chunk
@@ -241,6 +240,8 @@ class Root_Graph:
         val_labels = val_label_dict['label']
         test_labels = test_label_dict['label']
         return train_graphs, train_labels, val_graphs, val_labels, test_graphs, test_labels
 class GraphDataset(Dataset):
@@ -284,8 +285,9 @@ def get_dataset(cfg: DictConfig):
     batch_size = cfg.root_graph.batch_size
-    train_loader = GraphDataLoader(train_dataset, {'batch_size' : batch_size, 'shuffle' : True})
-    val_loader = GraphDataLoader(val_dataset, {'batch_size' : batch_size, 'shuffle' : False})
-    test_loader = GraphDataLoader(test_dataset, {'batch_size' : batch_size, 'shuffle' : False})
     return train_loader, val_loader, test_loader

     dphi = phi[src] - phi[dst]
     dphi = torch.remainder(dphi + np.pi, 2 * np.pi) - np.pi
     dR = torch.sqrt(deta ** 2 + dphi ** 2)
+    edge_features = torch.stack([dR, deta, dphi], dim=1)
     g.edata['features'] = edge_features
     g.globals = torch.tensor([num_nodes], dtype=dtype)
             node_features = np.empty((0, len(features)))
         graphs.append(make_graph(node_features, dtype=dtype))
+    labels = torch.full((len(graphs),), label, dtype=dtype)
     dgl.save_graphs(f"{save_path}/{name}_{chunk_id:02d}.bin", graphs, {'label': labels})
     print(f"Saved {name} chunk {chunk_id:02d} to {save_path}/{name}_{chunk_id:03d}.bin")
     return
             num_entries = tree.num_entries
         print(f"Getting branches: {branches}")
         step_size = math.ceil(num_entries / self.chunks)
         # Prepare chunk arguments for each chunk
         val_labels = val_label_dict['label']
         test_labels = test_label_dict['label']
+        print(f"successfully loaded {self.name}")
         return train_graphs, train_labels, val_graphs, val_labels, test_graphs, test_labels
 class GraphDataset(Dataset):
     batch_size = cfg.root_graph.batch_size
+    train_loader = GraphDataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = GraphDataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    test_loader = GraphDataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    print("all data loaded successfully")
     return train_loader, val_loader, test_loader

physicsnemo/MeshGraphNet.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import torch.nn as nn
+import dgl
+# Import the PhysicsNemo MeshGraphNet model
+from physicsnemo.models.meshgraphnet import MeshGraphNet as PhysicsNemoMeshGraphNet
+class MeshGraphNet(nn.Module):
+    def __init__(self, *args, out_dim=1, **kwargs):
+        super().__init__()
+        # Initialize the PhysicsNemo MeshGraphNet
+        self.base_gnn = PhysicsNemoMeshGraphNet(*args, **kwargs)
+        # Assume node_output_dim is known or infer from args/kwargs
+        node_output_dim = kwargs.get('hidden_dim_node_decoder', 64)
+        self.mlp = nn.Linear(node_output_dim, out_dim)
+    def forward(self, node_feats, edge_feats, batched_graph):
+        """
+        Args:
+            node_feats: [total_num_nodes, node_feat_dim]
+            edge_feats: [total_num_edges, edge_feat_dim]
+            batched_graph: DGLGraph, batched graphs
+        Returns:
+            graph_pred: [num_graphs, out_dim]
+        """
+        node_pred = self.base_gnn(node_feats, edge_feats, batched_graph)
+        batched_graph.ndata['h'] = node_pred
+        graph_feat = dgl.readout_nodes(batched_graph, 'h', op='mean')  # [num_graphs, node_output_dim]
+        graph_pred = self.mlp(graph_feat)  # [num_graphs, out_dim]
+        return graph_pred

physicsnemo/config.yaml CHANGED Viewed

@@ -20,16 +20,8 @@ scheduler:
   lr_decay: 1.E-3
 training:
-  batch_size: 100
   epochs: 100
-  geometries: "healthy"
-  stride: 5
-  rate_noise: 100
-  train_test_split: 0.9
-  loss_weight_1st_timestep: 1
-  loss_weight_other_timesteps: 0.5
-  loss_weight_boundary_nodes: 100
 checkpoints:
   ckpt_path: "checkpoints"
   ckpt_name: "model.pt"
@@ -47,6 +39,7 @@ architecture:
   hidden_dim_edge_encoder: 64
   hidden_dim_processor: 64
   hidden_dim_node_decoder: 64
 paths:
   data_dir: /global/cfs/projectdirs/trn007/lbl_atlas/data/stats_100K

   lr_decay: 1.E-3
 training:
   epochs: 100
 checkpoints:
   ckpt_path: "checkpoints"
   ckpt_name: "model.pt"
   hidden_dim_edge_encoder: 64
   hidden_dim_processor: 64
   hidden_dim_node_decoder: 64
+  out_dim: 1
 paths:
   data_dir: /global/cfs/projectdirs/trn007/lbl_atlas/data/stats_100K

physicsnemo/setup/Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM nvcr.io/nvidia/physicsnemo/physicsnemo:25.06
+WORKDIR /global/cfs/projectdirs/atlas/joshua/GNN4Colliders
+LABEL maintainer.name="Joshua Ho"
+LABEL maintainer.email="ho22joshua@berkeley.edu"
+ENV LANG=C.UTF-8
+# Install system dependencies: vim, OpenMPI, and build tools
+RUN apt-get update -qq \
+ && apt-get install -y --no-install-recommends \
+    wget lsb-release gnupg software-properties-common \
+    vim \
+    g++-11 gcc-11 libstdc++-11-dev \
+    openmpi-bin openmpi-common libopenmpi-dev \
+ && rm -rf /var/lib/apt/lists/*
+# Install Python packages: mpi4py and jupyter
+RUN pip install --no-cache-dir mpi4py jupyter uproot
+# (Optional) Expose Jupyter port
+EXPOSE 8888

physicsnemo/setup/build_image.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+tag=$1
+echo $tag
+podman-hpc build -t joshuaho/nemo:$tag --platform linux/amd64 .
+podman-hpc migrate joshuaho/nemo:$tag

physicsnemo/train.py CHANGED Viewed

@@ -16,19 +16,212 @@ from physicsnemo.distributed.manager import DistributedManager
 from Dataset import get_dataset
 import json
 class MGNTrainer:
     def __init__(self, logger, cfg, dist):
         # set device
         self.device = dist.device
         logger.info(f"Using {self.device} device")
         norm_type = {"features": "normal", "labels": "normal"}
-        self.train_loader, self.val_loader, self.test_loader = get_dataset(cfg)
-        print(f"train: {self.train_loader}")
-        print(f"val: {self.val_loader}")
-        print(f"test: {self.test_loader}")
 @hydra.main(version_base=None, config_path=".", config_name="config")
 def do_training(cfg: DictConfig):
@@ -55,11 +248,19 @@ def do_training(cfg: DictConfig):
     start = time.time()
     logger.info("Training started...")
     for epoch in range(trainer.epoch_init, cfg.training.epochs):
-        for graph in trainer.dataloader:
-            loss = trainer.train(graph)
         logger.info(
-            f"epoch: {epoch}, loss: {loss:10.3e}, time per epoch: {(time.time()-start):10.3e}"
         )
         # save checkpoint

 from Dataset import get_dataset
 import json
+from sklearn.metrics import roc_auc_score
+import MeshGraphNet
+import torch.nn.functional as F
+def weighted_bce(input, target, device=None, weights=None):
+    """
+    Compute a weighted and label-normalized binary cross entropy (BCE) loss.
+    For each unique label in the target tensor, the BCE loss is computed and weighted,
+    then normalized by the sum of weights for that label. The final loss is the mean
+    of these per-label normalized losses.
+    Args:
+        input (Tensor): Predicted logits of shape (N, ...).
+        target (Tensor): Ground truth labels of shape (N, ...), with discrete label values.
+        device (torch.device or None): Device to move tensors to (optional).
+        weights (Tensor or None): Optional tensor of per-sample weights, same shape as input/target.
+    Returns:
+        Tensor: Scalar tensor representing the normalized weighted BCE loss.
+    """
+    if input.shape != target.shape:
+        if input.shape[-1] == 1 and input.shape[:-1] == target.shape:
+            input = input.squeeze(-1)
+        elif target.shape[-1] == 1 and target.shape[:-1] == input.shape:
+            target = target.squeeze(-1)
+    if weights is None:
+        weights = torch.ones_like(target)
+    if device is not None:
+        input = input.to(device)
+        target = target.to(device)
+        weights = weights.to(device)
+    # Compute per-element BCE loss (no reduction)
+    loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
+    # Vectorized label normalization
+    unique_labels = torch.unique(target)
+    normalized_losses = []
+    for label in unique_labels:
+        label_mask = (target == label)
+        label_weights = weights[label_mask]
+        label_losses = loss[label_mask]
+        weight_sum = label_weights.sum()
+        if weight_sum > 0:
+            label_loss = (label_weights * label_losses).sum() / weight_sum
+            normalized_losses.append(label_loss)
+    if normalized_losses:
+        return torch.stack(normalized_losses).mean()
+    else:
+        return torch.tensor(0.0, device=input.device)
 class MGNTrainer:
     def __init__(self, logger, cfg, dist):
         # set device
         self.device = dist.device
         logger.info(f"Using {self.device} device")
+        params = {}
         norm_type = {"features": "normal", "labels": "normal"}
+        self.dataloader, self.valloader, self.testloader = get_dataset(cfg)
+        dtype_str = getattr(cfg.root_graph, "type", "torch.float32")
+        if isinstance(dtype_str, str) and dtype_str.startswith("torch."):
+            self.dtype = getattr(torch, dtype_str.split(".")[-1], torch.float32)
+        else:
+            self.dtype = torch.float32
+        nodes_features = cfg.root_graph.features
+        edges_features = ["dR", "deta", "dphi"]
+        global_features = ["num_nodes"]
+        params["infeat_nodes"] = len(nodes_features)
+        params["infeat_edges"] = len(edges_features)
+        params["infeat_globals"] = len(global_features)
+        params["out_dim"] = cfg.architecture.hidden_dim_node_encoder
+        params["node_features"] = list(nodes_features)
+        params["edges_features"] = edges_features
+        params["global_features"] = global_features
+        self.model = MeshGraphNet.MeshGraphNet(
+            params["infeat_nodes"],
+            params["infeat_edges"],
+            params['out_dim'],
+            processor_size=cfg.architecture.processor_size,
+            hidden_dim_node_encoder=cfg.architecture.hidden_dim_node_encoder,
+            hidden_dim_edge_encoder=cfg.architecture.hidden_dim_edge_encoder,
+            hidden_dim_processor=cfg.architecture.hidden_dim_processor,
+            hidden_dim_node_decoder=cfg.architecture.hidden_dim_node_decoder,
+        )
+        self.model = self.model.to(dtype=self.dtype, device=self.device)
+        if cfg.performance.jit:
+            self.model = torch.jit.script(self.model).to(self.device)
+        else:
+            self.model = self.model.to(self.device)
+        # instantiate loss, optimizer, and scheduler
+        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=cfg.scheduler.lr)
+        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            self.optimizer,
+            T_max=cfg.training.epochs,
+            eta_min=cfg.scheduler.lr * cfg.scheduler.lr_decay,
+        )
+        self.scaler = GradScaler(self.device)
+        # load checkpoint
+        self.epoch_init = load_checkpoint(
+            os.path.join(cfg.checkpoints.ckpt_path, cfg.checkpoints.ckpt_name),
+            models=self.model,
+            optimizer=self.optimizer,
+            scheduler=self.scheduler,
+            scaler=self.scaler,
+            device=self.device,
+        )
+        self.params = params
+        self.cfg = cfg
+    def backward(self, loss):
+        """
+        Perform backward pass.
+        Arguments:
+            loss: loss value.
+        """
+        # backward pass
+        if self.cfg.performance.amp:
+            self.scaler.scale(loss).backward()
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+        else:
+            loss.backward()
+            self.optimizer.step()
+    def train(self, graph, label):
+        """
+        Perform one training iteration over one graph. The training is performed
+        over multiple timesteps, where the number of timesteps is specified in
+        the 'stride' parameter.
+        Arguments:
+            graph: the desired graph.
+        Returns:
+            loss: loss value.
+        """
+        graph = graph.to(self.device)
+        self.optimizer.zero_grad()
+        pred = self.model(graph.ndata["features"], graph.edata["features"], graph)
+        loss = weighted_bce(pred, label, device=self.device)
+        self.backward(loss)
+        return loss
+    @torch.no_grad()
+    def eval(self):
+        """
+        Evaluate the model on one batch.
+        Args:
+            graph (DGLGraph): The input graph.
+            label (Tensor): The target labels.
+        Returns:
+            loss (Tensor): The computed loss value (scalar).
+        """
+        predictions = []
+        labels = []
+        for graph, label in self.valloader:
+            graph = graph.to(self.device)
+            pred = self.model(graph.ndata["features"], graph.edata["features"], graph)
+            predictions.append(pred)
+            labels.append(label)
+        predictions = torch.cat(predictions, dim=0)
+        labels = torch.cat(labels, dim=0)
+        loss = weighted_bce(predictions, labels, device=self.device)
+        # Convert logits to probabilities
+        prob = torch.sigmoid(predictions)
+        # Flatten to 1D arrays
+        prob_flat = prob.detach().to(torch.float32).cpu().numpy().flatten()
+        labels_flat = labels.detach().to(torch.float32).cpu().numpy().flatten()
+        # Calculate AUC
+        try:
+            auc = roc_auc_score(labels_flat, prob_flat)
+        except ValueError:
+            auc = float('nan')  # Not enough classes present for AUC
+        return loss, auc
 @hydra.main(version_base=None, config_path=".", config_name="config")
 def do_training(cfg: DictConfig):
     start = time.time()
     logger.info("Training started...")
     for epoch in range(trainer.epoch_init, cfg.training.epochs):
+        # Training
+        train_loss = []
+        for graph, label in trainer.dataloader:
+            trainer.model.train()
+            train_loss.append(trainer.train(graph, label))
+        val_loss, val_auc = trainer.eval()
+        train_loss = torch.mean(torch.stack(train_loss)).item()
         logger.info(
+            f"epoch: {epoch}, loss: {train_loss:10.3e}, val_loss: {val_loss:10.3e}, val_auc = {val_auc:10.3e}, time per epoch: {(time.time()-start):10.3e}"
         )
         # save checkpoint