working physicsnemo

Browse files

Files changed (13) hide show

physicsnemo/{config.yaml → configs/config.yaml} +8 -11
physicsnemo/{config_stats_all.yaml → configs/config_stats_all.yaml} +4 -6
physicsnemo/configs/tHjb_CP_0_vs_45.yaml +79 -0
physicsnemo/configs/tHjb_CP_0_vs_90.yaml +79 -0
physicsnemo/configs/tHjb_CP_0_vs_90_globals.yaml +84 -0
physicsnemo/{dataset.py → dataset/Dataset.py} +125 -72
physicsnemo/dataset/GraphBuilder.py +162 -0
physicsnemo/dataset/Graphs.py +88 -0
physicsnemo/dataset/Normalization.py +144 -0
physicsnemo/dataset_utils.py +0 -121
physicsnemo/metrics.py +110 -0
physicsnemo/models/MeshGraphNet.py +112 -14
physicsnemo/train.py +45 -98

physicsnemo/{config.yaml → configs/config.yaml} RENAMED Viewed

@@ -16,7 +16,7 @@
 random_seed: 2
 scheduler:
-  lr: 1.E-4
   lr_decay: 1.E-3
 training:
@@ -24,21 +24,18 @@ training:
 checkpoints:
   ckpt_path: "checkpoints"
-  ckpt_name: "model.pt"
 performance:
   amp: False
   jit: False
-testing:
-  graph: "s0090_0001.21.0.grph"
 architecture:
-  processor_size: 5
-  hidden_dim_node_encoder: 64
-  hidden_dim_edge_encoder: 64
-  hidden_dim_processor: 64
-  hidden_dim_node_decoder: 64
   out_dim: 1
 paths:
@@ -62,6 +59,6 @@ root_dataset:
   globals: []
   weights: ""
   tracking: []
-  chunks: 10
   batch_size: 8192
   train_val_test_split: [0.75, 0.24, 0.01]

 random_seed: 2
 scheduler:
+  lr: 1.E-3
   lr_decay: 1.E-3
 training:
 checkpoints:
   ckpt_path: "checkpoints"
+  ckpt_name: "config"
 performance:
   amp: False
   jit: False
 architecture:
+  processor_size: 8
+  hidden_dim_node_encoder: 128
+  hidden_dim_edge_encoder: 128
+  hidden_dim_processor: 128
+  hidden_dim_node_decoder: 128
   out_dim: 1
 paths:
   globals: []
   weights: ""
   tracking: []
+  step_size: 8192
   batch_size: 8192
   train_val_test_split: [0.75, 0.24, 0.01]

physicsnemo/{config_stats_all.yaml → configs/config_stats_all.yaml} RENAMED Viewed

@@ -24,15 +24,12 @@ training:
 checkpoints:
   ckpt_path: "checkpoints"
-  ckpt_name: "model.pt"
 performance:
   amp: False
   jit: False
-testing:
-  graph: "s0090_0001.21.0.grph"
 architecture:
   processor_size: 5
   hidden_dim_node_encoder: 64
@@ -62,6 +59,7 @@ root_dataset:
   globals: []
   weights: ""
   tracking: []
-  step_size: 1024
   batch_size: 8192
-  train_val_test_split: [0.75, 0.24, 0.01]

 checkpoints:
   ckpt_path: "checkpoints"
+  ckpt_name: "config_stats_all"
 performance:
   amp: False
   jit: False
 architecture:
   processor_size: 5
   hidden_dim_node_encoder: 64
   globals: []
   weights: ""
   tracking: []
+  step_size: 81920
   batch_size: 8192
+  train_val_test_split: [0.75, 0.24, 0.01]
+  prebatch: True

physicsnemo/configs/tHjb_CP_0_vs_45.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+# ignore_header_test
+# Copyright 2023 Stanford University
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+random_seed: 2
+scheduler:
+  lr: 1.E-3
+  lr_decay: 1.E-3
+training:
+  epochs: 100
+checkpoints:
+  ckpt_path: "checkpoints"
+  ckpt_name: "config"
+performance:
+  amp: False
+  jit: False
+architecture:
+  processor_size: 8
+  hidden_dim_node_encoder: 128
+  hidden_dim_edge_encoder: 128
+  hidden_dim_processor: 128
+  hidden_dim_node_decoder: 128
+  global_emb_dim: 128
+  out_dim: 1
+paths:
+  data_dir: /global/cfs/projectdirs/atlas/joshua/ttHCP/ntuples/v02/preselection/merged_fixed/train/
+  save_dir: /pscratch/sd/j/joshuaho/physicsnemo/ttHCP/graphs/tHjb_CP_0_vs_45/
+  training_dir: ./training_tHjb_CP_0_vs_45/
+datasets:
+  - name: tHjb_cp_0_had
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_0_AF3_had_scaled.root
+    label: 0
+  - name: tHjb_cp_0_lep
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_0_AF3_lep_scaled.root
+    label: 0
+  - name: tHjb_cp_45_had
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_45_AF3_had_scaled.root
+    label: 1
+  - name: tHjb_cp_45_lep
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_45_AF3_lep_scaled.root
+    label: 1
+root_dataset:
+  ttree: output
+  dtype: torch.bfloat16
+  features:
+    # pt, eta, phi, energy, btag, charge, node_type
+    jet: [m_jet_pt, m_jet_eta, m_jet_phi, CALC_E, m_jet_PCbtag, 0, 0]
+    electron: [m_el_pt, m_el_eta, m_el_phi, CALC_E, 0, m_el_charge, 1]
+    muon: [m_mu_pt, m_mu_eta, m_mu_phi, CALC_E, 0, m_mu_charge, 2]
+    photon: [ph_pt_myy, ph_eta, ph_phi, CALC_E, 0, 0, 3]
+    met: [m_met, 0, m_met_phi, CALC_E, 0, 0, 4]
+  globals: [NUM_NODES]
+  weights: m_weightXlumi
+  tracking: []
+  step_size: 16384
+  batch_size: 16384
+  train_val_test_split: [0.5, 0.25, 0.25]
+  prebatch:
+    enabled: True
+    chunk_size: 512

physicsnemo/configs/tHjb_CP_0_vs_90.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+# ignore_header_test
+# Copyright 2023 Stanford University
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+random_seed: 2
+scheduler:
+  lr: 1.E-3
+  lr_decay: 1.E-3
+training:
+  epochs: 100
+checkpoints:
+  ckpt_path: "checkpoints"
+  ckpt_name: "tHjb_CP_0_vs_90"
+performance:
+  amp: False
+  jit: False
+architecture:
+  processor_size: 8
+  hidden_dim_node_encoder: 128
+  hidden_dim_edge_encoder: 128
+  hidden_dim_processor: 128
+  hidden_dim_node_decoder: 128
+  global_emb_dim: 128
+  out_dim: 1
+paths:
+  data_dir: /global/cfs/projectdirs/atlas/joshua/ttHCP/ntuples/v02/preselection/merged_fixed/train/
+  save_dir: /pscratch/sd/j/joshuaho/physicsnemo/ttHCP/graphs/tHjb_CP_0_vs_90/
+  training_dir: ./tHjb_CP_0_vs_90/
+datasets:
+  - name: tHjb_cp_0_had
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_0_AF3_had_scaled.root
+    label: 0
+  - name: tHjb_cp_0_lep
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_0_AF3_lep_scaled.root
+    label: 0
+  - name: tHjb_cp_90_had
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_90_AF3_had_scaled.root
+    label: 1
+  - name: tHjb_cp_90_lep
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_90_AF3_lep_scaled.root
+    label: 1
+root_dataset:
+  ttree: output
+  dtype: torch.bfloat16
+  features:
+    # pt, eta, phi, energy, btag, charge, node_type
+    jet: [m_jet_pt, m_jet_eta, m_jet_phi, CALC_E, m_jet_PCbtag, 0, 0]
+    electron: [m_el_pt, m_el_eta, m_el_phi, CALC_E, 0, m_el_charge, 1]
+    muon: [m_mu_pt, m_mu_eta, m_mu_phi, CALC_E, 0, m_mu_charge, 2]
+    photon: [ph_pt_myy, ph_eta, ph_phi, CALC_E, 0, 0, 3]
+    met: [m_met, 0, m_met_phi, CALC_E, 0, 0, 4]
+  globals: [NUM_NODES]
+  weights: 1
+  tracking: []
+  step_size: 16384
+  batch_size: 16384
+  train_val_test_split: [0.5, 0.25, 0.25]
+  prebatch:
+    enabled: True
+    chunk_size: 512

physicsnemo/configs/tHjb_CP_0_vs_90_globals.yaml ADDED Viewed

	@@ -0,0 +1,84 @@

+# ignore_header_test
+# Copyright 2023 Stanford University
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+random_seed: 2
+scheduler:
+  lr: 1.E-3
+  lr_decay: 1.E-3
+training:
+  epochs: 100
+checkpoints:
+  ckpt_path: "checkpoints"
+  ckpt_name: "tHjb_CP_0_vs_90_globals"
+performance:
+  amp: False
+  jit: False
+architecture:
+  base_gnn:
+    input_dim_nodes: 7
+    input_dim_edges: 3
+    output_dim: 128
+    processor_size: 8
+    hidden_dim_node_encoder: 128
+    hidden_dim_edge_encoder: 128
+    hidden_dim_processor: 128
+    hidden_dim_node_decoder: 128
+  global_emb_dim: 128
+  global_feat_dim: 5
+  out_dim: 1
+paths:
+  data_dir: /global/cfs/projectdirs/atlas/joshua/ttHCP/ntuples/v02/preselection/merged_fixed/train/
+  save_dir: /pscratch/sd/j/joshuaho/physicsnemo/ttHCP/graphs/tHjb_CP_0_vs_90_globals/
+  training_dir: ./tHjb_CP_0_vs_90_globals/
+datasets:
+  - name: tHjb_cp_0_had
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_0_AF3_had_scaled.root
+    label: 0
+  - name: tHjb_cp_0_lep
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_0_AF3_lep_scaled.root
+    label: 0
+  - name: tHjb_cp_90_had
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_90_AF3_had_scaled.root
+    label: 1
+  - name: tHjb_cp_90_lep
+    load_path: ${paths.data_dir}/merged_aMCPy8_tHjb125_CP_90_AF3_lep_scaled.root
+    label: 1
+root_dataset:
+  ttree: output
+  dtype: torch.bfloat16
+  features:
+    # pt, eta, phi, energy, btag, charge, node_type
+    jet: [m_jet_pt, m_jet_eta, m_jet_phi, CALC_E, m_jet_PCbtag, 0, 0]
+    electron: [m_el_pt, m_el_eta, m_el_phi, CALC_E, 0, m_el_charge, 1]
+    muon: [m_mu_pt, m_mu_eta, m_mu_phi, CALC_E, 0, m_mu_charge, 2]
+    photon: [ph_pt_myy, ph_eta, ph_phi, CALC_E, 0, 0, 3]
+    met: [m_met, 0, m_met_phi, CALC_E, 0, 0, 4]
+  globals: [NUM_NODES, eta_H, pt_H, eta_recotop1, pT_recotop1]
+  weights: 1
+  tracking: []
+  step_size: 16384
+  batch_size: 16384
+  train_val_test_split: [0.5, 0.25, 0.25]
+  prebatch:
+    enabled: True
+    chunk_size: 512

physicsnemo/{dataset.py → dataset/Dataset.py} RENAMED Viewed

@@ -3,25 +3,25 @@ import uproot
 import dgl
 import torch
 import numpy as np
-import awkward as ak
 from omegaconf import DictConfig
 from typing import List
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from tqdm import tqdm
-import dataset_utils as utils
-from torch.utils.data import Dataset
 from dgl.dataloading import GraphDataLoader
-class RootDataset:
     def __init__(
         self,
         name: str,
         label: int,
         load_path: str,
         save_path: str,
         device: str,
         cfg: DictConfig
     ):
@@ -29,31 +29,38 @@ class RootDataset:
         self.label = label
         self.load_path = load_path
         self.save_path = save_path
         self.data = None
         self.device = device
         self.ttree = cfg.ttree
-        self.particles = cfg.particles
         self.features = cfg.features
         self.globals = cfg.globals
         self.step_size = cfg.step_size
         self.batch_size = cfg.batch_size
         self.train_val_test_split = cfg.train_val_test_split
         assert np.sum(self.train_val_test_split) == 1, "train_val_test_split must sum to 1"
-        dtype_str = getattr(cfg, "type", "torch.float32")
-        if isinstance(dtype_str, str) and dtype_str.startswith("torch."):
-            self.dtype = getattr(torch, dtype_str.split(".")[-1], torch.float32)
-        else:
-            self.dtype = torch.float32
         print(f"initializing dataset {name} with dtype {self.dtype}")
     def get_branches(self) -> List[str]:
-        branches = [f"{p}_{f}" for p in self.particles for f in self.features]
-        branches += self.globals
-        branches = ["MET_met" if b == "MET_pt" else b for b in branches]
-        return branches
     def process(self):
         branches = self.get_branches()
@@ -68,43 +75,46 @@ class RootDataset:
         with ProcessPoolExecutor(max_workers=num_cpus) as executor:
             futures = []
-            for chunk_id, arrays in enumerate(
-                tqdm(
-                    uproot.iterate(
                         f"{self.load_path}:{self.ttree}",
                         expressions=[b for b in branches if b in available_branches],
                         step_size=self.step_size,
                         library="ak"
                     ),
-                    desc="loading root file",
-                    total=total_chunks,
-                    position=0,
-                    leave=False
-                )
-            ):
-                cfg = utils.ChunkConfig(
-                    name=self.name,
-                    label=self.label,
-                    chunk_id=chunk_id,
-                    batch_size=self.batch_size,
-                    arrays=arrays,
-                    particles=self.particles,
-                    features=self.features,
-                    branches=branches,
-                    dtype=self.dtype,
-                    save_path=self.save_path,
-                )
-                futures.append(executor.submit(utils.process_chunk, cfg))
-        for future in as_completed(futures):
             try:
                 future.result()
             except Exception as e:
                 import traceback
-                print("Exception in worker process:")
                 traceback.print_exception(type(e), e, e.__traceback__)
         return
@@ -119,12 +129,18 @@ class RootDataset:
             self.process()
         graph_tuple_list = []
         for chunk_id, f in enumerate(chunk_files):
             if chunk_id < total_chunks - 1:
-                n_graphs = self.step_size
             else:
-                n_graphs = num_entries - self.step_size * (total_chunks - 1)
             graph_tuple_list.extend((f, idx) for idx in range(n_graphs))
         split = self.train_val_test_split
@@ -136,54 +152,91 @@ class RootDataset:
         val_tuples   = graph_tuple_list[n_train:n_train + n_val]
         test_tuples  = graph_tuple_list[n_train + n_val:]
         return train_tuples, val_tuples, test_tuples
-class GraphDataset(Dataset):
-    def __init__(self, graphs, labels):
-        self.graphs = graphs
-        self.labels = labels
     def __len__(self):
-        return len(self.graphs)
-    def __getitem__(self, idx):
-        return self.graphs[idx], self.labels[idx]
-    def shuffle(self):
-        # TODO: implement graph shuffling
-        return self.graphs
-class GraphTupleDataset:
-    def __init__(self, tuple_list):
-        self.tuples = tuple_list
-    def __len__(self):
-        return len(self.tuples)
     def __getitem__(self, idx):
-        filepath, graph_idx = self.tuples[idx]
-        graphs, labels = utils.load_graphs(filepath)
-        return graphs[graph_idx], labels[graph_idx]
 def get_dataset(cfg: DictConfig, device):
     all_train = []
     all_val = []
     all_test = []
     for ds in cfg.datasets:
         name = ds['name']
         load_path = ds.get('load_path', f"{cfg.paths.data_dir}/{name}.root")
         save_path = ds.get('save_path', f"{cfg.paths.save_dir}/")
-        datastet = RootDataset(name, ds.get('label'), load_path, save_path, device, cfg.root_dataset)
         train, val, test = datastet.load()
         all_train.extend(train)
         all_val.extend(val)
         all_test.extend(test)
-    train_dataset = GraphTupleDataset(all_train)
-    val_dataset = GraphTupleDataset(all_val)
-    test_dataset = GraphTupleDataset(all_test)
-    batch_size = cfg.root_dataset.batch_size
-    train_loader = GraphDataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=2)
-    val_loader   = GraphDataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2)
-    test_loader  = GraphDataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2)
     print("all data loaded successfully")
     print(f"train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")

 import dgl
 import torch
 import numpy as np
 from omegaconf import DictConfig
 from typing import List
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from tqdm import tqdm
+from dataset import GraphBuilder
+from dataset import Graphs
+from dataset import Normalization
 from dgl.dataloading import GraphDataLoader
+class Dataset:
     def __init__(
         self,
         name: str,
         label: int,
         load_path: str,
         save_path: str,
+        dtype: torch.dtype,
         device: str,
         cfg: DictConfig
     ):
         self.label = label
         self.load_path = load_path
         self.save_path = save_path
+        self.dtype = dtype
         self.data = None
         self.device = device
         self.ttree = cfg.ttree
         self.features = cfg.features
+        self.weights = cfg.weights
         self.globals = cfg.globals
+        self.tracking = cfg.tracking
         self.step_size = cfg.step_size
         self.batch_size = cfg.batch_size
+        self.prebatch = cfg.get('prebatch', {'enabled': False})
         self.train_val_test_split = cfg.train_val_test_split
         assert np.sum(self.train_val_test_split) == 1, "train_val_test_split must sum to 1"
         print(f"initializing dataset {name} with dtype {self.dtype}")
     def get_branches(self) -> List[str]:
+        node_branches = [
+            branches
+            for particle in self.features.values()
+            for branches in particle
+            if isinstance(branches, str) and (branches != "CALC_E" or branches != "NUM_NODES")
+        ]
+        global_branches = [x for x in self.globals if isinstance(x, str)]
+        weight_branch = [self.weights] if isinstance(self.weights, str) else []
+        tracking_branches = [x for x in self.tracking if isinstance(x, str)]
+        label_branch = [self.label] if isinstance(self.label, str) else []
+        return node_branches + global_branches + weight_branch + tracking_branches + label_branch
     def process(self):
         branches = self.get_branches()
         with ProcessPoolExecutor(max_workers=num_cpus) as executor:
             futures = []
+            with tqdm(
+                uproot.iterate(
                         f"{self.load_path}:{self.ttree}",
                         expressions=[b for b in branches if b in available_branches],
                         step_size=self.step_size,
                         library="ak"
                     ),
+                desc="loading root file",
+                total=total_chunks,
+                position=0,
+                leave=True
+            ) as pbar:
+                for chunk_id, arrays in enumerate(pbar):
+                    cfg = GraphBuilder.ChunkConfig(
+                        name=self.name,
+                        label=self.label,
+                        chunk_id=chunk_id,
+                        batch_size=self.batch_size,
+                        arrays=arrays,
+                        features=self.features,
+                        globals=self.globals,
+                        tracking=self.tracking,
+                        weights=self.weights,
+                        branches=branches,
+                        dtype=self.dtype,
+                        save_path=self.save_path,
+                        prebatch = self.prebatch,
+                    )
+                    futures.append(executor.submit(GraphBuilder.process_chunk, cfg))
+        for idx, future in enumerate(as_completed(futures)):
             try:
                 future.result()
             except Exception as e:
                 import traceback
+                print(f"exception in chunk: {idx}")
                 traceback.print_exception(type(e), e, e.__traceback__)
         return
             self.process()
         graph_tuple_list = []
         for chunk_id, f in enumerate(chunk_files):
             if chunk_id < total_chunks - 1:
+                if (self.prebatch.enabled):
+                    n_graphs = self.step_size // self.prebatch.chunk_size
+                else:
+                    n_graphs = self.step_size
             else:
+                if (self.prebatch.enabled):
+                    n_graphs = (num_entries - self.step_size * (total_chunks - 1)) // self.prebatch.chunk_size + 1
+                else:
+                    n_graphs = num_entries - self.step_size * (total_chunks - 1)
             graph_tuple_list.extend((f, idx) for idx in range(n_graphs))
         split = self.train_val_test_split
         val_tuples   = graph_tuple_list[n_train:n_train + n_val]
         test_tuples  = graph_tuple_list[n_train + n_val:]
         return train_tuples, val_tuples, test_tuples
+class GraphTupleDataset:
+    def __init__(self, tuple_list, stats):
+        self.tuple_list = tuple_list
+        self.stats = stats
+        self.cache = {}
     def __len__(self):
+        return len(self.tuple_list)
     def __getitem__(self, idx):
+        f, graph_idx = self.tuple_list[idx]
+        if f in self.cache:
+            g = self.cache[f]
+        else:
+            g = Graphs.load_graphs(f)
+            g.normalize(self.stats)
+            self.cache[f] = g
+        return g[graph_idx]
+    @staticmethod
+    def collate_fn(samples):
+        all_graphs = []
+        all_metadata = {}
+        # Initialize keys in all_metadata from the first sample
+        for k in samples[0][1]:
+            all_metadata[k] = []
+        for graph, metadata in samples:
+            all_graphs.append(graph)
+            for k, v in metadata.items():
+                all_metadata[k].append(v)
+        # Stack or concatenate metadata for each key
+        for k in all_metadata:
+            # If v is a tensor, stack or cat as appropriate
+            # Use torch.cat if v is already [N, ...] (e.g. labels, features)
+            # Use torch.stack if v is scalar or needs new dimension
+            try:
+                all_metadata[k] = torch.cat(all_metadata[k], dim=0)
+            except Exception:
+                all_metadata[k] = torch.stack(all_metadata[k], dim=0)
+        batched_graph = dgl.batch(all_graphs)
+        return batched_graph, all_metadata
 def get_dataset(cfg: DictConfig, device):
     all_train = []
     all_val = []
     all_test = []
+    dtype_str = getattr(cfg.root_dataset, "dtype", "torch.float32")
+    if isinstance(dtype_str, str) and dtype_str.startswith("torch."):
+        dtype = getattr(torch, dtype_str.split(".")[-1], torch.float32)
+    else:
+        dtype = torch.float32
     for ds in cfg.datasets:
         name = ds['name']
         load_path = ds.get('load_path', f"{cfg.paths.data_dir}/{name}.root")
         save_path = ds.get('save_path', f"{cfg.paths.save_dir}/")
+        datastet = Dataset(name, ds.get('label'), load_path, save_path, dtype, device, cfg.root_dataset)
         train, val, test = datastet.load()
         all_train.extend(train)
         all_val.extend(val)
         all_test.extend(test)
+    stats = Normalization.global_stats(f"{cfg.paths.save_dir}/stats/", dtype=dtype)
+    train_dataset = GraphTupleDataset(all_train, stats)
+    val_dataset = GraphTupleDataset(all_val, stats)
+    test_dataset = GraphTupleDataset(all_test, stats)
+    if (cfg.root_dataset.get('prebatch', False)):
+        batch_size = cfg.root_dataset.batch_size // cfg.root_dataset.prebatch.chunk_size
+        collate_fn = GraphTupleDataset.collate_fn
+    else:
+        batch_size = cfg.root_dataset.batch_size
+        collate_fn = None
+    train_loader = GraphDataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=5, drop_last=False, collate_fn=collate_fn)
+    val_loader   = GraphDataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=5, drop_last=False, collate_fn=collate_fn)
+    test_loader  = GraphDataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=0, drop_last=False, collate_fn=collate_fn)
     print("all data loaded successfully")
     print(f"train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")

physicsnemo/dataset/GraphBuilder.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import dgl
+import torch
+import numpy as np
+import awkward as ak
+from dataclasses import dataclass
+from typing import List, Any, Union
+from dataset.Graphs import Graphs, save_graphs
+from dataset import Normalization
+@dataclass
+class ChunkConfig:
+    name:           str
+    label:          Union[str, int]
+    chunk_id:       int
+    batch_size:     int
+    arrays:         List[Any]
+    features:       List[Any]
+    globals:        List[Any]
+    weights:        Union[str, float]
+    tracking:       List[Any]
+    branches:       List[Any]
+    dtype:          torch.dtype
+    save_path:      str
+    prebatch:       dict
+def process_chunk(cfg: ChunkConfig):
+    # Collect everything as lists first
+    graph_list = []
+    meta_dict = {
+        'globals': [],
+        'label': [],
+        'weight': [],
+        'tracking': [],
+        'batch_num_nodes': [],
+        'batch_num_edges': [],
+    }
+    for i in range(len(cfg.arrays)):
+        g, meta = process_single_entry(cfg, i)
+        graph_list.append(g)
+        for k in meta_dict:
+            meta_dict[k].append(meta[k])
+    # Stack all metadata fields into tensors
+    for k in meta_dict:
+        meta_dict[k] = torch.stack(meta_dict[k])
+    graphs = Graphs(graphs=graph_list, metadata=meta_dict)
+    Normalization.save_stats(graphs, f"{cfg.save_path}/stats/{cfg.name}_{cfg.chunk_id:04d}.json")
+    if getattr(cfg.prebatch, "enabled", False):
+        graphs.shuffle()
+        graphs.batch(cfg.prebatch["chunk_size"])
+    save_graphs(graphs, f"{cfg.save_path}/{cfg.name}_{cfg.chunk_id:04d}.bin")
+def process_single_entry(cfg, i):
+    # 1) node features
+    node_features: List[torch.Tensor] = []
+    for particle, branch_list in cfg.features.items():
+        feature_tensors: List[torch.Tensor] = []
+        for branch in branch_list:
+            if branch == "CALC_E":
+                pT  = feature_tensors[0]
+                eta = feature_tensors[1]
+                val = pT * torch.cosh(eta)
+            elif isinstance(branch, str):
+                arr = cfg.arrays[branch][i]
+                val = torch.from_numpy(ak.to_numpy(arr)).to(cfg.dtype)
+            else:
+                length = feature_tensors[0].shape[0]
+                val = torch.full((length,), float(branch), dtype=cfg.dtype)
+            feature_tensors.append(val)
+        if feature_tensors and feature_tensors[0].numel() > 0:
+            block = torch.stack(feature_tensors, dim=1)
+            node_features.append(block)
+    node_features = torch.cat(node_features, dim=0) if node_features else torch.empty((0, len(cfg.features)), dtype=cfg.dtype)
+    # 2) global features
+    global_feat_list: List[torch.Tensor] = []
+    for b in cfg.globals:
+        if b == "NUM_NODES":
+            global_feat_list.append(torch.tensor([len(node_features)], dtype=cfg.dtype))
+        else:
+            arr = cfg.arrays[b][i]
+            global_feat_list.append(torch.from_numpy(ak.to_numpy(arr)).to(cfg.dtype))
+    global_feat = torch.cat(global_feat_list, dim=0) if global_feat_list else torch.zeros((1,), dtype=cfg.dtype)
+    # 3) tracking
+    tracking_list: List[torch.Tensor] = []
+    for b in cfg.tracking:
+        arr = cfg.arrays[b][i]
+        tracking_list.append(torch.from_numpy(ak.to_numpy(arr)).to(cfg.dtype))
+    tracking = torch.cat(tracking_list, dim=0) if tracking_list else torch.zeros((1,), dtype=cfg.dtype)
+    # 4) weight
+    weight = float(cfg.arrays[cfg.weights][i]) if isinstance(cfg.weights, str) else cfg.weights
+    weight = torch.tensor(weight, dtype=cfg.dtype)
+    # 5) label
+    label = float(cfg.arrays[cfg.label][i]) if isinstance(cfg.label, str) else cfg.label
+    label = torch.tensor(label, dtype=cfg.dtype)
+    # 6) make the DGLGraph
+    g = make_graph(node_features, dtype=cfg.dtype)
+    # 7) batch_num_nodes and batch_num_edges
+    batch_num_nodes = g.batch_num_nodes()
+    batch_num_edges = g.batch_num_edges()
+    meta = {
+        'globals': global_feat,
+        'label': label,
+        'weight': weight,
+        'tracking': tracking,
+        'batch_num_nodes': batch_num_nodes,
+        'batch_num_edges': batch_num_edges,
+    }
+    return g, meta
+src_dst_cache = {}
+def get_src_dst(num_nodes):
+    if num_nodes not in src_dst_cache:
+        src, dst = torch.meshgrid(torch.arange(num_nodes), torch.arange(num_nodes), indexing='ij')
+        src_dst_cache[num_nodes] = (src.flatten(), dst.flatten())
+    return src_dst_cache[num_nodes]
+@torch.jit.script
+def compute_edge_features(eta, phi, src, dst):
+    deta = eta[src] - eta[dst]
+    dphi = phi[src] - phi[dst]
+    dphi = torch.remainder(dphi + np.pi, 2 * np.pi) - np.pi
+    dR = torch.sqrt(deta ** 2 + dphi ** 2)
+    edge_features = torch.stack([dR, deta, dphi], dim=1)
+    return edge_features
+def make_graph(node_features: torch.tensor, dtype=torch.float32):
+    num_nodes = node_features.shape[0]
+    if num_nodes == 0:
+        g = dgl.graph(([], []))
+        g.ndata['features'] = node_features
+        g.edata['features'] = torch.empty((0, 3), dtype=dtype)
+        g.globals = torch.tensor([0], dtype=dtype)
+        return g
+    src, dst = get_src_dst(num_nodes)
+    src = src.flatten()
+    dst = dst.flatten()
+    g = dgl.graph((src, dst))
+    g.ndata['features'] = node_features
+    eta = node_features[:, 1]
+    phi = node_features[:, 2]
+    edge_features = compute_edge_features(eta, phi, src, dst)
+    g.edata['features'] = edge_features
+    return g

physicsnemo/dataset/Graphs.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import dgl
+import torch
+from dataclasses import dataclass, field
+from typing import List, Dict
+@dataclass
+class Graphs:
+    graphs: List[dgl.DGLGraph]
+    metadata: Dict[str, torch.Tensor]
+    def __len__(self):
+        return len(self.graphs)
+    def __getitem__(self, idx):
+        meta = {k: v[idx] for k, v in self.metadata.items()}
+        return self.graphs[idx], meta
+    def shuffle(self):
+        idx = torch.randperm(len(self.graphs))
+        self.graphs = [self.graphs[i] for i in idx]
+        for k in self.metadata:
+            self.metadata[k] = self.metadata[k][idx]
+    def batch(self, batch_size, node_feature_dim=None, dtype=None):
+        """
+        In-place batching: after this, self.graphs is a list of batched DGLGraphs,
+        and self.metadata[k] is a tensor of shape [num_batches, batch_size, ...].
+        """
+        batched_graphs = []
+        batched_meta = {k: [] for k in self.metadata}
+        N = len(self.graphs)
+        # Infer node_feature_dim and dtype if not specified
+        if node_feature_dim is None and N > 0:
+            feats = self.graphs[0].ndata['features']
+            node_feature_dim = feats.shape[1] if feats.ndim > 1 else 1
+        if dtype is None and N > 0:
+            dtype = self.graphs[0].ndata['features'].dtype
+        for start in range(0, N, batch_size):
+            end = start + batch_size
+            batch_graphs = self.graphs[start:end]
+            batch_meta = {k: v[start:end] for k, v in self.metadata.items()}
+            # Padding if needed
+            pad_count = batch_size - len(batch_graphs)
+            if pad_count > 0:
+                dummy_graph = dgl.graph(([], []))
+                dummy_graph.ndata['features'] = torch.empty((0, node_feature_dim), dtype=dtype)
+                dummy_graph.edata['features'] = torch.empty((0, 3), dtype=dtype)  # assuming 3 edge features
+                batch_graphs += [dummy_graph] * pad_count
+                # Pad metadata with zeros
+                for k, v in batch_meta.items():
+                    shape = list(v[0].shape) if len(v) > 0 else []
+                    pad_tensor = torch.zeros([pad_count] + shape, dtype=v.dtype, device=v.device)
+                    batch_meta[k] = torch.cat([v, pad_tensor], dim=0)
+            else:
+                for k, v in batch_meta.items():
+                    batch_meta[k] = torch.stack(v, dim=0) if isinstance(v, list) else v
+            batched_graphs.append(dgl.batch(batch_graphs))
+            for k in batched_meta:
+                batched_meta[k].append(batch_meta[k])
+        # Now stack along a new axis: [num_batches, batch_size, ...]
+        for k in batched_meta:
+            self.metadata[k] = torch.stack(batched_meta[k], dim=0)
+        self.graphs = batched_graphs
+    def normalize(self, stats):
+        node_mean, node_std, _ = stats['node']
+        edge_mean, edge_std, _ = stats['edge']
+        for g in self.graphs:
+            g.ndata['features'] = (g.ndata['features'] - node_mean) / node_std
+            g.edata['features'] = (g.edata['features'] - edge_mean) / edge_std
+def save_graphs(graphs: Graphs, f: str):
+    meta_to_save = {k: v for k, v in graphs.metadata.items()}
+    dgl.save_graphs(f, graphs.graphs, meta_to_save)
+def load_graphs(f: str) -> Graphs:
+    g, meta = dgl.load_graphs(f)
+    for k in meta:
+        if not isinstance(meta[k], torch.Tensor):
+            meta[k] = torch.stack(meta[k])
+    return Graphs(graphs=g, metadata=meta)

physicsnemo/dataset/Normalization.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+import json
+import os
+from dataset.Graphs import Graphs
+from typing import List, Dict, Tuple
+def combine_feature_stats(chunks: List[Dict]) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    """
+    Combine mean/std/count from multiple chunks using Welford's algorithm.
+    Returns combined mean, std, and total count.
+    """
+    n_total = 0
+    mean_total = None
+    M2_total = None
+    for chunk in chunks:
+        n_k = chunk['count']
+        if n_k == 0:
+            continue
+        mean_k = torch.tensor(chunk['mean'])
+        std_k = torch.tensor(chunk['std'])
+        M2_k = (std_k ** 2) * n_k
+        if n_total == 0:
+            mean_total = mean_k
+            M2_total = M2_k
+            n_total = n_k
+        else:
+            delta = mean_k - mean_total
+            N = n_total + n_k
+            mean_total += delta * (n_k / N)
+            M2_total += M2_k + (delta ** 2) * (n_total * n_k / N)
+            n_total = N
+    if n_total == 0:
+        return torch.tensor([]), torch.tensor([]), 0
+    std_total = torch.sqrt(M2_total / n_total)
+    return mean_total, std_total, n_total
+def global_stats(dirpath: str, dtype: torch.dtype) -> Dict[str, Tuple[torch.Tensor, torch.Tensor, int]]:
+    """
+    Load all JSON stats files in a directory, combine node, edge, and global stats,
+    and optionally save the combined stats as JSON to `save_path`.
+    """
+    combined_stats_path = os.path.join(dirpath, "global_stats.json")
+    if not os.path.exists(combined_stats_path):
+        stats_list = []
+        for fname in os.listdir(dirpath):
+            if fname.endswith('.json'):
+                with open(os.path.join(dirpath, fname), 'r') as f:
+                    stats_list.append(json.load(f))
+        node_stats = [s['node'] for s in stats_list]
+        edge_stats = [s['edge'] for s in stats_list]
+        combined = {
+            'node': combine_feature_stats(node_stats),
+            'edge': combine_feature_stats(edge_stats),
+        }
+        combined_json = {}
+        for key, (mean, std, count) in combined.items():
+            combined_json[key] = {
+                'mean': mean.tolist() if mean.numel() > 0 else [],
+                'std': std.tolist() if std.numel() > 0 else [],
+                'count': count,
+            }
+        with open(combined_stats_path, 'w') as f:
+            json.dump(combined_json, f, indent=4)
+    with open(combined_stats_path, 'r') as f:
+        combined_json = json.load(f)
+    def to_tensor(d):
+        mean = torch.tensor(d['mean'], dtype=dtype) if d['mean'] else torch.tensor([], dtype=dtype)
+        std = torch.tensor(d['std'], dtype=dtype) if d['std'] else torch.tensor([], dtype=dtype)
+        count = d['count']
+        return mean, std, count
+    return {
+        'node': to_tensor(combined_json['node']),
+        'edge': to_tensor(combined_json['edge']),
+    }
+def compute_stats(feats, eps=1e-6):
+    mean = feats.mean(dim=0)
+    if feats.size(0) > 1:
+        var = ((feats - mean) ** 2).mean(dim=0)
+    else:
+        var = torch.zeros_like(mean)
+    std = torch.sqrt(var)
+    std = torch.where(std < eps, torch.full_like(std, eps), std)
+    return mean, std
+def save_stats(graphs: 'Graphs', filepath: str, categorical_unique_threshold=50):
+    """
+    Compute and save normalization stats (mean, std, counts) for node and edge features.
+    Categorical features (few unique values) have normalization disabled (mean=0, std=1).
+    """
+    if len(graphs) == 0:
+        raise ValueError("No graphs to compute stats from.")
+    # Node and edge features
+    all_node_feats = torch.cat([g.ndata['features'] for g, _ in graphs], dim=0)
+    all_edge_feats = torch.cat([g.edata['features'] for g, _ in graphs], dim=0)
+    counts = {
+        'node': all_node_feats.size(0),
+        'edge': all_edge_feats.size(0),
+    }
+    node_mean, node_std = compute_stats(all_node_feats)
+    edge_mean, edge_std = compute_stats(all_edge_feats)
+    categorical_mask = torch.tensor([
+        torch.unique(all_node_feats[:, i]).numel() < categorical_unique_threshold
+        for i in range(node_mean.size(0))
+    ], dtype=torch.bool)
+    node_mean[categorical_mask] = 0.0
+    node_std[categorical_mask] = 1.0
+    stats = {
+        'node': {
+            'mean': node_mean.tolist(),
+            'std': node_std.tolist(),
+            'count': counts['node'],
+        },
+        'edge': {
+            'mean': edge_mean.tolist(),
+            'std': edge_std.tolist(),
+            'count': counts['edge'],
+        },
+    }
+    os.makedirs(os.path.dirname(filepath), exist_ok=True)
+    with open(filepath, 'w') as f:
+        json.dump(stats, f, indent=4)

physicsnemo/dataset_utils.py DELETED Viewed

@@ -1,121 +0,0 @@
-import dgl
-import torch
-import numpy as np
-import awkward as ak
-from dataclasses import dataclass
-from typing import List, Any, Dict
-from tqdm import tqdm
-@dataclass
-class ChunkConfig:
-    name: str
-    label: str
-    chunk_id: int
-    batch_size: int
-    arrays: List[Any]
-    particles: List[Any]
-    features: List[Any]
-    branches: List[Any]
-    dtype: torch.dtype
-    save_path: str
-def process_chunk(cfg: ChunkConfig):
-    n_entries = len(cfg.arrays)
-    arrays_ordered = {}
-    for b in cfg.branches:
-        if b in cfg.arrays.fields:
-            arrays_ordered[b] = cfg.arrays[b]
-        elif b.endswith("_energy"):
-            prefix = b[:-7]
-            pt_name = f"{prefix}_pt"
-            if prefix == "MET":
-                pt_name = f"{prefix}_met"
-            eta_name = f"{prefix}_eta"
-            arrays_ordered[b] = cfg.arrays[pt_name] * np.cosh(cfg.arrays[eta_name])
-        elif "node_type" in b:
-            prefix = b[:-10]
-            pt_name = f"{prefix}_pt"
-            if prefix == "MET":
-                pt_name = f"{prefix}_met"
-            index = cfg.particles.index(prefix)
-            arrays_ordered[b] = ak.ones_like(cfg.arrays[pt_name]) * index
-        else:
-            prefix = b.split("_")[0]
-            pt_name = f"{prefix}_pt"
-            if prefix == "MET":
-                pt_name = f"{prefix}_met"
-            arrays_ordered[b] = ak.zeros_like(cfg.arrays[pt_name])
-    graphs = []
-    for i in range(n_entries):
-        node_features_list = []
-        for p in cfg.particles:
-            feats = []
-            for f in cfg.features:
-                branch = f"{p}_{f}"
-                if p == "MET" and f == "pt":
-                    branch = "MET_met"
-                value = ak.to_numpy(arrays_ordered[branch][i])
-                feats.append(value)
-            if len(feats[0]) == 0:
-                continue
-            node_array = np.stack(feats, axis=1)
-            node_features_list.append(node_array)
-        if node_features_list:
-            node_features = np.concatenate(node_features_list, axis=0)
-        else:
-            node_features = np.empty((0, len(cfg.features)))
-        graphs.append(make_graph(node_features, dtype=cfg.dtype))
-    labels = torch.full((len(graphs),), cfg.label, dtype=cfg.dtype)
-    save_graphs(f"{cfg.save_path}/{cfg.name}_{cfg.chunk_id:04d}.bin", graphs, {'labels': labels})
-    return
-def save_graphs(f: str, g: List[dgl.DGLGraph], metadata: Dict) -> None:
-    dgl.save_graphs(f, g, metadata)
-def load_graphs(f: str):
-    g, metadata = dgl.load_graphs(f)
-    return g, metadata['labels']
-src_dst_cache = {}
-def get_src_dst(num_nodes):
-    if num_nodes not in src_dst_cache:
-        src, dst = torch.meshgrid(torch.arange(num_nodes), torch.arange(num_nodes), indexing='ij')
-        src_dst_cache[num_nodes] = (src.flatten(), dst.flatten())
-    return src_dst_cache[num_nodes]
-@torch.jit.script
-def compute_edge_features(eta, phi, src, dst):
-    deta = eta[src] - eta[dst]
-    dphi = phi[src] - phi[dst]
-    dphi = torch.remainder(dphi + np.pi, 2 * np.pi) - np.pi
-    dR = torch.sqrt(deta ** 2 + dphi ** 2)
-    edge_features = torch.stack([dR, deta, dphi], dim=1)
-    return edge_features
-# TODO: normalize all features
-def make_graph(node_features: np.array, dtype=torch.float32):
-    node_features = torch.tensor(node_features, dtype=dtype)
-    num_nodes = node_features.shape[0]
-    if num_nodes == 0:
-        g = dgl.graph(([], []))
-        g.ndata['features'] = node_features
-        g.edata['features'] = torch.empty((0, 3), dtype=dtype)
-        g.globals = torch.tensor([0], dtype=dtype)
-        return g
-    src, dst = get_src_dst(num_nodes)
-    src = src.flatten()
-    dst = dst.flatten()
-    g = dgl.graph((src, dst))
-    g.ndata['features'] = node_features
-    eta = node_features[:, 1]
-    phi = node_features[:, 2]
-    edge_features = compute_edge_features(eta, phi, src, dst)
-    g.edata['features'] = edge_features
-    g.globals = torch.tensor([num_nodes], dtype=dtype)
-    return g

physicsnemo/metrics.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+def bce(input, target, weights=None):
+    if input.shape != target.shape:
+        if input.shape[-1] == 1 and input.shape[:-1] == target.shape:
+            input = input.squeeze(-1)
+        elif target.shape[-1] == 1 and target.shape[:-1] == input.shape:
+            target = target.squeeze(-1)
+    loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
+    return torch.mean(loss)
+def weighted_bce(input, target, weights=None):
+    """
+    Compute a weighted and label-normalized binary cross entropy (BCE) loss.
+    For each unique label in the target tensor, the BCE loss is computed and weighted,
+    then normalized by the sum of weights for that label. The final loss is the mean
+    of these per-label normalized losses.
+    Args:
+        input (Tensor): Predicted logits of shape (N, ...).
+        target (Tensor): Ground truth labels of shape (N, ...), with discrete label values.
+        weights (Tensor or None): Optional tensor of per-sample weights, same shape as input/target.
+    Returns:
+        Tensor: Scalar tensor representing the normalized weighted BCE loss.
+    """
+    if input.shape != target.shape:
+        if input.shape[-1] == 1 and input.shape[:-1] == target.shape:
+            input = input.squeeze(-1)
+        elif target.shape[-1] == 1 and target.shape[:-1] == input.shape:
+            target = target.squeeze(-1)
+    # Compute per-element BCE loss (no reduction)
+    loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
+    # If weights not provided, use ones
+    if weights is None:
+        weights = torch.ones_like(loss)
+    unique_labels = torch.unique(target)
+    normalized_losses = []
+    for label in unique_labels:
+        label_mask = (target == label)  # This will be a bool tensor
+        # Defensive: make sure mask is bool
+        if label_mask.dtype != torch.bool:
+            label_mask = label_mask.bool()
+        label_weights = weights[label_mask]
+        label_losses = loss[label_mask]
+        weight_sum = label_weights.sum()
+        if weight_sum > 0:
+            label_loss = (label_weights * label_losses).sum() / weight_sum
+            normalized_losses.append(label_loss)
+    if normalized_losses:
+        return torch.stack(normalized_losses).mean()
+    else:
+        return torch.tensor(0.0, device=input.device)
+def roc_auc_score(classes : np.ndarray,
+               predictions : np.ndarray,
+               weights : np.ndarray = None) -> float:
+    """
+    Calculating ROC AUC score as the probability of correct ordering
+    """
+    if weights is None:
+        weights = np.ones_like(predictions)
+    assert len(classes) == len(predictions) == len(weights)
+    assert classes.ndim == predictions.ndim == weights.ndim == 1
+    class0, class1 = sorted(np.unique(classes))
+    data = np.empty(
+            shape=len(classes),
+            dtype=[('c', classes.dtype),
+                   ('p', predictions.dtype),
+                   ('w', weights.dtype)]
+        )
+    data['c'], data['p'], data['w'] = classes, predictions, weights
+    data = data[np.argsort(data['c'])]
+    data = data[np.argsort(data['p'], kind='mergesort')] # here we're relying on stability as we need class orders preserved
+    correction = 0.
+    # mask1 - bool mask to highlight collision areas
+    # mask2 - bool mask with collision areas' start points
+    mask1 = np.empty(len(data), dtype=bool)
+    mask2 = np.empty(len(data), dtype=bool)
+    mask1[0] = mask2[-1] = False
+    mask1[1:] = data['p'][1:] == data['p'][:-1]
+    if mask1.any():
+        mask2[:-1] = ~mask1[:-1] & mask1[1:]
+        mask1[:-1] |= mask1[1:]
+        ids, = mask2.nonzero()
+        correction = sum([((dsplit['c'] == class0) * dsplit['w'] * msplit).sum() *
+                          ((dsplit['c'] == class1) * dsplit['w'] * msplit).sum()
+                          for dsplit, msplit in zip(np.split(data, ids), np.split(mask1, ids))]) * 0.5
+    weights_0 = data['w'] * (data['c'] == class0)
+    weights_1 = data['w'] * (data['c'] == class1)
+    cumsum_0 = weights_0.cumsum()
+    return ((cumsum_0 * weights_1).sum() - correction) / (weights_1.sum() * cumsum_0[-1])

physicsnemo/models/MeshGraphNet.py CHANGED Viewed

@@ -6,25 +6,123 @@ import dgl
 from physicsnemo.models.meshgraphnet import MeshGraphNet as PhysicsNemoMeshGraphNet
 class MeshGraphNet(nn.Module):
-    def __init__(self, *args, out_dim=1, **kwargs):
         super().__init__()
-        # Initialize the PhysicsNemo MeshGraphNet
-        self.base_gnn = PhysicsNemoMeshGraphNet(*args, **kwargs)
-        # Assume node_output_dim is known or infer from args/kwargs
-        node_output_dim = kwargs.get('hidden_dim_node_decoder', 64)
-        self.mlp = nn.Linear(node_output_dim, out_dim)
-    def forward(self, node_feats, edge_feats, batched_graph):
         """
-        Args:
-            node_feats: [total_num_nodes, node_feat_dim]
-            edge_feats: [total_num_edges, edge_feat_dim]
-            batched_graph: DGLGraph, batched graphs
         Returns:
             graph_pred: [num_graphs, out_dim]
         """
         node_pred = self.base_gnn(node_feats, edge_feats, batched_graph)
         batched_graph.ndata['h'] = node_pred
-        graph_feat = dgl.readout_nodes(batched_graph, 'h', op='mean')  # [num_graphs, node_output_dim]
-        graph_pred = self.mlp(graph_feat)  # [num_graphs, out_dim]
-        return graph_pred

 from physicsnemo.models.meshgraphnet import MeshGraphNet as PhysicsNemoMeshGraphNet
 class MeshGraphNet(nn.Module):
+    def __init__(self, cfg):
         super().__init__()
+        base_gnn_cfg = cfg.base_gnn
+        self.base_gnn = PhysicsNemoMeshGraphNet(**base_gnn_cfg)
+        self.global_mlp = nn.Sequential(
+            nn.Linear(cfg.global_feat_dim, cfg.global_emb_dim),
+            nn.ReLU(),
+        )
+        self.mlp = nn.Linear(
+            base_gnn_cfg['output_dim'] + base_gnn_cfg['input_dim_edges'] + cfg.global_emb_dim,
+            cfg.out_dim
+        )
+    def forward(self, node_feats, edge_feats, global_feats, batched_graph, metadata={}):
         """
+        node_feats: [total_num_nodes, node_feat_dim]
+        edge_feats: [total_num_edges, edge_feat_dim]
+        global_feats: [num_graphs, global_feat_dim]
+        batched_graph: DGLGraph, representing the collection of graphs in a batch
+        metadata: dict, may contain 'batch_num_nodes', 'batch_num_edges', etc.
         Returns:
             graph_pred: [num_graphs, out_dim]
         """
         node_pred = self.base_gnn(node_feats, edge_feats, batched_graph)
         batched_graph.ndata['h'] = node_pred
+        batched_graph.edata['e'] = edge_feats
+        graph_node_feat = mean_nodes(batched_graph, 'h', node_split=metadata.get("batch_num_nodes", None))
+        graph_edge_feat = mean_edges(batched_graph, 'e', edge_split=metadata.get("batch_num_edges", None))
+        # Flatten global_feats if needed
+        if global_feats.ndim == 3:
+            global_feats = global_feats.view(-1, global_feats.shape[-1])
+        global_emb = self.global_mlp(global_feats)  # [num_graphs, global_emb_dim]
+        combined_feat = torch.cat([graph_node_feat, graph_edge_feat, global_emb], dim=-1)
+        graph_pred = self.mlp(combined_feat)
+        return graph_pred
+def mean_nodes(batched_graph, feat_key='h', op='mean', node_split=None):
+    """
+    Aggregates node features per disjoint graph in a batched DGLGraph.
+    Args:
+        batched_graph: DGLGraph
+        feat_key: str, node feature key
+        op: 'mean', 'sum', or 'max'
+        node_split: 1D tensor or list of ints (num nodes per graph)
+    Returns:
+        Tensor of shape [num_graphs, node_feat_dim]
+    """
+    h = batched_graph.ndata[feat_key]
+    if node_split is None or len(node_split) == 0:
+        if op == 'mean':
+            return dgl.mean_nodes(batched_graph, feat_key)
+        elif op == 'sum':
+            return dgl.sum_nodes(batched_graph, feat_key)
+        elif op == 'max':
+            return dgl.max_nodes(batched_graph, feat_key)
+        else:
+            raise ValueError(f"Unknown op: {op}")
+    else:
+        # Ensure node_split is a flat list of ints
+        if isinstance(node_split, torch.Tensor):
+            splits = node_split.view(-1).tolist()
+        else:
+            splits = [int(x) for x in node_split]
+        chunks = torch.split(h, splits, dim=0)
+        if op == 'mean':
+            out = torch.stack([chunk.mean(0) if chunk.shape[0] > 0 else torch.zeros_like(h[0]) for chunk in chunks])
+        elif op == 'sum':
+            out = torch.stack([chunk.sum(0) if chunk.shape[0] > 0 else torch.zeros_like(h[0]) for chunk in chunks])
+        elif op == 'max':
+            out = torch.stack([chunk.max(0).values if chunk.shape[0] > 0 else torch.zeros_like(h[0]) for chunk in chunks])
+        else:
+            raise ValueError(f"Unknown op: {op}")
+        return out
+def mean_edges(batched_graph, feat_key='e', op='mean', edge_split=None):
+    """
+    Aggregates edge features per disjoint graph in a batched DGLGraph.
+    Args:
+        batched_graph: DGLGraph
+        feat_key: str, edge feature key
+        op: 'mean', 'sum', or 'max'
+        edge_split: 1D tensor or list of ints (num edges per graph)
+    Returns:
+        Tensor of shape [num_graphs, edge_feat_dim]
+    """
+    e = batched_graph.edata[feat_key]
+    if edge_split is None or len(edge_split) == 0:
+        if op == 'mean':
+            return dgl.mean_edges(batched_graph, feat_key)
+        elif op == 'sum':
+            return dgl.sum_edges(batched_graph, feat_key)
+        elif op == 'max':
+            return dgl.max_edges(batched_graph, feat_key)
+        else:
+            raise ValueError(f"Unknown op: {op}")
+    else:
+        # Ensure edge_split is a flat list of ints
+        if isinstance(edge_split, torch.Tensor):
+            splits = edge_split.view(-1).tolist()
+        else:
+            splits = [int(x) for x in edge_split]
+        chunks = torch.split(e, splits, dim=0)
+        if op == 'mean':
+            out = torch.stack([chunk.mean(0) if chunk.shape[0] > 0 else torch.zeros_like(e[0]) for chunk in chunks])
+        elif op == 'sum':
+            out = torch.stack([chunk.sum(0) if chunk.shape[0] > 0 else torch.zeros_like(e[0]) for chunk in chunks])
+        elif op == 'max':
+            out = torch.stack([chunk.max(0).values if chunk.shape[0] > 0 else torch.zeros_like(e[0]) for chunk in chunks])
+        else:
+            raise ValueError(f"Unknown op: {op}")
+        return out

physicsnemo/train.py CHANGED Viewed

@@ -14,73 +14,15 @@ from physicsnemo.launch.logging import (
 )
 from physicsnemo.launch.utils import load_checkpoint, save_checkpoint
 from physicsnemo.distributed.manager import DistributedManager
-from dataset import get_dataset
-import json
 import random
-from sklearn.metrics import roc_auc_score
 import models.MeshGraphNet as MeshGraphNet
-import torch.nn.functional as F
-def bce(input, target, device=None, weights=None):
-    if input.shape != target.shape:
-        if input.shape[-1] == 1 and input.shape[:-1] == target.shape:
-            input = input.squeeze(-1)
-        elif target.shape[-1] == 1 and target.shape[:-1] == input.shape:
-            target = target.squeeze(-1)
-    loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
-    return torch.mean(loss)
-def weighted_bce(input, target, device=None, weights=None):
-    """
-    Compute a weighted and label-normalized binary cross entropy (BCE) loss.
-    For each unique label in the target tensor, the BCE loss is computed and weighted,
-    then normalized by the sum of weights for that label. The final loss is the mean
-    of these per-label normalized losses.
-    Args:
-        input (Tensor): Predicted logits of shape (N, ...).
-        target (Tensor): Ground truth labels of shape (N, ...), with discrete label values.
-        device (torch.device or None): Device to move tensors to (optional).
-        weights (Tensor or None): Optional tensor of per-sample weights, same shape as input/target.
-    Returns:
-        Tensor: Scalar tensor representing the normalized weighted BCE loss.
-    """
-    if input.shape != target.shape:
-        if input.shape[-1] == 1 and input.shape[:-1] == target.shape:
-            input = input.squeeze(-1)
-        elif target.shape[-1] == 1 and target.shape[:-1] == input.shape:
-            target = target.squeeze(-1)
-    if weights is None:
-        weights = torch.ones_like(target).to(device)
-    # Compute per-element BCE loss (no reduction)
-    loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
-    # Vectorized label normalization
-    unique_labels = torch.unique(target)
-    normalized_losses = []
-    for label in unique_labels:
-        label_mask = (target == label)
-        label_weights = weights[label_mask]
-        label_losses = loss[label_mask]
-        weight_sum = label_weights.sum()
-        if weight_sum > 0:
-            label_loss = (label_weights * label_losses).sum() / weight_sum
-            normalized_losses.append(label_loss)
-    if normalized_losses:
-        return torch.stack(normalized_losses).mean()
-    else:
-        return torch.tensor(0.0, device=input.device)
 class MGNTrainer:
     def __init__(self, logger, cfg, dist):
@@ -91,37 +33,28 @@ class MGNTrainer:
         params = {}
         start = time.time()
-        self.dataloader, self.valloader, self.testloader = get_dataset(cfg, self.device)
         print(f"total time loading dataset: {time.time() - start:.2f} seconds")
-        dtype_str = getattr(cfg.root_dataset, "type", "torch.float32")
         if isinstance(dtype_str, str) and dtype_str.startswith("torch."):
             self.dtype = getattr(torch, dtype_str.split(".")[-1], torch.float32)
         else:
             self.dtype = torch.float32
-        nodes_features = cfg.root_dataset.features
-        edges_features = ["dR", "deta", "dphi"]
         global_features = ["num_nodes"]
-        params["infeat_nodes"] = len(nodes_features)
-        params["infeat_edges"] = len(edges_features)
         params["infeat_globals"] = len(global_features)
-        params["out_dim"] = cfg.architecture.hidden_dim_node_encoder
-        params["node_features"] = list(nodes_features)
-        params["edges_features"] = edges_features
         params["global_features"] = global_features
-        self.model = MeshGraphNet.MeshGraphNet(
-            params["infeat_nodes"],
-            params["infeat_edges"],
-            params['out_dim'],
-            processor_size=cfg.architecture.processor_size,
-            hidden_dim_node_encoder=cfg.architecture.hidden_dim_node_encoder,
-            hidden_dim_edge_encoder=cfg.architecture.hidden_dim_edge_encoder,
-            hidden_dim_processor=cfg.architecture.hidden_dim_processor,
-            hidden_dim_node_decoder=cfg.architecture.hidden_dim_node_decoder,
-        )
         self.model = self.model.to(dtype=self.dtype, device=self.device)
         if cfg.performance.jit:
@@ -168,7 +101,7 @@ class MGNTrainer:
             loss.backward()
             self.optimizer.step()
-    def train(self, graph, label):
         """
         Perform one training iteration over one graph. The training is performed
         over multiple timesteps, where the number of timesteps is specified in
@@ -181,11 +114,16 @@ class MGNTrainer:
             loss: loss value.
         """
         self.optimizer.zero_grad()
-        pred = self.model(graph.ndata["features"], graph.edata["features"], graph)
-        loss = bce(pred, label, device=self.device)
         self.backward(loss)
-        return loss
     @torch.no_grad()
     def eval(self):
@@ -201,18 +139,25 @@ class MGNTrainer:
         """
         predictions = []
         labels = []
-        for graph, label in self.valloader:
-            graph = graph.to(self.device)
-            pred = self.model(graph.ndata["features"], graph.edata["features"], graph)
             predictions.append(pred)
-            labels.append(label)
         predictions = torch.cat(predictions, dim=0)
         labels = torch.cat(labels, dim=0)
-        loss = weighted_bce(predictions, labels, device=self.device)
         # Convert logits to probabilities
         prob = torch.sigmoid(predictions)
@@ -223,13 +168,13 @@ class MGNTrainer:
         # Calculate AUC
         try:
-            auc = roc_auc_score(labels_flat, prob_flat)
         except ValueError:
             auc = float('nan')  # Not enough classes present for AUC
         return loss, auc
-@hydra.main(version_base=None, config_path=".", config_name="config")
 def do_training(cfg: DictConfig):
     """
     Perform training over all graphs in the dataset.
@@ -247,8 +192,9 @@ def do_training(cfg: DictConfig):
     dist = DistributedManager()
     # initialize loggers
     logger = PythonLogger("main")
-    logger.file_logging()
     # initialize trainer
     trainer = MGNTrainer(logger, cfg, dist)
@@ -274,13 +220,14 @@ def do_training(cfg: DictConfig):
         # Training
         train_loss = []
-        for graph, label in trainer.dataloader:
             trainer.model.train()
-            train_loss.append(trainer.train(graph, label))
         val_loss, val_auc = trainer.eval()
-        train_loss = torch.mean(torch.stack(train_loss)).item()
         logger.info(
             f"epoch: {epoch}, loss: {train_loss:10.3e}, val_loss: {val_loss:10.3e}, val_auc = {val_auc:10.3e}, time per epoch: {(time.time()-start):10.3e}"

 )
 from physicsnemo.launch.utils import load_checkpoint, save_checkpoint
 from physicsnemo.distributed.manager import DistributedManager
+import json
+from tqdm import tqdm
 import random
 import models.MeshGraphNet as MeshGraphNet
+from dataset.Dataset import get_dataset
+import metrics
 class MGNTrainer:
     def __init__(self, logger, cfg, dist):
         params = {}
         start = time.time()
+        self.trainloader, self.valloader, self.testloader = get_dataset(cfg, self.device)
         print(f"total time loading dataset: {time.time() - start:.2f} seconds")
+        dtype_str = getattr(cfg.root_dataset, "dtype", "torch.float32")
         if isinstance(dtype_str, str) and dtype_str.startswith("torch."):
             self.dtype = getattr(torch, dtype_str.split(".")[-1], torch.float32)
         else:
             self.dtype = torch.float32
+        node_features = list(cfg.root_dataset.features.values())[0]
+        edge_features = ["dR", "deta", "dphi"]
         global_features = ["num_nodes"]
+        params["infeat_nodes"] = len(node_features)
+        params["infeat_edges"] = len(edge_features)
         params["infeat_globals"] = len(global_features)
+        params["out_dim"] = cfg.architecture.out_dim
+        params["node_features"] = list(node_features)
+        params["edge_features"] = edge_features
         params["global_features"] = global_features
+        self.model = MeshGraphNet.MeshGraphNet(cfg.architecture)
         self.model = self.model.to(dtype=self.dtype, device=self.device)
         if cfg.performance.jit:
             loss.backward()
             self.optimizer.step()
+    def train(self, graph, metadata):
         """
         Perform one training iteration over one graph. The training is performed
         over multiple timesteps, where the number of timesteps is specified in
             loss: loss value.
         """
+        graph = graph.to(self.device, non_blocking=True)
+        globals = metadata['globals'].to(self.device, non_blocking=True)
+        label = metadata['label'].to(self.device, non_blocking=True)
+        weight =  metadata['weight'].to(self.device, non_blocking=True)
         self.optimizer.zero_grad()
+        pred = self.model(graph.ndata["features"], graph.edata["features"], globals, graph, metadata)
+        loss = metrics.weighted_bce(pred, label, weights=weight)
         self.backward(loss)
+        return loss.detach()
     @torch.no_grad()
     def eval(self):
         """
         predictions = []
         labels = []
+        weights = []
+        for graph, metadata in self.valloader:
+            graph = graph.to(self.device, non_blocking=True)
+            globals = metadata['globals'].to(self.device, non_blocking=True)
+            label = metadata['label'].to(self.device, non_blocking=True)
+            weight =  metadata['weight'].to(self.device, non_blocking=True)
+            pred = self.model(graph.ndata["features"], graph.edata["features"], globals, graph, metadata)
             predictions.append(pred)
+            labels.append(label)
+            weights.append(weight)
         predictions = torch.cat(predictions, dim=0)
         labels = torch.cat(labels, dim=0)
+        weights = torch.cat(weights, dim=0)
+        loss = metrics.weighted_bce(predictions, labels, weights=weights)
         # Convert logits to probabilities
         prob = torch.sigmoid(predictions)
         # Calculate AUC
         try:
+            auc = metrics.roc_auc_score(labels_flat, prob_flat)
         except ValueError:
             auc = float('nan')  # Not enough classes present for AUC
         return loss, auc
+@hydra.main(version_base=None, config_path="./configs/", config_name="tHjb_CP_0_vs_45")
 def do_training(cfg: DictConfig):
     """
     Perform training over all graphs in the dataset.
     dist = DistributedManager()
     # initialize loggers
+    os.makedirs(cfg.checkpoints.ckpt_path, exist_ok=True)
     logger = PythonLogger("main")
+    logger.file_logging(os.path.join(cfg.checkpoints.ckpt_path, "train.log"))
     # initialize trainer
     trainer = MGNTrainer(logger, cfg, dist)
         # Training
         train_loss = []
+        for graph, metadata in tqdm(trainer.trainloader, desc=f"epoch {epoch} trianing"):
             trainer.model.train()
+            loss = trainer.train(graph, metadata)
+            train_loss.append(loss.item())
         val_loss, val_auc = trainer.eval()
+        train_loss = torch.tensor(train_loss).mean()
         logger.info(
             f"epoch: {epoch}, loss: {train_loss:10.3e}, val_loss: {val_loss:10.3e}, val_auc = {val_auc:10.3e}, time per epoch: {(time.time()-start):10.3e}"