updated dataset class

Browse files

Files changed (9) hide show

physicsnemo/Dataset.py +0 -293
physicsnemo/config.yaml +5 -5
physicsnemo/config_stats_all.yaml +4 -4
physicsnemo/dataset.py +190 -0
physicsnemo/dataset_utils.py +121 -0
physicsnemo/models/Edge_Network.py +0 -0
physicsnemo/{MeshGraphNet.py → models/MeshGraphNet.py} +0 -0
physicsnemo/setup/Dockerfile +0 -2
physicsnemo/train.py +40 -17

physicsnemo/Dataset.py DELETED Viewed

@@ -1,293 +0,0 @@
-import os
-import uproot
-import dgl
-import torch
-import numpy as np
-import awkward as ak
-from omegaconf import DictConfig
-from typing import List
-from typing import Union
-import math
-import random
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from torch.utils.data import Dataset
-from dgl.dataloading import GraphDataLoader
-os.environ["TMPDIR"] = "/pscratch/sd/j/joshuaho/tmp"
-def make_graph(node_features: np.array, dtype=torch.float32):
-    node_features = torch.tensor(node_features, dtype=dtype)
-    num_nodes = node_features.shape[0]
-    if num_nodes == 0:
-        # Return an empty graph
-        g = dgl.graph(([], []))
-        g.ndata['features'] = node_features
-        g.edata['features'] = torch.empty((0, 3), dtype=dtype)
-        g.globals = torch.tensor([0], dtype=dtype)
-        return g
-    src, dst = np.meshgrid(np.arange(num_nodes), np.arange(num_nodes))
-    src = src.flatten()
-    dst = dst.flatten()
-    g = dgl.graph((src, dst))
-    g.ndata['features'] = node_features  # shape: (num_nodes, num_features)
-    # Compute edge features
-    eta = node_features[:, 1]
-    phi = node_features[:, 2]
-    deta = eta[src] - eta[dst]
-    dphi = phi[src] - phi[dst]
-    dphi = torch.remainder(dphi + np.pi, 2 * np.pi) - np.pi
-    dR = torch.sqrt(deta ** 2 + dphi ** 2)
-    edge_features = torch.stack([dR, deta, dphi], dim=1)
-    g.edata['features'] = edge_features
-    g.globals = torch.tensor([num_nodes], dtype=dtype)
-    return g
-def process_chunk(args):
-    name, label, chunk_id, arrays, particles, features, branches, dtype, save_path = args
-    n_entries = len(arrays)
-    arrays_ordered = {}
-    arrays_ordered = {}
-    for b in branches:
-        if b in arrays.fields:
-            arrays_ordered[b] = arrays[b]
-        elif b.endswith("_energy"):
-            prefix = b[:-7]
-            pt_name = f"{prefix}_pt"
-            if prefix == "MET":
-                pt_name = f"{prefix}_met"
-            eta_name = f"{prefix}_eta"
-            arrays_ordered[b] = arrays[pt_name] * np.cosh(arrays[eta_name])
-        elif "node_type" in b:
-            prefix = b[:-10]
-            pt_name = f"{prefix}_pt"
-            if prefix == "MET":
-                pt_name = f"{prefix}_met"
-            index = particles.index(prefix)
-            arrays_ordered[b] = ak.ones_like(arrays[pt_name]) * index
-        else:
-            prefix = b.split("_")[0]
-            pt_name = f"{prefix}_pt"
-            if prefix == "MET":
-                pt_name = f"{prefix}_met"
-            arrays_ordered[b] = ak.zeros_like(arrays[pt_name])
-    graphs = []
-    for i in range(n_entries):
-        if (i % 250 == 0):
-            print(f"{name} chunk {chunk_id} processed {i} events")
-        node_features_list = []
-        for p in particles:
-            feats = []
-            for f in features:
-                branch = f"{p}_{f}"
-                if p == "MET" and f == "pt":
-                    branch = "MET_met"
-                value = ak.to_numpy(arrays_ordered[branch][i])
-                feats.append(value)
-            if len(feats[0]) == 0:
-                continue
-            node_array = np.stack(feats, axis=1)
-            node_features_list.append(node_array)
-        if node_features_list:
-            node_features = np.concatenate(node_features_list, axis=0)
-        else:
-            node_features = np.empty((0, len(features)))
-        graphs.append(make_graph(node_features, dtype=dtype))
-    labels = torch.full((len(graphs),), label, dtype=dtype)
-    dgl.save_graphs(f"{save_path}/{name}_{chunk_id:02d}.bin", graphs, {'label': labels})
-    print(f"Saved {name} chunk {chunk_id:02d} to {save_path}/{name}_{chunk_id:03d}.bin")
-    return
-class Root_Graph:
-    def __init__(
-        self,
-        name: str,
-        label: int,
-        load_path: str,
-        save_path: str,
-        cfg: DictConfig
-    ):
-        self.name = name
-        self.label = label
-        self.load_path = load_path
-        self.save_path = save_path
-        self.data = None
-        self.ttree = cfg.ttree
-        self.particles = cfg.particles
-        self.features = cfg.features
-        self.globals = cfg.globals
-        self.chunks = cfg.chunks
-        self.train_val_test_split = cfg.train_val_test_split
-        assert np.sum(self.train_val_test_split) == 1, "train_val_test_split must sum to 1"
-        dtype_str = getattr(cfg, "type", "torch.float32")
-        if isinstance(dtype_str, str) and dtype_str.startswith("torch."):
-            self.dtype = getattr(torch, dtype_str.split(".")[-1], torch.float32)
-        else:
-            self.dtype = torch.float32
-        print(f"Initializing dataset {name} with dtype {self.dtype}")
-    def get_branches(self) -> List[str]:
-        branches = [f"{p}_{f}" for p in self.particles for f in self.features]
-        branches += self.globals
-        branches = ["MET_met" if b == "MET_pt" else b for b in branches]
-        return branches
-    def process(self, max_workers: int = 128):
-        branches = self.get_branches()
-        with uproot.open(f"{self.load_path}:{self.ttree}") as tree:
-            available_branches = set(tree.keys())
-            num_entries = tree.num_entries
-        print(f"Getting branches: {branches}")
-        step_size = math.ceil(num_entries / self.chunks)
-        # Prepare chunk arguments for each chunk
-        chunk_args_list = []
-        for chunk_id, arrays in enumerate(
-            uproot.iterate(
-                f"{self.load_path}:{self.ttree}",
-                expressions=[b for b in branches if b in available_branches],
-                step_size=step_size,
-                library="ak"
-            )
-        ):
-            # Pass everything needed for chunk processing
-            chunk_args_list.append((self.name, self.label, chunk_id, arrays, self.particles, self.features, branches, self.dtype, self.save_path))
-        # Parallel processing of chunks
-        with ThreadPoolExecutor(max_workers) as executor:
-            futures = [executor.submit(process_chunk, args) for args in chunk_args_list]
-            for future in as_completed(futures):
-                future.result()
-        return
-    def train_val_test(self):
-        split = self.train_val_test_split
-        # Collect all graphs and labels first
-        all_graphs = []
-        all_labels = []
-        files = [f"{self.save_path}/{self.name}_{chunk_id:02d}.bin" for chunk_id in range(self.chunks)]
-        for f in files:
-            graphs, label_dict = dgl.load_graphs(f)
-            all_graphs.extend(graphs)
-            # Assuming label_dict['label'] is shape (num_graphs_in_file,)
-            all_labels.extend(label_dict['label'].tolist())
-        n = len(all_graphs)
-        rand = np.random.rand(n)
-        train_idx = rand < split[0]
-        val_idx = (rand >= split[0]) & (rand < split[0] + split[1])
-        test_idx = rand >= split[0] + split[1]
-        train_graphs = [g for g, flag in zip(all_graphs, train_idx) if flag]
-        val_graphs = [g for g, flag in zip(all_graphs, val_idx) if flag]
-        test_graphs = [g for g, flag in zip(all_graphs, test_idx) if flag]
-        train_labels = [l for l, flag in zip(all_labels, train_idx) if flag]
-        val_labels = [l for l, flag in zip(all_labels, val_idx) if flag]
-        test_labels = [l for l, flag in zip(all_labels, test_idx) if flag]
-        train_labels = torch.tensor(train_labels)
-        val_labels = torch.tensor(val_labels)
-        test_labels = torch.tensor(test_labels)
-        dgl.save_graphs(f"{self.save_path}/{self.name}_train.bin", train_graphs, {'label': train_labels})
-        dgl.save_graphs(f"{self.save_path}/{self.name}_val.bin", val_graphs, {'label': val_labels})
-        dgl.save_graphs(f"{self.save_path}/{self.name}_test.bin", test_graphs, {'label': test_labels})
-        print(f"Train: {len(train_graphs)}, Val: {len(val_graphs)}, Test: {len(test_graphs)}")
-    def load(self):
-        # List of expected files
-        files = [f"{self.save_path}/{self.name}_{chunk_id:02d}.bin" for chunk_id in range(self.chunks)]
-        # Check if all files exist
-        if not all(os.path.exists(f) for f in files):
-            print("graphs not found, processing data...")
-            self.process()
-        else:
-            print("graphs found, skipping processing...")
-        # Check if train/val/test exist:
-        files = [f"{self.save_path}/{self.name}_{split}.bin" for split in ["train", "val", "test"]]
-        if not all(os.path.exists(f) for f in files):
-            print("train/val/test split not found, splitting graphs...")
-            self.train_val_test()
-        else:
-            print("train/val/test split found, skipping splitting...")
-        print("loading graphs...")
-        train_graphs, train_label_dict = dgl.load_graphs(f"{self.save_path}/{self.name}_train.bin")
-        val_graphs, val_label_dict = dgl.load_graphs(f"{self.save_path}/{self.name}_val.bin")
-        test_graphs, test_label_dict = dgl.load_graphs(f"{self.save_path}/{self.name}_test.bin")
-        train_labels = train_label_dict['label']
-        val_labels = val_label_dict['label']
-        test_labels = test_label_dict['label']
-        print(f"successfully loaded {self.name}")
-        return train_graphs, train_labels, val_graphs, val_labels, test_graphs, test_labels
-class GraphDataset(Dataset):
-    def __init__(self, graphs, labels):
-        self.graphs = graphs
-        self.labels = labels
-    def __len__(self):
-        return len(self.graphs)
-    def __getitem__(self, idx):
-        return self.graphs[idx], self.labels[idx]
-def get_dataset(cfg: DictConfig):
-    random.seed(cfg.random_seed)
-    np.random.seed(cfg.random_seed)
-    torch.manual_seed(cfg.random_seed)
-    all_train_graphs = []
-    all_train_labels = []
-    all_val_graphs = []
-    all_val_labels = []
-    all_test_graphs = []
-    all_test_labels = []
-    for ds in cfg.datasets:
-        name = ds['name']
-        load_path = ds.get('load_path', f"{cfg.paths.data_dir}/{name}.root")
-        save_path = ds.get('save_path', f"{cfg.paths.save_dir}/")
-        graph = Root_Graph(name, ds.get('label'), load_path, save_path, cfg.root_graph)
-        train_graphs, train_labels, val_graphs, val_labels, test_graphs, test_labels = graph.load()
-        all_train_graphs.extend(train_graphs)
-        all_train_labels.extend(train_labels)
-        all_val_graphs.extend(val_graphs)
-        all_val_labels.extend(val_labels)
-        all_test_graphs.extend(test_graphs)
-        all_test_labels.extend(test_labels)
-    train_dataset = GraphDataset(all_train_graphs, all_train_labels)
-    val_dataset = GraphDataset(all_val_graphs, all_val_labels)
-    test_dataset = GraphDataset(all_test_graphs, all_test_labels)
-    batch_size = cfg.root_graph.batch_size
-    train_loader = GraphDataLoader(train_dataset, batch_size=batch_size, shuffle=True)
-    val_loader = GraphDataLoader(val_dataset, batch_size=batch_size, shuffle=False)
-    test_loader = GraphDataLoader(test_dataset, batch_size=batch_size, shuffle=False)
-    print("all data loaded successfully")
-    return train_loader, val_loader, test_loader

physicsnemo/config.yaml CHANGED Viewed

@@ -42,9 +42,9 @@ architecture:
   out_dim: 1
 paths:
-  data_dir: /global/cfs/projectdirs/trn007/lbl_atlas/data/stats_100K
-  save_dir: /global/cfs/projectdirs/trn007/lbl_atlas/data/physicsnemo_graphs/stats_100K
-  training_dir: ./training/
 datasets:
   - name: ttH_cp_even
@@ -54,7 +54,7 @@ datasets:
     load_path: ${paths.data_dir}/ttH_CPodd.root
     label: 1
-root_graph:
   ttree: output
   type: torch.bfloat16
   particles: ["jet", "ele", "mu", "ph", "MET"]
@@ -62,6 +62,6 @@ root_graph:
   globals: []
   weights: ""
   tracking: []
-  chunks: 32
   batch_size: 8192
   train_val_test_split: [0.75, 0.24, 0.01]

   out_dim: 1
 paths:
+  data_dir: /global/cfs/projectdirs/atlas/joshua/hackathon_data/stats_100K
+  save_dir: /pscratch/sd/j/joshuaho/physicsnemo/graphs/stats_100K
+  training_dir: ./training_stats_100K/
 datasets:
   - name: ttH_cp_even
     load_path: ${paths.data_dir}/ttH_CPodd.root
     label: 1
+root_dataset:
   ttree: output
   type: torch.bfloat16
   particles: ["jet", "ele", "mu", "ph", "MET"]
   globals: []
   weights: ""
   tracking: []
+  chunks: 10
   batch_size: 8192
   train_val_test_split: [0.75, 0.24, 0.01]

physicsnemo/config_stats_all.yaml CHANGED Viewed

@@ -42,8 +42,8 @@ architecture:
   out_dim: 1
 paths:
-  data_dir: /global/cfs/projectdirs/trn007/lbl_atlas/data/stats_all
-  save_dir: /global/cfs/projectdirs/trn007/lbl_atlas/data/physicsnemo_graphs/stats_all
   training_dir: ./training_stats_all/
 datasets:
@@ -54,7 +54,7 @@ datasets:
     load_path: ${paths.data_dir}/ttH_CPodd.root
     label: 1
-root_graph:
   ttree: output
   type: torch.bfloat16
   particles: ["jet", "ele", "mu", "ph", "MET"]
@@ -62,6 +62,6 @@ root_graph:
   globals: []
   weights: ""
   tracking: []
-  chunks: 128
   batch_size: 8192
   train_val_test_split: [0.75, 0.24, 0.01]

   out_dim: 1
 paths:
+  data_dir: /global/cfs/projectdirs/atlas/joshua/hackathon_data/stats_all
+  save_dir: /pscratch/sd/j/joshuaho/physicsnemo/graphs/stats_all
   training_dir: ./training_stats_all/
 datasets:
     load_path: ${paths.data_dir}/ttH_CPodd.root
     label: 1
+root_dataset:
   ttree: output
   type: torch.bfloat16
   particles: ["jet", "ele", "mu", "ph", "MET"]
   globals: []
   weights: ""
   tracking: []
+  step_size: 1024
   batch_size: 8192
   train_val_test_split: [0.75, 0.24, 0.01]

physicsnemo/dataset.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import os
+import uproot
+import dgl
+import torch
+import numpy as np
+import awkward as ak
+from omegaconf import DictConfig
+from typing import List
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+import dataset_utils as utils
+from torch.utils.data import Dataset
+from dgl.dataloading import GraphDataLoader
+class RootDataset:
+    def __init__(
+        self,
+        name: str,
+        label: int,
+        load_path: str,
+        save_path: str,
+        device: str,
+        cfg: DictConfig
+    ):
+        self.name = name
+        self.label = label
+        self.load_path = load_path
+        self.save_path = save_path
+        self.data = None
+        self.device = device
+        self.ttree = cfg.ttree
+        self.particles = cfg.particles
+        self.features = cfg.features
+        self.globals = cfg.globals
+        self.step_size = cfg.step_size
+        self.batch_size = cfg.batch_size
+        self.train_val_test_split = cfg.train_val_test_split
+        assert np.sum(self.train_val_test_split) == 1, "train_val_test_split must sum to 1"
+        dtype_str = getattr(cfg, "type", "torch.float32")
+        if isinstance(dtype_str, str) and dtype_str.startswith("torch."):
+            self.dtype = getattr(torch, dtype_str.split(".")[-1], torch.float32)
+        else:
+            self.dtype = torch.float32
+        print(f"initializing dataset {name} with dtype {self.dtype}")
+    def get_branches(self) -> List[str]:
+        branches = [f"{p}_{f}" for p in self.particles for f in self.features]
+        branches += self.globals
+        branches = ["MET_met" if b == "MET_pt" else b for b in branches]
+        return branches
+    def process(self):
+        branches = self.get_branches()
+        with uproot.open(f"{self.load_path}:{self.ttree}") as tree:
+            available_branches = set(tree.keys())
+            num_entries = tree.num_entries
+        print(f"getting branches: {branches}")
+        num_cpus = os.cpu_count()
+        total_chunks = np.ceil(num_entries / self.step_size)
+        with ProcessPoolExecutor(max_workers=num_cpus) as executor:
+            futures = []
+            for chunk_id, arrays in enumerate(
+                tqdm(
+                    uproot.iterate(
+                        f"{self.load_path}:{self.ttree}",
+                        expressions=[b for b in branches if b in available_branches],
+                        step_size=self.step_size,
+                        library="ak"
+                    ),
+                    desc="loading root file",
+                    total=total_chunks,
+                    position=0,
+                    leave=False
+                )
+            ):
+                cfg = utils.ChunkConfig(
+                    name=self.name,
+                    label=self.label,
+                    chunk_id=chunk_id,
+                    batch_size=self.batch_size,
+                    arrays=arrays,
+                    particles=self.particles,
+                    features=self.features,
+                    branches=branches,
+                    dtype=self.dtype,
+                    save_path=self.save_path,
+                )
+                futures.append(executor.submit(utils.process_chunk, cfg))
+        for future in as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                import traceback
+                print("Exception in worker process:")
+                traceback.print_exception(type(e), e, e.__traceback__)
+        return
+    def load(self):
+        with uproot.open(f"{self.load_path}:{self.ttree}") as tree:
+            num_entries = tree.num_entries
+        total_chunks = int(np.ceil(num_entries / self.step_size))
+        chunk_files = [f"{self.save_path}/{self.name}_{chunk_id:04d}.bin" for chunk_id in range(total_chunks)]
+        if not all(os.path.exists(f) for f in chunk_files):
+            print("graphs not found. processing root file...")
+            self.process()
+        graph_tuple_list = []
+        for chunk_id, f in enumerate(chunk_files):
+            if chunk_id < total_chunks - 1:
+                n_graphs = self.step_size
+            else:
+                n_graphs = num_entries - self.step_size * (total_chunks - 1)
+            graph_tuple_list.extend((f, idx) for idx in range(n_graphs))
+        split = self.train_val_test_split
+        n_total = len(graph_tuple_list)
+        n_train = int(split[0] * n_total)
+        n_val = int(split[1] * n_total)
+        train_tuples = graph_tuple_list[:n_train]
+        val_tuples   = graph_tuple_list[n_train:n_train + n_val]
+        test_tuples  = graph_tuple_list[n_train + n_val:]
+        return train_tuples, val_tuples, test_tuples
+class GraphDataset(Dataset):
+    def __init__(self, graphs, labels):
+        self.graphs = graphs
+        self.labels = labels
+    def __len__(self):
+        return len(self.graphs)
+    def __getitem__(self, idx):
+        return self.graphs[idx], self.labels[idx]
+    def shuffle(self):
+        # TODO: implement graph shuffling
+        return self.graphs
+class GraphTupleDataset:
+    def __init__(self, tuple_list):
+        self.tuples = tuple_list
+    def __len__(self):
+        return len(self.tuples)
+    def __getitem__(self, idx):
+        filepath, graph_idx = self.tuples[idx]
+        graphs, labels = utils.load_graphs(filepath)
+        return graphs[graph_idx], labels[graph_idx]
+def get_dataset(cfg: DictConfig, device):
+    all_train = []
+    all_val = []
+    all_test = []
+    for ds in cfg.datasets:
+        name = ds['name']
+        load_path = ds.get('load_path', f"{cfg.paths.data_dir}/{name}.root")
+        save_path = ds.get('save_path', f"{cfg.paths.save_dir}/")
+        datastet = RootDataset(name, ds.get('label'), load_path, save_path, device, cfg.root_dataset)
+        train, val, test = datastet.load()
+        all_train.extend(train)
+        all_val.extend(val)
+        all_test.extend(test)
+    train_dataset = GraphTupleDataset(all_train)
+    val_dataset = GraphTupleDataset(all_val)
+    test_dataset = GraphTupleDataset(all_test)
+    batch_size = cfg.root_dataset.batch_size
+    train_loader = GraphDataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=2)
+    val_loader   = GraphDataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2)
+    test_loader  = GraphDataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2)
+    print("all data loaded successfully")
+    print(f"train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")
+    return train_loader, val_loader, test_loader

physicsnemo/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import dgl
+import torch
+import numpy as np
+import awkward as ak
+from dataclasses import dataclass
+from typing import List, Any, Dict
+from tqdm import tqdm
+@dataclass
+class ChunkConfig:
+    name: str
+    label: str
+    chunk_id: int
+    batch_size: int
+    arrays: List[Any]
+    particles: List[Any]
+    features: List[Any]
+    branches: List[Any]
+    dtype: torch.dtype
+    save_path: str
+def process_chunk(cfg: ChunkConfig):
+    n_entries = len(cfg.arrays)
+    arrays_ordered = {}
+    for b in cfg.branches:
+        if b in cfg.arrays.fields:
+            arrays_ordered[b] = cfg.arrays[b]
+        elif b.endswith("_energy"):
+            prefix = b[:-7]
+            pt_name = f"{prefix}_pt"
+            if prefix == "MET":
+                pt_name = f"{prefix}_met"
+            eta_name = f"{prefix}_eta"
+            arrays_ordered[b] = cfg.arrays[pt_name] * np.cosh(cfg.arrays[eta_name])
+        elif "node_type" in b:
+            prefix = b[:-10]
+            pt_name = f"{prefix}_pt"
+            if prefix == "MET":
+                pt_name = f"{prefix}_met"
+            index = cfg.particles.index(prefix)
+            arrays_ordered[b] = ak.ones_like(cfg.arrays[pt_name]) * index
+        else:
+            prefix = b.split("_")[0]
+            pt_name = f"{prefix}_pt"
+            if prefix == "MET":
+                pt_name = f"{prefix}_met"
+            arrays_ordered[b] = ak.zeros_like(cfg.arrays[pt_name])
+    graphs = []
+    for i in range(n_entries):
+        node_features_list = []
+        for p in cfg.particles:
+            feats = []
+            for f in cfg.features:
+                branch = f"{p}_{f}"
+                if p == "MET" and f == "pt":
+                    branch = "MET_met"
+                value = ak.to_numpy(arrays_ordered[branch][i])
+                feats.append(value)
+            if len(feats[0]) == 0:
+                continue
+            node_array = np.stack(feats, axis=1)
+            node_features_list.append(node_array)
+        if node_features_list:
+            node_features = np.concatenate(node_features_list, axis=0)
+        else:
+            node_features = np.empty((0, len(cfg.features)))
+        graphs.append(make_graph(node_features, dtype=cfg.dtype))
+    labels = torch.full((len(graphs),), cfg.label, dtype=cfg.dtype)
+    save_graphs(f"{cfg.save_path}/{cfg.name}_{cfg.chunk_id:04d}.bin", graphs, {'labels': labels})
+    return
+def save_graphs(f: str, g: List[dgl.DGLGraph], metadata: Dict) -> None:
+    dgl.save_graphs(f, g, metadata)
+def load_graphs(f: str):
+    g, metadata = dgl.load_graphs(f)
+    return g, metadata['labels']
+src_dst_cache = {}
+def get_src_dst(num_nodes):
+    if num_nodes not in src_dst_cache:
+        src, dst = torch.meshgrid(torch.arange(num_nodes), torch.arange(num_nodes), indexing='ij')
+        src_dst_cache[num_nodes] = (src.flatten(), dst.flatten())
+    return src_dst_cache[num_nodes]
+@torch.jit.script
+def compute_edge_features(eta, phi, src, dst):
+    deta = eta[src] - eta[dst]
+    dphi = phi[src] - phi[dst]
+    dphi = torch.remainder(dphi + np.pi, 2 * np.pi) - np.pi
+    dR = torch.sqrt(deta ** 2 + dphi ** 2)
+    edge_features = torch.stack([dR, deta, dphi], dim=1)
+    return edge_features
+# TODO: normalize all features
+def make_graph(node_features: np.array, dtype=torch.float32):
+    node_features = torch.tensor(node_features, dtype=dtype)
+    num_nodes = node_features.shape[0]
+    if num_nodes == 0:
+        g = dgl.graph(([], []))
+        g.ndata['features'] = node_features
+        g.edata['features'] = torch.empty((0, 3), dtype=dtype)
+        g.globals = torch.tensor([0], dtype=dtype)
+        return g
+    src, dst = get_src_dst(num_nodes)
+    src = src.flatten()
+    dst = dst.flatten()
+    g = dgl.graph((src, dst))
+    g.ndata['features'] = node_features
+    eta = node_features[:, 1]
+    phi = node_features[:, 2]
+    edge_features = compute_edge_features(eta, phi, src, dst)
+    g.edata['features'] = edge_features
+    g.globals = torch.tensor([num_nodes], dtype=dtype)
+    return g

physicsnemo/models/Edge_Network.py ADDED Viewed

File without changes

physicsnemo/{MeshGraphNet.py → models/MeshGraphNet.py} RENAMED Viewed

File without changes

physicsnemo/setup/Dockerfile CHANGED Viewed

@@ -21,5 +21,3 @@ RUN pip install --no-cache-dir mpi4py jupyter uproot
 # (Optional) Expose Jupyter port
 EXPOSE 8888


21
22	# (Optional) Expose Jupyter port
23	EXPOSE 8888

physicsnemo/train.py CHANGED Viewed

@@ -2,8 +2,9 @@ import time, os
 start = time.time()
 import torch
 from dgl.dataloading import GraphDataLoader
-from torch.cuda.amp import GradScaler
 import numpy as np
 import hydra
 from omegaconf import DictConfig
@@ -13,15 +14,27 @@ from physicsnemo.launch.logging import (
 )
 from physicsnemo.launch.utils import load_checkpoint, save_checkpoint
 from physicsnemo.distributed.manager import DistributedManager
-from Dataset import get_dataset
 import json
 from sklearn.metrics import roc_auc_score
-import MeshGraphNet
 import torch.nn.functional as F
 def weighted_bce(input, target, device=None, weights=None):
     """
     Compute a weighted and label-normalized binary cross entropy (BCE) loss.
@@ -47,12 +60,7 @@ def weighted_bce(input, target, device=None, weights=None):
             target = target.squeeze(-1)
     if weights is None:
-        weights = torch.ones_like(target)
-    if device is not None:
-        input = input.to(device)
-        target = target.to(device)
-        weights = weights.to(device)
     # Compute per-element BCE loss (no reduction)
     loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
@@ -82,17 +90,17 @@ class MGNTrainer:
         params = {}
-        norm_type = {"features": "normal", "labels": "normal"}
-        self.dataloader, self.valloader, self.testloader = get_dataset(cfg)
-        dtype_str = getattr(cfg.root_graph, "type", "torch.float32")
         if isinstance(dtype_str, str) and dtype_str.startswith("torch."):
             self.dtype = getattr(torch, dtype_str.split(".")[-1], torch.float32)
         else:
             self.dtype = torch.float32
-        nodes_features = cfg.root_graph.features
         edges_features = ["dR", "deta", "dphi"]
         global_features = ["num_nodes"]
@@ -173,10 +181,9 @@ class MGNTrainer:
             loss: loss value.
         """
-        graph = graph.to(self.device)
         self.optimizer.zero_grad()
         pred = self.model(graph.ndata["features"], graph.edata["features"], graph)
-        loss = weighted_bce(pred, label, device=self.device)
         self.backward(loss)
         return loss
@@ -192,7 +199,6 @@ class MGNTrainer:
         Returns:
             loss (Tensor): The computed loss value (scalar).
         """
         predictions = []
         labels = []
@@ -232,6 +238,9 @@ def do_training(cfg: DictConfig):
         cfg: Dictionary of parameters.
     """
     # initialize distributed manager
     DistributedManager.initialize()
@@ -244,6 +253,20 @@ def do_training(cfg: DictConfig):
     # initialize trainer
     trainer = MGNTrainer(logger, cfg, dist)
     # training loop
     start = time.time()
     logger.info("Training started...")

 start = time.time()
 import torch
+from torch.nn.parallel import DistributedDataParallel
 from dgl.dataloading import GraphDataLoader
+from torch.amp import GradScaler
 import numpy as np
 import hydra
 from omegaconf import DictConfig
 )
 from physicsnemo.launch.utils import load_checkpoint, save_checkpoint
 from physicsnemo.distributed.manager import DistributedManager
+from dataset import get_dataset
 import json
+import random
 from sklearn.metrics import roc_auc_score
+import models.MeshGraphNet as MeshGraphNet
 import torch.nn.functional as F
+def bce(input, target, device=None, weights=None):
+    if input.shape != target.shape:
+        if input.shape[-1] == 1 and input.shape[:-1] == target.shape:
+            input = input.squeeze(-1)
+        elif target.shape[-1] == 1 and target.shape[:-1] == input.shape:
+            target = target.squeeze(-1)
+    loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
+    return torch.mean(loss)
 def weighted_bce(input, target, device=None, weights=None):
     """
     Compute a weighted and label-normalized binary cross entropy (BCE) loss.
             target = target.squeeze(-1)
     if weights is None:
+        weights = torch.ones_like(target).to(device)
     # Compute per-element BCE loss (no reduction)
     loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
         params = {}
+        start = time.time()
+        self.dataloader, self.valloader, self.testloader = get_dataset(cfg, self.device)
+        print(f"total time loading dataset: {time.time() - start:.2f} seconds")
+        dtype_str = getattr(cfg.root_dataset, "type", "torch.float32")
         if isinstance(dtype_str, str) and dtype_str.startswith("torch."):
             self.dtype = getattr(torch, dtype_str.split(".")[-1], torch.float32)
         else:
             self.dtype = torch.float32
+        nodes_features = cfg.root_dataset.features
         edges_features = ["dR", "deta", "dphi"]
         global_features = ["num_nodes"]
             loss: loss value.
         """
         self.optimizer.zero_grad()
         pred = self.model(graph.ndata["features"], graph.edata["features"], graph)
+        loss = bce(pred, label, device=self.device)
         self.backward(loss)
         return loss
         Returns:
             loss (Tensor): The computed loss value (scalar).
         """
         predictions = []
         labels = []
         cfg: Dictionary of parameters.
     """
+    random.seed(cfg.random_seed)
+    np.random.seed(cfg.random_seed)
+    torch.manual_seed(cfg.random_seed)
     # initialize distributed manager
     DistributedManager.initialize()
     # initialize trainer
     trainer = MGNTrainer(logger, cfg, dist)
+    if dist.distributed:
+        ddps = torch.cuda.Stream()
+        with torch.cuda.stream(ddps):
+            trainer.model = DistributedDataParallel(
+                trainer.model,
+                device_ids=[dist.local_rank],  # Set the device_id to be
+                                               # the local rank of this process on
+                                               # this node
+                output_device=dist.device,
+                broadcast_buffers=dist.broadcast_buffers,
+                find_unused_parameters=dist.find_unused_parameters,
+            )
+        torch.cuda.current_stream().wait_stream(ddps)
     # training loop
     start = time.time()
     logger.info("Training started...")