updated training and inference script

Browse files

Files changed (3) hide show

root_gnn_dgl/scripts/inference.py +144 -130
root_gnn_dgl/scripts/prep_data.py +6 -6
root_gnn_dgl/scripts/training_script.py +68 -8

root_gnn_dgl/scripts/inference.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import sys
-import os
-file_path = os.getcwd()
 sys.path.append(file_path)
 import argparse
 import yaml
 import torch
 import dgl
@@ -12,26 +12,15 @@ from dgl.data import DGLDataset
 from dgl.dataloading import GraphDataLoader
 from torch.utils.data import SubsetRandomSampler, SequentialSampler
-def my_error_handler(level, abort, location, msg):
-    # Log the error message to a file instead of printing
-    with open("error_log.txt", "a") as log_file:
-        log_file.write(f"Error in {location}: {msg}\n")
-    # Optionally, print the error message to the console
-    # print(f"Error in {location}: {msg}")
-    # Decide whether to abort based on the error level
-    if abort:
-        raise RuntimeError(f"Fatal error in {location}: {msg}")
 class CustomPreBatchedDataset(DGLDataset):
-    def __init__(self, start_dataset, batch_size, mask_fn=None, drop_last=False, shuffle=False, **kwargs):
         self.start_dataset = start_dataset
         self.batch_size = batch_size
         self.mask_fn = mask_fn or (lambda x: torch.ones(len(x), dtype=torch.bool))
         self.drop_last = drop_last
         self.shuffle = shuffle
         super().__init__(name=start_dataset.name + '_custom_prebatched', save_dir=start_dataset.save_dir)
     def process(self):
@@ -39,18 +28,29 @@ class CustomPreBatchedDataset(DGLDataset):
         indices = torch.arange(len(self.start_dataset))[mask]
         print(f"Number of elements after masking: {len(indices)}")  # Debugging print
         if self.shuffle:
-            sampler = SubsetRandomSampler(indices)
         else:
-            sampler = SequentialSampler(indices)
         self.dataloader = GraphDataLoader(
-            self.start_dataset,
-            sampler=sampler,
-            batch_size=self.batch_size,
             drop_last=self.drop_last
         )
-        print(f"Batch size set in DataLoader: {self.batch_size}")  # Debugging print
     def __getitem__(self, idx):
         if isinstance(idx, int):
@@ -60,7 +60,15 @@ class CustomPreBatchedDataset(DGLDataset):
         return next(iter(dloader))
     def __len__(self):
-        return len(self.start_dataset)
 def include_config(conf):
     if 'include' in conf:
@@ -76,28 +84,44 @@ def load_config(config_file):
     return conf
 def main():
     parser = argparse.ArgumentParser()
     add_arg = parser.add_argument
-    add_arg('--config', type=str, required=True)
     add_arg('--target', type=str, required=True)
     add_arg('--destination', type=str, default='')
     add_arg('--chunkno', type=int, default=0)
     add_arg('--chunks', type=int, default=1)
     add_arg('--write', action='store_true')
     add_arg('--ckpt', type=int, default=-1)
     add_arg('--clobber', action='store_true')
     add_arg('--tree', type=str, default='')
-    add_arg('--branch_name', type=str, default='score')
     args = parser.parse_args()
-    config = load_config(args.config)
     if args.destination == '':
-        args.destination = os.path.join(config['Training_Directory'], 'inference/', os.path.split(args.target)[1])
     else:
-        args.destination = args.destination
-    if not args.write:
-        args.destination = args.destination.replace('.root', '') + f'_chunk{args.chunkno}.npz'
     if os.path.exists(args.destination):
         print(f'File {args.destination} already exists.')
         if args.clobber:
@@ -137,7 +161,7 @@ def main():
     dset_config['args']['selections'] = []
     dset_config['args']['save_dir'] = os.path.dirname(args.destination)
     if args.tree != '':
         dset_config['args']['tree_name'] = args.tree
@@ -152,9 +176,13 @@ def main():
     batch_size = config['Training']['batch_size']
     lstart = time.time()
-    loader = CustomPreBatchedDataset(dset, batch_size)
     loader.process()
-    # loader = dataset.PreBatchedDataset(dset, batch_size, shuffle=False, drop_last=False, save_to_disk=False, chunks = 1, num_workers=0)
     lend = time.time()
     print('Loader finished in {:.2f} seconds'.format(lend - lstart))
     sample_graph, _, _, global_sample = loader[0]
@@ -162,70 +190,64 @@ def main():
     print('dset length =', len(dset))
     print('loader length =', len(loader))
-    model = utils.buildFromConfig(config['Model'], {'sample_graph' : sample_graph, 'sample_global': global_sample}).to(device)
-    if args.ckpt < 0:
-        ep, checkpoint = utils.get_last_epoch(config, args.ckpt, device=device)
-    else:
-        ep, checkpoint = utils.get_specific_epoch(config, args.ckpt, device=device)
-    #Bad filler for models which were compiled. Have to remove this prefix.
-    mds_copy = {}
-    for key in checkpoint['model_state_dict'].keys():
-        newkey = key.replace('module.', '')
-        newkey = newkey.replace('_orig_mod.', '')
-        mds_copy[newkey] = checkpoint['model_state_dict'][key]
-    model.load_state_dict(mds_copy)
-    model.eval()
-    end = time.time()
-    print('Model and dataset finished in {:.2f} seconds'.format(end - start))
-    print('Starting inference')
-    start = time.time()
-    finish_fn = torch.nn.Sigmoid()
-    if 'Loss' in config:
-        finish_fn = utils.buildFromConfig(config['Loss']['finish'])
-    scores = []
-    labels = []
-    tracking_info = []
-    ibatch = 0
-    for batch, label, track, globals in loader.dataloader:
-        batch = batch.to(device)
-        pred = model(batch, globals.to(device))
-        ibatch += 1
-        # scores.append(finish_fn(pred).detach().cpu().numpy())
-        if (finish_fn.__class__.__name__ == "ContrastiveClusterFinish"):
-            scores.append(pred.detach().cpu().numpy())
-        else:
-            scores.append(finish_fn(pred).detach().cpu().numpy())
-        labels.append(label.detach().cpu().numpy())
-        tracking_info.append(track.detach().cpu().numpy())
-    # for batch, label, track, globals in loader:
-    #     batch = batch.to(device)
-    #     pred = model(batch, globals.to(device))
-    #     print(f'Batch size: {batch.batch_size if hasattr(batch, "batch_size") else "Unavailable"}')
-    #     print(f'Prediction shape: {pred.shape}')
-    #     ibatch += 1
-    #     scores.append(finish_fn(pred).detach().cpu().numpy())
-    #     labels.append(label.detach().cpu().numpy())
-    #     tracking_info.append(track.detach().cpu().numpy())
-    #     exit()
-    score_size = scores[0].shape[1]
-    scores = np.concatenate(scores)
-    labels = np.concatenate(labels)
-    tracking_info = np.concatenate(tracking_info)
-    end = time.time()
-    print('Inference finished in {:.2f} seconds'.format(end - start))
     if args.write:
-        # ROOT.SetErrorHandler(my_error_handler)
-        ROOT.gErrorIgnoreLevel = ROOT.kFatal
-        # ROOT.gSystem.RedirectOutput("/dev/null", "w")
         # Open the original ROOT file
         infile = ROOT.TFile.Open(args.target)
         tree = infile.Get(dset_config['args']['tree_name'])
@@ -236,54 +258,46 @@ def main():
         # Create a new ROOT file to write the modified tree
         outfile = ROOT.TFile.Open(args.destination, 'RECREATE')
-        # Clone the original tree, including data
-        outtree = tree.CloneTree(0)  # Clone all entries
-        # Determine if scores is a list of single values or vectors
-        from ROOT import std
-        if isinstance(scores[0], (list, tuple, np.ndarray)):  # Check if scores contains vectors
-            # Create a new branch for scores as a vector of floats
-            scores_branch_vec = std.vector('float')()
-            outtree.Branch(args.branch_name, scores_branch_vec)
-            is_vector = True
-        else:  # Scores contains single values
-            # Create a new branch for scores as a single float
-            score_branch_arr = array('f', [0])
-            outtree.Branch(args.branch_name, score_branch_arr, f'{args.branch_name}/F')
-            is_vector = False
-        # Write scores to the new branch
-        print(f'Writing {len(scores)} scores to tree')
         for i in range(tree.GetEntries()):
             tree.GetEntry(i)
-            if is_vector:
-                # Clear the vector
-                scores_branch_vec.clear()
-                # Add all elements from scores[i] to the vector
-                for value in scores[i]:
-                    scores_branch_vec.push_back(float(value))  # Use push_back to add elements one by one
-            else:
-                # Fill the score branch with the current single score
-                score_branch_arr[0] = float(scores[i])  # Ensure the value is a float
-            # Fill the output tree with all branches, including the new scores branch
             outtree.Fill()
         # Write the modified tree to the new file
         print(f'Writing to file {args.destination}')
         print(f'Input entries: {tree.GetEntries()}, Output entries: {outtree.GetEntries()}')
         outtree.Write()
         outfile.Close()
         infile.Close()
     else:
         os.makedirs(os.path.split(args.destination)[0], exist_ok=True)
-        np.savez(args.destination, scores=scores, labels=labels, tracking_info=tracking_info)
 if __name__ == '__main__':
-    main()

 import sys
+file_path = "/global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl"
 sys.path.append(file_path)
+import os
 import argparse
 import yaml
+import gc
 import torch
 import dgl
 from dgl.dataloading import GraphDataLoader
 from torch.utils.data import SubsetRandomSampler, SequentialSampler
 class CustomPreBatchedDataset(DGLDataset):
+    def __init__(self, start_dataset, batch_size, chunkno=0, chunks=1, mask_fn=None, drop_last=False, shuffle=False, **kwargs):
         self.start_dataset = start_dataset
         self.batch_size = batch_size
         self.mask_fn = mask_fn or (lambda x: torch.ones(len(x), dtype=torch.bool))
         self.drop_last = drop_last
         self.shuffle = shuffle
+        self.chunkno = chunkno
+        self.chunks = chunks
         super().__init__(name=start_dataset.name + '_custom_prebatched', save_dir=start_dataset.save_dir)
     def process(self):
         indices = torch.arange(len(self.start_dataset))[mask]
         print(f"Number of elements after masking: {len(indices)}")  # Debugging print
+        # --- CHUNK SPLITTING ---
+        total = len(indices)
+        if self.chunks == 1:
+            chunk_indices = indices
+            print(f"Chunks=1, using all {total} indices.")
+        else:
+            chunk_size = (total + self.chunks - 1) // self.chunks
+            start = self.chunkno * chunk_size
+            end = min((self.chunkno + 1) * chunk_size, total)
+            chunk_indices = indices[start:end]
+            print(f"Working on chunk {self.chunkno}/{self.chunks}: indices {start}:{end} (total {len(chunk_indices)})")
         if self.shuffle:
+            sampler = SubsetRandomSampler(chunk_indices)
         else:
+            sampler = SequentialSampler(chunk_indices)
         self.dataloader = GraphDataLoader(
+            self.start_dataset,
+            sampler=sampler,
+            batch_size=self.batch_size,
             drop_last=self.drop_last
         )
     def __getitem__(self, idx):
         if isinstance(idx, int):
         return next(iter(dloader))
     def __len__(self):
+        mask = self.mask_fn(self.start_dataset)
+        indices = torch.arange(len(self.start_dataset))[mask]
+        total = len(indices)
+        if self.chunks == 1:
+            return total
+        chunk_size = (total + self.chunks - 1) // self.chunks
+        start = self.chunkno * chunk_size
+        end = min((self.chunkno + 1) * chunk_size, total)
+        return end - start
 def include_config(conf):
     if 'include' in conf:
     return conf
 def main():
     parser = argparse.ArgumentParser()
     add_arg = parser.add_argument
+    add_arg('--config', type=str, nargs='+', required=True, help="List of config files")
     add_arg('--target', type=str, required=True)
     add_arg('--destination', type=str, default='')
     add_arg('--chunkno', type=int, default=0)
     add_arg('--chunks', type=int, default=1)
     add_arg('--write', action='store_true')
     add_arg('--ckpt', type=int, default=-1)
+    add_arg('--var', type=str, default='Test_AUC')
+    add_arg('--mode', type=str, default='max')
     add_arg('--clobber', action='store_true')
     add_arg('--tree', type=str, default='')
+    add_arg('--branch_name', type=str, nargs='+', required=True, help="List of branch names corresponding to configs")
     args = parser.parse_args()
+    if(len(args.config) != len(args.branch_name)):
+        print(f"configs and branch names do not match")
+        return
+    config = load_config(args.config[0])
+    # --- OUTPUT DESTINATION LOGIC ---
     if args.destination == '':
+        base_dest = os.path.join(config['Training_Directory'], 'inference/', os.path.split(args.target)[1])
+    else:
+        base_dest = args.destination
+    base_dest = base_dest.replace('.root', '').replace('.npz', '')
+    if args.chunks > 1:
+        chunked_dest = f"{base_dest}_chunk{args.chunkno}"
     else:
+        chunked_dest = base_dest
+    chunked_dest += '.root' if args.write else '.npz'
+    args.destination = chunked_dest
+    # --- FILE EXISTENCE CHECK ---
     if os.path.exists(args.destination):
         print(f'File {args.destination} already exists.')
         if args.clobber:
     dset_config['args']['selections'] = []
     dset_config['args']['save_dir'] = os.path.dirname(args.destination)
     if args.tree != '':
         dset_config['args']['tree_name'] = args.tree
     batch_size = config['Training']['batch_size']
     lstart = time.time()
+    loader = CustomPreBatchedDataset(
+        dset,
+        batch_size,
+        chunkno=args.chunkno,
+        chunks=args.chunks
+    )
     loader.process()
     lend = time.time()
     print('Loader finished in {:.2f} seconds'.format(lend - lstart))
     sample_graph, _, _, global_sample = loader[0]
     print('dset length =', len(dset))
     print('loader length =', len(loader))
+    all_scores = {}
+    all_labels = {}
+    all_tracking = {}
+    with torch.no_grad():
+        for config_file, branch in zip(args.config, args.branch_name):
+            config = load_config(config_file)
+            model = utils.buildFromConfig(config['Model'], {'sample_graph' : sample_graph, 'sample_global': global_sample}).to(device)
+            if args.ckpt < 0:
+                ep, checkpoint = utils.get_best_epoch(config, var=args.var, mode='max', device=device)
+            else:
+                ep, checkpoint = utils.get_specific_epoch(config, args.ckpt, device=device)
+            # Remove distributed/compiled prefixes if present
+            mds_copy = {}
+            for key in checkpoint['model_state_dict'].keys():
+                newkey = key.replace('module.', '')
+                newkey = newkey.replace('_orig_mod.', '')
+                mds_copy[newkey] = checkpoint['model_state_dict'][key]
+            model.load_state_dict(mds_copy)
+            model.eval()
+            end = time.time()
+            print('Model and dataset finished in {:.2f} seconds'.format(end - start))
+            print('Starting inference')
+            start = time.time()
+            finish_fn = torch.nn.Sigmoid()
+            if 'Loss' in config:
+                finish_fn = utils.buildFromConfig(config['Loss']['finish'])
+            scores = []
+            labels = []
+            tracking_info = []
+            ibatch = 0
+            for batch, label, track, globals in loader.dataloader:
+                batch = batch.to(device)
+                pred = model(batch, globals.to(device))
+                ibatch += 1
+                if (finish_fn.__class__.__name__ == "ContrastiveClusterFinish"):
+                    scores.append(pred.detach().cpu().numpy())
+                else:
+                    scores.append(finish_fn(pred).detach().cpu().numpy())
+                labels.append(label.detach().cpu().numpy())
+                tracking_info.append(track.detach().cpu().numpy())
+            score_size = scores[0].shape[1] if len(scores[0].shape) > 1 else 1
+            scores = np.concatenate(scores)
+            labels = np.concatenate(labels)
+            tracking_info = np.concatenate(tracking_info)
+            end = time.time()
+            print('Inference finished in {:.2f} seconds'.format(end - start))
+            all_scores[branch] = scores
+            all_labels[branch] = labels
+            all_tracking[branch] = tracking_info
     if args.write:
+        from ROOT import std
         # Open the original ROOT file
         infile = ROOT.TFile.Open(args.target)
         tree = infile.Get(dset_config['args']['tree_name'])
         # Create a new ROOT file to write the modified tree
         outfile = ROOT.TFile.Open(args.destination, 'RECREATE')
+        # Clone the original tree structure
+        outtree = tree.CloneTree(0)
+        # Create branches for all scores
+        branch_vectors = {}
+        for branch, scores in all_scores.items():
+            if isinstance(scores[0], (list, tuple, np.ndarray)) and len(scores[0]) > 1:
+                # Create a new branch for vectors
+                branch_vectors[branch] = std.vector('float')()
+                outtree.Branch(branch, branch_vectors[branch])
+            else:
+                # Create a new branch for single floats
+                branch_vectors[branch] = array('f', [0])
+                outtree.Branch(branch, branch_vectors[branch], f'{branch}/F')
+        # Fill the tree
         for i in range(tree.GetEntries()):
             tree.GetEntry(i)
+            for branch, scores in all_scores.items():
+                branch_data = branch_vectors[branch]
+                if isinstance(branch_data, array):  # Check if it's a single float array
+                    branch_data[0] = float(scores[i])
+                else:  # Assume it's a std::vector<float>
+                    branch_data.clear()
+                    for value in scores[i]:
+                        branch_data.push_back(float(value))
             outtree.Fill()
         # Write the modified tree to the new file
         print(f'Writing to file {args.destination}')
         print(f'Input entries: {tree.GetEntries()}, Output entries: {outtree.GetEntries()}')
+        print(f'Wrote scores to {args.branch_name}')
         outtree.Write()
         outfile.Close()
         infile.Close()
     else:
         os.makedirs(os.path.split(args.destination)[0], exist_ok=True)
+        np.savez(args.destination, scores=all_scores, labels=all_labels, tracking_info=all_tracking)
 if __name__ == '__main__':
+    main()

root_gnn_dgl/scripts/prep_data.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import sys
-import os
-file_path = os.getcwd()
 sys.path.append(file_path)
 import root_gnn_base.utils as utils
@@ -15,6 +14,7 @@ def main():
     add_arg('--dataset', type=str, required=True)
     add_arg('--chunk', type=int, default=0)
     add_arg('--shuffle_mode', action='store_true', help='Shuffle the dataset before training.')
     args = parser.parse_args()
     config = utils.load_config(args.config)
@@ -32,12 +32,12 @@ def main():
         fold_conf = dset_config["folding"]
         print(f"shuffle_chunks = {shuffle_chunks}, args.chunk = {args.chunk}, padding_mode = {padding_mode}")
         if dset_config["class"] == "LazyMultiLabelDataset":
-            LazyPreBatchedDataset(start_dataset = dset, batch_size = batch_size, mask_fn = utils.fold_selection(fold_conf, "train"), suffix = utils.fold_selection_name(fold_conf, "train"), chunks = shuffle_chunks, chunkno = args.chunk, padding_mode = padding_mode)
-            LazyPreBatchedDataset(start_dataset = dset, batch_size = batch_size, mask_fn = utils.fold_selection(fold_conf, "test"),  suffix = utils.fold_selection_name(fold_conf, 'test'), chunks = shuffle_chunks, chunkno = args.chunk, padding_mode = padding_mode)
         else:
-            PreBatchedDataset(dset, batch_size, utils.fold_selection(fold_conf, "train"), suffix = utils.fold_selection_name(fold_conf, "train"), chunks = shuffle_chunks, chunkno = args.chunk, padding_mode = padding_mode)
-            PreBatchedDataset(dset, batch_size, utils.fold_selection(fold_conf, "test"),  suffix = utils.fold_selection_name(fold_conf, 'test'), chunks = shuffle_chunks, chunkno = args.chunk, padding_mode = padding_mode)
 if __name__ == "__main__":
     main()

 import sys
+file_path = "/global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl"
 sys.path.append(file_path)
 import root_gnn_base.utils as utils
     add_arg('--dataset', type=str, required=True)
     add_arg('--chunk', type=int, default=0)
     add_arg('--shuffle_mode', action='store_true', help='Shuffle the dataset before training.')
+    add_arg('--drop_last', action='store_false', help='Set drop_last to False if the flag is provided. Defaults to True.')
     args = parser.parse_args()
     config = utils.load_config(args.config)
         fold_conf = dset_config["folding"]
         print(f"shuffle_chunks = {shuffle_chunks}, args.chunk = {args.chunk}, padding_mode = {padding_mode}")
         if dset_config["class"] == "LazyMultiLabelDataset":
+            LazyPreBatchedDataset(start_dataset = dset, batch_size = batch_size, mask_fn = utils.fold_selection(fold_conf, "train"), suffix = utils.fold_selection_name(fold_conf, "train"), chunks = shuffle_chunks, chunkno = args.chunk, padding_mode = padding_mode, drop_last=args.drop_last)
+            LazyPreBatchedDataset(start_dataset = dset, batch_size = batch_size, mask_fn = utils.fold_selection(fold_conf, "test"),  suffix = utils.fold_selection_name(fold_conf, 'test'), chunks = shuffle_chunks, chunkno = args.chunk, padding_mode = padding_mode, drop_last=args.drop_last)
         else:
+            PreBatchedDataset(dset, batch_size, utils.fold_selection(fold_conf, "train"), suffix = utils.fold_selection_name(fold_conf, "train"), chunks = shuffle_chunks, chunkno = args.chunk, padding_mode = padding_mode, drop_last=args.drop_last)
+            PreBatchedDataset(dset, batch_size, utils.fold_selection(fold_conf, "test"),  suffix = utils.fold_selection_name(fold_conf, 'test'), chunks = shuffle_chunks, chunkno = args.chunk, padding_mode = padding_mode, drop_last=args.drop_last)
 if __name__ == "__main__":
     main()

root_gnn_dgl/scripts/training_script.py CHANGED Viewed

@@ -11,9 +11,8 @@ import torch
 import torch.nn as nn
 import sys
-file_path = os.getcwd()
 sys.path.append(file_path)
 import root_gnn_base.batched_dataset as datasets
 from root_gnn_base import utils
 import root_gnn_base.custom_scheduler as lr_utils
@@ -29,6 +28,8 @@ import torch.multiprocessing as mp
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn.parallel import DistributedDataParallel as DDP
 def mem():
     print(f'Current memory usage: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024} GB')
@@ -75,9 +76,9 @@ def evaluate(val_loaders, model, config, device, epoch = -1):
         print(f"Loaded epoch {checkpoint['epoch']} from checkpoint")
     if 'Loss' not in config:
-        loss_fcn = nn.BCEWithLogitsLoss()
     else:
-        loss_fcn = utils.buildFromConfig(config['Loss'])
     if len(val_loaders) == 0:
         return "No validation data"
     start = time.time()
@@ -143,10 +144,10 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
     restart = args.restart
     # define train/val samples, loss function and optimizer
     if 'Loss' not in config:
-        loss_fcn = nn.BCEWithLogitsLoss()
         finish_fn = torch.nn.Sigmoid()
     else:
-        loss_fcn = utils.buildFromConfig(config['Loss'])
         finish_fn = utils.buildFromConfig(config['Loss']['finish'])
     optimizer = torch.optim.Adam(model.parameters(), lr=config['Training']['learning_rate'])
@@ -280,11 +281,13 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
             batch_start = time.time()
             logits = torch.tensor([])
             tlabels = torch.tensor([])
             batch_lengths = []
             for cycler in train_cyclers:
-                graph, label, _, global_feats = next(cycler)
                 graph = graph.to(device)
                 label = label.to(device)
                 global_feats = global_feats.to(device)
                 if is_padded: #Padding the globals to match padded graphs.
                     global_feats = torch.concatenate((global_feats, torch.zeros(1, len(global_feats[0])).to(device)))
@@ -292,9 +295,11 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
                 if (len(logits) == 0):
                     logits = model(graph, global_feats)
                     tlabels = label
                 else:
                     logits = torch.concatenate((logits, model(graph, global_feats)), dim=0)
                     tlabels = torch.concatenate((tlabels, label), dim=0)
                 batch_lengths.append(logits.shape[0] - 1)
             if is_padded:
@@ -307,7 +312,35 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
                 tlabels = tlabels.to(torch.float)
             if loss_fcn.__class__.__name__ == 'CrossEntropyLoss':
                 tlabels = tlabels.to(torch.long)
-            loss = loss_fcn(logits, tlabels.to(device)) # changed logits from logits[:,0] and left labels as int for multiclass. Does this break binary? Yes.
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
@@ -382,6 +415,9 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
         wgt_mask = weights > 0
         print(f"Num batches trained = {ibatch}")
         #Note: This section is a bit ugly. Very conditional. Should maybe config defined behavior?
@@ -472,7 +508,29 @@ def train(train_loaders, test_loaders, model, device, config, args, rank):
             print(contrastive_cluster_log_str, flush=True)
         # test_loss = loss_fcn(logits, labels.to(device))
         test_loss = loss_fcn(logits, labels)
         end = time.time()
         log_str = "Epoch {:05d} | LR {:.4e} | Loss {:.4f} | Accuracy {:.4f} | Test_Loss {:.4f} | Test_AUC {:.4f} | Time {:.4f} s".format(
                 epoch, optimizer.param_groups[0]['lr'], total_loss/ibatch, acc, test_loss, test_auc, end - start
@@ -664,6 +722,7 @@ def main(rank=0, args=None, world_size=1, port=24500, seed=12345):
     load_end = time.time()
     print("Load time: {:.4f} s".format(load_end - load_start))
     model = utils.buildFromConfig(config["Model"], {'sample_graph': gsamp, 'sample_global': global_samp, 'seed': seed}).to(device)
     if not args.nocompile:
         model = torch.compile(model)
@@ -728,6 +787,7 @@ if __name__ == "__main__":
     add_arg("--statistics", type=float, help="Size of training data")
     add_arg("--directory", type=str, help="Append to Training Directory")
     add_arg("--seed", type=int, default=2, help="Sets random seed")
     pargs = parser.parse_args()

 import torch.nn as nn
 import sys
+file_path = "/global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl/"
 sys.path.append(file_path)
 import root_gnn_base.batched_dataset as datasets
 from root_gnn_base import utils
 import root_gnn_base.custom_scheduler as lr_utils
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn.parallel import DistributedDataParallel as DDP
+print("import time: {:.4f} s".format(time.time() - start_time))
 def mem():
     print(f'Current memory usage: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024} GB')
         print(f"Loaded epoch {checkpoint['epoch']} from checkpoint")
     if 'Loss' not in config:
+        loss_fcn = nn.BCEWithLogitsLoss(reduction='none')
     else:
+        loss_fcn = utils.buildFromConfig(config['Loss'], {'reduction': 'none'})
     if len(val_loaders) == 0:
         return "No validation data"
     start = time.time()
     restart = args.restart
     # define train/val samples, loss function and optimizer
     if 'Loss' not in config:
+        loss_fcn = nn.BCEWithLogitsLoss(reduction='none')
         finish_fn = torch.nn.Sigmoid()
     else:
+        loss_fcn = utils.buildFromConfig(config['Loss'], {'reduction':'none'})
         finish_fn = utils.buildFromConfig(config['Loss']['finish'])
     optimizer = torch.optim.Adam(model.parameters(), lr=config['Training']['learning_rate'])
             batch_start = time.time()
             logits = torch.tensor([])
             tlabels = torch.tensor([])
+            weights = torch.tensor([])
             batch_lengths = []
             for cycler in train_cyclers:
+                graph, label, track, global_feats = next(cycler)
                 graph = graph.to(device)
                 label = label.to(device)
+                track = track.to(device)
                 global_feats = global_feats.to(device)
                 if is_padded: #Padding the globals to match padded graphs.
                     global_feats = torch.concatenate((global_feats, torch.zeros(1, len(global_feats[0])).to(device)))
                 if (len(logits) == 0):
                     logits = model(graph, global_feats)
                     tlabels = label
+                    weights = track[:,1]
                 else:
                     logits = torch.concatenate((logits, model(graph, global_feats)), dim=0)
                     tlabels = torch.concatenate((tlabels, label), dim=0)
+                    weights = torch.concatenate((weights, track[:,1]), dim=0)
                 batch_lengths.append(logits.shape[0] - 1)
             if is_padded:
                 tlabels = tlabels.to(torch.float)
             if loss_fcn.__class__.__name__ == 'CrossEntropyLoss':
                 tlabels = tlabels.to(torch.long)
+            # loss = loss_fcn(logits, tlabels.to(device)) # changed logits from logits[:,0] and left labels as int for multiclass. Does this break binary? Yes.
+            # loss = torch.sum(weights * loss) / torch.sum(weights)
+            if args.abs:
+                weights = torch.abs(weights)
+            loss = loss_fcn(logits, tlabels.to(device))
+            # Normalize loss within each label
+            unique_labels = torch.unique(tlabels)  # Get unique labels
+            normalized_loss = 0.0
+            for label in unique_labels:
+                # Mask for samples belonging to the current label
+                label_mask = (tlabels == label)
+                # Extract weights and losses for the current label
+                label_weights = weights[label_mask]
+                label_losses = loss[label_mask]
+                # Compute normalized loss for the current label
+                label_loss = torch.sum(label_weights * label_losses) / torch.sum(label_weights)
+                # Add to the total normalized loss
+                normalized_loss += label_loss
+            loss = normalized_loss / len(unique_labels)
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
         wgt_mask = weights > 0
+        if args.abs:
+            weights = torch.abs(weights)
         print(f"Num batches trained = {ibatch}")
         #Note: This section is a bit ugly. Very conditional. Should maybe config defined behavior?
             print(contrastive_cluster_log_str, flush=True)
         # test_loss = loss_fcn(logits, labels.to(device))
+        # test_loss = loss_fcn(logits, labels)
+        # test_loss = torch.sum(weights * test_loss) / torch.sum(weights)
         test_loss = loss_fcn(logits, labels)
+        # Normalize loss within each label
+        unique_labels = torch.unique(labels)  # Get unique labels
+        normalized_loss = 0.0
+        for label in unique_labels:
+            # Mask for samples belonging to the current label
+            label_mask = (labels == label)
+            # Extract weights and losses for the current label
+            label_weights = weights[label_mask]
+            label_losses = test_loss[label_mask]
+            # Compute normalized loss for the current label
+            label_loss = torch.sum(label_weights * label_losses) / torch.sum(label_weights)
+            # Add to the total normalized loss
+            normalized_loss += label_loss
+        test_loss = normalized_loss / len(unique_labels)
         end = time.time()
         log_str = "Epoch {:05d} | LR {:.4e} | Loss {:.4f} | Accuracy {:.4f} | Test_Loss {:.4f} | Test_AUC {:.4f} | Time {:.4f} s".format(
                 epoch, optimizer.param_groups[0]['lr'], total_loss/ibatch, acc, test_loss, test_auc, end - start
     load_end = time.time()
     print("Load time: {:.4f} s".format(load_end - load_start))
     model = utils.buildFromConfig(config["Model"], {'sample_graph': gsamp, 'sample_global': global_samp, 'seed': seed}).to(device)
     if not args.nocompile:
         model = torch.compile(model)
     add_arg("--statistics", type=float, help="Size of training data")
     add_arg("--directory", type=str, help="Append to Training Directory")
     add_arg("--seed", type=int, default=2, help="Sets random seed")
+    add_arg("--abs", action="store_true", help="Use abs value of per-event weight")
     pargs = parser.parse_args()