added demo

Browse files

Files changed (7) hide show

.gitignore +1 -0
root_gnn_dgl/README.md +15 -1
root_gnn_dgl/configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml +67 -0
root_gnn_dgl/configs/demo/pretraining_multiclass.yaml +134 -0
root_gnn_dgl/configs/pretraining/multiclass.yaml +0 -0
root_gnn_dgl/root_gnn_base/dataset.py +8 -7
root_gnn_dgl/run_demo.sh +21 -0

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	__pycache__/


1	__pycache__/
2	+ trainings/

root_gnn_dgl/README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 ## Environment Setup
-The environment dependencies for this project are listed in `environment.yml`. Follow the steps below to set up the environment:
 ### Step 1: Install Conda
 If you don’t already have Conda installed, install either Miniconda (lightweight) or Anaconda (full version):
@@ -13,6 +13,7 @@ If you don’t already have Conda installed, install either Miniconda (lightweig
 ### Step 2: Clone the Repository
 Clone this repository to your local machine:
 ```bash
 git lfs install
 git clone https://huggingface.co/HWresearch/GNN4Colliders
 ```
@@ -39,3 +40,16 @@ Run the `setup/test_setup.py` script to confirm that all packages needed for tra
 ```bash
 python setup/test_setup.py
 ```

 ## Environment Setup
+The environment dependencies for this project are listed in `setup/environment.yml`. Follow the steps below to set up the environment:
 ### Step 1: Install Conda
 If you don’t already have Conda installed, install either Miniconda (lightweight) or Anaconda (full version):
 ### Step 2: Clone the Repository
 Clone this repository to your local machine:
 ```bash
+git init
 git lfs install
 git clone https://huggingface.co/HWresearch/GNN4Colliders
 ```
 ```bash
 python setup/test_setup.py
 ```
+## Data Preparation
+The first step in the process is to convert the events stored in ROOT files into DGL graph objects. This conversion is handled automatically by the Dataset objects during their creation, provided the graph data has not already been saved to disk. To accomplish this, a simple script is used to initialize the relevant Dataset object and then exit. This script needs to be executed for each data chunk in each dataset being used for training.
+Below is an example of how to use the `scripts/prep_data.py` script:
+```bash
+<insert exmaple here>
+```
+The `--shuffle_mode` flag performs shuffling and pre-batches the graphs in each chunk, since holding the entire dataset in memory and shuffling it together can be prohibitive for large datasets.
+## Training

root_gnn_dgl/configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml ADDED Viewed

	@@ -0,0 +1,67 @@

+Training_Name: ttH_CP_even_vs_odd_finetuning_12_process
+Training_Directory: trainings/ttH_vs_ttH_CPodd_TL_studies/ttH_CP_even_vs_odd_finetuning_12_process
+Model:
+  module: models.GCN
+  class: Transferred_Learning_Finetuning
+  args:
+    pretraining_path: trainings/Hyy_BIG/model_epoch_59.pt
+    pretraining_model:
+      module: models.GCN
+      class: Edge_Network
+      args:
+        hid_size: 64
+        in_size: 7
+        out_size: 13
+        n_layers: 4
+        n_proc_steps: 4
+    hid_size: 64
+    in_size: 7
+    out_size: 1
+    n_layers: 4
+    n_proc_steps: 4
+    dropout: 0.10
+Training:
+  epochs: 200
+  batch_size: 1024
+  learning_rate: 0.00001
+  gamma: 0.99
+Datasets:
+  ttH: &dataset_defn
+    module: root_gnn_base.dataset
+    class: LazyDataset
+    shuffle_chunks: 10
+    batch_size: 1024
+    padding_mode: NONE #one of STEPS, FIXED, or NONE
+    args: &dataset_args
+      name: ttH
+      label: 0
+      weight_var: weight
+      chunks: 100
+      buffer_size: 11
+      file_names: ttH_NLO.root
+      tree_name: output
+      fold_var: Number
+      raw_dir: /global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl/data/ntuples/Hyy_pretraining/
+      save_dir: /pscratch/sd/j/joshuaho/root_gnn/root_gnn_dgl/data/processed_ttH_vs_ttH_CPOdd_10M
+      node_branch_names:
+        - [jet_pt, ele_pt, mu_pt, ph_pt, MET_met]
+        - [jet_eta, ele_eta, mu_eta, ph_eta, 0]
+        - [jet_phi, ele_phi, mu_phi, ph_phi, MET_phi]
+        - CALC_E
+        - [jet_btag, 0, 0, 0, 0]
+        - [0, ele_charge, mu_charge, 0, 0]
+        - NODE_TYPE
+      node_branch_types: [vector, vector, vector, vector, single]
+      node_feature_scales: [1e-1, 1, 1, 1e-1, 1, 1, 1]
+    folding:
+      n_folds: 10
+      test: [0, 1, 2]
+      # validation: 1
+      train: [3,4,5,6,7,8,9]
+  ttH_CPodd:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: ttH_CPodd
+      label: 1
+      file_names: ttH_CPodd.root

root_gnn_dgl/configs/demo/pretraining_multiclass.yaml ADDED Viewed

	@@ -0,0 +1,134 @@

+Training_Name: pretraining_multiclass
+Training_Directory: trainings/demo/pretraining_multiclass/
+Model:
+  module: models.GCN
+  class: Edge_Network
+  args:
+    hid_size: 64
+    in_size: 7
+    out_size: 12
+    n_layers: 4
+    n_proc_steps: 4
+    dropout: 0
+Loss:
+  module: torch.nn
+  class: CrossEntropyLoss
+  args: {}
+  finish:
+    module: torch.nn
+    class: Softmax
+    args: {dim: 1}
+Training:
+  epochs: 200
+  batch_size: 1024
+  learning_rate: 0.0001
+  gamma: 0.99
+Datasets:
+  ttH: &dataset_defn
+    module: root_gnn_base.dataset
+    class: LazyDataset
+    shuffle_chunks: 3
+    padding_mode: NONE #one of STEPS, FIXED, or NONE
+    args: &dataset_args
+      name: ttH
+      label: 0
+      weight_var: weight
+      chunks: 3
+      buffer_size: 1
+      file_names: ttH_NLO_inc.root
+      tree_name: output
+      fold_var: Number
+      raw_dir: /global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl/data/ntuples/Hyy_pretraining/multilabel_10K/
+      save_dir: /pscratch/sd/j/joshuaho/GNN4Colliders/root_gnn_dgl/data/demo/pretraining_multiclass/
+      node_branch_names:
+        - [jet_pt, ele_pt, mu_pt, ph_pt, MET_met]
+        - [jet_eta, ele_eta, mu_eta, ph_eta, 0]
+        - [jet_phi, ele_phi, mu_phi, ph_phi, MET_phi]
+        - CALC_E
+        - [jet_btag, 0, 0, 0, 0]
+        - [0, ele_charge, mu_charge, 0, 0]
+        - NODE_TYPE
+      node_branch_types: [vector, vector, vector, vector, single]
+      node_feature_scales: [1e-1, 1, 1, 1e-1, 1, 1, 1]
+    folding:
+      n_folds: 3
+      test: [0]
+      # validation: 1
+      train: [1, 2]
+  tHjb:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: tHjb
+      label: 1
+      file_names: tHjb_NLO_inc.root
+  ggF:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: ggF
+      label: 2
+      file_names: ggF_NLO_inc.root
+  VBF:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: VBF
+      label: 3
+      file_names: VBF_NLO_inc.root
+  WH:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: WH
+      label: 4
+      file_names: WH_NLO_inc.root
+  ZH:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: ZH
+      label: 5
+      file_names: ZH_NLO_inc.root
+  ttyy:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: ttyy_ch
+      label: 6
+      file_names: 'ttyy.root'
+  tttt:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: tttt
+      label: 7
+      file_names: 'tttt.root'
+  SingleT_schan:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: SingleT_schan
+      label: 8
+      file_names: 'SingleT_schan.root'
+  ttbar:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: ttbar
+      label: 9
+      file_names: 'ttbar.root'
+  ttW:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: ttW
+      label: 10
+      file_names: 'ttW.root'
+  ttt:
+    <<: *dataset_defn
+    args:
+      <<: *dataset_args
+      name: ttt
+      label: 11
+      file_names: 'ttt.root'

root_gnn_dgl/configs/pretraining/multiclass.yaml ADDED Viewed

File without changes

root_gnn_dgl/root_gnn_base/dataset.py CHANGED Viewed

@@ -199,13 +199,14 @@ class RootDataset(DGLDataset):
             tracking = torch.stack(tracking)
             globals = torch.stack(globals)
-            # self.labels = labels
-            # self.tracking = tracking
-            # self.global_features = globals
-            # self.graphs = graphs
-            self.save_chunk(chunk_id, graphs, labels, tracking, globals)
         return
         self.graphs = self.graph_chunks[0]
         for chunk in self.graph_chunks[1:]:

             tracking = torch.stack(tracking)
             globals = torch.stack(globals)
+            if (self.chunks > 1):
+                self.save_chunk(chunk_id, graphs, labels, tracking, globals)
+            else:
+                self.labels = labels
+                self.tracking = tracking
+                self.global_features = globals
+                self.graphs = graphs
+                self.save()
         return
         self.graphs = self.graph_chunks[0]
         for chunk in self.graph_chunks[1:]:

root_gnn_dgl/run_demo.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+# Pretraining
+# Data Preparation
+datasets=("ttH" "tHjb" "ggF" "VBF" "WH" "ZH" "ttyy" "tttt" "SingleT_schan" "ttbar" "ttW" "ttt")
+chunks=3
+for data in "${datasets[@]}"; do
+    python scripts/prep_data.py --config configs/demo/pretraining_multiclass.yaml --dataset "$data" --shuffle_mode --chunk 0
+    for ((i=0; i<chunks; i++)); do
+        python scripts/prep_data.py --config configs/demo/pretraining_multiclass.yaml --dataset "$data" --shuffle_mode --chunk "$i"
+    done
+done
+# Training
+python scripts/training_script.py --config configs/demo/pretraining_multiclass.yaml --preshuffle --nocompile --lazy
+# Inference