created demo

Browse files

Files changed (5) hide show

.gitignore +1 -0
root_gnn_dgl/README.md +19 -1
root_gnn_dgl/configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml +18 -18
root_gnn_dgl/configs/demo/pretraining_multiclass.yaml +1 -1
root_gnn_dgl/run_demo.sh +32 -4

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 __pycache__/
 trainings/

 __pycache__/
 trainings/
+scores/

root_gnn_dgl/README.md CHANGED Viewed

@@ -40,6 +40,16 @@ Run the `setup/test_setup.py` script to confirm that all packages needed for tra
 ```bash
 python setup/test_setup.py
 ```
 ## Data Preparation
 The first step in the process is to convert the events stored in ROOT files into DGL graph objects. This conversion is handled automatically by the Dataset objects during their creation, provided the graph data has not already been saved to disk. To accomplish this, a simple script is used to initialize the relevant Dataset object and then exit. This script needs to be executed for each data chunk in each dataset being used for training.
@@ -47,7 +57,15 @@ The first step in the process is to convert the events stored in ROOT files into
 Below is an example of how to use the `scripts/prep_data.py` script:
 ```bash
-<insert exmaple here>
 ```
 The `--shuffle_mode` flag performs shuffling and pre-batches the graphs in each chunk, since holding the entire dataset in memory and shuffling it together can be prohibitive for large datasets.

 ```bash
 python setup/test_setup.py
 ```
+## Running the Demo
+The demo training is an example of our ML workflow, consisting of training a pretrained model, then finetuning it for an analysis task. The config files for the demo are located in the directory `configs/demo/`. The demo can be run on a login node.
+The pretraining for the demo is a multiclass classification training on 12 datasets corresponding to 12 distinct physics processes, containing 10,000 simulated collision events each. The pretraining is then fintuned on a binary classification task between two datasets containing 10,000 simulated collision events each for two different processes, called ttH CP Even and ttH CP Odd.
+The entire demo can be ran with the command
+```bash
+source run_demo.sh
+```
 ## Data Preparation
 The first step in the process is to convert the events stored in ROOT files into DGL graph objects. This conversion is handled automatically by the Dataset objects during their creation, provided the graph data has not already been saved to disk. To accomplish this, a simple script is used to initialize the relevant Dataset object and then exit. This script needs to be executed for each data chunk in each dataset being used for training.
 Below is an example of how to use the `scripts/prep_data.py` script:
 ```bash
+datasets=("ttH" "tHjb" "ggF" "VBF" "WH" "ZH" "ttyy" "tttt" "SingleT_schan" "ttbar" "ttW" "ttt")
+chunks=3
+for data in "${datasets[@]}"; do
+    python scripts/prep_data.py --config configs/demo/pretraining_multiclass.yaml --dataset "$data" --shuffle_mode --chunk 0
+    for ((i=0; i<chunks; i++)); do
+        python scripts/prep_data.py --config configs/demo/pretraining_multiclass.yaml --dataset "$data" --shuffle_mode --chunk "$i"
+    done
+done
 ```
 The `--shuffle_mode` flag performs shuffling and pre-batches the graphs in each chunk, since holding the entire dataset in memory and shuffling it together can be prohibitive for large datasets.

root_gnn_dgl/configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml CHANGED Viewed

@@ -1,17 +1,17 @@
-Training_Name: ttH_CP_even_vs_odd_finetuning_12_process
-Training_Directory: trainings/ttH_vs_ttH_CPodd_TL_studies/ttH_CP_even_vs_odd_finetuning_12_process
 Model:
   module: models.GCN
   class: Transferred_Learning_Finetuning
   args:
-    pretraining_path: trainings/Hyy_BIG/model_epoch_59.pt
     pretraining_model:
       module: models.GCN
       class: Edge_Network
       args:
         hid_size: 64
         in_size: 7
-        out_size: 13
         n_layers: 4
         n_proc_steps: 4
     hid_size: 64
@@ -19,30 +19,30 @@ Model:
     out_size: 1
     n_layers: 4
     n_proc_steps: 4
-    dropout: 0.10
 Training:
-  epochs: 200
   batch_size: 1024
   learning_rate: 0.00001
   gamma: 0.99
 Datasets:
-  ttH: &dataset_defn
     module: root_gnn_base.dataset
     class: LazyDataset
-    shuffle_chunks: 10
     batch_size: 1024
     padding_mode: NONE #one of STEPS, FIXED, or NONE
     args: &dataset_args
-      name: ttH
       label: 0
       weight_var: weight
-      chunks: 100
-      buffer_size: 11
       file_names: ttH_NLO.root
       tree_name: output
       fold_var: Number
-      raw_dir: /global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl/data/ntuples/Hyy_pretraining/
-      save_dir: /pscratch/sd/j/joshuaho/root_gnn/root_gnn_dgl/data/processed_ttH_vs_ttH_CPOdd_10M
       node_branch_names:
         - [jet_pt, ele_pt, mu_pt, ph_pt, MET_met]
         - [jet_eta, ele_eta, mu_eta, ph_eta, 0]
@@ -54,14 +54,14 @@ Datasets:
       node_branch_types: [vector, vector, vector, vector, single]
       node_feature_scales: [1e-1, 1, 1, 1e-1, 1, 1, 1]
     folding:
-      n_folds: 10
-      test: [0, 1, 2]
       # validation: 1
-      train: [3,4,5,6,7,8,9]
-  ttH_CPodd:
     <<: *dataset_defn
     args:
       <<: *dataset_args
-      name: ttH_CPodd
       label: 1
       file_names: ttH_CPodd.root

+Training_Name: finetuning_ttH_CP_Even_vs_Odd
+Training_Directory: trainings/demo/finetuning_ttH_CP_Even_vs_Odd
 Model:
   module: models.GCN
   class: Transferred_Learning_Finetuning
   args:
+    pretraining_path: trainings/demo/pretraining_multiclass/model_epoch_100.pt # update to the last epoch of the pretraining
     pretraining_model:
       module: models.GCN
       class: Edge_Network
       args:
         hid_size: 64
         in_size: 7
+        out_size: 12
         n_layers: 4
         n_proc_steps: 4
     hid_size: 64
     out_size: 1
     n_layers: 4
     n_proc_steps: 4
+    dropout: 0
 Training:
+  epochs: 500
   batch_size: 1024
   learning_rate: 0.00001
   gamma: 0.99
 Datasets:
+  ttH_CP_Even: &dataset_defn
     module: root_gnn_base.dataset
     class: LazyDataset
+    shuffle_chunks: 3
     batch_size: 1024
     padding_mode: NONE #one of STEPS, FIXED, or NONE
     args: &dataset_args
+      name: ttH_CP_Even
       label: 0
       weight_var: weight
+      chunks: 3
+      buffer_size: 1
       file_names: ttH_NLO.root
       tree_name: output
       fold_var: Number
+      raw_dir: /global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl/data/ntuples/Hyy_pretraining/multilabel_10K/
+      save_dir: /pscratch/sd/j/joshuaho/GNN4Colliders/root_gnn_dgl/data/demo/finetuning_ttH_CP_Even_vs_Odd/
       node_branch_names:
         - [jet_pt, ele_pt, mu_pt, ph_pt, MET_met]
         - [jet_eta, ele_eta, mu_eta, ph_eta, 0]
       node_branch_types: [vector, vector, vector, vector, single]
       node_feature_scales: [1e-1, 1, 1, 1e-1, 1, 1, 1]
     folding:
+      n_folds: 3
+      test: [0]
       # validation: 1
+      train: [1, 2]
+  ttH_CP_Odd:
     <<: *dataset_defn
     args:
       <<: *dataset_args
+      name: ttH_CP_Odd
       label: 1
       file_names: ttH_CPodd.root

root_gnn_dgl/configs/demo/pretraining_multiclass.yaml CHANGED Viewed

@@ -19,7 +19,7 @@ Loss:
     class: Softmax
     args: {dim: 1}
 Training:
-  epochs: 200
   batch_size: 1024
   learning_rate: 0.0001
   gamma: 0.99

     class: Softmax
     args: {dim: 1}
 Training:
+  epochs: 500
   batch_size: 1024
   learning_rate: 0.0001
   gamma: 0.99

root_gnn_dgl/run_demo.sh CHANGED Viewed

@@ -1,8 +1,6 @@
 #!/bin/bash
 # Pretraining
-# Data Preparation
 datasets=("ttH" "tHjb" "ggF" "VBF" "WH" "ZH" "ttyy" "tttt" "SingleT_schan" "ttbar" "ttW" "ttt")
 chunks=3
@@ -13,9 +11,39 @@ for data in "${datasets[@]}"; do
     done
 done
-# Training
 python scripts/training_script.py --config configs/demo/pretraining_multiclass.yaml --preshuffle --nocompile --lazy
 # Inference

 #!/bin/bash
 # Pretraining
 datasets=("ttH" "tHjb" "ggF" "VBF" "WH" "ZH" "ttyy" "tttt" "SingleT_schan" "ttbar" "ttW" "ttt")
 chunks=3
     done
 done
 python scripts/training_script.py --config configs/demo/pretraining_multiclass.yaml --preshuffle --nocompile --lazy
+# Finetuning
+datasets=("ttH_CP_Even" "ttH_CP_Odd")
+chunks=3
+for data in "${datasets[@]}"; do
+    python scripts/prep_data.py --config configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml --dataset "$data" --shuffle_mode --chunk 0
+    for ((i=0; i<chunks; i++)); do
+        python scripts/prep_data.py --config configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml --dataset "$data" --shuffle_mode --chunk "$i"
+    done
+done
+python scripts/training_script.py --config configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml --preshuffle --nocompile --lazy
 # Inference
+python scripts/inference.py \
+    --target "/global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl/data/ntuples/Hyy_pretraining/multilabel_10K/ttH_NLO.root" \
+    --destination "/global/cfs/projectdirs/atlas/joshua/GNN4Colliders/root_gnn_dgl/scores/ttH_NLO.root" \
+    --config "configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml" \
+    --chunks 1 \
+    --chunkno 0 \
+    --write \
+    --branch 'GNN_Score'
+python scripts/inference.py \
+    --target "/global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl/data/ntuples/Hyy_pretraining/multilabel_10K/ttH_CPodd.root" \
+    --destination "/global/cfs/projectdirs/atlas/joshua/GNN4Colliders/root_gnn_dgl/scores/ttH_CPodd.root" \
+    --config "configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml" \
+    --chunks 1 \
+    --chunkno 0 \
+    --write \
+    --branch 'GNN_Score'