Commit
·
a19acb8
1
Parent(s):
96ccc5c
added demo
Browse files- .gitignore +1 -0
- root_gnn_dgl/README.md +15 -1
- root_gnn_dgl/configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml +67 -0
- root_gnn_dgl/configs/demo/pretraining_multiclass.yaml +134 -0
- root_gnn_dgl/configs/pretraining/multiclass.yaml +0 -0
- root_gnn_dgl/root_gnn_base/dataset.py +8 -7
- root_gnn_dgl/run_demo.sh +21 -0
.gitignore
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
__pycache__/
|
|
|
|
|
|
| 1 |
__pycache__/
|
| 2 |
+
trainings/
|
root_gnn_dgl/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
## Environment Setup
|
| 4 |
|
| 5 |
-
The environment dependencies for this project are listed in `environment.yml`. Follow the steps below to set up the environment:
|
| 6 |
|
| 7 |
### Step 1: Install Conda
|
| 8 |
If you don’t already have Conda installed, install either Miniconda (lightweight) or Anaconda (full version):
|
|
@@ -13,6 +13,7 @@ If you don’t already have Conda installed, install either Miniconda (lightweig
|
|
| 13 |
### Step 2: Clone the Repository
|
| 14 |
Clone this repository to your local machine:
|
| 15 |
```bash
|
|
|
|
| 16 |
git lfs install
|
| 17 |
git clone https://huggingface.co/HWresearch/GNN4Colliders
|
| 18 |
```
|
|
@@ -39,3 +40,16 @@ Run the `setup/test_setup.py` script to confirm that all packages needed for tra
|
|
| 39 |
```bash
|
| 40 |
python setup/test_setup.py
|
| 41 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
## Environment Setup
|
| 4 |
|
| 5 |
+
The environment dependencies for this project are listed in `setup/environment.yml`. Follow the steps below to set up the environment:
|
| 6 |
|
| 7 |
### Step 1: Install Conda
|
| 8 |
If you don’t already have Conda installed, install either Miniconda (lightweight) or Anaconda (full version):
|
|
|
|
| 13 |
### Step 2: Clone the Repository
|
| 14 |
Clone this repository to your local machine:
|
| 15 |
```bash
|
| 16 |
+
git init
|
| 17 |
git lfs install
|
| 18 |
git clone https://huggingface.co/HWresearch/GNN4Colliders
|
| 19 |
```
|
|
|
|
| 40 |
```bash
|
| 41 |
python setup/test_setup.py
|
| 42 |
```
|
| 43 |
+
|
| 44 |
+
## Data Preparation
|
| 45 |
+
The first step in the process is to convert the events stored in ROOT files into DGL graph objects. This conversion is handled automatically by the Dataset objects during their creation, provided the graph data has not already been saved to disk. To accomplish this, a simple script is used to initialize the relevant Dataset object and then exit. This script needs to be executed for each data chunk in each dataset being used for training.
|
| 46 |
+
|
| 47 |
+
Below is an example of how to use the `scripts/prep_data.py` script:
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
<insert exmaple here>
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
The `--shuffle_mode` flag performs shuffling and pre-batches the graphs in each chunk, since holding the entire dataset in memory and shuffling it together can be prohibitive for large datasets.
|
| 54 |
+
|
| 55 |
+
## Training
|
root_gnn_dgl/configs/demo/finetuning_ttH_CP_Even_vs_Odd.yaml
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Training_Name: ttH_CP_even_vs_odd_finetuning_12_process
|
| 2 |
+
Training_Directory: trainings/ttH_vs_ttH_CPodd_TL_studies/ttH_CP_even_vs_odd_finetuning_12_process
|
| 3 |
+
Model:
|
| 4 |
+
module: models.GCN
|
| 5 |
+
class: Transferred_Learning_Finetuning
|
| 6 |
+
args:
|
| 7 |
+
pretraining_path: trainings/Hyy_BIG/model_epoch_59.pt
|
| 8 |
+
pretraining_model:
|
| 9 |
+
module: models.GCN
|
| 10 |
+
class: Edge_Network
|
| 11 |
+
args:
|
| 12 |
+
hid_size: 64
|
| 13 |
+
in_size: 7
|
| 14 |
+
out_size: 13
|
| 15 |
+
n_layers: 4
|
| 16 |
+
n_proc_steps: 4
|
| 17 |
+
hid_size: 64
|
| 18 |
+
in_size: 7
|
| 19 |
+
out_size: 1
|
| 20 |
+
n_layers: 4
|
| 21 |
+
n_proc_steps: 4
|
| 22 |
+
dropout: 0.10
|
| 23 |
+
Training:
|
| 24 |
+
epochs: 200
|
| 25 |
+
batch_size: 1024
|
| 26 |
+
learning_rate: 0.00001
|
| 27 |
+
gamma: 0.99
|
| 28 |
+
Datasets:
|
| 29 |
+
ttH: &dataset_defn
|
| 30 |
+
module: root_gnn_base.dataset
|
| 31 |
+
class: LazyDataset
|
| 32 |
+
shuffle_chunks: 10
|
| 33 |
+
batch_size: 1024
|
| 34 |
+
padding_mode: NONE #one of STEPS, FIXED, or NONE
|
| 35 |
+
args: &dataset_args
|
| 36 |
+
name: ttH
|
| 37 |
+
label: 0
|
| 38 |
+
weight_var: weight
|
| 39 |
+
chunks: 100
|
| 40 |
+
buffer_size: 11
|
| 41 |
+
file_names: ttH_NLO.root
|
| 42 |
+
tree_name: output
|
| 43 |
+
fold_var: Number
|
| 44 |
+
raw_dir: /global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl/data/ntuples/Hyy_pretraining/
|
| 45 |
+
save_dir: /pscratch/sd/j/joshuaho/root_gnn/root_gnn_dgl/data/processed_ttH_vs_ttH_CPOdd_10M
|
| 46 |
+
node_branch_names:
|
| 47 |
+
- [jet_pt, ele_pt, mu_pt, ph_pt, MET_met]
|
| 48 |
+
- [jet_eta, ele_eta, mu_eta, ph_eta, 0]
|
| 49 |
+
- [jet_phi, ele_phi, mu_phi, ph_phi, MET_phi]
|
| 50 |
+
- CALC_E
|
| 51 |
+
- [jet_btag, 0, 0, 0, 0]
|
| 52 |
+
- [0, ele_charge, mu_charge, 0, 0]
|
| 53 |
+
- NODE_TYPE
|
| 54 |
+
node_branch_types: [vector, vector, vector, vector, single]
|
| 55 |
+
node_feature_scales: [1e-1, 1, 1, 1e-1, 1, 1, 1]
|
| 56 |
+
folding:
|
| 57 |
+
n_folds: 10
|
| 58 |
+
test: [0, 1, 2]
|
| 59 |
+
# validation: 1
|
| 60 |
+
train: [3,4,5,6,7,8,9]
|
| 61 |
+
ttH_CPodd:
|
| 62 |
+
<<: *dataset_defn
|
| 63 |
+
args:
|
| 64 |
+
<<: *dataset_args
|
| 65 |
+
name: ttH_CPodd
|
| 66 |
+
label: 1
|
| 67 |
+
file_names: ttH_CPodd.root
|
root_gnn_dgl/configs/demo/pretraining_multiclass.yaml
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Training_Name: pretraining_multiclass
|
| 2 |
+
Training_Directory: trainings/demo/pretraining_multiclass/
|
| 3 |
+
Model:
|
| 4 |
+
module: models.GCN
|
| 5 |
+
class: Edge_Network
|
| 6 |
+
args:
|
| 7 |
+
hid_size: 64
|
| 8 |
+
in_size: 7
|
| 9 |
+
out_size: 12
|
| 10 |
+
n_layers: 4
|
| 11 |
+
n_proc_steps: 4
|
| 12 |
+
dropout: 0
|
| 13 |
+
Loss:
|
| 14 |
+
module: torch.nn
|
| 15 |
+
class: CrossEntropyLoss
|
| 16 |
+
args: {}
|
| 17 |
+
finish:
|
| 18 |
+
module: torch.nn
|
| 19 |
+
class: Softmax
|
| 20 |
+
args: {dim: 1}
|
| 21 |
+
Training:
|
| 22 |
+
epochs: 200
|
| 23 |
+
batch_size: 1024
|
| 24 |
+
learning_rate: 0.0001
|
| 25 |
+
gamma: 0.99
|
| 26 |
+
Datasets:
|
| 27 |
+
ttH: &dataset_defn
|
| 28 |
+
module: root_gnn_base.dataset
|
| 29 |
+
class: LazyDataset
|
| 30 |
+
shuffle_chunks: 3
|
| 31 |
+
padding_mode: NONE #one of STEPS, FIXED, or NONE
|
| 32 |
+
args: &dataset_args
|
| 33 |
+
name: ttH
|
| 34 |
+
label: 0
|
| 35 |
+
weight_var: weight
|
| 36 |
+
chunks: 3
|
| 37 |
+
buffer_size: 1
|
| 38 |
+
file_names: ttH_NLO_inc.root
|
| 39 |
+
tree_name: output
|
| 40 |
+
fold_var: Number
|
| 41 |
+
raw_dir: /global/cfs/projectdirs/atlas/joshua/root_gnn/root_gnn_dgl/data/ntuples/Hyy_pretraining/multilabel_10K/
|
| 42 |
+
save_dir: /pscratch/sd/j/joshuaho/GNN4Colliders/root_gnn_dgl/data/demo/pretraining_multiclass/
|
| 43 |
+
node_branch_names:
|
| 44 |
+
- [jet_pt, ele_pt, mu_pt, ph_pt, MET_met]
|
| 45 |
+
- [jet_eta, ele_eta, mu_eta, ph_eta, 0]
|
| 46 |
+
- [jet_phi, ele_phi, mu_phi, ph_phi, MET_phi]
|
| 47 |
+
- CALC_E
|
| 48 |
+
- [jet_btag, 0, 0, 0, 0]
|
| 49 |
+
- [0, ele_charge, mu_charge, 0, 0]
|
| 50 |
+
- NODE_TYPE
|
| 51 |
+
node_branch_types: [vector, vector, vector, vector, single]
|
| 52 |
+
node_feature_scales: [1e-1, 1, 1, 1e-1, 1, 1, 1]
|
| 53 |
+
folding:
|
| 54 |
+
n_folds: 3
|
| 55 |
+
test: [0]
|
| 56 |
+
# validation: 1
|
| 57 |
+
train: [1, 2]
|
| 58 |
+
tHjb:
|
| 59 |
+
<<: *dataset_defn
|
| 60 |
+
args:
|
| 61 |
+
<<: *dataset_args
|
| 62 |
+
name: tHjb
|
| 63 |
+
label: 1
|
| 64 |
+
file_names: tHjb_NLO_inc.root
|
| 65 |
+
ggF:
|
| 66 |
+
<<: *dataset_defn
|
| 67 |
+
args:
|
| 68 |
+
<<: *dataset_args
|
| 69 |
+
name: ggF
|
| 70 |
+
label: 2
|
| 71 |
+
file_names: ggF_NLO_inc.root
|
| 72 |
+
VBF:
|
| 73 |
+
<<: *dataset_defn
|
| 74 |
+
args:
|
| 75 |
+
<<: *dataset_args
|
| 76 |
+
name: VBF
|
| 77 |
+
label: 3
|
| 78 |
+
file_names: VBF_NLO_inc.root
|
| 79 |
+
WH:
|
| 80 |
+
<<: *dataset_defn
|
| 81 |
+
args:
|
| 82 |
+
<<: *dataset_args
|
| 83 |
+
name: WH
|
| 84 |
+
label: 4
|
| 85 |
+
file_names: WH_NLO_inc.root
|
| 86 |
+
ZH:
|
| 87 |
+
<<: *dataset_defn
|
| 88 |
+
args:
|
| 89 |
+
<<: *dataset_args
|
| 90 |
+
name: ZH
|
| 91 |
+
label: 5
|
| 92 |
+
file_names: ZH_NLO_inc.root
|
| 93 |
+
ttyy:
|
| 94 |
+
<<: *dataset_defn
|
| 95 |
+
args:
|
| 96 |
+
<<: *dataset_args
|
| 97 |
+
name: ttyy_ch
|
| 98 |
+
label: 6
|
| 99 |
+
file_names: 'ttyy.root'
|
| 100 |
+
tttt:
|
| 101 |
+
<<: *dataset_defn
|
| 102 |
+
args:
|
| 103 |
+
<<: *dataset_args
|
| 104 |
+
name: tttt
|
| 105 |
+
label: 7
|
| 106 |
+
file_names: 'tttt.root'
|
| 107 |
+
SingleT_schan:
|
| 108 |
+
<<: *dataset_defn
|
| 109 |
+
args:
|
| 110 |
+
<<: *dataset_args
|
| 111 |
+
name: SingleT_schan
|
| 112 |
+
label: 8
|
| 113 |
+
file_names: 'SingleT_schan.root'
|
| 114 |
+
ttbar:
|
| 115 |
+
<<: *dataset_defn
|
| 116 |
+
args:
|
| 117 |
+
<<: *dataset_args
|
| 118 |
+
name: ttbar
|
| 119 |
+
label: 9
|
| 120 |
+
file_names: 'ttbar.root'
|
| 121 |
+
ttW:
|
| 122 |
+
<<: *dataset_defn
|
| 123 |
+
args:
|
| 124 |
+
<<: *dataset_args
|
| 125 |
+
name: ttW
|
| 126 |
+
label: 10
|
| 127 |
+
file_names: 'ttW.root'
|
| 128 |
+
ttt:
|
| 129 |
+
<<: *dataset_defn
|
| 130 |
+
args:
|
| 131 |
+
<<: *dataset_args
|
| 132 |
+
name: ttt
|
| 133 |
+
label: 11
|
| 134 |
+
file_names: 'ttt.root'
|
root_gnn_dgl/configs/pretraining/multiclass.yaml
ADDED
|
File without changes
|
root_gnn_dgl/root_gnn_base/dataset.py
CHANGED
|
@@ -199,13 +199,14 @@ class RootDataset(DGLDataset):
|
|
| 199 |
tracking = torch.stack(tracking)
|
| 200 |
globals = torch.stack(globals)
|
| 201 |
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
| 209 |
return
|
| 210 |
self.graphs = self.graph_chunks[0]
|
| 211 |
for chunk in self.graph_chunks[1:]:
|
|
|
|
| 199 |
tracking = torch.stack(tracking)
|
| 200 |
globals = torch.stack(globals)
|
| 201 |
|
| 202 |
+
if (self.chunks > 1):
|
| 203 |
+
self.save_chunk(chunk_id, graphs, labels, tracking, globals)
|
| 204 |
+
else:
|
| 205 |
+
self.labels = labels
|
| 206 |
+
self.tracking = tracking
|
| 207 |
+
self.global_features = globals
|
| 208 |
+
self.graphs = graphs
|
| 209 |
+
self.save()
|
| 210 |
return
|
| 211 |
self.graphs = self.graph_chunks[0]
|
| 212 |
for chunk in self.graph_chunks[1:]:
|
root_gnn_dgl/run_demo.sh
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Pretraining
|
| 4 |
+
|
| 5 |
+
# Data Preparation
|
| 6 |
+
datasets=("ttH" "tHjb" "ggF" "VBF" "WH" "ZH" "ttyy" "tttt" "SingleT_schan" "ttbar" "ttW" "ttt")
|
| 7 |
+
chunks=3
|
| 8 |
+
|
| 9 |
+
for data in "${datasets[@]}"; do
|
| 10 |
+
python scripts/prep_data.py --config configs/demo/pretraining_multiclass.yaml --dataset "$data" --shuffle_mode --chunk 0
|
| 11 |
+
for ((i=0; i<chunks; i++)); do
|
| 12 |
+
python scripts/prep_data.py --config configs/demo/pretraining_multiclass.yaml --dataset "$data" --shuffle_mode --chunk "$i"
|
| 13 |
+
done
|
| 14 |
+
done
|
| 15 |
+
|
| 16 |
+
# Training
|
| 17 |
+
|
| 18 |
+
python scripts/training_script.py --config configs/demo/pretraining_multiclass.yaml --preshuffle --nocompile --lazy
|
| 19 |
+
|
| 20 |
+
# Inference
|
| 21 |
+
|