fix the wrong GPU index issue of multi-node
Browse files- configs/inference.json +3 -1
- configs/metadata.json +3 -2
- configs/multi_gpu_train.json +4 -2
- configs/train.json +6 -3
configs/inference.json
CHANGED
|
@@ -9,7 +9,6 @@
|
|
| 9 |
"test_json": "$@bundle_root+'/label/test_samples.json'",
|
| 10 |
"test_fp": "$open(@test_json,'r', encoding='utf8')",
|
| 11 |
"test_dict": "$json.load(@test_fp)",
|
| 12 |
-
"test_close": "$@test_fp.close()",
|
| 13 |
"device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
|
| 14 |
"network_def": {
|
| 15 |
"_target_": "SEResNet50",
|
|
@@ -110,5 +109,8 @@
|
|
| 110 |
],
|
| 111 |
"run": [
|
| 112 |
"$@evaluator.run()"
|
|
|
|
|
|
|
|
|
|
| 113 |
]
|
| 114 |
}
|
|
|
|
| 9 |
"test_json": "$@bundle_root+'/label/test_samples.json'",
|
| 10 |
"test_fp": "$open(@test_json,'r', encoding='utf8')",
|
| 11 |
"test_dict": "$json.load(@test_fp)",
|
|
|
|
| 12 |
"device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
|
| 13 |
"network_def": {
|
| 14 |
"_target_": "SEResNet50",
|
|
|
|
| 109 |
],
|
| 110 |
"run": [
|
| 111 |
"$@evaluator.run()"
|
| 112 |
+
],
|
| 113 |
+
"finalize": [
|
| 114 |
+
"$@test_fp.close()"
|
| 115 |
]
|
| 116 |
}
|
configs/metadata.json
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
|
| 3 |
-
"version": "0.4.
|
| 4 |
"changelog": {
|
|
|
|
| 5 |
"0.4.3": "add dataset dir example",
|
| 6 |
"0.4.2": "update ONNX-TensorRT descriptions",
|
| 7 |
"0.4.1": "update the model weights with the deterministic training",
|
|
@@ -22,7 +23,7 @@
|
|
| 22 |
"0.1.0": "complete the first version model package",
|
| 23 |
"0.0.1": "initialize the model package structure"
|
| 24 |
},
|
| 25 |
-
"monai_version": "1.2.
|
| 26 |
"pytorch_version": "1.13.1",
|
| 27 |
"numpy_version": "1.22.2",
|
| 28 |
"optional_packages_version": {
|
|
|
|
| 1 |
{
|
| 2 |
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
|
| 3 |
+
"version": "0.4.4",
|
| 4 |
"changelog": {
|
| 5 |
+
"0.4.4": "fix the wrong GPU index issue of multi-node",
|
| 6 |
"0.4.3": "add dataset dir example",
|
| 7 |
"0.4.2": "update ONNX-TensorRT descriptions",
|
| 8 |
"0.4.1": "update the model weights with the deterministic training",
|
|
|
|
| 23 |
"0.1.0": "complete the first version model package",
|
| 24 |
"0.0.1": "initialize the model package structure"
|
| 25 |
},
|
| 26 |
+
"monai_version": "1.2.0",
|
| 27 |
"pytorch_version": "1.13.1",
|
| 28 |
"numpy_version": "1.22.2",
|
| 29 |
"optional_packages_version": {
|
configs/multi_gpu_train.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"device": "$torch.device(
|
| 3 |
"network": {
|
| 4 |
"_target_": "torch.nn.parallel.DistributedDataParallel",
|
| 5 |
"module": "$@network_def.to(@device)",
|
|
@@ -34,6 +34,8 @@
|
|
| 34 |
"$@train#trainer.run()"
|
| 35 |
],
|
| 36 |
"finalize": [
|
| 37 |
-
"$dist.is_initialized() and dist.destroy_process_group()"
|
|
|
|
|
|
|
| 38 |
]
|
| 39 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"device": "$torch.device('cuda:' + os.environ['LOCAL_RANK'])",
|
| 3 |
"network": {
|
| 4 |
"_target_": "torch.nn.parallel.DistributedDataParallel",
|
| 5 |
"module": "$@network_def.to(@device)",
|
|
|
|
| 34 |
"$@train#trainer.run()"
|
| 35 |
],
|
| 36 |
"finalize": [
|
| 37 |
+
"$dist.is_initialized() and dist.destroy_process_group()",
|
| 38 |
+
"$@train_fp.close()",
|
| 39 |
+
"$@val_fp.close()"
|
| 40 |
]
|
| 41 |
}
|
configs/train.json
CHANGED
|
@@ -2,7 +2,8 @@
|
|
| 2 |
"imports": [
|
| 3 |
"$import torch",
|
| 4 |
"$import json",
|
| 5 |
-
"$import ignite"
|
|
|
|
| 6 |
],
|
| 7 |
"bundle_root": ".",
|
| 8 |
"ckpt_dir": "$@bundle_root + '/models'",
|
|
@@ -12,11 +13,9 @@
|
|
| 12 |
"val_json": "$@bundle_root+'/label/val_samples.json'",
|
| 13 |
"train_fp": "$open(@train_json,'r', encoding='utf8')",
|
| 14 |
"train_dict": "$json.load(@train_fp)",
|
| 15 |
-
"train_close": "$@train_fp.close()",
|
| 16 |
"val_fp": "$open(@val_json,'r', encoding='utf8')",
|
| 17 |
"val_dict": "$json.load(@val_fp)",
|
| 18 |
"val_interval": 1,
|
| 19 |
-
"val_close": "$@val_fp.close()",
|
| 20 |
"device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
|
| 21 |
"network_def": {
|
| 22 |
"_target_": "SEResNet50",
|
|
@@ -256,5 +255,9 @@
|
|
| 256 |
],
|
| 257 |
"run": [
|
| 258 |
"$@train#trainer.run()"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
]
|
| 260 |
}
|
|
|
|
| 2 |
"imports": [
|
| 3 |
"$import torch",
|
| 4 |
"$import json",
|
| 5 |
+
"$import ignite",
|
| 6 |
+
"$import os"
|
| 7 |
],
|
| 8 |
"bundle_root": ".",
|
| 9 |
"ckpt_dir": "$@bundle_root + '/models'",
|
|
|
|
| 13 |
"val_json": "$@bundle_root+'/label/val_samples.json'",
|
| 14 |
"train_fp": "$open(@train_json,'r', encoding='utf8')",
|
| 15 |
"train_dict": "$json.load(@train_fp)",
|
|
|
|
| 16 |
"val_fp": "$open(@val_json,'r', encoding='utf8')",
|
| 17 |
"val_dict": "$json.load(@val_fp)",
|
| 18 |
"val_interval": 1,
|
|
|
|
| 19 |
"device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
|
| 20 |
"network_def": {
|
| 21 |
"_target_": "SEResNet50",
|
|
|
|
| 255 |
],
|
| 256 |
"run": [
|
| 257 |
"$@train#trainer.run()"
|
| 258 |
+
],
|
| 259 |
+
"finalize": [
|
| 260 |
+
"$@train_fp.close()",
|
| 261 |
+
"$@val_fp.close()"
|
| 262 |
]
|
| 263 |
}
|