File size: 1,213 Bytes
02aa18d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
{
    "device": "$torch.device('cuda:' + os.environ['LOCAL_RANK'])",
    "use_tensorboard": "$dist.get_rank() == 0",
    "controlnet": {
        "_target_": "torch.nn.parallel.DistributedDataParallel",
        "module": "$@controlnet_def.to(@device)",
        "find_unused_parameters": true,
        "device_ids": [
            "@device"
        ]
    },
    "load_controlnet": "$@controlnet.module.load_state_dict(@checkpoint_controlnet['controlnet_state_dict'], strict=True)",
    "train#sampler": {
        "_target_": "DistributedSampler",
        "dataset": "@train#dataset",
        "even_divisible": true,
        "shuffle": true
    },
    "train#dataloader#sampler": "@train#sampler",
    "train#dataloader#shuffle": false,
    "train#trainer#train_handlers": "$@train#handlers[: -1 if dist.get_rank() > 0 else None]",
    "initialize": [
        "$import torch.distributed as dist",
        "$dist.is_initialized() or dist.init_process_group(backend='nccl')",
        "$torch.cuda.set_device(@device)",
        "$monai.utils.set_determinism(seed=123)"
    ],
    "run": [
        "$@train#trainer.run()"
    ],
    "finalize": [
        "$dist.is_initialized() and dist.destroy_process_group()"
    ]
}