| | |
| | import argparse |
| | import logging |
| | import numpy as np |
| | import time |
| | import os |
| |
|
| | from caffe2.python import core, workspace, experiment_util, data_parallel_model |
| | from caffe2.python import dyndep, optimizer |
| | from caffe2.python import timeout_guard, model_helper, brew |
| | from caffe2.proto import caffe2_pb2 |
| |
|
| | import caffe2.python.models.resnet as resnet |
| | import caffe2.python.models.shufflenet as shufflenet |
| | from caffe2.python.modeling.initializers import Initializer, PseudoFP16Initializer |
| | import caffe2.python.predictor.predictor_exporter as pred_exp |
| | import caffe2.python.predictor.predictor_py_utils as pred_utils |
| | from caffe2.python.predictor_constants import predictor_constants |
| |
|
| | ''' |
| | Parallelized multi-GPU distributed trainer for Resne(X)t & Shufflenet. |
| | Can be used to train on imagenet data, for example. |
| | The default parameters can train a standard Resnet-50 (1x64d), and parameters |
| | can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d). |
| | |
| | To run the trainer in single-machine multi-gpu mode by setting num_shards = 1. |
| | |
| | To run the trainer in multi-machine multi-gpu mode with M machines, |
| | run the same program on all machines, specifying num_shards = M, and |
| | shard_id = a unique integer in the set [0, M-1]. |
| | |
| | For rendezvous (the trainer processes have to know about each other), |
| | you can either use a directory path that is visible to all processes |
| | (e.g. NFS directory), or use a Redis instance. Use the former by |
| | passing the `file_store_path` argument. Use the latter by passing the |
| | `redis_host` and `redis_port` arguments. |
| | ''' |
| |
|
| | logging.basicConfig() |
| | log = logging.getLogger("Imagenet_trainer") |
| | log.setLevel(logging.DEBUG) |
| |
|
| | dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops') |
| | dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops') |
| |
|
| |
|
| | def AddImageInput( |
| | model, |
| | reader, |
| | batch_size, |
| | img_size, |
| | dtype, |
| | is_test, |
| | mean_per_channel=None, |
| | std_per_channel=None, |
| | ): |
| | ''' |
| | The image input operator loads image and label data from the reader and |
| | applies transformations to the images (random cropping, mirroring, ...). |
| | ''' |
| | data, label = brew.image_input( |
| | model, |
| | reader, ["data", "label"], |
| | batch_size=batch_size, |
| | output_type=dtype, |
| | use_gpu_transform=True if core.IsGPUDeviceType(model._device_type) else False, |
| | use_caffe_datum=True, |
| | mean_per_channel=mean_per_channel, |
| | std_per_channel=std_per_channel, |
| | |
| | mean=128., |
| | std=128., |
| | scale=256, |
| | crop=img_size, |
| | mirror=1, |
| | is_test=is_test, |
| | ) |
| |
|
| | data = model.StopGradient(data, data) |
| |
|
| |
|
| | def AddNullInput(model, reader, batch_size, img_size, dtype): |
| | ''' |
| | The null input function uses a gaussian fill operator to emulate real image |
| | input. A label blob is hardcoded to a single value. This is useful if you |
| | want to test compute throughput or don't have a dataset available. |
| | ''' |
| | suffix = "_fp16" if dtype == "float16" else "" |
| | model.param_init_net.GaussianFill( |
| | [], |
| | ["data" + suffix], |
| | shape=[batch_size, 3, img_size, img_size], |
| | ) |
| | if dtype == "float16": |
| | model.param_init_net.FloatToHalf("data" + suffix, "data") |
| |
|
| | model.param_init_net.ConstantFill( |
| | [], |
| | ["label"], |
| | shape=[batch_size], |
| | value=1, |
| | dtype=core.DataType.INT32, |
| | ) |
| |
|
| |
|
| | def SaveModel(args, train_model, epoch, use_ideep): |
| | prefix = "[]_{}".format(train_model._device_prefix, train_model._devices[0]) |
| | predictor_export_meta = pred_exp.PredictorExportMeta( |
| | predict_net=train_model.net.Proto(), |
| | parameters=data_parallel_model.GetCheckpointParams(train_model), |
| | inputs=[prefix + "/data"], |
| | outputs=[prefix + "/softmax"], |
| | shapes={ |
| | prefix + "/softmax": (1, args.num_labels), |
| | prefix + "/data": (args.num_channels, args.image_size, args.image_size) |
| | } |
| | ) |
| |
|
| | |
| | model_path = "%s/%s_%d.mdl" % ( |
| | args.file_store_path, |
| | args.save_model_name, |
| | epoch, |
| | ) |
| |
|
| | |
| | |
| | |
| | pred_exp.save_to_db( |
| | db_type="minidb", |
| | db_destination=model_path, |
| | predictor_export_meta=predictor_export_meta, |
| | use_ideep=use_ideep |
| | ) |
| |
|
| |
|
| | def LoadModel(path, model, use_ideep): |
| | ''' |
| | Load pretrained model from file |
| | ''' |
| | log.info("Loading path: {}".format(path)) |
| | meta_net_def = pred_exp.load_from_db(path, 'minidb') |
| | init_net = core.Net(pred_utils.GetNet( |
| | meta_net_def, predictor_constants.GLOBAL_INIT_NET_TYPE)) |
| | predict_init_net = core.Net(pred_utils.GetNet( |
| | meta_net_def, predictor_constants.PREDICT_INIT_NET_TYPE)) |
| |
|
| | if use_ideep: |
| | predict_init_net.RunAllOnIDEEP() |
| | else: |
| | predict_init_net.RunAllOnGPU() |
| | if use_ideep: |
| | init_net.RunAllOnIDEEP() |
| | else: |
| | init_net.RunAllOnGPU() |
| |
|
| | assert workspace.RunNetOnce(predict_init_net) |
| | assert workspace.RunNetOnce(init_net) |
| |
|
| | |
| | itercnt = workspace.FetchBlob("optimizer_iteration") |
| | workspace.FeedBlob( |
| | "optimizer_iteration", |
| | itercnt, |
| | device_option=core.DeviceOption(caffe2_pb2.CPU, 0) |
| | ) |
| |
|
| |
|
| | def RunEpoch( |
| | args, |
| | epoch, |
| | train_model, |
| | test_model, |
| | total_batch_size, |
| | num_shards, |
| | expname, |
| | explog, |
| | ): |
| | ''' |
| | Run one epoch of the trainer. |
| | TODO: add checkpointing here. |
| | ''' |
| | |
| | log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) |
| | epoch_iters = int(args.epoch_size / total_batch_size / num_shards) |
| | test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards) |
| | for i in range(epoch_iters): |
| | |
| | |
| | timeout = args.first_iter_timeout if i == 0 else args.timeout |
| | with timeout_guard.CompleteInTimeOrDie(timeout): |
| | t1 = time.time() |
| | workspace.RunNet(train_model.net.Proto().name) |
| | t2 = time.time() |
| | dt = t2 - t1 |
| |
|
| | fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" |
| | log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt)) |
| | prefix = "{}_{}".format( |
| | train_model._device_prefix, |
| | train_model._devices[0]) |
| | accuracy = workspace.FetchBlob(prefix + '/accuracy') |
| | loss = workspace.FetchBlob(prefix + '/loss') |
| | train_fmt = "Training loss: {}, accuracy: {}" |
| | log.info(train_fmt.format(loss, accuracy)) |
| |
|
| | num_images = epoch * epoch_iters * total_batch_size |
| | prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) |
| | accuracy = workspace.FetchBlob(prefix + '/accuracy') |
| | loss = workspace.FetchBlob(prefix + '/loss') |
| | learning_rate = workspace.FetchBlob( |
| | data_parallel_model.GetLearningRateBlobNames(train_model)[0] |
| | ) |
| | test_accuracy = 0 |
| | test_accuracy_top5 = 0 |
| | if test_model is not None: |
| | |
| | ntests = 0 |
| | for _ in range(test_epoch_iters): |
| | workspace.RunNet(test_model.net.Proto().name) |
| | for g in test_model._devices: |
| | test_accuracy += np.asscalar(workspace.FetchBlob( |
| | "{}_{}".format(test_model._device_prefix, g) + '/accuracy' |
| | )) |
| | test_accuracy_top5 += np.asscalar(workspace.FetchBlob( |
| | "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5' |
| | )) |
| | ntests += 1 |
| | test_accuracy /= ntests |
| | test_accuracy_top5 /= ntests |
| | else: |
| | test_accuracy = (-1) |
| | test_accuracy_top5 = (-1) |
| |
|
| | explog.log( |
| | input_count=num_images, |
| | batch_count=(i + epoch * epoch_iters), |
| | additional_values={ |
| | 'accuracy': accuracy, |
| | 'loss': loss, |
| | 'learning_rate': learning_rate, |
| | 'epoch': epoch, |
| | 'top1_test_accuracy': test_accuracy, |
| | 'top5_test_accuracy': test_accuracy_top5, |
| | } |
| | ) |
| | assert loss < 40, "Exploded gradients :(" |
| |
|
| | |
| | return epoch + 1 |
| |
|
| |
|
| | def Train(args): |
| | if args.model == "resnext": |
| | model_name = "resnext" + str(args.num_layers) |
| | elif args.model == "shufflenet": |
| | model_name = "shufflenet" |
| |
|
| | |
| | if args.gpus is not None: |
| | gpus = [int(x) for x in args.gpus.split(',')] |
| | num_gpus = len(gpus) |
| | else: |
| | gpus = list(range(args.num_gpus)) |
| | num_gpus = args.num_gpus |
| |
|
| | log.info("Running on GPUs: {}".format(gpus)) |
| |
|
| | |
| | total_batch_size = args.batch_size |
| | batch_per_device = total_batch_size // num_gpus |
| | assert \ |
| | total_batch_size % num_gpus == 0, \ |
| | "Number of GPUs must divide batch size" |
| |
|
| | |
| | if args.image_mean_per_channel: |
| | assert \ |
| | len(args.image_mean_per_channel) == args.num_channels, \ |
| | "The number of channels of image mean doesn't match input" |
| |
|
| | if args.image_std_per_channel: |
| | assert \ |
| | len(args.image_std_per_channel) == args.num_channels, \ |
| | "The number of channels of image std doesn't match input" |
| |
|
| | |
| | global_batch_size = total_batch_size * args.num_shards |
| | epoch_iters = int(args.epoch_size / global_batch_size) |
| |
|
| | assert \ |
| | epoch_iters > 0, \ |
| | "Epoch size must be larger than batch size times shard count" |
| |
|
| | args.epoch_size = epoch_iters * global_batch_size |
| | log.info("Using epoch size: {}".format(args.epoch_size)) |
| |
|
| | |
| | if args.use_ideep: |
| | train_arg_scope = { |
| | 'use_cudnn': False, |
| | 'cudnn_exhaustive_search': False, |
| | 'training_mode': 1 |
| | } |
| | else: |
| | train_arg_scope = { |
| | 'order': 'NCHW', |
| | 'use_cudnn': True, |
| | 'cudnn_exhaustive_search': True, |
| | 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), |
| | } |
| | train_model = model_helper.ModelHelper( |
| | name=model_name, arg_scope=train_arg_scope |
| | ) |
| |
|
| | num_shards = args.num_shards |
| | shard_id = args.shard_id |
| |
|
| | |
| | |
| | |
| | interfaces = args.distributed_interfaces.split(",") |
| |
|
| | |
| | if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: |
| | num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) |
| | shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) |
| | if num_shards > 1: |
| | rendezvous = dict( |
| | kv_handler=None, |
| | num_shards=num_shards, |
| | shard_id=shard_id, |
| | engine="GLOO", |
| | transport=args.distributed_transport, |
| | interface=interfaces[0], |
| | mpi_rendezvous=True, |
| | exit_nets=None) |
| |
|
| | elif num_shards > 1: |
| | |
| | store_handler = "store_handler" |
| | if args.redis_host is not None: |
| | |
| | workspace.RunOperatorOnce( |
| | core.CreateOperator( |
| | "RedisStoreHandlerCreate", [], [store_handler], |
| | host=args.redis_host, |
| | port=args.redis_port, |
| | prefix=args.run_id, |
| | ) |
| | ) |
| | else: |
| | |
| | workspace.RunOperatorOnce( |
| | core.CreateOperator( |
| | "FileStoreHandlerCreate", [], [store_handler], |
| | path=args.file_store_path, |
| | prefix=args.run_id, |
| | ) |
| | ) |
| |
|
| | rendezvous = dict( |
| | kv_handler=store_handler, |
| | shard_id=shard_id, |
| | num_shards=num_shards, |
| | engine="GLOO", |
| | transport=args.distributed_transport, |
| | interface=interfaces[0], |
| | exit_nets=None) |
| |
|
| | else: |
| | rendezvous = None |
| |
|
| | |
| | def create_resnext_model_ops(model, loss_scale): |
| | initializer = (PseudoFP16Initializer if args.dtype == 'float16' |
| | else Initializer) |
| |
|
| | with brew.arg_scope([brew.conv, brew.fc], |
| | WeightInitializer=initializer, |
| | BiasInitializer=initializer, |
| | enable_tensor_core=args.enable_tensor_core, |
| | float16_compute=args.float16_compute): |
| | pred = resnet.create_resnext( |
| | model, |
| | "data", |
| | num_input_channels=args.num_channels, |
| | num_labels=args.num_labels, |
| | num_layers=args.num_layers, |
| | num_groups=args.resnext_num_groups, |
| | num_width_per_group=args.resnext_width_per_group, |
| | no_bias=True, |
| | no_loss=True, |
| | ) |
| |
|
| | if args.dtype == 'float16': |
| | pred = model.net.HalfToFloat(pred, pred + '_fp32') |
| |
|
| | softmax, loss = model.SoftmaxWithLoss([pred, 'label'], |
| | ['softmax', 'loss']) |
| | loss = model.Scale(loss, scale=loss_scale) |
| | brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) |
| | brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) |
| | return [loss] |
| |
|
| | def create_shufflenet_model_ops(model, loss_scale): |
| | initializer = (PseudoFP16Initializer if args.dtype == 'float16' |
| | else Initializer) |
| |
|
| | with brew.arg_scope([brew.conv, brew.fc], |
| | WeightInitializer=initializer, |
| | BiasInitializer=initializer, |
| | enable_tensor_core=args.enable_tensor_core, |
| | float16_compute=args.float16_compute): |
| | pred = shufflenet.create_shufflenet( |
| | model, |
| | "data", |
| | num_input_channels=args.num_channels, |
| | num_labels=args.num_labels, |
| | no_loss=True, |
| | ) |
| |
|
| | if args.dtype == 'float16': |
| | pred = model.net.HalfToFloat(pred, pred + '_fp32') |
| |
|
| | softmax, loss = model.SoftmaxWithLoss([pred, 'label'], |
| | ['softmax', 'loss']) |
| | loss = model.Scale(loss, scale=loss_scale) |
| | brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) |
| | brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) |
| | return [loss] |
| |
|
| | def add_optimizer(model): |
| | stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) |
| |
|
| | if args.float16_compute: |
| | |
| | opt = optimizer.build_fp16_sgd( |
| | model, |
| | args.base_learning_rate, |
| | momentum=0.9, |
| | nesterov=1, |
| | weight_decay=args.weight_decay, |
| | policy="step", |
| | stepsize=stepsz, |
| | gamma=0.1 |
| | ) |
| | else: |
| | optimizer.add_weight_decay(model, args.weight_decay) |
| | opt = optimizer.build_multi_precision_sgd( |
| | model, |
| | args.base_learning_rate, |
| | momentum=0.9, |
| | nesterov=1, |
| | policy="step", |
| | stepsize=stepsz, |
| | gamma=0.1 |
| | ) |
| | return opt |
| |
|
| | |
| | |
| | |
| | if args.train_data == "null": |
| | def add_image_input(model): |
| | AddNullInput( |
| | model, |
| | None, |
| | batch_size=batch_per_device, |
| | img_size=args.image_size, |
| | dtype=args.dtype, |
| | ) |
| | else: |
| | reader = train_model.CreateDB( |
| | "reader", |
| | db=args.train_data, |
| | db_type=args.db_type, |
| | num_shards=num_shards, |
| | shard_id=shard_id, |
| | ) |
| |
|
| | def add_image_input(model): |
| | AddImageInput( |
| | model, |
| | reader, |
| | batch_size=batch_per_device, |
| | img_size=args.image_size, |
| | dtype=args.dtype, |
| | is_test=False, |
| | mean_per_channel=args.image_mean_per_channel, |
| | std_per_channel=args.image_std_per_channel, |
| | ) |
| |
|
| | def add_post_sync_ops(model): |
| | """Add ops applied after initial parameter sync.""" |
| | for param_info in model.GetOptimizationParamInfo(model.GetParams()): |
| | if param_info.blob_copy is not None: |
| | model.param_init_net.HalfToFloat( |
| | param_info.blob, |
| | param_info.blob_copy[core.DataType.FLOAT] |
| | ) |
| |
|
| | data_parallel_model.Parallelize( |
| | train_model, |
| | input_builder_fun=add_image_input, |
| | forward_pass_builder_fun=create_resnext_model_ops |
| | if args.model == "resnext" else create_shufflenet_model_ops, |
| | optimizer_builder_fun=add_optimizer, |
| | post_sync_builder_fun=add_post_sync_ops, |
| | devices=gpus, |
| | rendezvous=rendezvous, |
| | optimize_gradient_memory=False, |
| | use_nccl=args.use_nccl, |
| | cpu_device=args.use_cpu, |
| | ideep=args.use_ideep, |
| | shared_model=args.use_cpu, |
| | combine_spatial_bn=args.use_cpu, |
| | ) |
| |
|
| | data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) |
| |
|
| | workspace.RunNetOnce(train_model.param_init_net) |
| | workspace.CreateNet(train_model.net) |
| |
|
| | |
| | test_model = None |
| | if (args.test_data is not None): |
| | log.info("----- Create test net ----") |
| | if args.use_ideep: |
| | test_arg_scope = { |
| | 'use_cudnn': False, |
| | 'cudnn_exhaustive_search': False, |
| | } |
| | else: |
| | test_arg_scope = { |
| | 'order': "NCHW", |
| | 'use_cudnn': True, |
| | 'cudnn_exhaustive_search': True, |
| | } |
| | test_model = model_helper.ModelHelper( |
| | name=model_name + "_test", |
| | arg_scope=test_arg_scope, |
| | init_params=False, |
| | ) |
| |
|
| | test_reader = test_model.CreateDB( |
| | "test_reader", |
| | db=args.test_data, |
| | db_type=args.db_type, |
| | ) |
| |
|
| | def test_input_fn(model): |
| | AddImageInput( |
| | model, |
| | test_reader, |
| | batch_size=batch_per_device, |
| | img_size=args.image_size, |
| | dtype=args.dtype, |
| | is_test=True, |
| | mean_per_channel=args.image_mean_per_channel, |
| | std_per_channel=args.image_std_per_channel, |
| | ) |
| |
|
| | data_parallel_model.Parallelize( |
| | test_model, |
| | input_builder_fun=test_input_fn, |
| | forward_pass_builder_fun=create_resnext_model_ops |
| | if args.model == "resnext" else create_shufflenet_model_ops, |
| | post_sync_builder_fun=add_post_sync_ops, |
| | param_update_builder_fun=None, |
| | devices=gpus, |
| | use_nccl=args.use_nccl, |
| | cpu_device=args.use_cpu, |
| | ) |
| | workspace.RunNetOnce(test_model.param_init_net) |
| | workspace.CreateNet(test_model.net) |
| |
|
| | epoch = 0 |
| | |
| | if args.load_model_path is not None: |
| | LoadModel(args.load_model_path, train_model, args.use_ideep) |
| |
|
| | |
| | data_parallel_model.FinalizeAfterCheckpoint(train_model) |
| |
|
| | |
| | |
| | last_str = args.load_model_path.split('_')[-1] |
| | if last_str.endswith('.mdl'): |
| | epoch = int(last_str[:-4]) |
| | log.info("Reset epoch to {}".format(epoch)) |
| | else: |
| | log.warning("The format of load_model_path doesn't match!") |
| |
|
| | expname = "%s_gpu%d_b%d_L%d_lr%.2f_v2" % ( |
| | model_name, |
| | args.num_gpus, |
| | total_batch_size, |
| | args.num_labels, |
| | args.base_learning_rate, |
| | ) |
| |
|
| | explog = experiment_util.ModelTrainerLog(expname, args) |
| |
|
| | |
| | while epoch < args.num_epochs: |
| | epoch = RunEpoch( |
| | args, |
| | epoch, |
| | train_model, |
| | test_model, |
| | total_batch_size, |
| | num_shards, |
| | expname, |
| | explog |
| | ) |
| |
|
| | |
| | SaveModel(args, train_model, epoch, args.use_ideep) |
| |
|
| | model_path = "%s/%s_" % ( |
| | args.file_store_path, |
| | args.save_model_name |
| | ) |
| | |
| | if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): |
| | os.remove(model_path + str(epoch - 1) + ".mdl") |
| |
|
| |
|
| | def main(): |
| | |
| | parser = argparse.ArgumentParser( |
| | description="Caffe2: ImageNet Trainer" |
| | ) |
| | parser.add_argument("--train_data", type=str, default=None, required=True, |
| | help="Path to training data (or 'null' to simulate)") |
| | parser.add_argument("--num_layers", type=int, default=50, |
| | help="The number of layers in ResNe(X)t model") |
| | parser.add_argument("--resnext_num_groups", type=int, default=1, |
| | help="The cardinality of resnext") |
| | parser.add_argument("--resnext_width_per_group", type=int, default=64, |
| | help="The cardinality of resnext") |
| | parser.add_argument("--test_data", type=str, default=None, |
| | help="Path to test data") |
| | parser.add_argument("--image_mean_per_channel", type=float, nargs='+', |
| | help="The per channel mean for the images") |
| | parser.add_argument("--image_std_per_channel", type=float, nargs='+', |
| | help="The per channel standard deviation for the images") |
| | parser.add_argument("--test_epoch_size", type=int, default=50000, |
| | help="Number of test images") |
| | parser.add_argument("--db_type", type=str, default="lmdb", |
| | help="Database type (such as lmdb or leveldb)") |
| | parser.add_argument("--gpus", type=str, |
| | help="Comma separated list of GPU devices to use") |
| | parser.add_argument("--num_gpus", type=int, default=1, |
| | help="Number of GPU devices (instead of --gpus)") |
| | parser.add_argument("--num_channels", type=int, default=3, |
| | help="Number of color channels") |
| | parser.add_argument("--image_size", type=int, default=224, |
| | help="Input image size (to crop to)") |
| | parser.add_argument("--num_labels", type=int, default=1000, |
| | help="Number of labels") |
| | parser.add_argument("--batch_size", type=int, default=32, |
| | help="Batch size, total over all GPUs") |
| | parser.add_argument("--epoch_size", type=int, default=1500000, |
| | help="Number of images/epoch, total over all machines") |
| | parser.add_argument("--num_epochs", type=int, default=1000, |
| | help="Num epochs.") |
| | parser.add_argument("--base_learning_rate", type=float, default=0.1, |
| | help="Initial learning rate.") |
| | parser.add_argument("--weight_decay", type=float, default=1e-4, |
| | help="Weight decay (L2 regularization)") |
| | parser.add_argument("--cudnn_workspace_limit_mb", type=int, default=64, |
| | help="CuDNN workspace limit in MBs") |
| | parser.add_argument("--num_shards", type=int, default=1, |
| | help="Number of machines in distributed run") |
| | parser.add_argument("--shard_id", type=int, default=0, |
| | help="Shard id.") |
| | parser.add_argument("--run_id", type=str, |
| | help="Unique run identifier (e.g. uuid)") |
| | parser.add_argument("--redis_host", type=str, |
| | help="Host of Redis server (for rendezvous)") |
| | parser.add_argument("--redis_port", type=int, default=6379, |
| | help="Port of Redis server (for rendezvous)") |
| | parser.add_argument("--file_store_path", type=str, default="/tmp", |
| | help="Path to directory to use for rendezvous") |
| | parser.add_argument("--save_model_name", type=str, default="resnext_model", |
| | help="Save the trained model to a given name") |
| | parser.add_argument("--load_model_path", type=str, default=None, |
| | help="Load previously saved model to continue training") |
| | parser.add_argument("--use_cpu", action="store_true", |
| | help="Use CPU instead of GPU") |
| | parser.add_argument("--use_nccl", action="store_true", |
| | help="Use nccl for inter-GPU collectives") |
| | parser.add_argument("--use_ideep", type=bool, default=False, |
| | help="Use ideep") |
| | parser.add_argument('--dtype', default='float', |
| | choices=['float', 'float16'], |
| | help='Data type used for training') |
| | parser.add_argument('--float16_compute', action='store_true', |
| | help="Use float 16 compute, if available") |
| | parser.add_argument('--enable_tensor_core', action='store_true', |
| | help='Enable Tensor Core math for Conv and FC ops') |
| | parser.add_argument("--distributed_transport", type=str, default="tcp", |
| | help="Transport to use for distributed run [tcp|ibverbs]") |
| | parser.add_argument("--distributed_interfaces", type=str, default="", |
| | help="Network interfaces to use for distributed run") |
| |
|
| | parser.add_argument("--first_iter_timeout", type=int, default=1200, |
| | help="Timeout (secs) of the first iteration " |
| | "(default: %(default)s)") |
| | parser.add_argument("--timeout", type=int, default=60, |
| | help="Timeout (secs) of each (except the first) iteration " |
| | "(default: %(default)s)") |
| | parser.add_argument("--model", |
| | default="resnext", const="resnext", nargs="?", |
| | choices=["shufflenet", "resnext"], |
| | help="List of models which can be run") |
| | args = parser.parse_args() |
| |
|
| | Train(args) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | workspace.GlobalInit(['caffe2', '--caffe2_log_level=2']) |
| | main() |
| |
|