|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Common functionalities used by both Keras and Estimator implementations."""
|
|
|
| from __future__ import absolute_import
|
| from __future__ import division
|
| from __future__ import print_function
|
|
|
| import json
|
| import os
|
|
|
| from absl import flags
|
| from absl import logging
|
| import numpy as np
|
| import tensorflow as tf, tf_keras
|
|
|
| from official.common import distribute_utils
|
| from official.recommendation import constants as rconst
|
| from official.recommendation import data_pipeline
|
| from official.recommendation import data_preprocessing
|
| from official.recommendation import movielens
|
| from official.utils.flags import core as flags_core
|
|
|
| FLAGS = flags.FLAGS
|
|
|
|
|
| def get_inputs(params):
|
| """Returns some parameters used by the model."""
|
| if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
|
| movielens.download(FLAGS.dataset, FLAGS.data_dir)
|
|
|
| if FLAGS.seed is not None:
|
| np.random.seed(FLAGS.seed)
|
|
|
| if FLAGS.use_synthetic_data:
|
| producer = data_pipeline.DummyConstructor()
|
| num_users, num_items = movielens.DATASET_TO_NUM_USERS_AND_ITEMS[
|
| FLAGS.dataset]
|
| num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
|
| num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
|
| else:
|
| num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
|
| dataset=FLAGS.dataset,
|
| data_dir=FLAGS.data_dir,
|
| params=params,
|
| constructor_type=FLAGS.constructor_type,
|
| deterministic=FLAGS.seed is not None)
|
| num_train_steps = producer.train_batches_per_epoch
|
| num_eval_steps = producer.eval_batches_per_epoch
|
|
|
| return num_users, num_items, num_train_steps, num_eval_steps, producer
|
|
|
|
|
| def parse_flags(flags_obj):
|
| """Convenience function to turn flags into params."""
|
| num_gpus = flags_core.get_num_gpus(flags_obj)
|
|
|
| batch_size = flags_obj.batch_size
|
| eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
|
|
|
| return {
|
| "train_epochs": flags_obj.train_epochs,
|
| "batches_per_step": 1,
|
| "use_seed": flags_obj.seed is not None,
|
| "batch_size": batch_size,
|
| "eval_batch_size": eval_batch_size,
|
| "learning_rate": flags_obj.learning_rate,
|
| "mf_dim": flags_obj.num_factors,
|
| "model_layers": [int(layer) for layer in flags_obj.layers],
|
| "mf_regularization": flags_obj.mf_regularization,
|
| "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
|
| "num_neg": flags_obj.num_neg,
|
| "distribution_strategy": flags_obj.distribution_strategy,
|
| "num_gpus": num_gpus,
|
| "use_tpu": flags_obj.tpu is not None,
|
| "tpu": flags_obj.tpu,
|
| "tpu_zone": flags_obj.tpu_zone,
|
| "tpu_gcp_project": flags_obj.tpu_gcp_project,
|
| "beta1": flags_obj.beta1,
|
| "beta2": flags_obj.beta2,
|
| "epsilon": flags_obj.epsilon,
|
| "match_mlperf": flags_obj.ml_perf,
|
| "epochs_between_evals": flags_obj.epochs_between_evals,
|
| "keras_use_ctl": flags_obj.keras_use_ctl,
|
| "hr_threshold": flags_obj.hr_threshold,
|
| "stream_files": flags_obj.tpu is not None,
|
| "train_dataset_path": flags_obj.train_dataset_path,
|
| "eval_dataset_path": flags_obj.eval_dataset_path,
|
| "input_meta_data_path": flags_obj.input_meta_data_path,
|
| }
|
|
|
|
|
| def get_v1_distribution_strategy(params):
|
| """Returns the distribution strategy to use."""
|
| if params["use_tpu"]:
|
|
|
| for name in [
|
| "googleapiclient.discovery", "googleapiclient.discovery_cache",
|
| "oauth2client.transport"
|
| ]:
|
| logging.getLogger(name).setLevel(logging.ERROR)
|
|
|
| tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
|
| tpu=params["tpu"],
|
| zone=params["tpu_zone"],
|
| project=params["tpu_gcp_project"],
|
| coordinator_name="coordinator")
|
|
|
| logging.info("Issuing reset command to TPU to ensure a clean state.")
|
| tf.Session.reset(tpu_cluster_resolver.get_master())
|
|
|
|
|
|
|
|
|
| tf_config_env = {
|
| "session_master":
|
| tpu_cluster_resolver.get_master(),
|
| "eval_session_master":
|
| tpu_cluster_resolver.get_master(),
|
| "coordinator":
|
| tpu_cluster_resolver.cluster_spec().as_dict()["coordinator"]
|
| }
|
| os.environ["TF_CONFIG"] = json.dumps(tf_config_env)
|
|
|
| distribution = tf.distribute.TPUStrategy(
|
| tpu_cluster_resolver, steps_per_run=100)
|
|
|
| else:
|
| distribution = distribute_utils.get_distribution_strategy(
|
| num_gpus=params["num_gpus"])
|
|
|
| return distribution
|
|
|
|
|
| def define_ncf_flags():
|
| """Add flags for running ncf_main."""
|
|
|
| flags_core.define_base(
|
| model_dir=True,
|
| clean=True,
|
| train_epochs=True,
|
| epochs_between_evals=True,
|
| export_dir=False,
|
| run_eagerly=True,
|
| stop_threshold=True,
|
| num_gpu=True,
|
| distribution_strategy=True)
|
| flags_core.define_performance(
|
| synthetic_data=True,
|
| dtype=True,
|
| fp16_implementation=True,
|
| loss_scale=True,
|
| enable_xla=True,
|
| )
|
| flags_core.define_device(tpu=True)
|
| flags_core.define_benchmark()
|
|
|
| flags.adopt_module_key_flags(flags_core)
|
|
|
| movielens.define_flags()
|
|
|
| flags_core.set_defaults(
|
| model_dir="/tmp/ncf/",
|
| data_dir="/tmp/movielens-data/",
|
| dataset=movielens.ML_1M,
|
| train_epochs=2,
|
| batch_size=99000,
|
| tpu=None)
|
|
|
|
|
| flags.DEFINE_boolean(
|
| name="download_if_missing",
|
| default=True,
|
| help=flags_core.help_wrap(
|
| "Download data to data_dir if it is not already present."))
|
|
|
| flags.DEFINE_integer(
|
| name="eval_batch_size",
|
| default=None,
|
| help=flags_core.help_wrap(
|
| "The batch size used for evaluation. This should generally be larger"
|
| "than the training batch size as the lack of back propagation during"
|
| "evaluation can allow for larger batch sizes to fit in memory. If not"
|
| "specified, the training batch size (--batch_size) will be used."))
|
|
|
| flags.DEFINE_integer(
|
| name="num_factors",
|
| default=8,
|
| help=flags_core.help_wrap("The Embedding size of MF model."))
|
|
|
|
|
| flags.DEFINE_list(
|
| name="layers",
|
| default=["64", "32", "16", "8"],
|
| help=flags_core.help_wrap(
|
| "The sizes of hidden layers for MLP. Example "
|
| "to specify different sizes of MLP layers: --layers=32,16,8,4"))
|
|
|
| flags.DEFINE_float(
|
| name="mf_regularization",
|
| default=0.,
|
| help=flags_core.help_wrap(
|
| "The regularization factor for MF embeddings. The factor is used by "
|
| "regularizer which allows to apply penalties on layer parameters or "
|
| "layer activity during optimization."))
|
|
|
| flags.DEFINE_list(
|
| name="mlp_regularization",
|
| default=["0.", "0.", "0.", "0."],
|
| help=flags_core.help_wrap(
|
| "The regularization factor for each MLP layer. See mf_regularization "
|
| "help for more info about regularization factor."))
|
|
|
| flags.DEFINE_integer(
|
| name="num_neg",
|
| default=4,
|
| help=flags_core.help_wrap(
|
| "The Number of negative instances to pair with a positive instance."))
|
|
|
| flags.DEFINE_float(
|
| name="learning_rate",
|
| default=0.001,
|
| help=flags_core.help_wrap("The learning rate."))
|
|
|
| flags.DEFINE_float(
|
| name="beta1",
|
| default=0.9,
|
| help=flags_core.help_wrap("beta1 hyperparameter for the Adam optimizer."))
|
|
|
| flags.DEFINE_float(
|
| name="beta2",
|
| default=0.999,
|
| help=flags_core.help_wrap("beta2 hyperparameter for the Adam optimizer."))
|
|
|
| flags.DEFINE_float(
|
| name="epsilon",
|
| default=1e-8,
|
| help=flags_core.help_wrap("epsilon hyperparameter for the Adam "
|
| "optimizer."))
|
|
|
| flags.DEFINE_float(
|
| name="hr_threshold",
|
| default=1.0,
|
| help=flags_core.help_wrap(
|
| "If passed, training will stop when the evaluation metric HR is "
|
| "greater than or equal to hr_threshold. For dataset ml-1m, the "
|
| "desired hr_threshold is 0.68 which is the result from the paper; "
|
| "For dataset ml-20m, the threshold can be set as 0.95 which is "
|
| "achieved by MLPerf implementation."))
|
|
|
| flags.DEFINE_enum(
|
| name="constructor_type",
|
| default="bisection",
|
| enum_values=["bisection", "materialized"],
|
| case_sensitive=False,
|
| help=flags_core.help_wrap(
|
| "Strategy to use for generating false negatives. materialized has a"
|
| "precompute that scales badly, but a faster per-epoch construction"
|
| "time and can be faster on very large systems."))
|
|
|
| flags.DEFINE_string(
|
| name="train_dataset_path",
|
| default=None,
|
| help=flags_core.help_wrap("Path to training data."))
|
|
|
| flags.DEFINE_string(
|
| name="eval_dataset_path",
|
| default=None,
|
| help=flags_core.help_wrap("Path to evaluation data."))
|
|
|
| flags.DEFINE_string(
|
| name="input_meta_data_path",
|
| default=None,
|
| help=flags_core.help_wrap("Path to input meta data file."))
|
|
|
| flags.DEFINE_bool(
|
| name="ml_perf",
|
| default=False,
|
| help=flags_core.help_wrap(
|
| "If set, changes the behavior of the model slightly to match the "
|
| "MLPerf reference implementations here: \n"
|
| "https://github.com/mlperf/reference/tree/master/recommendation/"
|
| "pytorch\n"
|
| "The two changes are:\n"
|
| "1. When computing the HR and NDCG during evaluation, remove "
|
| "duplicate user-item pairs before the computation. This results in "
|
| "better HRs and NDCGs.\n"
|
| "2. Use a different soring algorithm when sorting the input data, "
|
| "which performs better due to the fact the sorting algorithms are "
|
| "not stable."))
|
|
|
| flags.DEFINE_bool(
|
| name="output_ml_perf_compliance_logging",
|
| default=False,
|
| help=flags_core.help_wrap(
|
| "If set, output the MLPerf compliance logging. This is only useful "
|
| "if one is running the model for MLPerf. See "
|
| "https://github.com/mlperf/policies/blob/master/training_rules.adoc"
|
| "#submission-compliance-logs for details. This uses sudo and so may "
|
| "ask for your password, as root access is needed to clear the system "
|
| "caches, which is required for MLPerf compliance."))
|
|
|
| flags.DEFINE_integer(
|
| name="seed",
|
| default=None,
|
| help=flags_core.help_wrap(
|
| "This value will be used to seed both NumPy and TensorFlow."))
|
|
|
| @flags.validator(
|
| "eval_batch_size",
|
| "eval_batch_size must be at least {}".format(rconst.NUM_EVAL_NEGATIVES +
|
| 1))
|
| def eval_size_check(eval_batch_size):
|
| return (eval_batch_size is None or
|
| int(eval_batch_size) > rconst.NUM_EVAL_NEGATIVES)
|
|
|
| flags.DEFINE_bool(
|
| name="early_stopping",
|
| default=False,
|
| help=flags_core.help_wrap(
|
| "If True, we stop the training when it reaches hr_threshold"))
|
|
|
| flags.DEFINE_bool(
|
| name="keras_use_ctl",
|
| default=False,
|
| help=flags_core.help_wrap(
|
| "If True, we use a custom training loop for keras."))
|
|
|
|
|
| def convert_to_softmax_logits(logits):
|
| """Convert the logits returned by the base model to softmax logits.
|
|
|
| Args:
|
| logits: used to create softmax.
|
|
|
| Returns:
|
| Softmax with the first column of zeros is equivalent to sigmoid.
|
| """
|
| softmax_logits = tf.concat([logits * 0, logits], axis=1)
|
| return softmax_logits
|
|
|