diff --git a/.gitattributes b/.gitattributes index c9bca540b7213f83530a2a192c42adbeff2ffc39..22803b2f7573fc4eaca05dbeb43623704f04f900 100644 --- a/.gitattributes +++ b/.gitattributes @@ -179,3 +179,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/rllib/env/__pycache__/multi_agent_episode.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/tune_controller.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/_checkpoint.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2dce00862e332ab2a2cb553f61e08480461d75af Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/_checkpoint.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/__pycache__/trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..083310977436fe5aef1556f2a364fcec03ccc1f3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/__pycache__/trainer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf4483616962b69a53fb2db9d060764369989181 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/__pycache__/mlflow_simple_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/__pycache__/mlflow_simple_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edb0d02ab975a4c8f82a70fe37ac35c981b5c7f7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/__pycache__/mlflow_simple_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..646ffbdae23ac54aa80e2c68d14a0cfeb9c41dda Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_cifar_pbt_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_cifar_pbt_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19b09809bfc53f1c8138bdd2244243d71924a9bb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_cifar_pbt_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1b9637c8da3c1a4cb89e1ddeeea10b30c62cbfc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_pytorch_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_pytorch_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e50ece0277ae04d39ab88e4ccec672cddf01591 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_pytorch_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_tune_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_tune_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03807fc21420a8ca745005fc6e794eaeb653417a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/__pycache__/horovod_tune_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_cifar_pbt_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_cifar_pbt_example.py new file mode 100644 index 0000000000000000000000000000000000000000..d7be644d5de03bc7cda298ddaeba1f9417c6fea5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_cifar_pbt_example.py @@ -0,0 +1,210 @@ +import os +import tempfile + +import numpy as np +import torch +import torch.nn as nn +import torchvision +import torchvision.transforms as transforms +from torch.utils.data import DataLoader +from torchvision.models import resnet18 + +import ray +import ray.cloudpickle as cpickle +import ray.train.torch +from ray import train, tune +from ray.train import ( + Checkpoint, + CheckpointConfig, + FailureConfig, + RunConfig, + ScalingConfig, +) +from ray.train.horovod import HorovodTrainer +from ray.tune.schedulers import create_scheduler +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner +from ray.tune.utils.release_test_util import ProgressCallback + +# The long running version starts 4 trials while only 2 can be run at a time. +# Thus trials are paused and restored at all times so that every trial can make +# progress. The PBT scheduler also applies perturbation and mutation, +# which also involves pausing and restoring. +# The intention is to stress test the pausing and restoring of trials, +# especially that there should be no GPU memory leak. + +# TODO(ml-team): This test is very low signal at the moment. +# We should further trim it down. + +CIFAR10_STATS = { + "mean": (0.4914, 0.4822, 0.4465), + "std": (0.2023, 0.1994, 0.2010), +} + + +def train_loop_per_worker(config): + import horovod.torch as hvd + + hvd.init() + device = ray.train.torch.get_device() + net = resnet18().to(device) + optimizer = torch.optim.SGD( + net.parameters(), + lr=config["lr"], + ) + epoch = 0 + + checkpoint = train.get_checkpoint() + if checkpoint: + with checkpoint.as_directory() as checkpoint_dir: + with open(os.path.join(checkpoint_dir, "data.ckpt"), "rb") as fp: + checkpoint_dict = cpickle.load(fp) + + model_state = checkpoint_dict["model_state"] + optimizer_state = checkpoint_dict["optimizer_state"] + epoch = checkpoint_dict["epoch"] + 1 + + net.load_state_dict(model_state) + optimizer.load_state_dict(optimizer_state) + + criterion = nn.CrossEntropyLoss() + optimizer = hvd.DistributedOptimizer(optimizer) + np.random.seed(1 + hvd.rank()) + torch.manual_seed(1234) + # To ensure consistent initialization across workers, + hvd.broadcast_parameters(net.state_dict(), root_rank=0) + hvd.broadcast_optimizer_state(optimizer, root_rank=0) + + trainset = ray.get(config["data"]) + + train_sampler = torch.utils.data.distributed.DistributedSampler( + trainset, num_replicas=hvd.size(), rank=hvd.rank() + ) + + # Note, don't set `num_workers` in DataLoader (not even 1), + # as that will separately start multiple processes (each corresponding to 1 worker) + # to load the data. This is known to cause issues with Ray. + trainloader = DataLoader( + trainset, batch_size=int(config["batch_size"]), sampler=train_sampler + ) + + for current_epoch in range(epoch, 40): # loop over the dataset multiple times + running_loss = 0.0 + epoch_steps = 0 + for i, data in enumerate(trainloader): + # get the inputs; data is a list of [inputs, labels] + inputs, labels = data + inputs, labels = inputs.to(device), labels.to(device) + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = net(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # print statistics + running_loss += loss.item() + epoch_steps += 1 + + if i % 2000 == 1999: # print every 2000 mini-batches + print( + "[%d, %5d] loss: %.3f" + % (current_epoch + 1, i + 1, running_loss / epoch_steps) + ) + + if config["smoke_test"]: + break + + with tempfile.TemporaryDirectory() as checkpoint_dir: + with open(os.path.join(checkpoint_dir, "data.ckpt"), "wb") as fp: + cpickle.dump( + dict( + model_state=net.state_dict(), + optimizer_state=optimizer.state_dict(), + epoch=current_epoch, + ), + fp, + ) + checkpoint = Checkpoint.from_directory(checkpoint_dir) + train.report(dict(loss=running_loss / epoch_steps), checkpoint=checkpoint) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--smoke-test", action="store_true", help=("Finish quickly for testing.") + ) + args = parser.parse_args() + + if args.smoke_test: + ray.init() + else: + ray.init(address="auto") # assumes ray is started with ray up + + transform_train = transforms.Compose( + [ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(CIFAR10_STATS["mean"], CIFAR10_STATS["std"]), + ] + ) # meanstd transformation + + dataset = torchvision.datasets.CIFAR10( + root="/tmp/data_cifar", train=True, download=True, transform=transform_train + ) + + horovod_trainer = HorovodTrainer( + train_loop_per_worker=train_loop_per_worker, + scaling_config=ScalingConfig( + use_gpu=False if args.smoke_test else True, + num_workers=2, + ), + train_loop_config={"batch_size": 64, "data": ray.put(dataset)}, + ) + + # ensure that checkpointing works. + pbt = create_scheduler( + "pbt", + perturbation_interval=1, # To make perturb more often. + hyperparam_mutations={ + "train_loop_config": {"lr": tune.uniform(0.001, 0.1)}, + }, + ) + + tuner = Tuner( + horovod_trainer, + param_space={ + "train_loop_config": { + "lr": 0.1 + if args.smoke_test + else tune.grid_search([0.1 * i for i in range(1, 5)]), # 4 trials + "smoke_test": args.smoke_test, + } + }, + tune_config=TuneConfig( + num_samples=2 if args.smoke_test else 1, + metric="loss", + mode="min", + scheduler=pbt, + ), + run_config=RunConfig( + stop={"training_iteration": 1} if args.smoke_test else None, + failure_config=FailureConfig(fail_fast=False), + checkpoint_config=CheckpointConfig(num_to_keep=1), + callbacks=[ProgressCallback()], + ), + ) + + result_grid = tuner.fit() + + # Make sure trials do not fail. + for result in result_grid: + assert not result.error + + print("Best hyperparameters found were: ", result_grid.get_best_result().config) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_example.py new file mode 100644 index 0000000000000000000000000000000000000000..236814aa8afccd775c3b67ab96edf8d3da93a0c8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_example.py @@ -0,0 +1,286 @@ +import argparse +import os + +import horovod.torch as hvd +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from filelock import FileLock +from torchvision import datasets, transforms + +import ray +from ray import train +from ray.train import ScalingConfig +from ray.train.horovod import HorovodTrainer + + +def metric_average(val, name): + tensor = torch.tensor(val) + avg_tensor = hvd.allreduce(tensor, name=name) + return avg_tensor.item() + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) + + +def setup(config): + data_dir = config.get("data_dir", None) + seed = config.get("seed", 42) + batch_size = config.get("batch_size", 64) + use_adasum = config.get("use_adasum", False) + lr = config.get("lr", 0.01) + momentum = config.get("momentum", 0.5) + use_cuda = config.get("use_cuda", False) + + # Horovod: initialize library. + hvd.init() + torch.manual_seed(seed) + + if use_cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + torch.cuda.manual_seed(seed) + + # Horovod: limit # of CPU threads to be used per worker. + torch.set_num_threads(1) + + kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} + data_dir = data_dir or "~/data" + with FileLock(os.path.expanduser("~/.horovod_lock")): + train_dataset = datasets.MNIST( + data_dir, + train=True, + download=True, + transform=transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ), + ) + # Horovod: use DistributedSampler to partition the training data. + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=hvd.size(), rank=hvd.rank() + ) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs + ) + + model = Net() + + # By default, Adasum doesn't need scaling up learning rate. + lr_scaler = hvd.size() if not use_adasum else 1 + + if use_cuda: + # Move model to GPU. + model.cuda() + # If using GPU Adasum allreduce, scale learning rate by local_size. + if use_adasum and hvd.nccl_built(): + lr_scaler = hvd.local_size() + + # Horovod: scale learning rate by lr_scaler. + optimizer = optim.SGD(model.parameters(), lr=lr * lr_scaler, momentum=momentum) + + # Horovod: wrap optimizer with DistributedOptimizer. + optimizer = hvd.DistributedOptimizer( + optimizer, + named_parameters=model.named_parameters(), + op=hvd.Adasum if use_adasum else hvd.Average, + ) + + return model, optimizer, train_loader, train_sampler + + +def train_epoch( + model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda +): + loss = None + model.train() + # Horovod: set epoch to sampler for shuffling. + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if use_cuda: + data, target = data.cuda(), target.cuda() + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % log_interval == 0: + # Horovod: use train_sampler to determine the number of + # examples in this worker's partition. + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(data), + len(train_sampler), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) + return loss.item() if loss else None + + +# Horovod function API. + + +def train_func(config): + num_epochs = config.get("num_epochs", 10) + log_interval = config.get("log_interval", 10) + use_cuda = config.get("use_cuda", False) + + model, optimizer, train_loader, train_sampler = setup(config) + + for epoch in range(num_epochs): + loss = train_epoch( + model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda + ) + train.report(dict(loss=loss)) + + +def main(num_workers, use_gpu, kwargs): + trainer = HorovodTrainer( + train_func, + train_loop_config=kwargs, + scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=num_workers), + ) + results = trainer.fit() + print(results.metrics) + + +# Horovod Class API. + + +class HorovodTrainClass: + def __init__(self, config): + self.log_interval = config.get("log_interval", 10) + self.use_cuda = config.get("use_cuda", False) + + if self.use_cuda: + torch.cuda.set_device(hvd.local_rank()) + + self.model, self.optimizer, self.train_loader, self.train_sampler = setup( + config + ) + + def train(self, epoch): + loss = train_epoch( + self.model, + self.optimizer, + self.train_sampler, + self.train_loader, + epoch, + self.log_interval, + self.use_cuda, + ) + return loss + + +if __name__ == "__main__": + # Training settings + parser = argparse.ArgumentParser( + description="PyTorch MNIST Example", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--num-epochs", + type=int, + default=5, + metavar="N", + help="number of epochs to train (default: 10)", + ) + parser.add_argument( + "--lr", + type=float, + default=0.01, + metavar="LR", + help="learning rate (default: 0.01)", + ) + parser.add_argument( + "--momentum", + type=float, + default=0.5, + metavar="M", + help="SGD momentum (default: 0.5)", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="enables CUDA training" + ) + parser.add_argument( + "--seed", type=int, default=42, metavar="S", help="random seed (default: 42)" + ) + parser.add_argument( + "--log-interval", + type=int, + default=10, + metavar="N", + help="how many batches to wait before logging training status", + ) + parser.add_argument( + "--use-adasum", + action="store_true", + default=False, + help="use adasum algorithm to do reduction", + ) + parser.add_argument( + "--num-workers", + type=int, + default=2, + help="Number of Ray workers to use for training.", + ) + parser.add_argument( + "--data-dir", + help="location of the training dataset in the local filesystem (" + "will be downloaded if needed)", + ) + parser.add_argument( + "--address", + required=False, + type=str, + default=None, + help="Address of Ray cluster.", + ) + + args = parser.parse_args() + + if args.address: + ray.init(args.address) + else: + ray.init() + + use_cuda = args.use_gpu if args.use_gpu is not None else False + + kwargs = { + "data_dir": args.data_dir, + "seed": args.seed, + "use_cuda": use_cuda, + "batch_size": args.batch_size, + "use_adasum": args.use_adasum if args.use_adasum else False, + "lr": args.lr, + "momentum": args.momentum, + "num_epochs": args.num_epochs, + "log_interval": args.log_interval, + } + + main(num_workers=args.num_workers, use_gpu=use_cuda, kwargs=kwargs) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_pytorch_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_pytorch_example.py new file mode 100644 index 0000000000000000000000000000000000000000..d20ed51e9ef7e39c467948954816e91e509cff9c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_pytorch_example.py @@ -0,0 +1,270 @@ +import argparse +import os +import tempfile + +import horovod.torch as hvd +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from filelock import FileLock +from torchvision import datasets, transforms + +import ray.train.torch +from ray import train +from ray.train import Checkpoint, ScalingConfig +from ray.train.horovod import HorovodTrainer + + +def metric_average(val, name): + tensor = torch.tensor(val) + avg_tensor = hvd.allreduce(tensor, name=name) + return avg_tensor.item() + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) + + +def setup(config): + data_dir = config.get("data_dir", None) + seed = config.get("seed", 42) + batch_size = config.get("batch_size", 64) + use_adasum = config.get("use_adasum", False) + lr = config.get("lr", 0.01) + momentum = config.get("momentum", 0.5) + use_cuda = config.get("use_cuda", False) + + # Horovod: initialize library. + hvd.init() + torch.manual_seed(seed) + + if use_cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + torch.cuda.manual_seed(seed) + + # Horovod: limit # of CPU threads to be used per worker. + torch.set_num_threads(1) + + kwargs = {"pin_memory": True} if use_cuda else {} + data_dir = data_dir or "~/data" + with FileLock(os.path.expanduser("~/.horovod_lock")): + train_dataset = datasets.MNIST( + data_dir, + train=True, + download=True, + transform=transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ), + ) + # Horovod: use DistributedSampler to partition the training data. + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=hvd.size(), rank=hvd.rank() + ) + # Note, don't set `num_workers` in DataLoader (not even 1), + # as that will separately start multiple processes (each corresponding to 1 worker) + # to load the data. This is known to cause issues with Ray. + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs + ) + + model = Net() + + # By default, Adasum doesn't need scaling up learning rate. + lr_scaler = hvd.size() if not use_adasum else 1 + + if use_cuda: + # Move model to GPU. + model.cuda() + # If using GPU Adasum allreduce, scale learning rate by local_size. + if use_adasum and hvd.nccl_built(): + lr_scaler = hvd.local_size() + + # Horovod: scale learning rate by lr_scaler. + optimizer = optim.SGD(model.parameters(), lr=lr * lr_scaler, momentum=momentum) + + # Horovod: wrap optimizer with DistributedOptimizer. + optimizer = hvd.DistributedOptimizer( + optimizer, + named_parameters=model.named_parameters(), + op=hvd.Adasum if use_adasum else hvd.Average, + ) + + return model, optimizer, train_loader, train_sampler + + +def train_epoch( + model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda +): + loss = None + model.train() + # Horovod: set epoch to sampler for shuffling. + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if use_cuda: + data, target = data.cuda(), target.cuda() + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % log_interval == 0: + # Horovod: use train_sampler to determine the number of + # examples in this worker's partition. + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(data), + len(train_sampler), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) + return loss.item() if loss else None + + +def train_func(config): + num_epochs = config.get("num_epochs", 10) + log_interval = config.get("log_interval", 10) + use_cuda = config.get("use_cuda", False) + + model, optimizer, train_loader, train_sampler = setup(config) + + results = [] + for epoch in range(num_epochs): + loss = train_epoch( + model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda + ) + results.append(loss) + with tempfile.TemporaryDirectory() as tmpdir: + torch.save(model.state_dict(), os.path.join(tmpdir, "model.pt")) + train.report({"loss": loss}, checkpoint=Checkpoint.from_directory(tmpdir)) + + # Only used for testing. + return results + + +def main(num_workers, use_gpu, kwargs): + trainer = HorovodTrainer( + train_loop_per_worker=train_func, + train_loop_config={ + "num_epochs": kwargs["num_epochs"], + "log_interval": kwargs["log_interval"], + "use_cuda": kwargs["use_cuda"], + }, + scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + ) + result = trainer.fit() + print(result) + + +if __name__ == "__main__": + # Training settings + parser = argparse.ArgumentParser( + description="PyTorch MNIST Example", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--num-epochs", + type=int, + default=5, + metavar="N", + help="number of epochs to train (default: 10)", + ) + parser.add_argument( + "--lr", + type=float, + default=0.01, + metavar="LR", + help="learning rate (default: 0.01)", + ) + parser.add_argument( + "--momentum", + type=float, + default=0.5, + metavar="M", + help="SGD momentum (default: 0.5)", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="enables CUDA training" + ) + parser.add_argument( + "--seed", type=int, default=42, metavar="S", help="random seed (default: 42)" + ) + parser.add_argument( + "--log-interval", + type=int, + default=10, + metavar="N", + help="how many batches to wait before logging training status", + ) + parser.add_argument( + "--use-adasum", + action="store_true", + default=False, + help="use adasum algorithm to do reduction", + ) + parser.add_argument( + "--num-workers", + type=int, + default=2, + help="Number of Ray workers to use for training.", + ) + parser.add_argument( + "--data-dir", + help="location of the training dataset in the local filesystem (" + "will be downloaded if needed)", + ) + parser.add_argument( + "--address", + required=False, + type=str, + default=None, + help="Address of Ray cluster.", + ) + + args = parser.parse_args() + + if args.address: + ray.init(args.address) + else: + ray.init() + + use_cuda = args.use_gpu if args.use_gpu is not None else False + + kwargs = { + "data_dir": args.data_dir, + "seed": args.seed, + "use_cuda": use_cuda, + "batch_size": args.batch_size, + "use_adasum": args.use_adasum if args.use_adasum else False, + "lr": args.lr, + "momentum": args.momentum, + "num_epochs": args.num_epochs, + "log_interval": args.log_interval, + } + + main(num_workers=args.num_workers, use_gpu=use_cuda, kwargs=kwargs) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_tune_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_tune_example.py new file mode 100644 index 0000000000000000000000000000000000000000..9433d50635ad0a8db585aefc454dd6c9649685a3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/horovod/horovod_tune_example.py @@ -0,0 +1,139 @@ +import time + +import numpy as np +import torch + +import ray +import ray.train.torch +from ray import train, tune +from ray.train import ScalingConfig +from ray.train.horovod import HorovodTrainer +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner + + +def sq(x): + m2 = 1.0 + m1 = -20.0 + m0 = 50.0 + return m2 * x * x + m1 * x + m0 + + +def qu(x): + m3 = 10.0 + m2 = 5.0 + m1 = -20.0 + m0 = -5.0 + return m3 * x * x * x + m2 * x * x + m1 * x + m0 + + +class Net(torch.nn.Module): + def __init__(self, mode="sq"): + super(Net, self).__init__() + + if mode == "square": + self.mode = 0 + self.param = torch.nn.Parameter(torch.FloatTensor([1.0, -1.0])) + else: + self.mode = 1 + self.param = torch.nn.Parameter(torch.FloatTensor([1.0, -1.0, 1.0])) + + def forward(self, x): + if ~self.mode: + return x * x + self.param[0] * x + self.param[1] + else: + return_val = 10 * x * x * x + return_val += self.param[0] * x * x + return_val += self.param[1] * x + self.param[2] + return return_val + + +def train_loop_per_worker(config): + import horovod.torch as hvd + import torch + + hvd.init() + device = ray.train.torch.get_device() + mode = config["mode"] + net = Net(mode).to(device) + optimizer = torch.optim.SGD( + net.parameters(), + lr=config["lr"], + ) + optimizer = hvd.DistributedOptimizer(optimizer) + + num_steps = 5 + print(hvd.size()) + np.random.seed(1 + hvd.rank()) + torch.manual_seed(1234) + # To ensure consistent initialization across workers, + hvd.broadcast_parameters(net.state_dict(), root_rank=0) + hvd.broadcast_optimizer_state(optimizer, root_rank=0) + + start = time.time() + x_max = config["x_max"] + for step in range(1, num_steps + 1): + features = torch.Tensor(np.random.rand(1) * 2 * x_max - x_max).to(device) + if mode == "square": + labels = sq(features) + else: + labels = qu(features) + optimizer.zero_grad() + outputs = net(features) + loss = torch.nn.MSELoss()(outputs, labels) + loss.backward() + + optimizer.step() + time.sleep(0.1) + train.report(dict(loss=loss.item())) + total = time.time() - start + print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.") + + +def tune_horovod(num_workers, num_samples, use_gpu, mode="square", x_max=1.0): + horovod_trainer = HorovodTrainer( + train_loop_per_worker=train_loop_per_worker, + scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + train_loop_config={"mode": mode, "x_max": x_max}, + ) + + tuner = Tuner( + horovod_trainer, + param_space={"train_loop_config": {"lr": tune.uniform(0.1, 1)}}, + tune_config=TuneConfig(mode="min", metric="loss", num_samples=num_samples), + _tuner_kwargs={"fail_fast": True}, + ) + + result_grid = tuner.fit() + + print("Best hyperparameters found were: ", result_grid.get_best_result().config) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", type=str, default="square", choices=["square", "cubic"] + ) + parser.add_argument( + "--learning_rate", type=float, default=0.1, dest="learning_rate" + ) + parser.add_argument("--x_max", type=float, default=1.0, dest="x_max") + parser.add_argument("--gpu", action="store_true") + parser.add_argument( + "--smoke-test", action="store_true", help=("Finish quickly for testing.") + ) + parser.add_argument("--num-workers", type=int, default=2) + args, _ = parser.parse_known_args() + + if args.smoke_test: + ray.init(num_cpus=3) + + tune_horovod( + num_workers=args.num_workers, + num_samples=2 if args.smoke_test else 10, + use_gpu=args.gpu, + mode=args.mode, + x_max=args.x_max, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..024675238ac9d234b2de1f9eeec567adb2528f80 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_fashion_mnist_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_fashion_mnist_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4c856b4f2afcd729ea172dd1571b2a98872b71c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_fashion_mnist_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_linear_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_linear_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a86932e6b2de5c807942bf45c3ef54404e408f93 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_linear_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_quick_start.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_quick_start.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81176fc02476405d728b5078668ac056855d69f6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_quick_start.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_regression_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_regression_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e06dc3bfc0a969d86eaf9ec9e62332b4555e58e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/torch_regression_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/tune_cifar_torch_pbt_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/tune_cifar_torch_pbt_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7323ae6502b3bc4d6403614a26354c259ebbefcc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/tune_cifar_torch_pbt_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/tune_torch_regression_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/tune_torch_regression_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91015b20aefbcd7689cb88252fc0a7dbc45b875b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/__pycache__/tune_torch_regression_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5adeeb93fca714b51c1460c951a56ecd6946127c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/__pycache__/auto_pipeline_for_host_to_device_data_transfer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/__pycache__/auto_pipeline_for_host_to_device_data_transfer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e692020c26550e696de81c4212815d10a6543414 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/__pycache__/auto_pipeline_for_host_to_device_data_transfer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py new file mode 100644 index 0000000000000000000000000000000000000000..28fe7461bc3c30dd1631bc30454b6a5dc64a9696 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py @@ -0,0 +1,161 @@ +# The PyTorch data transfer benchmark script. +import argparse +import warnings + +import numpy as np +import torch +import torch.nn as nn + +import ray.train as train +from ray.train import ScalingConfig +from ray.train.torch import TorchTrainer + + +class Net(nn.Module): + def __init__(self, in_d, hidden): + # output dim = 1 + super(Net, self).__init__() + dims = [in_d] + hidden + [1] + self.layers = nn.ModuleList( + [nn.Linear(dims[i - 1], dims[i]) for i in range(len(dims))] + ) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + +class BenchmarkDataset(torch.utils.data.Dataset): + """Create a naive dataset for the benchmark""" + + def __init__(self, dim, size=1000): + self.x = torch.from_numpy(np.random.normal(size=(size, dim))).float() + self.y = torch.from_numpy(np.random.normal(size=(size, 1))).float() + self.size = size + + def __getitem__(self, index): + return self.x[index, None], self.y[index, None] + + def __len__(self): + return self.size + + +def train_epoch(epoch, dataloader, model, loss_fn, optimizer): + if train.get_context().get_world_size() > 1: + dataloader.sampler.set_epoch(epoch) + + for X, y in dataloader: + # Compute prediction error + pred = model(X) + loss = loss_fn(pred, y) + + # Backpropagation + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +def train_func(config): + data_size = config.get("data_size", 4096 * 50) + batch_size = config.get("batch_size", 4096) + hidden_size = config.get("hidden_size", 1) + use_auto_transfer = config.get("use_auto_transfer", False) + lr = config.get("lr", 1e-2) + epochs = config.get("epochs", 10) + + train_dataset = BenchmarkDataset(4096, size=data_size) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=batch_size, shuffle=True + ) + + train_loader = train.torch.prepare_data_loader( + data_loader=train_loader, move_to_device=True, auto_transfer=use_auto_transfer + ) + + model = Net(in_d=4096, hidden=[4096] * hidden_size) + model = train.torch.prepare_model(model) + + loss_fn = nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=lr) + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + choice = "with" if use_auto_transfer else "without" + print(f"Starting the torch data prefetch benchmark {choice} auto pipeline...") + + torch.cuda.synchronize() + start.record() + for epoch in range(epochs): + train_epoch(epoch, train_loader, model, loss_fn, optimizer) + end.record() + torch.cuda.synchronize() + + print( + f"Finished the torch data prefetch benchmark {choice} " + f"auto pipeline: {start.elapsed_time(end)} ms." + ) + + return "Experiment done." + + +def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epochs=3): + config = { + "lr": 1e-2, + "hidden_size": num_hidden_layers, + "batch_size": 4096, + "epochs": epochs, + "use_auto_transfer": use_auto_transfer, + } + trainer = TorchTrainer( + train_func, + train_loop_config=config, + scaling_config=ScalingConfig(use_gpu=True, num_workers=num_workers), + ) + results = trainer.fit() + + print(results.metrics) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--epochs", type=int, default=1, help="Number of epochs to train for." + ) + parser.add_argument( + "--num_hidden_layers", + type=int, + default=1, + help="Number of epochs to train for.", + ) + + args, _ = parser.parse_known_args() + + import ray + + ray.init(address=args.address) + + if not torch.cuda.is_available(): + warnings.warn("GPU is not available. Skip the test using auto pipeline.") + else: + train_linear( + num_workers=1, + num_hidden_layers=args.num_hidden_layers, + use_auto_transfer=True, + epochs=args.epochs, + ) + + torch.cuda.empty_cache() + train_linear( + num_workers=1, + num_hidden_layers=args.num_hidden_layers, + use_auto_transfer=False, + epochs=args.epochs, + ) + + ray.shutdown() diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_fashion_mnist_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_fashion_mnist_example.py new file mode 100644 index 0000000000000000000000000000000000000000..e26ed51ad6f4d572305f83892af39a7074311bd4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_fashion_mnist_example.py @@ -0,0 +1,152 @@ +import os +from typing import Dict + +import torch +from filelock import FileLock +from torch import nn +from torch.utils.data import DataLoader +from torchvision import datasets, transforms +from torchvision.transforms import Normalize, ToTensor +from tqdm import tqdm + +import ray.train +from ray.train import ScalingConfig +from ray.train.torch import TorchTrainer + + +def get_dataloaders(batch_size): + # Transform to normalize the input images + transform = transforms.Compose([ToTensor(), Normalize((0.5,), (0.5,))]) + + with FileLock(os.path.expanduser("~/data.lock")): + # Download training data from open datasets + training_data = datasets.FashionMNIST( + root="~/data", + train=True, + download=True, + transform=transform, + ) + + # Download test data from open datasets + test_data = datasets.FashionMNIST( + root="~/data", + train=False, + download=True, + transform=transform, + ) + + # Create data loaders + train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True) + test_dataloader = DataLoader(test_data, batch_size=batch_size) + + return train_dataloader, test_dataloader + + +# Model Definition +class NeuralNetwork(nn.Module): + def __init__(self): + super(NeuralNetwork, self).__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(28 * 28, 512), + nn.ReLU(), + nn.Dropout(0.25), + nn.Linear(512, 512), + nn.ReLU(), + nn.Dropout(0.25), + nn.Linear(512, 10), + nn.ReLU(), + ) + + def forward(self, x): + x = self.flatten(x) + logits = self.linear_relu_stack(x) + return logits + + +def train_func_per_worker(config: Dict): + lr = config["lr"] + epochs = config["epochs"] + batch_size = config["batch_size_per_worker"] + + # Get dataloaders inside the worker training function + train_dataloader, test_dataloader = get_dataloaders(batch_size=batch_size) + + # [1] Prepare Dataloader for distributed training + # Shard the datasets among workers and move batches to the correct device + # ======================================================================= + train_dataloader = ray.train.torch.prepare_data_loader(train_dataloader) + test_dataloader = ray.train.torch.prepare_data_loader(test_dataloader) + + model = NeuralNetwork() + + # [2] Prepare and wrap your model with DistributedDataParallel + # Move the model to the correct GPU/CPU device + # ============================================================ + model = ray.train.torch.prepare_model(model) + + loss_fn = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9) + + # Model training loop + for epoch in range(epochs): + if ray.train.get_context().get_world_size() > 1: + # Required for the distributed sampler to shuffle properly across epochs. + train_dataloader.sampler.set_epoch(epoch) + + model.train() + for X, y in tqdm(train_dataloader, desc=f"Train Epoch {epoch}"): + pred = model(X) + loss = loss_fn(pred, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + model.eval() + test_loss, num_correct, num_total = 0, 0, 0 + with torch.no_grad(): + for X, y in tqdm(test_dataloader, desc=f"Test Epoch {epoch}"): + pred = model(X) + loss = loss_fn(pred, y) + + test_loss += loss.item() + num_total += y.shape[0] + num_correct += (pred.argmax(1) == y).sum().item() + + test_loss /= len(test_dataloader) + accuracy = num_correct / num_total + + # [3] Report metrics to Ray Train + # =============================== + ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy}) + + +def train_fashion_mnist(num_workers=2, use_gpu=False): + global_batch_size = 32 + + train_config = { + "lr": 1e-3, + "epochs": 10, + "batch_size_per_worker": global_batch_size // num_workers, + } + + # Configure computation resources + scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu) + + # Initialize a Ray TorchTrainer + trainer = TorchTrainer( + train_loop_per_worker=train_func_per_worker, + train_loop_config=train_config, + scaling_config=scaling_config, + ) + + # [4] Start distributed training + # Run `train_func_per_worker` on all workers + # ============================================= + result = trainer.fit() + print(f"Training result: {result}") + + +if __name__ == "__main__": + train_fashion_mnist(num_workers=4, use_gpu=True) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_linear_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_linear_example.py new file mode 100644 index 0000000000000000000000000000000000000000..19d5848473b91719f488d19a3364ddc55da6d899 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_linear_example.py @@ -0,0 +1,147 @@ +import argparse +import os +import tempfile + +import numpy as np +import torch +import torch.nn as nn + +import ray.train as train +from ray.train import Checkpoint, RunConfig, ScalingConfig +from ray.train.torch import TorchTrainer + + +class LinearDataset(torch.utils.data.Dataset): + """y = a * x + b""" + + def __init__(self, a, b, size=1000): + x = np.arange(0, 10, 10 / size, dtype=np.float32) + self.x = torch.from_numpy(x) + self.y = torch.from_numpy(a * x + b) + + def __getitem__(self, index): + return self.x[index, None], self.y[index, None] + + def __len__(self): + return len(self.x) + + +def train_epoch(epoch, dataloader, model, loss_fn, optimizer): + if train.get_context().get_world_size() > 1: + dataloader.sampler.set_epoch(epoch) + + for X, y in dataloader: + # Compute prediction error + pred = model(X) + loss = loss_fn(pred, y) + + # Backpropagation + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +def validate_epoch(dataloader, model, loss_fn): + num_batches = len(dataloader) + model.eval() + loss = 0 + with torch.no_grad(): + for X, y in dataloader: + pred = model(X) + loss += loss_fn(pred, y).item() + loss /= num_batches + import copy + + model_copy = copy.deepcopy(model) + return model_copy.cpu().state_dict(), loss + + +def train_func(config): + data_size = config.get("data_size", 1000) + val_size = config.get("val_size", 400) + batch_size = config.get("batch_size", 32) + hidden_size = config.get("hidden_size", 1) + lr = config.get("lr", 1e-2) + epochs = config.get("epochs", 3) + + train_dataset = LinearDataset(2, 5, size=data_size) + val_dataset = LinearDataset(2, 5, size=val_size) + train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size) + validation_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size) + + train_loader = train.torch.prepare_data_loader(train_loader) + validation_loader = train.torch.prepare_data_loader(validation_loader) + + model = nn.Linear(1, hidden_size) + model = train.torch.prepare_model(model) + + loss_fn = nn.MSELoss() + + optimizer = torch.optim.SGD(model.parameters(), lr=lr) + + results = [] + for epoch in range(epochs): + train_epoch(epoch, train_loader, model, loss_fn, optimizer) + state_dict, loss = validate_epoch(validation_loader, model, loss_fn) + result = dict(loss=loss) + results.append(result) + + with tempfile.TemporaryDirectory() as tmpdir: + torch.save(state_dict, os.path.join(tmpdir, "model.pt")) + train.report(result, checkpoint=Checkpoint.from_directory(tmpdir)) + + return results + + +def train_linear(num_workers=2, use_gpu=False, epochs=3, storage_path=None): + config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} + trainer = TorchTrainer( + train_loop_per_worker=train_func, + train_loop_config=config, + scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + run_config=RunConfig(storage_path=storage_path), + ) + result = trainer.fit() + + print(result.metrics) + return result.metrics + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--use-gpu", action="store_true", help="Whether to use GPU for training." + ) + parser.add_argument( + "--epochs", type=int, default=3, help="Number of epochs to train for." + ) + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.", + ) + + args, _ = parser.parse_known_args() + + import ray + + if args.smoke_test: + # 2 workers + 1 for trainer. + ray.init(num_cpus=3) + train_linear() + else: + ray.init(address=args.address) + train_linear( + num_workers=args.num_workers, use_gpu=args.use_gpu, epochs=args.epochs + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_quick_start.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_quick_start.py new file mode 100644 index 0000000000000000000000000000000000000000..df1ae3461bd7aaac9d5c5d98c604115df36d3760 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_quick_start.py @@ -0,0 +1,110 @@ +# ruff: noqa +# fmt: off +# isort: skip_file + +# __torch_setup_begin__ +import torch +import torch.nn as nn +from torch.utils.data import DataLoader +from torchvision import datasets +from torchvision.transforms import ToTensor + +def get_dataset(): + return datasets.FashionMNIST( + root="/tmp/data", + train=True, + download=True, + transform=ToTensor(), + ) + +class NeuralNetwork(nn.Module): + def __init__(self): + super().__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(28 * 28, 512), + nn.ReLU(), + nn.Linear(512, 512), + nn.ReLU(), + nn.Linear(512, 10), + ) + + def forward(self, inputs): + inputs = self.flatten(inputs) + logits = self.linear_relu_stack(inputs) + return logits +# __torch_setup_end__ + +# __torch_single_begin__ +def train_func(): + num_epochs = 3 + batch_size = 64 + + dataset = get_dataset() + dataloader = DataLoader(dataset, batch_size=batch_size) + + model = NeuralNetwork() + + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + + for epoch in range(num_epochs): + for inputs, labels in dataloader: + optimizer.zero_grad() + pred = model(inputs) + loss = criterion(pred, labels) + loss.backward() + optimizer.step() + print(f"epoch: {epoch}, loss: {loss.item()}") +# __torch_single_end__ + +# __torch_distributed_begin__ +import ray.train.torch + +def train_func_distributed(): + num_epochs = 3 + batch_size = 64 + + dataset = get_dataset() + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + dataloader = ray.train.torch.prepare_data_loader(dataloader) + + model = NeuralNetwork() + model = ray.train.torch.prepare_model(model) + + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + + for epoch in range(num_epochs): + if ray.train.get_context().get_world_size() > 1: + dataloader.sampler.set_epoch(epoch) + + for inputs, labels in dataloader: + optimizer.zero_grad() + pred = model(inputs) + loss = criterion(pred, labels) + loss.backward() + optimizer.step() + print(f"epoch: {epoch}, loss: {loss.item()}") +# __torch_distributed_end__ + + +if __name__ == "__main__": + # __torch_single_run_begin__ + train_func() + # __torch_single_run_end__ + + # __torch_trainer_begin__ + from ray.train.torch import TorchTrainer + from ray.train import ScalingConfig + + # For GPU Training, set `use_gpu` to True. + use_gpu = False + + trainer = TorchTrainer( + train_func_distributed, + scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu) + ) + + results = trainer.fit() + # __torch_trainer_end__ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_regression_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_regression_example.py new file mode 100644 index 0000000000000000000000000000000000000000..8bd54fbcb7ab2e8c778b220c39187e0ad0430ca5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/torch_regression_example.py @@ -0,0 +1,160 @@ +import argparse +import os +import tempfile +from typing import Tuple + +import pandas as pd +import torch +import torch.nn as nn + +import ray +import ray.train as train +from ray.data import Dataset +from ray.train import Checkpoint, DataConfig, ScalingConfig +from ray.train.torch import TorchTrainer + + +def get_datasets(split: float = 0.7) -> Tuple[Dataset]: + dataset = ray.data.read_csv("s3://anonymous@air-example-data/regression.csv") + + def combine_x(batch): + return pd.DataFrame( + { + "x": batch[[f"x{i:03d}" for i in range(100)]].values.tolist(), + "y": batch["y"], + } + ) + + dataset = dataset.map_batches(combine_x, batch_format="pandas") + train_dataset, validation_dataset = dataset.repartition( + num_blocks=4 + ).train_test_split(split, shuffle=True) + return train_dataset, validation_dataset + + +def train_epoch(iterable_dataset, model, loss_fn, optimizer, device): + model.train() + for X, y in iterable_dataset: + X = X.to(device) + y = y.to(device) + + # Compute prediction error + pred = model(X) + loss = loss_fn(pred, y) + + # Backpropagation + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +def validate_epoch(iterable_dataset, model, loss_fn, device): + num_batches = 0 + model.eval() + loss = 0 + with torch.no_grad(): + for X, y in iterable_dataset: + X = X.to(device) + y = y.to(device) + num_batches += 1 + pred = model(X) + loss += loss_fn(pred, y).item() + loss /= num_batches + result = {"loss": loss} + return result + + +def train_func(config): + batch_size = config.get("batch_size", 32) + hidden_size = config.get("hidden_size", 10) + lr = config.get("lr", 1e-2) + epochs = config.get("epochs", 3) + + train_dataset_shard = train.get_dataset_shard("train") + validation_dataset = train.get_dataset_shard("validation") + + model = nn.Sequential( + nn.Linear(100, hidden_size), nn.ReLU(), nn.Linear(hidden_size, 1) + ) + model = train.torch.prepare_model(model) + + loss_fn = nn.L1Loss() + + optimizer = torch.optim.SGD(model.parameters(), lr=lr) + + results = [] + + def create_torch_iterator(shard): + iterator = shard.iter_torch_batches(batch_size=batch_size) + for batch in iterator: + yield batch["x"].float(), batch["y"].float() + + for _ in range(epochs): + train_torch_dataset = create_torch_iterator(train_dataset_shard) + validation_torch_dataset = create_torch_iterator(validation_dataset) + + device = train.torch.get_device() + + train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) + if train.get_context().get_world_rank() == 0: + result = validate_epoch(validation_torch_dataset, model, loss_fn, device) + else: + result = {} + results.append(result) + + with tempfile.TemporaryDirectory() as tmpdir: + torch.save(model.module.state_dict(), os.path.join(tmpdir, "model.pt")) + train.report(result, checkpoint=Checkpoint.from_directory(tmpdir)) + + return results + + +def train_regression(num_workers=2, use_gpu=False): + train_dataset, val_dataset = get_datasets() + config = {"lr": 1e-2, "hidden_size": 20, "batch_size": 4, "epochs": 3} + + trainer = TorchTrainer( + train_loop_per_worker=train_func, + train_loop_config=config, + scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + datasets={"train": train_dataset, "validation": val_dataset}, + dataset_config=DataConfig(datasets_to_split=["train"]), + ) + + result = trainer.fit() + print(result.metrics) + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="Use GPU for training." + ) + + args, _ = parser.parse_known_args() + + if args.smoke_test: + # 2 workers, 1 for trainer, 1 for datasets + ray.init(num_cpus=4) + result = train_regression() + else: + ray.init(address=args.address) + result = train_regression(num_workers=args.num_workers, use_gpu=args.use_gpu) + print(result) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py new file mode 100644 index 0000000000000000000000000000000000000000..00b5694884bd01b752b72a14a8a606fc12ad3052 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py @@ -0,0 +1,253 @@ +import argparse +import os +import tempfile + +import torch +import torch.nn as nn +import torchvision.transforms as transforms +from filelock import FileLock +from torch.utils.data import DataLoader, Subset +from torchvision.datasets import CIFAR10 +from torchvision.models import resnet18 + +import ray +import ray.cloudpickle as cpickle +from ray import train, tune +from ray.train import Checkpoint, FailureConfig, RunConfig, ScalingConfig +from ray.train.torch import TorchTrainer +from ray.tune.schedulers import PopulationBasedTraining +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner + + +def train_epoch(epoch, dataloader, model, loss_fn, optimizer): + if ray.train.get_context().get_world_size() > 1: + dataloader.sampler.set_epoch(epoch) + + size = len(dataloader.dataset) // train.get_context().get_world_size() + model.train() + for batch, (X, y) in enumerate(dataloader): + # Compute prediction error + pred = model(X) + loss = loss_fn(pred, y) + + # Backpropagation + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if batch % 100 == 0: + loss, current = loss.item(), batch * len(X) + print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") + + +def validate_epoch(dataloader, model, loss_fn): + size = len(dataloader.dataset) // train.get_context().get_world_size() + num_batches = len(dataloader) + model.eval() + test_loss, correct = 0, 0 + with torch.no_grad(): + for X, y in dataloader: + pred = model(X) + test_loss += loss_fn(pred, y).item() + correct += (pred.argmax(1) == y).type(torch.float).sum().item() + test_loss /= num_batches + correct /= size + print( + f"Test Error: \n " + f"Accuracy: {(100 * correct):>0.1f}%, " + f"Avg loss: {test_loss:>8f} \n" + ) + return {"loss": test_loss} + + +def update_optimizer_config(optimizer, config): + for param_group in optimizer.param_groups: + for param, val in config.items(): + param_group[param] = val + + +def train_func(config): + epochs = config.get("epochs", 3) + + model = resnet18() + + # Note that `prepare_model` needs to be called before setting optimizer. + if not train.get_checkpoint(): # fresh start + model = train.torch.prepare_model(model) + + # Create optimizer. + optimizer_config = { + "lr": config.get("lr"), + "momentum": config.get("momentum"), + } + optimizer = torch.optim.SGD(model.parameters(), **optimizer_config) + + starting_epoch = 0 + if train.get_checkpoint(): + with train.get_checkpoint().as_directory() as checkpoint_dir: + with open(os.path.join(checkpoint_dir, "data.ckpt"), "rb") as fp: + checkpoint_dict = cpickle.load(fp) + + # Load in model + model_state = checkpoint_dict["model"] + model.load_state_dict(model_state) + model = train.torch.prepare_model(model) + + # Load in optimizer + optimizer_state = checkpoint_dict["optimizer_state_dict"] + optimizer.load_state_dict(optimizer_state) + + # Optimizer configs (`lr`, `momentum`) are being mutated by PBT and passed in + # through config, so we need to update the optimizer loaded from the checkpoint + update_optimizer_config(optimizer, optimizer_config) + + # The current epoch increments the loaded epoch by 1 + checkpoint_epoch = checkpoint_dict["epoch"] + starting_epoch = checkpoint_epoch + 1 + + # Load in training and validation data. + transform_train = transforms.Compose( + [ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ] + ) # meanstd transformation + + transform_test = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ] + ) + + data_dir = config.get("data_dir", os.path.expanduser("~/data")) + os.makedirs(data_dir, exist_ok=True) + with FileLock(os.path.join(data_dir, ".ray.lock")): + train_dataset = CIFAR10( + root=data_dir, train=True, download=True, transform=transform_train + ) + validation_dataset = CIFAR10( + root=data_dir, train=False, download=False, transform=transform_test + ) + + if config.get("test_mode"): + train_dataset = Subset(train_dataset, list(range(64))) + validation_dataset = Subset(validation_dataset, list(range(64))) + + worker_batch_size = config["batch_size"] // train.get_context().get_world_size() + + train_loader = DataLoader(train_dataset, batch_size=worker_batch_size, shuffle=True) + validation_loader = DataLoader(validation_dataset, batch_size=worker_batch_size) + + train_loader = train.torch.prepare_data_loader(train_loader) + validation_loader = train.torch.prepare_data_loader(validation_loader) + + # Create loss. + criterion = nn.CrossEntropyLoss() + + for epoch in range(starting_epoch, epochs): + train_epoch(epoch, train_loader, model, criterion, optimizer) + result = validate_epoch(validation_loader, model, criterion) + + with tempfile.TemporaryDirectory() as checkpoint_dir: + with open(os.path.join(checkpoint_dir, "data.ckpt"), "wb") as fp: + cpickle.dump( + { + "epoch": epoch, + "model": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + }, + fp, + ) + checkpoint = Checkpoint.from_directory(checkpoint_dir) + train.report(result, checkpoint=checkpoint) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--address", required=False, type=str, help="The address to use for Redis." + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--num-epochs", type=int, default=5, help="Number of epochs to train." + ) + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="Enables GPU training." + ) + parser.add_argument( + "--data-dir", + required=False, + type=str, + default="~/data", + help="Root directory for storing downloaded dataset.", + ) + parser.add_argument( + "--synch", action="store_true", default=False, help="Use synchronous PBT." + ) + + args, _ = parser.parse_known_args() + if args.smoke_test: + ray.init(num_cpus=4) + else: + ray.init(address=args.address) + + trainer = TorchTrainer( + train_func, + scaling_config=ScalingConfig( + num_workers=args.num_workers, use_gpu=args.use_gpu + ), + ) + pbt_scheduler = PopulationBasedTraining( + time_attr="training_iteration", + perturbation_interval=1, + hyperparam_mutations={ + "train_loop_config": { + # distribution for resampling + "lr": tune.loguniform(0.001, 0.1), + # allow perturbations within this set of categorical values + "momentum": [0.8, 0.9, 0.99], + } + }, + synch=args.synch, + ) + + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.grid_search([0.001, 0.01, 0.05, 0.1]), + "momentum": 0.8, + "batch_size": 128 * args.num_workers, + "test_mode": args.smoke_test, # whether to to subset the data + "data_dir": args.data_dir, + "epochs": args.num_epochs, + } + }, + tune_config=TuneConfig( + num_samples=1, metric="loss", mode="min", scheduler=pbt_scheduler + ), + run_config=RunConfig( + stop={"training_iteration": 3 if args.smoke_test else args.num_epochs}, + failure_config=FailureConfig(max_failures=3), # used for fault tolerance + ), + ) + + results = tuner.fit() + + print(results.get_best_result(metric="loss", mode="min")) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/tune_torch_regression_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/tune_torch_regression_example.py new file mode 100644 index 0000000000000000000000000000000000000000..e8221c995110f5f8c5b6d48b25d87868a3f78c0c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch/tune_torch_regression_example.py @@ -0,0 +1,82 @@ +import argparse + +import ray +from ray import tune +from ray.train import DataConfig, ScalingConfig +from ray.train.examples.pytorch.torch_regression_example import get_datasets, train_func +from ray.train.torch import TorchTrainer +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner + + +def tune_linear(num_workers, num_samples, use_gpu): + train_dataset, val_dataset = get_datasets() + + config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} + + trainer = TorchTrainer( + train_loop_per_worker=train_func, + train_loop_config=config, + scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + datasets={"train": train_dataset, "validation": val_dataset}, + dataset_config=DataConfig(datasets_to_split=["train"]), + ) + + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([4, 16, 32]), + "epochs": 3, + } + }, + tune_config=TuneConfig(num_samples=num_samples, metric="loss", mode="min"), + ) + result_grid = tuner.fit() + best_result = result_grid.get_best_result() + print(best_result) + return best_result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.", + ) + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--num-samples", + type=int, + default=2, + help="Sets number of samples for training.", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="Use GPU for training." + ) + + args = parser.parse_args() + + if args.smoke_test: + # 2 workers, 1 for trainer, 1 for datasets + ray.init(num_cpus=4) + tune_linear(num_workers=2, num_samples=1, use_gpu=False) + else: + ray.init(address=args.address) + tune_linear( + num_workers=args.num_workers, + use_gpu=args.use_gpu, + num_samples=args.num_samples, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d895d6faad274c9164c67a873a0dd2f53a5d976 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/__pycache__/distributed_sage_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/__pycache__/distributed_sage_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0efa4343f44e658b3effcc4faa36a9871e18ff1b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/__pycache__/distributed_sage_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/distributed_sage_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/distributed_sage_example.py new file mode 100644 index 0000000000000000000000000000000000000000..0f3b1e126d05a266b43496fa9ec3c097c3aaba19 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/pytorch_geometric/distributed_sage_example.py @@ -0,0 +1,228 @@ +# Adapted from https://github.com/pyg-team/pytorch_geometric/blob/2.1.0 +# /examples/multi_gpu/distributed_sampling.py + +import argparse +import os + +import torch +import torch.nn.functional as F +from filelock import FileLock +from torch_geometric.datasets import FakeDataset, Reddit +from torch_geometric.loader import NeighborSampler +from torch_geometric.nn import SAGEConv +from torch_geometric.transforms import RandomNodeSplit + +from ray import train +from ray.train import ScalingConfig +from ray.train.torch import TorchTrainer + + +class SAGE(torch.nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2): + super().__init__() + self.num_layers = num_layers + + self.convs = torch.nn.ModuleList() + self.convs.append(SAGEConv(in_channels, hidden_channels)) + for _ in range(self.num_layers - 2): + self.convs.append(SAGEConv(hidden_channels, hidden_channels)) + self.convs.append(SAGEConv(hidden_channels, out_channels)) + + def forward(self, x, adjs): + for i, (edge_index, _, size) in enumerate(adjs): + x_target = x[: size[1]] # Target nodes are always placed first. + x = self.convs[i]((x, x_target), edge_index) + if i != self.num_layers - 1: + x = F.relu(x) + x = F.dropout(x, p=0.5, training=self.training) + return x.log_softmax(dim=-1) + + @torch.no_grad() + def test(self, x_all, subgraph_loader): + for i in range(self.num_layers): + xs = [] + for batch_size, n_id, adj in subgraph_loader: + edge_index, _, size = adj + x = x_all[n_id.to(x_all.device)].to(train.torch.get_device()) + x_target = x[: size[1]] + x = self.convs[i]((x, x_target), edge_index) + if i != self.num_layers - 1: + x = F.relu(x) + xs.append(x.cpu()) + + x_all = torch.cat(xs, dim=0) + + return x_all + + +def train_loop_per_worker(train_loop_config): + dataset = train_loop_config["dataset_fn"]() + batch_size = train_loop_config["batch_size"] + num_epochs = train_loop_config["num_epochs"] + + data = dataset[0] + train_idx = data.train_mask.nonzero(as_tuple=False).view(-1) + train_idx = train_idx.split( + train_idx.size(0) // train.get_context().get_world_size() + )[train.get_context().get_world_rank()] + + train_loader = NeighborSampler( + data.edge_index, + node_idx=train_idx, + sizes=[25, 10], + batch_size=batch_size, + shuffle=True, + ) + + # Disable distributed sampler since the train_loader has already been split above. + train_loader = train.torch.prepare_data_loader(train_loader, add_dist_sampler=False) + + # Do validation on rank 0 worker only. + if train.get_context().get_world_rank() == 0: + subgraph_loader = NeighborSampler( + data.edge_index, node_idx=None, sizes=[-1], batch_size=2048, shuffle=False + ) + subgraph_loader = train.torch.prepare_data_loader( + subgraph_loader, add_dist_sampler=False + ) + + model = SAGE(dataset.num_features, 256, dataset.num_classes) + model = train.torch.prepare_model(model) + optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + + x, y = data.x.to(train.torch.get_device()), data.y.to(train.torch.get_device()) + + for epoch in range(num_epochs): + model.train() + + # ``batch_size`` is the number of samples in the current batch. + # ``n_id`` are the ids of all the nodes used in the computation. This is + # needed to pull in the necessary features just for the current batch that is + # being trained on. + # ``adjs`` is a list of 3 element tuple consisting of ``(edge_index, e_id, + # size)`` for each sample in the batch, where ``edge_index``represent the + # edges of the sampled subgraph, ``e_id`` are the ids of the edges in the + # sample, and ``size`` holds the shape of the subgraph. + # See ``torch_geometric.loader.neighbor_sampler.NeighborSampler`` for more info. + for batch_size, n_id, adjs in train_loader: + optimizer.zero_grad() + out = model(x[n_id], adjs) + loss = F.nll_loss(out, y[n_id[:batch_size]]) + loss.backward() + optimizer.step() + + if train.get_context().get_world_rank() == 0: + print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}") + + train_accuracy = validation_accuracy = test_accuracy = None + + # Do validation on rank 0 worker only. + if train.get_context().get_world_rank() == 0: + model.eval() + with torch.no_grad(): + out = model.module.test(x, subgraph_loader) + res = out.argmax(dim=-1) == data.y + train_accuracy = int(res[data.train_mask].sum()) / int( + data.train_mask.sum() + ) + validation_accuracy = int(res[data.val_mask].sum()) / int( + data.val_mask.sum() + ) + test_accuracy = int(res[data.test_mask].sum()) / int(data.test_mask.sum()) + + train.report( + dict( + train_accuracy=train_accuracy, + validation_accuracy=validation_accuracy, + test_accuracy=test_accuracy, + ) + ) + + +def gen_fake_dataset(): + """Returns a function to be called on each worker that returns a Fake Dataset.""" + + # For fake dataset, since the dataset is randomized, we create it once on the + # driver, and then send the same dataset to all the training workers. + # Use 10% of nodes for validation and 10% for testing. + fake_dataset = FakeDataset(transform=RandomNodeSplit(num_val=0.1, num_test=0.1)) + + def gen_dataset(): + return fake_dataset + + return gen_dataset + + +def gen_reddit_dataset(): + """Returns a function to be called on each worker that returns Reddit Dataset.""" + + # For Reddit dataset, we have to download the data on each node, so we create the + # dataset on each training worker. + with FileLock(os.path.expanduser("~/.reddit_dataset_lock")): + dataset = Reddit("./data/Reddit") + return dataset + + +def train_gnn( + num_workers=2, use_gpu=False, epochs=3, global_batch_size=32, dataset="reddit" +): + per_worker_batch_size = global_batch_size // num_workers + + trainer = TorchTrainer( + train_loop_per_worker=train_loop_per_worker, + train_loop_config={ + "num_epochs": epochs, + "batch_size": per_worker_batch_size, + "dataset_fn": gen_reddit_dataset + if dataset == "reddit" + else gen_fake_dataset(), + }, + scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + ) + result = trainer.fit() + print(result.metrics) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--use-gpu", action="store_true", help="Whether to use GPU for training." + ) + parser.add_argument( + "--epochs", type=int, default=3, help="Number of epochs to train for." + ) + parser.add_argument( + "--global-batch-size", + "-b", + type=int, + default=32, + help="Global batch size to use for training.", + ) + parser.add_argument( + "--dataset", + "-d", + type=str, + choices=["reddit", "fake"], + default="reddit", + help="The dataset to use. Either 'reddit' or 'fake' Defaults to 'reddit'.", + ) + + args, _ = parser.parse_known_args() + + train_gnn( + num_workers=args.num_workers, + use_gpu=args.use_gpu, + epochs=args.epochs, + global_batch_size=args.global_batch_size, + dataset=args.dataset, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9581a7511f678a6a148c8495204718f5fd58f1ae Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_autoencoder_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_autoencoder_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db37eb16bca1226989077c05d286885e37c46c03 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_autoencoder_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_mnist_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_mnist_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d42c7dbf0c8fa4581a45921aee139fc2ea74e09 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_mnist_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_quick_start.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_quick_start.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6dc7bd8c71a0060507936c2e5d6da461690d9bd9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_quick_start.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_regression_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_regression_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1cf560b3f2bd2b96d6d80768556ae1d253233075 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tensorflow_regression_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tune_tensorflow_autoencoder_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tune_tensorflow_autoencoder_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..976c7072987738a0e451eb0e81195f69f539d12d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tune_tensorflow_autoencoder_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tune_tensorflow_mnist_example.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tune_tensorflow_mnist_example.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6545a360c5cf132bbad1b615dcc18a8538017326 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/__pycache__/tune_tensorflow_mnist_example.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_autoencoder_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_autoencoder_example.py new file mode 100644 index 0000000000000000000000000000000000000000..6d976203efdc9fee5bbfbb27a8aa39d080bea994 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_autoencoder_example.py @@ -0,0 +1,174 @@ +# This example showcases how to use Tensorflow with Ray Train. +# Original code: +# https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras +# https://blog.keras.io/building-autoencoders-in-keras.html +import argparse + +import numpy as np +import pandas as pd +import tensorflow as tf +import tensorflow_datasets as tfds + +import ray +from ray import train +from ray.air.integrations.keras import ReportCheckpointCallback +from ray.data.datasource import SimpleTensorFlowDatasource +from ray.data.extensions import TensorArray +from ray.train import Result, ScalingConfig +from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard + + +def get_dataset(split_type="train"): + def dataset_factory(): + return tfds.load("mnist", split=[split_type], as_supervised=True)[0].take(128) + + dataset = ray.data.read_datasource( + SimpleTensorFlowDatasource(), dataset_factory=dataset_factory + ) + + def normalize_images(x): + x = np.float32(x.numpy()) / 255.0 + x = np.reshape(x, (-1,)) + return x + + def preprocess_dataset(batch): + return [ + (normalize_images(image), normalize_images(image)) for image, _ in batch + ] + + dataset = dataset.map_batches(preprocess_dataset) + + def convert_batch_to_pandas(batch): + + images = [TensorArray(image) for image, _ in batch] + # because we did autoencoder here + df = pd.DataFrame({"image": images, "label": images}) + return df + + dataset = dataset.map_batches(convert_batch_to_pandas) + return dataset + + +def build_autoencoder_model() -> tf.keras.Model: + model = tf.keras.Sequential( + [ + tf.keras.Input(shape=(784,)), + # encoder + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dense(64, activation="relu"), + tf.keras.layers.Dense(32, activation="relu"), + # decoder + tf.keras.layers.Dense(64, activation="relu"), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dense(784, activation="sigmoid"), + ] + ) + return model + + +def train_func(config: dict): + + per_worker_batch_size = config.get("batch_size", 64) + epochs = config.get("epochs", 3) + + dataset_shard = train.get_dataset_shard("train") + + strategy = tf.distribute.MultiWorkerMirroredStrategy() + + with strategy.scope(): + # Model building/compiling need to be within `strategy.scope()`. + multi_worker_model = build_autoencoder_model() + learning_rate = config.get("lr", 0.001) + multi_worker_model.compile( + loss=tf.keras.losses.BinaryCrossentropy(), + optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), + metrics=[ + "binary_crossentropy", + ], + ) + + def to_tf_dataset(dataset, batch_size): + def to_tensor_iterator(): + for batch in dataset.iter_tf_batches( + batch_size=batch_size, dtypes=tf.float32 + ): + yield batch["image"], batch["label"] + + output_signature = ( + tf.TensorSpec(shape=(None, 784), dtype=tf.float32), + tf.TensorSpec(shape=(None, 784), dtype=tf.float32), + ) + tf_dataset = tf.data.Dataset.from_generator( + to_tensor_iterator, output_signature=output_signature + ) + return prepare_dataset_shard(tf_dataset) + + results = [] + for epoch in range(epochs): + tf_dataset = to_tf_dataset( + dataset=dataset_shard, + batch_size=per_worker_batch_size, + ) + history = multi_worker_model.fit( + tf_dataset, callbacks=[ReportCheckpointCallback()] + ) + results.append(history.history) + return results + + +def train_tensorflow_mnist( + num_workers: int = 2, use_gpu: bool = False, epochs: int = 4 +) -> Result: + train_dataset = get_dataset(split_type="train") + config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} + scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu) + trainer = TensorflowTrainer( + train_loop_per_worker=train_func, + train_loop_config=config, + datasets={"train": train_dataset}, + scaling_config=scaling_config, + ) + + results = trainer.fit() + print(results.metrics) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="Enables GPU training" + ) + parser.add_argument( + "--epochs", type=int, default=3, help="Number of epochs to train for." + ) + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.", + ) + + args, _ = parser.parse_known_args() + + if args.smoke_test: + # 2 workers, 1 for trainer, 1 for datasets + num_gpus = args.num_workers if args.use_gpu else 0 + ray.init(num_cpus=4, num_gpus=num_gpus) + result = train_tensorflow_mnist(num_workers=2, use_gpu=args.use_gpu) + else: + ray.init(address=args.address) + result = train_tensorflow_mnist( + num_workers=args.num_workers, use_gpu=args.use_gpu, epochs=args.epochs + ) + print(result) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_mnist_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_mnist_example.py new file mode 100644 index 0000000000000000000000000000000000000000..3fd5d7c759df6723a7a16a622d70c743519ed638 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_mnist_example.py @@ -0,0 +1,135 @@ +# This example showcases how to use Tensorflow with Ray Train. +# Original code: +# https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras +import argparse +import json +import os + +import numpy as np +import tensorflow as tf +from filelock import FileLock + +from ray.air.integrations.keras import ReportCheckpointCallback +from ray.train import Result, RunConfig, ScalingConfig +from ray.train.tensorflow import TensorflowTrainer + + +def mnist_dataset(batch_size: int) -> tf.data.Dataset: + with FileLock(os.path.expanduser("~/.mnist_lock")): + (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() + # The `x` arrays are in uint8 and have values in the [0, 255] range. + # You need to convert them to float32 with values in the [0, 1] range. + x_train = x_train / np.float32(255) + y_train = y_train.astype(np.int64) + train_dataset = ( + tf.data.Dataset.from_tensor_slices((x_train, y_train)) + .shuffle(60000) + .repeat() + .batch(batch_size) + ) + return train_dataset + + +def build_cnn_model() -> tf.keras.Model: + model = tf.keras.Sequential( + [ + tf.keras.Input(shape=(28, 28)), + tf.keras.layers.Reshape(target_shape=(28, 28, 1)), + tf.keras.layers.Conv2D(32, 3, activation="relu"), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dense(10), + ] + ) + return model + + +def train_func(config: dict): + per_worker_batch_size = config.get("batch_size", 64) + epochs = config.get("epochs", 3) + steps_per_epoch = config.get("steps_per_epoch", 70) + + tf_config = json.loads(os.environ["TF_CONFIG"]) + num_workers = len(tf_config["cluster"]["worker"]) + + strategy = tf.distribute.MultiWorkerMirroredStrategy() + + global_batch_size = per_worker_batch_size * num_workers + multi_worker_dataset = mnist_dataset(global_batch_size) + + with strategy.scope(): + # Model building/compiling need to be within `strategy.scope()`. + multi_worker_model = build_cnn_model() + learning_rate = config.get("lr", 0.001) + multi_worker_model.compile( + loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), + metrics=["accuracy"], + ) + + history = multi_worker_model.fit( + multi_worker_dataset, + epochs=epochs, + steps_per_epoch=steps_per_epoch, + callbacks=[ReportCheckpointCallback()], + ) + results = history.history + return results + + +def train_tensorflow_mnist( + num_workers: int = 2, + use_gpu: bool = False, + epochs: int = 4, + storage_path: str = None, +) -> Result: + config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} + trainer = TensorflowTrainer( + train_loop_per_worker=train_func, + train_loop_config=config, + scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + run_config=RunConfig(storage_path=storage_path), + ) + results = trainer.fit() + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="Enables GPU training" + ) + parser.add_argument( + "--epochs", type=int, default=3, help="Number of epochs to train for." + ) + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.", + ) + + args, _ = parser.parse_known_args() + + import ray + + if args.smoke_test: + # 2 workers, 1 for trainer, 1 for datasets + num_gpus = args.num_workers if args.use_gpu else 0 + ray.init(num_cpus=4, num_gpus=num_gpus) + train_tensorflow_mnist(num_workers=2, use_gpu=args.use_gpu) + else: + ray.init(address=args.address) + train_tensorflow_mnist( + num_workers=args.num_workers, use_gpu=args.use_gpu, epochs=args.epochs + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_quick_start.py b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_quick_start.py new file mode 100644 index 0000000000000000000000000000000000000000..4b078675960230461f3c6d493d10a3f56ec0ddea --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_quick_start.py @@ -0,0 +1,87 @@ +# ruff: noqa +# fmt: off +# isort: skip_file + +# __tf_setup_begin__ +import sys +import numpy as np + +if sys.version_info >= (3, 12): + # Tensorflow is not installed for Python 3.12 because of keras compatibility. + sys.exit(0) +else: + import tensorflow as tf + +def mnist_dataset(batch_size): + (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() + # The `x` arrays are in uint8 and have values in the [0, 255] range. + # You need to convert them to float32 with values in the [0, 1] range. + x_train = x_train / np.float32(255) + y_train = y_train.astype(np.int64) + train_dataset = tf.data.Dataset.from_tensor_slices( + (x_train, y_train)).shuffle(60000).repeat().batch(batch_size) + return train_dataset + + +def build_and_compile_cnn_model(): + model = tf.keras.Sequential([ + tf.keras.layers.InputLayer(input_shape=(28, 28)), + tf.keras.layers.Reshape(target_shape=(28, 28, 1)), + tf.keras.layers.Conv2D(32, 3, activation='relu'), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dense(10) + ]) + model.compile( + loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), + metrics=['accuracy']) + return model +# __tf_setup_end__ + +# __tf_single_begin__ +def train_func(): + batch_size = 64 + single_worker_dataset = mnist_dataset(batch_size) + single_worker_model = build_and_compile_cnn_model() + single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70) +# __tf_single_end__ + +# __tf_distributed_begin__ +import json +import os + +def train_func_distributed(): + per_worker_batch_size = 64 + # This environment variable will be set by Ray Train. + tf_config = json.loads(os.environ['TF_CONFIG']) + num_workers = len(tf_config['cluster']['worker']) + + strategy = tf.distribute.MultiWorkerMirroredStrategy() + + global_batch_size = per_worker_batch_size * num_workers + multi_worker_dataset = mnist_dataset(global_batch_size) + + with strategy.scope(): + # Model building/compiling need to be within `strategy.scope()`. + multi_worker_model = build_and_compile_cnn_model() + + multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70) +# __tf_distributed_end__ + +if __name__ == "__main__": + # __tf_single_run_begin__ + train_func() + # __tf_single_run_end__ + + # __tf_trainer_begin__ + from ray.train.tensorflow import TensorflowTrainer + from ray.train import ScalingConfig + + # For GPU Training, set `use_gpu` to True. + use_gpu = False + + trainer = TensorflowTrainer(train_func_distributed, scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)) + + trainer.fit() + # __tf_trainer_end__ diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_regression_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_regression_example.py new file mode 100644 index 0000000000000000000000000000000000000000..b4c80f88bd7f9e43a58a4561905ce1f0fe590a9a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tensorflow_regression_example.py @@ -0,0 +1,111 @@ +import argparse +import sys + +import ray +from ray import train +from ray.data.preprocessors import Concatenator +from ray.train import Result, ScalingConfig + +if sys.version_info >= (3, 12): + # Skip this test in Python 3.12+ because TensorFlow is not supported. + sys.exit(0) +else: + import tensorflow as tf + + from ray.air.integrations.keras import ReportCheckpointCallback + from ray.train.tensorflow import TensorflowTrainer + + +def build_model() -> tf.keras.Model: + model = tf.keras.Sequential( + [ + tf.keras.layers.InputLayer(input_shape=(100,)), + tf.keras.layers.Dense(10), + tf.keras.layers.Dense(1), + ] + ) + return model + + +def train_func(config: dict): + batch_size = config.get("batch_size", 64) + epochs = config.get("epochs", 3) + + strategy = tf.distribute.MultiWorkerMirroredStrategy() + with strategy.scope(): + # Model building/compiling need to be within `strategy.scope()`. + multi_worker_model = build_model() + multi_worker_model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=config.get("lr", 1e-3)), + loss=tf.keras.losses.mean_absolute_error, + metrics=[tf.keras.metrics.mean_squared_error], + ) + + dataset = train.get_dataset_shard("train") + + results = [] + for _ in range(epochs): + tf_dataset = dataset.to_tf( + feature_columns="x", label_columns="y", batch_size=batch_size + ) + history = multi_worker_model.fit( + tf_dataset, callbacks=[ReportCheckpointCallback()] + ) + results.append(history.history) + return results + + +def train_tensorflow_regression(num_workers: int = 2, use_gpu: bool = False) -> Result: + dataset = ray.data.read_csv("s3://anonymous@air-example-data/regression.csv") + columns_to_concatenate = [f"x{i:03}" for i in range(100)] + preprocessor = Concatenator(columns=columns_to_concatenate, output_column_name="x") + dataset = preprocessor.fit_transform(dataset) + + config = {"lr": 1e-3, "batch_size": 32, "epochs": 4} + scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu) + trainer = TensorflowTrainer( + train_loop_per_worker=train_func, + train_loop_config=config, + scaling_config=scaling_config, + datasets={"train": dataset}, + ) + results = trainer.fit() + print(results.metrics) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="Enables GPU training" + ) + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.", + ) + + args, _ = parser.parse_known_args() + + if args.smoke_test: + # 2 workers, 1 for trainer, 1 for datasets + num_gpus = args.num_workers if args.use_gpu else 0 + ray.init(num_cpus=4, num_gpus=num_gpus) + result = train_tensorflow_regression(num_workers=2, use_gpu=args.use_gpu) + else: + ray.init(address=args.address) + result = train_tensorflow_regression( + num_workers=args.num_workers, use_gpu=args.use_gpu + ) + print(result) diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tune_tensorflow_mnist_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tune_tensorflow_mnist_example.py new file mode 100644 index 0000000000000000000000000000000000000000..a1a1860516caa099ad4e1905212e8065a051a72f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tune_tensorflow_mnist_example.py @@ -0,0 +1,80 @@ +import argparse +import sys + +import ray +from ray import tune +from ray.train import ScalingConfig +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner + +if sys.version_info >= (3, 12): + # Skip this test in Python 3.12+ because TensorFlow is not supported. + exit(0) +else: + from ray.train.examples.tf.tensorflow_mnist_example import train_func + from ray.train.tensorflow import TensorflowTrainer + + +def tune_tensorflow_mnist( + num_workers: int = 2, num_samples: int = 2, use_gpu: bool = False +): + trainer = TensorflowTrainer( + train_loop_per_worker=train_func, + scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + ) + tuner = Tuner( + trainer, + tune_config=TuneConfig(num_samples=num_samples, metric="accuracy", mode="max"), + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([32, 64, 128]), + "epochs": 3, + } + }, + ) + best_accuracy = tuner.fit().get_best_result().metrics["accuracy"] + print(f"Best accuracy config: {best_accuracy}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.", + ) + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--num-samples", + type=int, + default=2, + help="Sets number of samples for training.", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="Enables GPU training" + ) + + args = parser.parse_args() + + if args.smoke_test: + num_gpus = args.num_workers if args.use_gpu else 0 + ray.init(num_cpus=8, num_gpus=num_gpus) + tune_tensorflow_mnist(num_workers=2, num_samples=2, use_gpu=args.use_gpu) + else: + ray.init(address=args.address) + tune_tensorflow_mnist( + num_workers=args.num_workers, + num_samples=args.num_samples, + use_gpu=args.use_gpu, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a156e133d5facccb9b06707d4b6f15dad509ca17 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__init__.py @@ -0,0 +1,32 @@ +# isort: off +try: + import tensorflow as tf # noqa: F401 +except ModuleNotFoundError: + raise ModuleNotFoundError( + "TensorFlow isn't installed. To install TensorFlow, run 'pip install " + "tensorflow'." + ) +# isort: on + +from ray.train.tensorflow.config import TensorflowConfig +from ray.train.tensorflow.tensorflow_checkpoint import TensorflowCheckpoint +from ray.train.tensorflow.tensorflow_predictor import TensorflowPredictor +from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer +from ray.train.tensorflow.train_loop_utils import prepare_dataset_shard +from ray.train.v2._internal.constants import is_v2_enabled + +if is_v2_enabled(): + from ray.train.v2.tensorflow.tensorflow_trainer import ( # noqa: F811 + TensorflowTrainer, + ) + +__all__ = [ + "TensorflowCheckpoint", + "TensorflowConfig", + "prepare_dataset_shard", + "TensorflowPredictor", + "TensorflowTrainer", +] + + +# DO NOT ADD ANYTHING AFTER THIS LINE. diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41eb7fb57d4cd1cbe7b1156b24230b6249c58102 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a58e0cb730d22bb89aefe2e2a1b21ea97641e65 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/keras.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/keras.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7fef580de1524d91ac66faeb98e52194849a8d27 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/keras.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/tensorflow_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/tensorflow_checkpoint.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f0565957bbac75d5a9effa59f750339ac5ced6c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/tensorflow_checkpoint.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/tensorflow_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/tensorflow_predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..441347076e5fd549356f418f6ab99181697cdeb9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/tensorflow_predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/tensorflow_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/tensorflow_trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..feb05e482f109a60f40ff71b472f5719f1ef183d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/tensorflow_trainer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/train_loop_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/train_loop_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e4e55a4baaf6ae3916697a7b991f8f48a365e6a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/__pycache__/train_loop_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/config.py b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/config.py new file mode 100644 index 0000000000000000000000000000000000000000..ae3baedb2a6fef55e65fb30af17d92d7e7192a93 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/config.py @@ -0,0 +1,60 @@ +import json +import logging +import os +from dataclasses import dataclass +from typing import List + +import ray +from ray.train._internal.utils import get_address_and_port +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend, BackendConfig +from ray.util import PublicAPI + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="beta") +@dataclass +class TensorflowConfig(BackendConfig): + @property + def backend_cls(self): + return _TensorflowBackend + + +def _setup_tensorflow_environment(worker_addresses: List[str], index: int): + """Set up distributed Tensorflow training information. + + This function should be called on each worker. + + Args: + worker_addresses: Addresses of all the workers. + index: Index (i.e. world rank) of the current worker. + """ + tf_config = { + "cluster": {"worker": worker_addresses}, + "task": {"type": "worker", "index": index}, + } + os.environ["TF_CONFIG"] = json.dumps(tf_config) + + +class _TensorflowBackend(Backend): + def on_start(self, worker_group: WorkerGroup, backend_config: TensorflowConfig): + # Compute URL for initializing distributed setup. + def get_url(): + address, port = get_address_and_port() + return f"{address}:{port}" + + urls = worker_group.execute(get_url) + + # Get setup tasks in order to throw errors on failure. + setup_futures = [] + for i in range(len(worker_group)): + setup_futures.append( + worker_group.execute_single_async( + i, + _setup_tensorflow_environment, + worker_addresses=urls, + index=i, + ) + ) + ray.get(setup_futures) diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/keras.py b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/keras.py new file mode 100644 index 0000000000000000000000000000000000000000..3594779c8db1801e98aca325c7a40deff299cc56 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/keras.py @@ -0,0 +1,3 @@ +from ray.air.integrations.keras import ReportCheckpointCallback + +ReportCheckpointCallback.__module__ = "ray.train.tensorflow.keras" diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/tensorflow_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/tensorflow_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..183af6e843deea8d3322d7c9bb729961f3be3aa3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/tensorflow_checkpoint.py @@ -0,0 +1,155 @@ +import os +import shutil +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING, Optional + +import tensorflow as tf +from tensorflow import keras + +from ray.train._internal.framework_checkpoint import FrameworkCheckpoint +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + + +@PublicAPI(stability="beta") +class TensorflowCheckpoint(FrameworkCheckpoint): + """A :py:class:`~ray.train.Checkpoint` with TensorFlow-specific functionality.""" + + MODEL_FILENAME_KEY = "_model_filename" + + @classmethod + def from_model( + cls, + model: keras.Model, + *, + preprocessor: Optional["Preprocessor"] = None, + ) -> "TensorflowCheckpoint": + """Create a :py:class:`~ray.train.Checkpoint` that stores a Keras model. + + The checkpoint created with this method needs to be paired with + `model` when used. + + Args: + model: The Keras model, whose weights are stored in the checkpoint. + preprocessor: A fitted preprocessor to be applied before inference. + + Returns: + A :py:class:`TensorflowCheckpoint` containing the specified model. + + Examples: + + .. testcode:: + + from ray.train.tensorflow import TensorflowCheckpoint + import tensorflow as tf + + model = tf.keras.applications.resnet.ResNet101() + checkpoint = TensorflowCheckpoint.from_model(model) + + .. testoutput:: + :options: +MOCK + :hide: + + ... # Model may or may not be downloaded + + """ + tempdir = tempfile.mkdtemp() + filename = "model.keras" + model.save(Path(tempdir, filename).as_posix()) + + checkpoint = cls.from_directory(tempdir) + if preprocessor: + checkpoint.set_preprocessor(preprocessor) + checkpoint.update_metadata({cls.MODEL_FILENAME_KEY: filename}) + return checkpoint + + @classmethod + def from_h5( + cls, file_path: str, *, preprocessor: Optional["Preprocessor"] = None + ) -> "TensorflowCheckpoint": + """Create a :py:class:`~ray.train.Checkpoint` that stores a Keras + model from H5 format. + + The checkpoint generated by this method contains all the information needed. + Thus no `model` is needed to be supplied when using this checkpoint. + + Args: + file_path: The path to the .h5 file to load model from. This is the + same path that is used for ``model.save(path)``. + preprocessor: A fitted preprocessor to be applied before inference. + + Returns: + A :py:class:`TensorflowCheckpoint` converted from h5 format. + + """ + if not os.path.isfile(file_path) or not file_path.endswith(".h5"): + raise ValueError( + "Please supply a h5 file path to `TensorflowCheckpoint.from_h5()`." + ) + tempdir = tempfile.mkdtemp() + filename = os.path.basename(file_path) + new_checkpoint_file = Path(tempdir, filename).as_posix() + shutil.copy(file_path, new_checkpoint_file) + + checkpoint = cls.from_directory(tempdir) + if preprocessor: + checkpoint.set_preprocessor(preprocessor) + checkpoint.update_metadata({cls.MODEL_FILENAME_KEY: filename}) + return checkpoint + + @classmethod + def from_saved_model( + cls, dir_path: str, *, preprocessor: Optional["Preprocessor"] = None + ) -> "TensorflowCheckpoint": + """Create a :py:class:`~ray.train.Checkpoint` that stores a Keras + model from SavedModel format. + + The checkpoint generated by this method contains all the information needed. + Thus no `model` is needed to be supplied when using this checkpoint. + + Args: + dir_path: The directory containing the saved model. This is the same + directory as used by ``model.save(dir_path)``. + preprocessor: A fitted preprocessor to be applied before inference. + + Returns: + A :py:class:`TensorflowCheckpoint` converted from SavedModel format. + + """ + if not os.path.isdir(dir_path): + raise ValueError( + "Please supply a directory to `TensorflowCheckpoint.from_saved_model`" + ) + tempdir = tempfile.mkdtemp() + # TODO(ml-team): Replace this with copytree() + os.rmdir(tempdir) + shutil.copytree(dir_path, tempdir) + + checkpoint = cls.from_directory(tempdir) + if preprocessor: + checkpoint.set_preprocessor(preprocessor) + # NOTE: The entire directory is the checkpoint. + checkpoint.update_metadata({cls.MODEL_FILENAME_KEY: "."}) + return checkpoint + + def get_model( + self, + ) -> tf.keras.Model: + """Retrieve the model stored in this checkpoint. + + Returns: + The Tensorflow Keras model stored in the checkpoint. + """ + metadata = self.get_metadata() + if self.MODEL_FILENAME_KEY not in metadata: + raise ValueError( + "`TensorflowCheckpoint` cannot retrieve the model if you override the " + "checkpoint metadata. Please use `Checkpoint.update_metadata` instead." + ) + model_filename = metadata[self.MODEL_FILENAME_KEY] + with self.as_directory() as checkpoint_dir: + model_path = Path(checkpoint_dir, model_filename).as_posix() + return keras.models.load_model(model_path) diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/tensorflow_predictor.py b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/tensorflow_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..ab353c333727b8310da1d6f388eb1641d27f26e5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/tensorflow_predictor.py @@ -0,0 +1,247 @@ +import logging +from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union + +import numpy as np +import tensorflow as tf + +from ray.air._internal.tensorflow_utils import convert_ndarray_batch_to_tf_tensor_batch +from ray.train._internal.dl_predictor import DLPredictor +from ray.train.predictor import DataBatchType +from ray.train.tensorflow import TensorflowCheckpoint +from ray.util import log_once +from ray.util.annotations import DeveloperAPI, PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="beta") +class TensorflowPredictor(DLPredictor): + """A predictor for TensorFlow models. + + Args: + model: A Tensorflow Keras model to use for predictions. + preprocessor: A preprocessor used to transform data batches prior + to prediction. + model_weights: List of weights to use for the model. + use_gpu: If set, the model will be moved to GPU on instantiation and + prediction happens on GPU. + """ + + def __init__( + self, + *, + model: Optional[tf.keras.Model] = None, + preprocessor: Optional["Preprocessor"] = None, + use_gpu: bool = False, + ): + self.use_gpu = use_gpu + # TensorFlow model objects cannot be pickled, therefore we use + # a callable that returns the model and initialize it here, + # instead of having an initialized model object as an attribute. + # Predictors are not serializable (see the implementation of __reduce__) + # in the Predictor class, so we can safely store the initialized model + # as an attribute. + if use_gpu: + # TODO (jiaodong): #26249 Use multiple GPU devices with sharded input + with tf.device("GPU:0"): + self._model = model + else: + self._model = model + gpu_devices = tf.config.list_physical_devices("GPU") + if len(gpu_devices) > 0 and log_once("tf_predictor_not_using_gpu"): + logger.warning( + "You have `use_gpu` as False but there are " + f"{len(gpu_devices)} GPUs detected on host where " + "prediction will only use CPU. Please consider explicitly " + "setting `TensorflowPredictor(use_gpu=True)` or " + "`batch_predictor.predict(ds, num_gpus_per_worker=1)` to " + "enable GPU prediction." + ) + super().__init__(preprocessor) + + def __repr__(self): + fn_name = getattr(self._model, "__name__", self._model) + fn_name_str = "" + if fn_name: + fn_name_str = str(fn_name)[:40] + return ( + f"{self.__class__.__name__}(" + f"model={fn_name_str!r}, " + f"preprocessor={self._preprocessor!r}, " + f"use_gpu={self.use_gpu!r})" + ) + + @classmethod + def from_checkpoint( + cls, + checkpoint: TensorflowCheckpoint, + model_definition: Optional[ + Union[Callable[[], tf.keras.Model], Type[tf.keras.Model]] + ] = None, + use_gpu: Optional[bool] = False, + ) -> "TensorflowPredictor": + """Instantiate the predictor from a TensorflowCheckpoint. + + Args: + checkpoint: The checkpoint to load the model and preprocessor from. + model_definition: A callable that returns a TensorFlow Keras model + to use. Model weights will be loaded from the checkpoint. + This is only needed if the `checkpoint` was created from + `TensorflowCheckpoint.from_model`. + use_gpu: Whether GPU should be used during prediction. + """ + if model_definition: + raise DeprecationWarning( + "`model_definition` is deprecated. `TensorflowCheckpoint.from_model` " + "now saves the full model definition in .keras format." + ) + + model = checkpoint.get_model() + preprocessor = checkpoint.get_preprocessor() + return cls( + model=model, + preprocessor=preprocessor, + use_gpu=use_gpu, + ) + + @DeveloperAPI + def call_model( + self, inputs: Union[tf.Tensor, Dict[str, tf.Tensor]] + ) -> Union[tf.Tensor, Dict[str, tf.Tensor]]: + """Runs inference on a single batch of tensor data. + + This method is called by `TorchPredictor.predict` after converting the + original data batch to torch tensors. + + Override this method to add custom logic for processing the model input or + output. + + Example: + + .. testcode:: + + # List outputs are not supported by default TensorflowPredictor. + def build_model() -> tf.keras.Model: + input = tf.keras.layers.Input(shape=1) + model = tf.keras.models.Model(inputs=input, outputs=[input, input]) + return model + + # Use a custom predictor to format model output as a dict. + class CustomPredictor(TensorflowPredictor): + def call_model(self, inputs): + model_output = super().call_model(inputs) + return { + str(i): model_output[i] for i in range(len(model_output)) + } + + import numpy as np + data_batch = np.array([[0.5], [0.6], [0.7]], dtype=np.float32) + + predictor = CustomPredictor(model=build_model()) + predictions = predictor.predict(data_batch) + + Args: + inputs: A batch of data to predict on, represented as either a single + TensorFlow tensor or for multi-input models, a dictionary of tensors. + + Returns: + The model outputs, either as a single tensor or a dictionary of tensors. + + """ + if self.use_gpu: + with tf.device("GPU:0"): + return self._model(inputs) + else: + return self._model(inputs) + + def predict( + self, + data: DataBatchType, + dtype: Optional[Union[tf.dtypes.DType, Dict[str, tf.dtypes.DType]]] = None, + ) -> DataBatchType: + """Run inference on data batch. + + If the provided data is a single array or a dataframe/table with a single + column, it will be converted into a single Tensorflow tensor before being + inputted to the model. + + If the provided data is a multi-column table or a dict of numpy arrays, + it will be converted into a dict of tensors before being inputted to the + model. This is useful for multi-modal inputs (for example your model accepts + both image and text). + + Args: + data: A batch of input data. Either a pandas DataFrame or numpy + array. + dtype: The dtypes to use for the tensors. Either a single dtype for all + tensors or a mapping from column name to dtype. + + Examples: + + .. testcode:: + + import numpy as np + import tensorflow as tf + from ray.train.tensorflow import TensorflowPredictor + + def build_model(): + return tf.keras.Sequential( + [ + tf.keras.layers.InputLayer(input_shape=()), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(1), + ] + ) + + weights = [np.array([[2.0]]), np.array([0.0])] + predictor = TensorflowPredictor(model=build_model()) + + data = np.asarray([1, 2, 3]) + predictions = predictor.predict(data) + + import pandas as pd + import tensorflow as tf + from ray.train.tensorflow import TensorflowPredictor + + def build_model(): + input1 = tf.keras.layers.Input(shape=(1,), name="A") + input2 = tf.keras.layers.Input(shape=(1,), name="B") + merged = tf.keras.layers.Concatenate(axis=1)([input1, input2]) + output = tf.keras.layers.Dense(2, input_dim=2)(merged) + return tf.keras.models.Model( + inputs=[input1, input2], outputs=output) + + predictor = TensorflowPredictor(model=build_model()) + + # Pandas dataframe. + data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + + predictions = predictor.predict(data) + + Returns: + DataBatchType: Prediction result. The return type will be the same as the + input type. + """ + return super(TensorflowPredictor, self).predict(data=data, dtype=dtype) + + def _arrays_to_tensors( + self, + numpy_arrays: Union[np.ndarray, Dict[str, np.ndarray]], + dtype: Optional[Union[tf.dtypes.DType, Dict[str, tf.dtypes.DType]]], + ) -> Union[tf.Tensor, Dict[str, tf.Tensor]]: + return convert_ndarray_batch_to_tf_tensor_batch(numpy_arrays, dtypes=dtype) + + def _tensor_to_array(self, tensor: tf.Tensor) -> np.ndarray: + if not isinstance(tensor, tf.Tensor): + raise ValueError( + "Expected the model to return either a tf.Tensor or a " + f"dict of tf.Tensor, but got {type(tensor)} instead. " + f"To support models with different output types, subclass " + f"TensorflowPredictor and override the `call_model` method " + f"to process the output into either torch.Tensor or Dict[" + f"str, torch.Tensor]." + ) + return tensor.numpy() diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/tensorflow_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/tensorflow_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..8a613edd2a4b3a6b7569198ca41f78284d3f29d6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/tensorflow_trainer.py @@ -0,0 +1,193 @@ +from typing import Any, Callable, Dict, Optional, Union + +from ray.train import Checkpoint, DataConfig, RunConfig, ScalingConfig +from ray.train.data_parallel_trainer import DataParallelTrainer +from ray.train.tensorflow.config import TensorflowConfig +from ray.train.trainer import GenDataset +from ray.util import PublicAPI + + +@PublicAPI(stability="beta") +class TensorflowTrainer(DataParallelTrainer): + """A Trainer for data parallel Tensorflow training. + + This Trainer runs the function ``train_loop_per_worker`` on multiple Ray + Actors. These actors already have the necessary TensorFlow process group already + configured for distributed TensorFlow training. + + The ``train_loop_per_worker`` function is expected to take in either 0 or 1 + arguments: + + .. testcode:: + + def train_loop_per_worker(): + ... + + .. testcode:: + + def train_loop_per_worker(config: Dict): + ... + + If ``train_loop_per_worker`` accepts an argument, then + ``train_loop_config`` will be passed in as the argument. This is useful if you + want to tune the values in ``train_loop_config`` as hyperparameters. + + If the ``datasets`` dict contains a training dataset (denoted by + the "train" key), then it will be split into multiple dataset + shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside + ``train_loop_per_worker``. All the other datasets will not be split and + ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset. + + Inside the ``train_loop_per_worker`` function, you can use any of the + :ref:`Ray Train loop methods `. + + .. warning:: + Ray will not automatically set any environment variables or configuration + related to local parallelism / threading + :ref:`aside from "OMP_NUM_THREADS" `. + If you desire greater control over TensorFlow threading, use + the ``tf.config.threading`` module (eg. + ``tf.config.threading.set_inter_op_parallelism_threads(num_cpus)``) + at the beginning of your ``train_loop_per_worker`` function. + + + .. testcode:: + + from ray import train + + def train_loop_per_worker(): + # Report intermediate results for callbacks or logging and + # checkpoint data. + train.report(...) + + # Returns dict of last saved checkpoint. + train.get_checkpoint() + + # Returns the Dataset shard for the given key. + train.get_dataset_shard("my_dataset") + + # Returns the total number of workers executing training. + train.get_context().get_world_size() + + # Returns the rank of this worker. + train.get_context().get_world_rank() + + # Returns the rank of the worker on the current node. + train.get_context().get_local_rank() + + Any returns from the ``train_loop_per_worker`` will be discarded and not + used or persisted anywhere. + + To save a model to use for the ``TensorflowPredictor``, you must save it under the + "model" kwarg in ``Checkpoint`` passed to ``train.report()``. + + Example: + + .. testcode:: + + import os + import tempfile + import tensorflow as tf + + import ray + from ray import train + from ray.train import Checkpoint, ScalingConfig + from ray.train.tensorflow import TensorflowTrainer + + def build_model(): + # toy neural network : 1-layer + return tf.keras.Sequential( + [tf.keras.layers.Dense( + 1, activation="linear", input_shape=(1,))] + ) + + def train_loop_per_worker(config): + dataset_shard = train.get_dataset_shard("train") + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + with strategy.scope(): + model = build_model() + model.compile( + optimizer="Adam", loss="mean_squared_error", metrics=["mse"]) + + tf_dataset = dataset_shard.to_tf( + feature_columns="x", + label_columns="y", + batch_size=1 + ) + for epoch in range(config["num_epochs"]): + model.fit(tf_dataset) + + # Create checkpoint. + checkpoint_dir = tempfile.mkdtemp() + model.save_weights( + os.path.join(checkpoint_dir, "my_checkpoint") + ) + checkpoint = Checkpoint.from_directory(checkpoint_dir) + + train.report( + {}, + checkpoint=checkpoint, + ) + + train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) + trainer = TensorflowTrainer( + train_loop_per_worker=train_loop_per_worker, + scaling_config=ScalingConfig(num_workers=3, use_gpu=True), + datasets={"train": train_dataset}, + train_loop_config={"num_epochs": 2}, + ) + result = trainer.fit() + + .. testoutput:: + :options:+ELLIPSIS + :hide: + + ... + + Args: + train_loop_per_worker: The training function to execute. + This can either take in no arguments or a ``config`` dict. + train_loop_config: Configurations to pass into + ``train_loop_per_worker`` if it accepts an argument. + tensorflow_config: Configuration for setting up the TensorFlow backend. + If set to None, use the default configuration. This replaces the + ``backend_config`` arg of ``DataParallelTrainer``. + scaling_config: Configuration for how to scale data parallel training. + dataset_config: Configuration for dataset ingest. + run_config: Configuration for the execution of the training run. + datasets: Any Datasets to use for training. Use + the key "train" to denote which dataset is the training + dataset. + resume_from_checkpoint: A checkpoint to resume training from. + metadata: Dict that should be made available via + `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + """ + + def __init__( + self, + train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], + *, + train_loop_config: Optional[Dict] = None, + tensorflow_config: Optional[TensorflowConfig] = None, + scaling_config: Optional[ScalingConfig] = None, + dataset_config: Optional[DataConfig] = None, + run_config: Optional[RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + metadata: Optional[Dict[str, Any]] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + if not tensorflow_config: + tensorflow_config = TensorflowConfig() + + super(TensorflowTrainer, self).__init__( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + backend_config=tensorflow_config, + scaling_config=scaling_config, + dataset_config=dataset_config, + run_config=run_config, + datasets=datasets, + resume_from_checkpoint=resume_from_checkpoint, + metadata=metadata, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/tensorflow/train_loop_utils.py b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/train_loop_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b7862d6a365f14d16cf6ad85e17ce7dc92ad5c2d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/tensorflow/train_loop_utils.py @@ -0,0 +1,27 @@ +import tensorflow as tf + +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="beta") +def prepare_dataset_shard(tf_dataset_shard: tf.data.Dataset): + """A utility function that overrides default config for Tensorflow Dataset. + + This should be used on a TensorFlow ``Dataset`` created by calling + ``iter_tf_batches()`` on a ``ray.data.Dataset`` returned by + ``ray.train.get_dataset_shard()`` since the dataset has already + been sharded across the workers. + + Args: + tf_dataset_shard (tf.data.Dataset): A TensorFlow Dataset. + + Returns: + A TensorFlow Dataset with: + - autosharding turned off + - prefetching turned on with autotune enabled + """ + options = tf.data.Options() + options.experimental_distribute.auto_shard_policy = ( + tf.data.experimental.AutoShardPolicy.OFF + ) + return tf_dataset_shard.with_options(options).prefetch(tf.data.AUTOTUNE) diff --git a/.venv/lib/python3.11/site-packages/ray/train/util/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/util/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f373b95bbd270009d820abab7f1e3c8324c6fa9f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/util/__init__.py @@ -0,0 +1,7 @@ +from ray.air.util.check_ingest import DummyTrainer + +__all__ = [ + "DummyTrainer", +] + +DummyTrainer.__module__ = "ray.train.util" diff --git a/.venv/lib/python3.11/site-packages/ray/train/util/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/util/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70585d22e2410f2204d18d37b4a5d49e71cedd58 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/util/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/tune_controller.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/tune_controller.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7ee628580d634a780d380bbb929c5cead437fda --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/tune_controller.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88ae169516de2b1392499b271b75dfe4dd32869d5222a0dcf49c53d7f2e56589 +size 103903 diff --git a/.venv/lib/python3.11/site-packages/ray/tune/schedulers/__pycache__/pb2_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/schedulers/__pycache__/pb2_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b238233a23441ee67d0a22e03dfc035547f28b3c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/schedulers/__pycache__/pb2_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/utils/callback.py b/.venv/lib/python3.11/site-packages/ray/tune/utils/callback.py new file mode 100644 index 0000000000000000000000000000000000000000..b53063b85ab98a46074b52fc20f53e0dcaf93395 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/utils/callback.py @@ -0,0 +1,143 @@ +import logging +import os +from typing import TYPE_CHECKING, Collection, List, Optional, Type, Union + +from ray.tune.callback import Callback, CallbackList +from ray.tune.logger import ( + CSVLogger, + CSVLoggerCallback, + JsonLogger, + JsonLoggerCallback, + LegacyLoggerCallback, + TBXLogger, + TBXLoggerCallback, +) + +logger = logging.getLogger(__name__) + +if TYPE_CHECKING: + from ray.tune.experimental.output import AirVerbosity + +DEFAULT_CALLBACK_CLASSES = ( + CSVLoggerCallback, + JsonLoggerCallback, + TBXLoggerCallback, +) + + +def _get_artifact_templates_for_callbacks( + callbacks: Union[List[Callback], List[Type[Callback]], CallbackList] +) -> List[str]: + templates = [] + for callback in callbacks: + templates += list(callback._SAVED_FILE_TEMPLATES) + return templates + + +def _create_default_callbacks( + callbacks: Optional[List[Callback]], + *, + air_verbosity: Optional["AirVerbosity"] = None, + entrypoint: Optional[str] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + config: Optional[dict] = None, + progress_metrics: Optional[Collection[str]] = None, +) -> List[Callback]: + """Create default callbacks for `Tuner.fit()`. + + This function takes a list of existing callbacks and adds default + callbacks to it. + + Specifically, three kinds of callbacks will be added: + + 1. Loggers. Ray Tune's experiment analysis relies on CSV and JSON logging. + 2. Syncer. Ray Tune synchronizes logs and checkpoint between workers and + the head node. + 2. Trial progress reporter. For reporting intermediate progress, like trial + results, Ray Tune uses a callback. + + These callbacks will only be added if they don't already exist, i.e. if + they haven't been passed (and configured) by the user. A notable case + is when a Logger is passed, which is not a CSV or JSON logger - then + a CSV and JSON logger will still be created. + + Lastly, this function will ensure that the Syncer callback comes after all + Logger callbacks, to ensure that the most up-to-date logs and checkpoints + are synced across nodes. + + """ + callbacks = callbacks or [] + has_csv_logger = False + has_json_logger = False + has_tbx_logger = False + + from ray.tune.progress_reporter import TrialProgressCallback + + has_trial_progress_callback = any( + isinstance(c, TrialProgressCallback) for c in callbacks + ) + + if has_trial_progress_callback and air_verbosity is not None: + logger.warning( + "AIR_VERBOSITY is set, ignoring passed-in TrialProgressCallback." + ) + new_callbacks = [ + c for c in callbacks if not isinstance(c, TrialProgressCallback) + ] + callbacks = new_callbacks + if air_verbosity is not None: # new flow + from ray.tune.experimental.output import ( + _detect_reporter as _detect_air_reporter, + ) + + air_progress_reporter = _detect_air_reporter( + air_verbosity, + num_samples=1, # Update later with setup() + entrypoint=entrypoint, + metric=metric, + mode=mode, + config=config, + progress_metrics=progress_metrics, + ) + callbacks.append(air_progress_reporter) + elif not has_trial_progress_callback: # old flow + trial_progress_callback = TrialProgressCallback( + metric=metric, progress_metrics=progress_metrics + ) + callbacks.append(trial_progress_callback) + + # Check if we have a CSV, JSON and TensorboardX logger + for i, callback in enumerate(callbacks): + if isinstance(callback, LegacyLoggerCallback): + if CSVLogger in callback.logger_classes: + has_csv_logger = True + if JsonLogger in callback.logger_classes: + has_json_logger = True + if TBXLogger in callback.logger_classes: + has_tbx_logger = True + elif isinstance(callback, CSVLoggerCallback): + has_csv_logger = True + elif isinstance(callback, JsonLoggerCallback): + has_json_logger = True + elif isinstance(callback, TBXLoggerCallback): + has_tbx_logger = True + + # If CSV, JSON or TensorboardX loggers are missing, add + if os.environ.get("TUNE_DISABLE_AUTO_CALLBACK_LOGGERS", "0") != "1": + if not has_csv_logger: + callbacks.append(CSVLoggerCallback()) + if not has_json_logger: + callbacks.append(JsonLoggerCallback()) + if not has_tbx_logger: + try: + callbacks.append(TBXLoggerCallback()) + except ImportError: + logger.warning( + "The TensorboardX logger cannot be instantiated because " + "either TensorboardX or one of it's dependencies is not " + "installed. Please make sure you have the latest version " + "of TensorboardX installed: `pip install -U tensorboardx`" + ) + + return callbacks diff --git a/.venv/lib/python3.11/site-packages/ray/tune/utils/file_transfer.py b/.venv/lib/python3.11/site-packages/ray/tune/utils/file_transfer.py new file mode 100644 index 0000000000000000000000000000000000000000..d742a91eb9a9c52bbb77f31118b280a8b2506beb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/utils/file_transfer.py @@ -0,0 +1,481 @@ +import fnmatch +import io +import os +import shutil +import tarfile +from typing import Dict, Generator, List, Optional, Tuple, Union + +import ray +from ray.air._internal.filelock import TempFileLock +from ray.air.util.node import _force_on_node, _get_node_id_from_node_ip +from ray.util.annotations import DeveloperAPI + +_DEFAULT_CHUNK_SIZE_BYTES = 500 * 1024 * 1024 # 500 MiB +_DEFAULT_MAX_SIZE_BYTES = 1 * 1024 * 1024 * 1024 # 1 GiB + + +@DeveloperAPI +def sync_dir_between_nodes( + source_ip: str, + source_path: str, + target_ip: str, + target_path: str, + force_all: bool = False, + exclude: Optional[List] = None, + chunk_size_bytes: int = _DEFAULT_CHUNK_SIZE_BYTES, + max_size_bytes: Optional[int] = _DEFAULT_MAX_SIZE_BYTES, + return_futures: bool = False, +) -> Union[ + None, + Tuple[ray.ObjectRef, ray.ActorID, ray.ObjectRef], + Tuple[ray.ObjectRef, None, None], +]: + """Synchronize directory on source node to directory on target node. + + Per default, this function will collect information about already existing + files in the target directory. Only files that differ in either mtime or + filesize will be transferred, unless ``force_all=True``. + + If ``source_ip==target_ip``, shutil will be used to copy the directory. Otherwise, + the directory will be packed and sent through the Ray Object Store to the target + node. + + Args: + source_ip: IP of source node. + source_path: Path to directory on source node. + target_ip: IP of target node. + target_path: Path to directory on target node. + force_all: If True, all files will be transferred (not just differing files). + Ignored if ``source_ip==target_ip``. + exclude: Pattern of files to exclude, e.g. + ``["*/checkpoint_*]`` to exclude trial checkpoints. + chunk_size_bytes: Chunk size for data transfer. Ignored if + ``source_ip==target_ip``. + max_size_bytes: If packed data exceeds this value, raise an error before + transfer. If ``None``, no limit is enforced. Ignored if + ``source_ip==target_ip``. + return_futures: If True, returns a tuple of the unpack future, + the pack actor, and the files_stats future. If False (default) will + block until synchronization finished and return None. + + Returns: + None, or Tuple of unpack future, pack actor, and files_stats future. + If ``source_ip==target_ip``, pack actor and files_stats future will be None. + + """ + if source_ip != target_ip: + return _sync_dir_between_different_nodes( + source_ip=source_ip, + source_path=source_path, + target_ip=target_ip, + target_path=target_path, + force_all=force_all, + exclude=exclude, + chunk_size_bytes=chunk_size_bytes, + max_size_bytes=max_size_bytes, + return_futures=return_futures, + ) + elif source_path != target_path: + ret = _sync_dir_on_same_node( + ip=source_ip, + source_path=source_path, + target_path=target_path, + exclude=exclude, + return_futures=return_futures, + ) + if return_futures: + return ret, None, None + return ret + + +def _sync_dir_on_same_node( + ip: str, + source_path: str, + target_path: str, + exclude: Optional[List] = None, + return_futures: bool = False, +) -> Optional[ray.ObjectRef]: + """Synchronize directory to another directory on the same node. + + Per default, this function will collect information about already existing + files in the target directory. All files will be copied over. + + Args: + ip: IP of the node. + source_path: Path to source directory. + target_path: Path to target directory. + exclude: Pattern of files to exclude, e.g. + ``["*/checkpoint_*]`` to exclude trial checkpoints. + return_futures: If True, returns a future of the copy task. + + Returns: + None, or future of the copy task. + + """ + + node_id = _get_node_id_from_node_ip(ip) + + copy_on_node = _remote_copy_dir.options(num_cpus=0, **_force_on_node(node_id)) + copy_future = copy_on_node.remote( + source_dir=source_path, target_dir=target_path, exclude=exclude + ) + + if return_futures: + return copy_future + + return ray.get(copy_future) + + +def _sync_dir_between_different_nodes( + source_ip: str, + source_path: str, + target_ip: str, + target_path: str, + force_all: bool = False, + exclude: Optional[List] = None, + chunk_size_bytes: int = _DEFAULT_CHUNK_SIZE_BYTES, + max_size_bytes: Optional[int] = _DEFAULT_MAX_SIZE_BYTES, + return_futures: bool = False, +) -> Union[None, Tuple[ray.ObjectRef, ray.ActorID, ray.ObjectRef]]: + """Synchronize directory on source node to directory on target node. + + Per default, this function will collect information about already existing + files in the target directory. Only files that differ in either mtime or + filesize will be transferred, unless ``force_all=True``. + + Args: + source_ip: IP of source node. + source_path: Path to directory on source node. + target_ip: IP of target node. + target_path: Path to directory on target node. + force_all: If True, all files will be transferred (not just differing files). + exclude: Pattern of files to exclude, e.g. + ``["*/checkpoint_*]`` to exclude trial checkpoints. + chunk_size_bytes: Chunk size for data transfer. + max_size_bytes: If packed data exceeds this value, raise an error before + transfer. If ``None``, no limit is enforced. + return_futures: If True, returns a tuple of the unpack future, + the pack actor, and the files_stats future. If False (default) will + block until synchronization finished and return None. + + Returns: + None, or Tuple of unpack future, pack actor, and files_stats future. + + """ + + source_node_id = _get_node_id_from_node_ip(source_ip) + target_node_id = _get_node_id_from_node_ip(target_ip) + + pack_actor_on_source_node = _PackActor.options( + num_cpus=0, **_force_on_node(source_node_id) + ) + unpack_on_target_node = _unpack_from_actor.options( + num_cpus=0, **_force_on_node(target_node_id) + ) + + if force_all: + files_stats = None + else: + files_stats = _remote_get_recursive_files_and_stats.options( + num_cpus=0, **_force_on_node(target_node_id) + ).remote(target_path) + + pack_actor = pack_actor_on_source_node.remote( + source_dir=source_path, + files_stats=files_stats, + chunk_size_bytes=chunk_size_bytes, + max_size_bytes=max_size_bytes, + exclude=exclude, + ) + unpack_future = unpack_on_target_node.remote(pack_actor, target_path) + + if return_futures: + return unpack_future, pack_actor, files_stats + + return ray.get(unpack_future) + + +def _get_recursive_files_and_stats(path: str) -> Dict[str, Tuple[float, int]]: + """Return dict of files mapping to stats in ``path``. + + This function scans a directory ``path`` recursively and returns a dict + mapping each contained file to a tuple of (mtime, filesize). + + mtime and filesize are returned from ``os.lstat`` and are usually a + floating point number (timestamp) and an int (filesize in bytes). + """ + files_stats = {} + for root, dirs, files in os.walk(path, topdown=False): + rel_root = os.path.relpath(root, path) + for file in files: + try: + key = os.path.join(rel_root, file) + stat = os.lstat(os.path.join(path, key)) + files_stats[key] = stat.st_mtime, stat.st_size + except FileNotFoundError: + # Race condition: If a file is deleted while executing this + # method, just continue and don't include the file in the stats + pass + + return files_stats + + +# Only export once +_remote_get_recursive_files_and_stats = ray.remote(_get_recursive_files_and_stats) + + +def _pack_dir( + source_dir: str, + exclude: Optional[List] = None, + files_stats: Optional[Dict[str, Tuple[float, int]]] = None, +) -> io.BytesIO: + """Pack whole directory contents into an uncompressed tarfile. + + This function accepts a ``files_stats`` argument. If given, only files + whose stats differ from these stats will be packed. + + The main use case for this is that we can collect information about files + already existing in the target directory, and only pack files that have + been updated. This is similar to how cloud syncing utilities decide + which files to transfer. + + Args: + source_dir: Path to local directory to pack into tarfile. + exclude: Pattern of files to exclude, e.g. + ``["*/checkpoint_*]`` to exclude trial checkpoints. + files_stats: Dict of relative filenames mapping to a tuple of + (mtime, filesize). Only files that differ from these stats + will be packed. + + Returns: + Tarfile as a stream object. + """ + + def _should_exclude(candidate: str) -> bool: + if not exclude: + return False + + for excl in exclude: + if fnmatch.fnmatch(candidate, excl): + return True + return False + + stream = io.BytesIO() + with tarfile.open(fileobj=stream, mode="w", format=tarfile.PAX_FORMAT) as tar: + + if not files_stats and not exclude: + # If no `files_stats` is passed, pack whole directory + tar.add(source_dir, arcname="", recursive=True) + else: + files_stats = files_stats or {} + # Otherwise, only pack differing files + tar.add(source_dir, arcname="", recursive=False) + for root, dirs, files in os.walk(source_dir, topdown=False): + rel_root = os.path.relpath(root, source_dir) + # Always add all directories + for dir in dirs: + key = os.path.join(rel_root, dir) + tar.add(os.path.join(source_dir, key), arcname=key, recursive=False) + # Add files where our information differs + for file in files: + key = os.path.join(rel_root, file) + stat = os.lstat(os.path.join(source_dir, key)) + file_stat = stat.st_mtime, stat.st_size + + if _should_exclude(key): + # If the file matches an exclude pattern, skip + continue + + if key in files_stats and files_stats[key] == file_stat: + # If the file did not change, skip + continue + + tar.add(os.path.join(source_dir, key), arcname=key) + + return stream + + +def _gib_string(num_bytes: float) -> str: + return f"{float(num_bytes / 1024 ** 3):.2f}GiB" + + +@ray.remote +class _PackActor: + """Actor wrapping around a packing job. + + This actor is used for chunking the packed data into smaller chunks that + can be transferred via the object store more efficiently. + + The actor will start packing the directory when initialized, and separate + chunks can be received by calling the remote ``next()`` task. + + Args: + source_dir: Path to local directory to pack into tarfile. + exclude: Pattern of files to exclude, e.g. + ``["*/checkpoint_*]`` to exclude trial checkpoints. + files_stats: Dict of relative filenames mapping to a tuple of + (mtime, filesize). Only files that differ from these stats + will be packed. + chunk_size_bytes: Cut bytes stream into chunks of this size in bytes. + max_size_bytes: If packed data exceeds this value, raise an error before + transfer. If ``None``, no limit is enforced. + """ + + def __init__( + self, + source_dir: str, + exclude: Optional[List] = None, + files_stats: Optional[Dict[str, Tuple[float, int]]] = None, + chunk_size_bytes: int = _DEFAULT_CHUNK_SIZE_BYTES, + max_size_bytes: Optional[int] = _DEFAULT_MAX_SIZE_BYTES, + ): + self.stream = _pack_dir( + source_dir=source_dir, exclude=exclude, files_stats=files_stats + ) + + # Get buffer size + self.stream.seek(0, 2) + file_size = self.stream.tell() + + if max_size_bytes and file_size > max_size_bytes: + raise RuntimeError( + f"Packed directory {source_dir} content has a size of " + f"{_gib_string(file_size)}, which exceeds the limit " + f"of {_gib_string(max_size_bytes)}. Please check the directory " + f"contents. If you want to transfer everything, you can increase " + f"or disable the limit by passing the `max_size` argument." + ) + self.chunk_size = chunk_size_bytes + self.max_size = max_size_bytes + self.iter = None + + def get_full_data(self) -> bytes: + return self.stream.getvalue() + + def _chunk_generator(self) -> Generator[bytes, None, None]: + self.stream.seek(0) + data = self.stream.read(self.chunk_size) + while data: + yield data + data = self.stream.read(self.chunk_size) + + def next(self) -> Optional[bytes]: + if not self.iter: + self.iter = iter(self._chunk_generator()) + try: + return next(self.iter) + except StopIteration: + return None + + +def _iter_remote(actor: ray.ActorID) -> Generator[bytes, None, None]: + """Iterate over actor task and return as generator.""" + while True: + buffer = ray.get(actor.next.remote()) + if buffer is None: + return + yield buffer + + +def _unpack_dir(stream: io.BytesIO, target_dir: str, *, _retry: bool = True) -> None: + """Unpack tarfile stream into target directory.""" + stream.seek(0) + target_dir = os.path.normpath(target_dir) + try: + # Timeout 0 means there will be only one attempt to acquire + # the file lock. If it cannot be aquired, a TimeoutError + # will be thrown. + with TempFileLock(f"{target_dir}.lock", timeout=0): + with tarfile.open(fileobj=stream) as tar: + tar.extractall(target_dir) + except TimeoutError: + # wait, but do not do anything + with TempFileLock(f"{target_dir}.lock"): + pass + # if the dir was locked due to being deleted, + # recreate + if not os.path.exists(target_dir): + if _retry: + _unpack_dir(stream, target_dir, _retry=False) + else: + raise RuntimeError( + f"Target directory {target_dir} does not exist " + "and couldn't be recreated. " + "Please raise an issue on GitHub: " + "https://github.com/ray-project/ray/issues" + ) + + +@ray.remote +def _unpack_from_actor(pack_actor: ray.ActorID, target_dir: str) -> None: + """Iterate over chunks received from pack actor and unpack.""" + stream = io.BytesIO() + for buffer in _iter_remote(pack_actor): + stream.write(buffer) + _unpack_dir(stream, target_dir=target_dir) + + +def _copy_dir( + source_dir: str, + target_dir: str, + *, + exclude: Optional[List] = None, + _retry: bool = True, +) -> None: + """Copy dir with shutil on the actor.""" + target_dir = os.path.normpath(target_dir) + try: + # Timeout 0 means there will be only one attempt to acquire + # the file lock. If it cannot be aquired, a TimeoutError + # will be thrown. + with TempFileLock(f"{target_dir}.lock", timeout=0): + _delete_path_unsafe(target_dir) + + _ignore_func = None + if exclude: + + def _ignore(path, names): + ignored_names = set() + rel_path = os.path.relpath(path, source_dir) + for name in names: + candidate = os.path.join(rel_path, name) + for excl in exclude: + if fnmatch.fnmatch(candidate, excl): + ignored_names.add(name) + break + return ignored_names + + _ignore_func = _ignore + + shutil.copytree(source_dir, target_dir, ignore=_ignore_func) + except TimeoutError: + # wait, but do not do anything + with TempFileLock(f"{target_dir}.lock"): + pass + # if the dir was locked due to being deleted, + # recreate + if not os.path.exists(target_dir): + if _retry: + _copy_dir(source_dir, target_dir, _retry=False) + else: + raise RuntimeError( + f"Target directory {target_dir} does not exist " + "and couldn't be recreated. " + "Please raise an issue on GitHub: " + "https://github.com/ray-project/ray/issues" + ) + + +# Only export once +_remote_copy_dir = ray.remote(_copy_dir) + + +def _delete_path_unsafe(target_path: str): + """Delete path (files and directories). No filelock.""" + if os.path.exists(target_path): + if os.path.isdir(target_path): + shutil.rmtree(target_path) + else: + os.remove(target_path) + return True + return False diff --git a/.venv/lib/python3.11/site-packages/ray/tune/utils/mock.py b/.venv/lib/python3.11/site-packages/ray/tune/utils/mock.py new file mode 100644 index 0000000000000000000000000000000000000000..5d7f7c8d2624d00910c170b84d27185f876f3b31 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/utils/mock.py @@ -0,0 +1,124 @@ +import logging +import os +import random +import time +from collections import defaultdict +from pathlib import Path +from typing import Dict + +from ray.tune.callback import Callback +from ray.tune.experiment import Trial + +logger = logging.getLogger(__name__) + + +class FailureInjectorCallback(Callback): + """Adds random failure injection to the TrialExecutor.""" + + def __init__( + self, + config_path="~/ray_bootstrap_config.yaml", + probability=0.1, + time_between_checks=0, + disable=False, + ): + self.probability = probability + self.config_path = Path(config_path).expanduser().as_posix() + self.disable = disable + + self.time_between_checks = time_between_checks + # Initialize with current time so we don't fail right away + self.last_fail_check = time.monotonic() + + def on_step_begin(self, **info): + if not os.path.exists(self.config_path): + return + if time.monotonic() < self.last_fail_check + self.time_between_checks: + return + self.last_fail_check = time.monotonic() + import click + + from ray.autoscaler._private.commands import kill_node + + failures = 0 + max_failures = 3 + # With 10% probability inject failure to a worker. + if random.random() < self.probability and not self.disable: + # With 10% probability fully terminate the node. + should_terminate = random.random() < self.probability + while failures < max_failures: + try: + kill_node( + self.config_path, + yes=True, + hard=should_terminate, + override_cluster_name=None, + ) + return + except click.exceptions.ClickException: + failures += 1 + logger.exception( + "Killing random node failed in attempt " + "{}. " + "Retrying {} more times".format( + str(failures), str(max_failures - failures) + ) + ) + + +class TrialStatusSnapshot: + """A sequence of statuses of trials as they progress. + + If all trials keep previous status, no snapshot is taken. + """ + + def __init__(self): + self._snapshot = [] + + def append(self, new_snapshot: Dict[str, str]): + """May append a new snapshot to the sequence.""" + if not new_snapshot: + # Don't add an empty snapshot. + return + if not self._snapshot or new_snapshot != self._snapshot[-1]: + self._snapshot.append(new_snapshot) + + def max_running_trials(self) -> int: + """Outputs the max number of running trials at a given time. + + Usually used to assert certain number given resource restrictions. + """ + result = 0 + for snapshot in self._snapshot: + count = 0 + for trial_id in snapshot: + if snapshot[trial_id] == Trial.RUNNING: + count += 1 + result = max(result, count) + + return result + + def all_trials_are_terminated(self) -> bool: + """True if all trials are terminated.""" + if not self._snapshot: + return False + last_snapshot = self._snapshot[-1] + return all( + last_snapshot[trial_id] == Trial.TERMINATED for trial_id in last_snapshot + ) + + +class TrialStatusSnapshotTaker(Callback): + """Collects a sequence of statuses of trials as they progress. + + If all trials keep previous status, no snapshot is taken. + """ + + def __init__(self, snapshot: TrialStatusSnapshot): + self._snapshot = snapshot + + def on_step_end(self, iteration, trials, **kwargs): + new_snapshot = defaultdict(str) + for trial in trials: + new_snapshot[trial.trial_id] = trial.status + self._snapshot.append(new_snapshot) diff --git a/.venv/lib/python3.11/site-packages/ray/tune/utils/mock_trainable.py b/.venv/lib/python3.11/site-packages/ray/tune/utils/mock_trainable.py new file mode 100644 index 0000000000000000000000000000000000000000..2e79181ebba6cc7497aeee9e0771e73ab3e62c66 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/utils/mock_trainable.py @@ -0,0 +1,63 @@ +import json +import os +import time + +import numpy as np + +from ray.tune import Trainable + +MOCK_TRAINABLE_NAME = "mock_trainable" +MOCK_ERROR_KEY = "mock_error" + + +class MyTrainableClass(Trainable): + """Example agent whose learning curve is a random sigmoid. + + The dummy hyperparameters "width" and "height" determine the slope and + maximum reward value reached. + """ + + def setup(self, config): + self._sleep_time = config.get("sleep", 0) + self._mock_error = config.get(MOCK_ERROR_KEY, False) + self._persistent_error = config.get("persistent_error", False) + + self.timestep = 0 + self.restored = False + + def step(self): + if ( + self._mock_error + and self.timestep > 0 # allow at least 1 successful checkpoint. + and (self._persistent_error or not self.restored) + ): + raise RuntimeError(f"Failing on purpose! {self.timestep=}") + + if self._sleep_time > 0: + time.sleep(self._sleep_time) + + self.timestep += 1 + v = np.tanh(float(self.timestep) / self.config.get("width", 1)) + v *= self.config.get("height", 1) + + # Here we use `episode_reward_mean`, but you can also report other + # objectives such as loss or accuracy. + return {"episode_reward_mean": v} + + def save_checkpoint(self, checkpoint_dir): + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "w") as f: + f.write(json.dumps({"timestep": self.timestep})) + + def load_checkpoint(self, checkpoint_dir): + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "r") as f: + self.timestep = json.loads(f.read())["timestep"] + + self.restored = True + + +def register_mock_trainable(): + from ray.tune import register_trainable + + register_trainable(MOCK_TRAINABLE_NAME, MyTrainableClass) diff --git a/.venv/lib/python3.11/site-packages/ray/tune/utils/object_cache.py b/.venv/lib/python3.11/site-packages/ray/tune/utils/object_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..99f1b5678d2a74c4b8c5932d0c92f919ec32bb78 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/utils/object_cache.py @@ -0,0 +1,173 @@ +from collections import Counter, defaultdict +from typing import Dict, Generator, List, Optional, TypeVar + +# Grouping key - must be hashable +T = TypeVar("T") +# Objects to cache +U = TypeVar("U") + + +class _ObjectCache: + """Cache up to some maximum count given a grouping key. + + This object cache can e.g. be used to cache Ray Tune trainable actors + given their resource requirements (reuse_actors=True). + + If the max number of cached objects for a grouping key is reached, + no more objects for this group will be cached. + + However, if `may_keep_one=True`, one object (globally across all grouping + keys) may be cached, even if the max number of objects is 0. This is to + allow to cache an object if the max number of objects of this key + will increase shortly after (as is the case e.g. in the Ray Tune control + loop). + + Args: + may_keep_one: If True, one object (globally) may be cached if no desired + maximum objects are defined. + + """ + + def __init__(self, may_keep_one: bool = True): + self._num_cached_objects: int = 0 + self._cached_objects: Dict[T, List[U]] = defaultdict(list) + self._max_num_objects: Counter[T] = Counter() + + self._may_keep_one = may_keep_one + + @property + def num_cached_objects(self): + return self._num_cached_objects + + @property + def total_max_objects(self): + # Counter.total() is only available for python 3.10+ + return sum(self._max_num_objects.values()) + + def increase_max(self, key: T, by: int = 1) -> None: + """Increase number of max objects for this key. + + Args: + key: Group key. + by: Decrease by this amount. + """ + self._max_num_objects[key] += by + + def decrease_max(self, key: T, by: int = 1) -> None: + """Decrease number of max objects for this key. + + Args: + key: Group key. + by: Decrease by this amount. + """ + self._max_num_objects[key] -= by + + def has_cached_object(self, key: T) -> bool: + """Return True if at least one cached object exists for this key. + + Args: + key: Group key. + + Returns: + True if at least one cached object exists for this key. + """ + return bool(self._cached_objects[key]) + + def cache_object(self, key: T, obj: U) -> bool: + """Cache object for a given key. + + This will put the object into a cache, assuming the number + of cached objects for this key is less than the number of + max objects for this key. + + An exception is made if `max_keep_one=True` and no other + objects are cached globally. In that case, the object can + still be cached. + + Args: + key: Group key. + obj: Object to cache. + + Returns: + True if the object has been cached. False otherwise. + + """ + # If we have more objects cached already than we desire + if len(self._cached_objects[key]) >= self._max_num_objects[key]: + # If may_keep_one is False, never cache + if not self._may_keep_one: + return False + + # If we have more than one other cached object, don't cache + if self._num_cached_objects > 0: + return False + + # If any other objects are expected to be cached, don't cache + if any(v for v in self._max_num_objects.values()): + return False + + # Otherwise, cache (for now). + + self._cached_objects[key].append(obj) + self._num_cached_objects += 1 + return True + + def pop_cached_object(self, key: T) -> Optional[U]: + """Get one cached object for a key. + + This will remove the object from the cache. + + Args: + key: Group key. + + Returns: + Cached object. + """ + if not self.has_cached_object(key): + return None + + self._num_cached_objects -= 1 + return self._cached_objects[key].pop(0) + + def flush_cached_objects(self, force_all: bool = False) -> Generator[U, None, None]: + """Return a generator over cached objects evicted from the cache. + + This method yields all cached objects that should be evicted from the + cache for cleanup by the caller. + + If the number of max objects is lower than the number of + cached objects for a given key, objects are evicted until + the numbers are equal. + + If `max_keep_one=True` (and ``force_all=False``), one cached object + may be retained. + + Objects are evicted FIFO. + + If ``force_all=True``, all objects are evicted. + + Args: + force_all: If True, all objects are flushed. This takes precedence + over ``keep_one``. + + Yields: + Evicted objects to be cleaned up by caller. + + """ + # If force_all=True, don't keep one. + keep_one = self._may_keep_one and not force_all + + for key, objs in self._cached_objects.items(): + max_cached = self._max_num_objects[key] if not force_all else 0 + + if ( + self._num_cached_objects == 1 + and keep_one + # Only keep this object if we don't expect a different one + and not any(v for v in self._max_num_objects.values()) + ): + break + + while len(objs) > max_cached: + self._num_cached_objects -= 1 + yield objs.pop(0) diff --git a/.venv/lib/python3.11/site-packages/ray/tune/utils/release_test_util.py b/.venv/lib/python3.11/site-packages/ray/tune/utils/release_test_util.py new file mode 100644 index 0000000000000000000000000000000000000000..9120097d52d20bb4d84e6e5ca88fa7aebe167655 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/utils/release_test_util.py @@ -0,0 +1,190 @@ +import json +import os +import pickle +import tempfile +import time +from collections import Counter + +import numpy as np + +from ray import train, tune +from ray._private.test_utils import safe_write_to_results_json +from ray.train import Checkpoint +from ray.tune.callback import Callback + + +class ProgressCallback(Callback): + def __init__(self): + self.last_update = 0 + self.update_interval = 60 + + def on_step_end(self, iteration, trials, **kwargs): + if time.time() - self.last_update > self.update_interval: + now = time.time() + result = { + "last_update": now, + "iteration": iteration, + "trial_states": dict(Counter([trial.status for trial in trials])), + } + safe_write_to_results_json(result, "/tmp/release_test_out.json") + + self.last_update = now + + +class TestDurableTrainable(tune.Trainable): + def __init__(self, *args, **kwargs): + self.setup_env() + + super(TestDurableTrainable, self).__init__(*args, **kwargs) + + def setup_env(self): + pass + + def setup(self, config): + self._num_iters = int(config["num_iters"]) + self._sleep_time = config["sleep_time"] + self._score = config["score"] + + self._checkpoint_iters = config["checkpoint_iters"] + self._checkpoint_size_b = config["checkpoint_size_b"] + self._checkpoint_num_items = self._checkpoint_size_b // 8 # np.float64 + + self._iter = 0 + + def step(self): + if self._iter > 0: + time.sleep(self._sleep_time) + + res = dict(score=self._iter + self._score) + + if self._iter >= self._num_iters: + res["done"] = True + + self._iter += 1 + return res + + def save_checkpoint(self, tmp_checkpoint_dir): + checkpoint_file = os.path.join(tmp_checkpoint_dir, "bogus.ckpt") + checkpoint_data = np.random.uniform(0, 1, size=self._checkpoint_num_items) + with open(checkpoint_file, "wb") as fp: + pickle.dump(checkpoint_data, fp) + + def load_checkpoint(self, checkpoint): + pass + + +def function_trainable(config): + num_iters = int(config["num_iters"]) + sleep_time = config["sleep_time"] + score = config["score"] + + checkpoint_iters = config["checkpoint_iters"] + checkpoint_size_b = config["checkpoint_size_b"] + checkpoint_num_items = checkpoint_size_b // 8 # np.float64 + checkpoint_num_files = config["checkpoint_num_files"] + + for i in range(num_iters): + metrics = {"score": i + score} + if ( + checkpoint_iters >= 0 + and checkpoint_size_b > 0 + and i % checkpoint_iters == 0 + ): + with tempfile.TemporaryDirectory() as tmpdir: + for i in range(checkpoint_num_files): + checkpoint_file = os.path.join(tmpdir, f"bogus_{i}.ckpt") + checkpoint_data = np.random.uniform(0, 1, size=checkpoint_num_items) + with open(checkpoint_file, "wb") as fp: + pickle.dump(checkpoint_data, fp) + train.report(metrics, checkpoint=Checkpoint.from_directory(tmpdir)) + else: + train.report(metrics) + + time.sleep(sleep_time) + + +def timed_tune_run( + name: str, + num_samples: int, + results_per_second: int = 1, + trial_length_s: int = 1, + max_runtime: int = 300, + checkpoint_freq_s: int = -1, + checkpoint_size_b: int = 0, + checkpoint_num_files: int = 1, + **tune_kwargs, +) -> bool: + durable = ( + "storage_path" in tune_kwargs + and tune_kwargs["storage_path"] + and ( + tune_kwargs["storage_path"].startswith("s3://") + or tune_kwargs["storage_path"].startswith("gs://") + ) + ) + + sleep_time = 1.0 / results_per_second + num_iters = int(trial_length_s / sleep_time) + checkpoint_iters = -1 + if checkpoint_freq_s >= 0: + checkpoint_iters = int(checkpoint_freq_s / sleep_time) + + config = { + "score": tune.uniform(0.0, 1.0), + "num_iters": num_iters, + "sleep_time": sleep_time, + "checkpoint_iters": checkpoint_iters, + "checkpoint_size_b": checkpoint_size_b, + "checkpoint_num_files": checkpoint_num_files, + } + + print(f"Starting benchmark with config: {config}") + + run_kwargs = {"reuse_actors": True, "verbose": 2} + run_kwargs.update(tune_kwargs) + + _train = function_trainable + + if durable: + _train = TestDurableTrainable + run_kwargs["checkpoint_freq"] = checkpoint_iters + + start_time = time.monotonic() + analysis = tune.run( + _train, + config=config, + num_samples=num_samples, + raise_on_failed_trial=False, + **run_kwargs, + ) + time_taken = time.monotonic() - start_time + + result = { + "time_taken": time_taken, + "trial_states": dict(Counter([trial.status for trial in analysis.trials])), + "last_update": time.time(), + } + + test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/tune_test.json") + with open(test_output_json, "wt") as f: + json.dump(result, f) + + success = time_taken <= max_runtime + + if not success: + print( + f"The {name} test took {time_taken:.2f} seconds, but should not " + f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n" + f"--- FAILED: {name.upper()} ::: " + f"{time_taken:.2f} > {max_runtime:.2f} ---" + ) + else: + print( + f"The {name} test took {time_taken:.2f} seconds, which " + f"is below the budget of {max_runtime:.2f} seconds. " + f"Test successful. \n\n" + f"--- PASSED: {name.upper()} ::: " + f"{time_taken:.2f} <= {max_runtime:.2f} ---" + ) + + return success diff --git a/.venv/lib/python3.11/site-packages/ray/tune/utils/resource_updater.py b/.venv/lib/python3.11/site-packages/ray/tune/utils/resource_updater.py new file mode 100644 index 0000000000000000000000000000000000000000..d832683193f3268304106ebee5fe5f749ad94f6c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/utils/resource_updater.py @@ -0,0 +1,369 @@ +import logging +import os +import time +from collections import namedtuple +from numbers import Number +from typing import Any, Dict, Optional + +import ray +from ray._private.resource_spec import NODE_ID_PREFIX + +logger = logging.getLogger(__name__) + +TUNE_STATE_REFRESH_PERIOD = 10 # Refresh resources every 10 s + + +def _to_gb(n_bytes): + return round(n_bytes / (1024**3), 2) + + +class _Resources( + namedtuple( + "_Resources", + [ + "cpu", + "gpu", + "memory", + "object_store_memory", + "extra_cpu", + "extra_gpu", + "extra_memory", + "extra_object_store_memory", + "custom_resources", + "extra_custom_resources", + "has_placement_group", + ], + ) +): + """Ray resources required to schedule a trial. + + Parameters: + cpu: Number of CPUs to allocate to the trial. + gpu: Number of GPUs to allocate to the trial. + memory: Memory to reserve for the trial. + object_store_memory: Object store memory to reserve. + extra_cpu: Extra CPUs to reserve in case the trial needs to + launch additional Ray actors that use CPUs. + extra_gpu: Extra GPUs to reserve in case the trial needs to + launch additional Ray actors that use GPUs. + extra_memory: Memory to reserve for the trial launching + additional Ray actors that use memory. + extra_object_store_memory: Object store memory to reserve for + the trial launching additional Ray actors that use object store + memory. + custom_resources: Mapping of resource to quantity to allocate + to the trial. + extra_custom_resources: Extra custom resources to reserve in + case the trial needs to launch additional Ray actors that use + any of these custom resources. + has_placement_group: Bool indicating if the trial also + has an associated placement group. + + """ + + __slots__ = () + + def __new__( + cls, + cpu: float, + gpu: float, + memory: float = 0, + object_store_memory: float = 0.0, + extra_cpu: float = 0.0, + extra_gpu: float = 0.0, + extra_memory: float = 0.0, + extra_object_store_memory: float = 0.0, + custom_resources: Optional[dict] = None, + extra_custom_resources: Optional[dict] = None, + has_placement_group: bool = False, + ): + custom_resources = custom_resources or {} + extra_custom_resources = extra_custom_resources or {} + leftovers = set(custom_resources) ^ set(extra_custom_resources) + + for value in leftovers: + custom_resources.setdefault(value, 0) + extra_custom_resources.setdefault(value, 0) + + cpu = round(cpu, 2) + gpu = round(gpu, 2) + memory = round(memory, 2) + object_store_memory = round(object_store_memory, 2) + extra_cpu = round(extra_cpu, 2) + extra_gpu = round(extra_gpu, 2) + extra_memory = round(extra_memory, 2) + extra_object_store_memory = round(extra_object_store_memory, 2) + custom_resources = { + resource: round(value, 2) for resource, value in custom_resources.items() + } + extra_custom_resources = { + resource: round(value, 2) + for resource, value in extra_custom_resources.items() + } + + all_values = [ + cpu, + gpu, + memory, + object_store_memory, + extra_cpu, + extra_gpu, + extra_memory, + extra_object_store_memory, + ] + all_values += list(custom_resources.values()) + all_values += list(extra_custom_resources.values()) + assert len(custom_resources) == len(extra_custom_resources) + for entry in all_values: + assert isinstance(entry, Number), ("Improper resource value.", entry) + return super(_Resources, cls).__new__( + cls, + cpu, + gpu, + memory, + object_store_memory, + extra_cpu, + extra_gpu, + extra_memory, + extra_object_store_memory, + custom_resources, + extra_custom_resources, + has_placement_group, + ) + + def summary_string(self): + summary = "{} CPUs, {} GPUs".format( + self.cpu + self.extra_cpu, self.gpu + self.extra_gpu + ) + if self.memory or self.extra_memory: + summary += ", {} GiB heap".format( + round((self.memory + self.extra_memory) / (1024**3), 2) + ) + if self.object_store_memory or self.extra_object_store_memory: + summary += ", {} GiB objects".format( + round( + (self.object_store_memory + self.extra_object_store_memory) + / (1024**3), + 2, + ) + ) + custom_summary = ", ".join( + [ + "{} {}".format(self.get_res_total(res), res) + for res in self.custom_resources + if not res.startswith(NODE_ID_PREFIX) + ] + ) + if custom_summary: + summary += " ({})".format(custom_summary) + return summary + + def cpu_total(self): + return self.cpu + self.extra_cpu + + def gpu_total(self): + return self.gpu + self.extra_gpu + + def memory_total(self): + return self.memory + self.extra_memory + + def object_store_memory_total(self): + return self.object_store_memory + self.extra_object_store_memory + + def get_res_total(self, key): + return self.custom_resources.get(key, 0) + self.extra_custom_resources.get( + key, 0 + ) + + def get(self, key): + return self.custom_resources.get(key, 0) + + def is_nonnegative(self): + all_values = [self.cpu, self.gpu, self.extra_cpu, self.extra_gpu] + all_values += list(self.custom_resources.values()) + all_values += list(self.extra_custom_resources.values()) + return all(v >= 0 for v in all_values) + + @classmethod + def subtract(cls, original, to_remove): + cpu = original.cpu - to_remove.cpu + gpu = original.gpu - to_remove.gpu + memory = original.memory - to_remove.memory + object_store_memory = ( + original.object_store_memory - to_remove.object_store_memory + ) + extra_cpu = original.extra_cpu - to_remove.extra_cpu + extra_gpu = original.extra_gpu - to_remove.extra_gpu + extra_memory = original.extra_memory - to_remove.extra_memory + extra_object_store_memory = ( + original.extra_object_store_memory - to_remove.extra_object_store_memory + ) + all_resources = set(original.custom_resources).union( + set(to_remove.custom_resources) + ) + new_custom_res = { + k: original.custom_resources.get(k, 0) + - to_remove.custom_resources.get(k, 0) + for k in all_resources + } + extra_custom_res = { + k: original.extra_custom_resources.get(k, 0) + - to_remove.extra_custom_resources.get(k, 0) + for k in all_resources + } + return _Resources( + cpu, + gpu, + memory, + object_store_memory, + extra_cpu, + extra_gpu, + extra_memory, + extra_object_store_memory, + new_custom_res, + extra_custom_res, + ) + + +class _ResourceUpdater: + """Periodic Resource updater for Tune. + + Initially, all resources are set to 0. The updater will try to update resources + when (1) init ResourceUpdater (2) call "update_avail_resources", "num_cpus" + or "num_gpus". + + The update takes effect when (1) Ray is initialized (2) the interval between + this and last update is larger than "refresh_period" + """ + + def __init__(self, refresh_period: Optional[float] = None): + self._avail_resources = _Resources(cpu=0, gpu=0) + + if refresh_period is None: + refresh_period = float( + os.environ.get("TUNE_STATE_REFRESH_PERIOD", TUNE_STATE_REFRESH_PERIOD) + ) + self._refresh_period = refresh_period + self._last_resource_refresh = float("-inf") + self.update_avail_resources() + + def update_avail_resources(self, num_retries: int = 5, force: bool = False): + if not ray.is_initialized(): + return + if ( + time.time() - self._last_resource_refresh < self._refresh_period + and not force + ): + return + logger.debug("Checking Ray cluster resources.") + resources = None + for i in range(num_retries): + if i > 0: + logger.warning( + f"Cluster resources not detected or are 0. Attempt #{i + 1}...", + ) + time.sleep(0.5) + resources = ray.cluster_resources() + if resources: + break + + if not resources: + # NOTE: This hides the possibility that Ray may be waiting for + # clients to connect. + resources.setdefault("CPU", 0) + resources.setdefault("GPU", 0) + logger.warning( + "Cluster resources cannot be detected or are 0. " + "You can resume this experiment by passing in `resume=True` to `run`." + ) + + resources = resources.copy() + num_cpus = resources.pop("CPU", 0) + num_gpus = resources.pop("GPU", 0) + memory = resources.pop("memory", 0) + object_store_memory = resources.pop("object_store_memory", 0) + custom_resources = resources + + self._avail_resources = _Resources( + int(num_cpus), + int(num_gpus), + memory=int(memory), + object_store_memory=int(object_store_memory), + custom_resources=custom_resources, + ) + self._last_resource_refresh = time.time() + + def _get_used_avail_resources(self, total_allocated_resources: Dict[str, Any]): + total_allocated_resources = total_allocated_resources.copy() + + used_cpu = total_allocated_resources.pop("CPU", 0) + total_cpu = self._avail_resources.cpu + used_gpu = total_allocated_resources.pop("GPU", 0) + total_gpu = self._avail_resources.gpu + + custom_used_total = { + name: ( + total_allocated_resources.get(name, 0.0), + self._avail_resources.get_res_total(name), + ) + for name in self._avail_resources.custom_resources + if not name.startswith(NODE_ID_PREFIX) + and (total_allocated_resources.get(name, 0.0) > 0 or "_group_" not in name) + } + return used_cpu, total_cpu, used_gpu, total_gpu, custom_used_total + + def debug_string(self, total_allocated_resources: Dict[str, Any]) -> str: + """Returns a human readable message for printing to the console.""" + if self._last_resource_refresh > 0: + ( + used_cpu, + total_cpu, + used_gpu, + total_gpu, + custom_used_total, + ) = self._get_used_avail_resources(total_allocated_resources) + + if ( + used_cpu > total_cpu + or used_gpu > total_gpu + or any(used > total for (used, total) in custom_used_total.values()) + ): + # If any of the used resources are higher than what we currently think + # is available, update our state and re-fetch + self.update_avail_resources(force=True) + ( + used_cpu, + total_cpu, + used_gpu, + total_gpu, + custom_used_total, + ) = self._get_used_avail_resources(total_allocated_resources) + + status = ( + f"Logical resource usage: {used_cpu}/{total_cpu} CPUs, " + f"{used_gpu}/{total_gpu} GPUs" + ) + customs = ", ".join( + f"{used}/{total} {name}" + for name, (used, total) in custom_used_total.items() + ) + + if customs: + status += f" ({customs})" + return status + else: + return "Logical resource usage: ?" + + def get_num_cpus(self) -> int: + self.update_avail_resources() + return self._avail_resources.cpu + + def get_num_gpus(self) -> int: + self.update_avail_resources() + return self._avail_resources.gpu + + def __reduce__(self): + # Do not need to serialize resources, because we can always + # update it again. This also prevents keeping outdated resources + # when deserialized. + return _ResourceUpdater, (self._refresh_period,) diff --git a/.venv/lib/python3.11/site-packages/ray/tune/utils/serialization.py b/.venv/lib/python3.11/site-packages/ray/tune/utils/serialization.py new file mode 100644 index 0000000000000000000000000000000000000000..12ac7b1af060bbf96bdfd234de797aba6f21f182 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/utils/serialization.py @@ -0,0 +1,43 @@ +import json +import logging +import types + +from ray import cloudpickle as cloudpickle +from ray._private.utils import binary_to_hex, hex_to_binary +from ray.util.annotations import DeveloperAPI +from ray.util.debug import log_once + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +class TuneFunctionEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, types.FunctionType): + return self._to_cloudpickle(obj) + try: + return super(TuneFunctionEncoder, self).default(obj) + except Exception: + if log_once(f"tune_func_encode:{str(obj)}"): + logger.debug("Unable to encode. Falling back to cloudpickle.") + return self._to_cloudpickle(obj) + + def _to_cloudpickle(self, obj): + return { + "_type": "CLOUDPICKLE_FALLBACK", + "value": binary_to_hex(cloudpickle.dumps(obj)), + } + + +@DeveloperAPI +class TuneFunctionDecoder(json.JSONDecoder): + def __init__(self, *args, **kwargs): + json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs) + + def object_hook(self, obj): + if obj.get("_type") == "CLOUDPICKLE_FALLBACK": + return self._from_cloudpickle(obj) + return obj + + def _from_cloudpickle(self, obj): + return cloudpickle.loads(hex_to_binary(obj["value"])) diff --git a/.venv/lib/python3.11/site-packages/ray/tune/utils/util.py b/.venv/lib/python3.11/site-packages/ray/tune/utils/util.py new file mode 100644 index 0000000000000000000000000000000000000000..617c8ccf46a07a62fbe3811d9328e571e67d2456 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/utils/util.py @@ -0,0 +1,646 @@ +import copy +import glob +import inspect +import logging +import os +import threading +import time +from collections import defaultdict +from datetime import datetime +from numbers import Number +from threading import Thread +from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union + +import numpy as np + +import ray +from ray._private.dict import ( # noqa: F401 + deep_update, + flatten_dict, + merge_dicts, + unflatten_dict, + unflatten_list_dict, + unflattened_lookup, +) +from ray.air._internal.json import SafeFallbackEncoder # noqa +from ray.air._internal.util import is_nan, is_nan_or_inf # noqa: F401 +from ray.util.annotations import DeveloperAPI, PublicAPI + +import psutil + +logger = logging.getLogger(__name__) + + +def _import_gputil(): + try: + import GPUtil + except ImportError: + GPUtil = None + return GPUtil + + +START_OF_TIME = time.time() + + +@DeveloperAPI +class UtilMonitor(Thread): + """Class for system usage utilization monitoring. + + It keeps track of CPU, RAM, GPU, VRAM usage (each gpu separately) by + pinging for information every x seconds in a separate thread. + + Requires psutil and GPUtil to be installed. Can be enabled with + Tuner(param_space={"log_sys_usage": True}). + """ + + def __init__(self, start=True, delay=0.7): + self.stopped = True + GPUtil = _import_gputil() + self.GPUtil = GPUtil + if GPUtil is None and start: + logger.warning("Install gputil for GPU system monitoring.") + + if psutil is None and start: + logger.warning("Install psutil to monitor system performance.") + + if GPUtil is None and psutil is None: + return + + super(UtilMonitor, self).__init__() + self.delay = delay # Time between calls to GPUtil + self.values = defaultdict(list) + self.lock = threading.Lock() + self.daemon = True + if start: + self.start() + + def _read_utilization(self): + with self.lock: + if psutil is not None: + self.values["cpu_util_percent"].append( + float(psutil.cpu_percent(interval=None)) + ) + self.values["ram_util_percent"].append( + float(psutil.virtual_memory().percent) + ) + if self.GPUtil is not None: + gpu_list = [] + try: + gpu_list = self.GPUtil.getGPUs() + except Exception: + logger.debug("GPUtil failed to retrieve GPUs.") + for gpu in gpu_list: + self.values["gpu_util_percent" + str(gpu.id)].append( + float(gpu.load) + ) + self.values["vram_util_percent" + str(gpu.id)].append( + float(gpu.memoryUtil) + ) + + def get_data(self): + if self.stopped: + return {} + + with self.lock: + ret_values = copy.deepcopy(self.values) + for key, val in self.values.items(): + del val[:] + return {"perf": {k: np.mean(v) for k, v in ret_values.items() if len(v) > 0}} + + def run(self): + self.stopped = False + while not self.stopped: + self._read_utilization() + time.sleep(self.delay) + + def stop(self): + self.stopped = True + + +@DeveloperAPI +def retry_fn( + fn: Callable[[], Any], + exception_type: Union[Type[Exception], Sequence[Type[Exception]]] = Exception, + num_retries: int = 3, + sleep_time: int = 1, + timeout: Optional[Number] = None, +) -> bool: + errored = threading.Event() + + def _try_fn(): + try: + fn() + except exception_type as e: + logger.warning(e) + errored.set() + + for i in range(num_retries): + errored.clear() + + proc = threading.Thread(target=_try_fn) + proc.daemon = True + proc.start() + proc.join(timeout=timeout) + + if proc.is_alive(): + logger.debug( + f"Process timed out (try {i+1}/{num_retries}): " + f"{getattr(fn, '__name__', None)}" + ) + elif not errored.is_set(): + return True + + # Timed out, sleep and try again + time.sleep(sleep_time) + + # Timed out, so return False + return False + + +@DeveloperAPI +class warn_if_slow: + """Prints a warning if a given operation is slower than 500ms. + + Example: + >>> from ray.tune.utils.util import warn_if_slow + >>> something = ... # doctest: +SKIP + >>> with warn_if_slow("some_operation"): # doctest: +SKIP + ... ray.get(something) # doctest: +SKIP + """ + + DEFAULT_THRESHOLD = float(os.environ.get("TUNE_WARN_THRESHOLD_S", 0.5)) + DEFAULT_MESSAGE = ( + "The `{name}` operation took {duration:.3f} s, " + "which may be a performance bottleneck." + ) + + def __init__( + self, + name: str, + threshold: Optional[float] = None, + message: Optional[str] = None, + disable: bool = False, + ): + self.name = name + self.threshold = threshold or self.DEFAULT_THRESHOLD + self.message = message or self.DEFAULT_MESSAGE + self.too_slow = False + self.disable = disable + + def __enter__(self): + self.start = time.time() + return self + + def __exit__(self, type, value, traceback): + now = time.time() + if self.disable: + return + if now - self.start > self.threshold and now - START_OF_TIME > 60.0: + self.too_slow = True + duration = now - self.start + logger.warning(self.message.format(name=self.name, duration=duration)) + + +@DeveloperAPI +class Tee(object): + def __init__(self, stream1, stream2): + self.stream1 = stream1 + self.stream2 = stream2 + + # If True, we are currently handling a warning. + # We use this flag to avoid infinite recursion. + self._handling_warning = False + + def _warn(self, op, s, args, kwargs): + # If we are already handling a warning, this is because + # `logger.warning` below triggered the same object again + # (e.g. because stderr is redirected to this object). + # In that case, exit early to avoid recursion. + if self._handling_warning: + return + + msg = f"ValueError when calling '{op}' on stream ({s}). " + msg += f"args: {args} kwargs: {kwargs}" + + self._handling_warning = True + logger.warning(msg) + self._handling_warning = False + + def seek(self, *args, **kwargs): + for s in [self.stream1, self.stream2]: + try: + s.seek(*args, **kwargs) + except ValueError: + self._warn("seek", s, args, kwargs) + + def write(self, *args, **kwargs): + for s in [self.stream1, self.stream2]: + try: + s.write(*args, **kwargs) + except ValueError: + self._warn("write", s, args, kwargs) + + def flush(self, *args, **kwargs): + for s in [self.stream1, self.stream2]: + try: + s.flush(*args, **kwargs) + except ValueError: + self._warn("flush", s, args, kwargs) + + @property + def encoding(self): + if hasattr(self.stream1, "encoding"): + return self.stream1.encoding + return self.stream2.encoding + + @property + def error(self): + if hasattr(self.stream1, "error"): + return self.stream1.error + return self.stream2.error + + @property + def newlines(self): + if hasattr(self.stream1, "newlines"): + return self.stream1.newlines + return self.stream2.newlines + + def detach(self): + raise NotImplementedError + + def read(self, *args, **kwargs): + raise NotImplementedError + + def readline(self, *args, **kwargs): + raise NotImplementedError + + def tell(self, *args, **kwargs): + raise NotImplementedError + + +@DeveloperAPI +def date_str(): + return datetime.today().strftime("%Y-%m-%d_%H-%M-%S") + + +def _to_pinnable(obj): + """Converts obj to a form that can be pinned in object store memory. + + Currently only numpy arrays are pinned in memory, if you have a strong + reference to the array value. + """ + + return (obj, np.zeros(1)) + + +def _from_pinnable(obj): + """Retrieve from _to_pinnable format.""" + + return obj[0] + + +@DeveloperAPI +def diagnose_serialization(trainable: Callable): + """Utility for detecting why your trainable function isn't serializing. + + Args: + trainable: The trainable object passed to + tune.Tuner(trainable). Currently only supports + Function API. + + Returns: + bool | set of unserializable objects. + + Example: + + .. code-block:: python + + import threading + # this is not serializable + e = threading.Event() + + def test(): + print(e) + + diagnose_serialization(test) + # should help identify that 'e' should be moved into + # the `test` scope. + + # correct implementation + def test(): + e = threading.Event() + print(e) + + assert diagnose_serialization(test) is True + + """ + from ray.tune.registry import _check_serializability, register_trainable + + def check_variables(objects, failure_set, printer): + for var_name, variable in objects.items(): + msg = None + try: + _check_serializability(var_name, variable) + status = "PASSED" + except Exception as e: + status = "FAILED" + msg = f"{e.__class__.__name__}: {str(e)}" + failure_set.add(var_name) + printer(f"{str(variable)}[name='{var_name}'']... {status}") + if msg: + printer(msg) + + print(f"Trying to serialize {trainable}...") + try: + register_trainable("__test:" + str(trainable), trainable, warn=False) + print("Serialization succeeded!") + return True + except Exception as e: + print(f"Serialization failed: {e}") + + print( + "Inspecting the scope of the trainable by running " + f"`inspect.getclosurevars({str(trainable)})`..." + ) + closure = inspect.getclosurevars(trainable) + failure_set = set() + if closure.globals: + print( + f"Detected {len(closure.globals)} global variables. " + "Checking serializability..." + ) + check_variables(closure.globals, failure_set, lambda s: print(" " + s)) + + if closure.nonlocals: + print( + f"Detected {len(closure.nonlocals)} nonlocal variables. " + "Checking serializability..." + ) + check_variables(closure.nonlocals, failure_set, lambda s: print(" " + s)) + + if not failure_set: + print( + "Nothing was found to have failed the diagnostic test, though " + "serialization did not succeed. Feel free to raise an " + "issue on github." + ) + return failure_set + else: + print( + f"Variable(s) {failure_set} was found to be non-serializable. " + "Consider either removing the instantiation/imports " + "of these objects or moving them into the scope of " + "the trainable. " + ) + return failure_set + + +def _atomic_save(state: Dict, checkpoint_dir: str, file_name: str, tmp_file_name: str): + """Atomically saves the state object to the checkpoint directory. + + This is automatically used by Tuner().fit during a Tune job. + + Args: + state: Object state to be serialized. + checkpoint_dir: Directory location for the checkpoint. + file_name: Final name of file. + tmp_file_name: Temporary name of file. + """ + import ray.cloudpickle as cloudpickle + + tmp_search_ckpt_path = os.path.join(checkpoint_dir, tmp_file_name) + with open(tmp_search_ckpt_path, "wb") as f: + cloudpickle.dump(state, f) + + os.replace(tmp_search_ckpt_path, os.path.join(checkpoint_dir, file_name)) + + +def _load_newest_checkpoint(dirpath: str, ckpt_pattern: str) -> Optional[Dict]: + """Returns the most recently modified checkpoint. + + Assumes files are saved with an ordered name, most likely by + :obj:atomic_save. + + Args: + dirpath: Directory in which to look for the checkpoint file. + ckpt_pattern: File name pattern to match to find checkpoint + files. + + Returns: + (dict) Deserialized state dict. + """ + import ray.cloudpickle as cloudpickle + + full_paths = glob.glob(os.path.join(dirpath, ckpt_pattern)) + if not full_paths: + return + most_recent_checkpoint = max(full_paths) + with open(most_recent_checkpoint, "rb") as f: + checkpoint_state = cloudpickle.load(f) + return checkpoint_state + + +@PublicAPI(stability="beta") +def wait_for_gpu( + gpu_id: Optional[Union[int, str]] = None, + target_util: float = 0.01, + retry: int = 20, + delay_s: int = 5, + gpu_memory_limit: Optional[float] = None, +): + """Checks if a given GPU has freed memory. + + Requires ``gputil`` to be installed: ``pip install gputil``. + + Args: + gpu_id: GPU id or uuid to check. + Must be found within GPUtil.getGPUs(). If none, resorts to + the first item returned from `ray.get_gpu_ids()`. + target_util: The utilization threshold to reach to unblock. + Set this to 0 to block until the GPU is completely free. + retry: Number of times to check GPU limit. Sleeps `delay_s` + seconds between checks. + delay_s: Seconds to wait before check. + + Returns: + bool: True if free. + + Raises: + RuntimeError: If GPUtil is not found, if no GPUs are detected + or if the check fails. + + Example: + + .. code-block:: python + + def tune_func(config): + tune.utils.wait_for_gpu() + train() + + tuner = tune.Tuner( + tune.with_resources( + tune_func, + resources={"gpu": 1} + ), + tune_config=tune.TuneConfig(num_samples=10) + ) + tuner.fit() + + """ + GPUtil = _import_gputil() + + if GPUtil is None: + raise RuntimeError("GPUtil must be installed if calling `wait_for_gpu`.") + + if gpu_id is None: + gpu_id_list = ray.get_gpu_ids() + if not gpu_id_list: + raise RuntimeError( + "No GPU ids found from `ray.get_gpu_ids()`. " + "Did you set Tune resources correctly?" + ) + gpu_id = gpu_id_list[0] + + gpu_attr = "id" + if isinstance(gpu_id, str): + if gpu_id.isdigit(): + # GPU ID returned from `ray.get_gpu_ids()` is a str representation + # of the int GPU ID + gpu_id = int(gpu_id) + else: + # Could not coerce gpu_id to int, so assume UUID + # and compare against `uuid` attribute e.g., + # 'GPU-04546190-b68d-65ac-101b-035f8faed77d' + gpu_attr = "uuid" + elif not isinstance(gpu_id, int): + raise ValueError(f"gpu_id ({type(gpu_id)}) must be type str/int.") + + def gpu_id_fn(g): + # Returns either `g.id` or `g.uuid` depending on + # the format of the input `gpu_id` + return getattr(g, gpu_attr) + + gpu_ids = {gpu_id_fn(g) for g in GPUtil.getGPUs()} + if gpu_id not in gpu_ids: + raise ValueError( + f"{gpu_id} not found in set of available GPUs: {gpu_ids}. " + "`wait_for_gpu` takes either GPU ordinal ID (e.g., '0') or " + "UUID (e.g., 'GPU-04546190-b68d-65ac-101b-035f8faed77d')." + ) + + for i in range(int(retry)): + gpu_object = next(g for g in GPUtil.getGPUs() if gpu_id_fn(g) == gpu_id) + if gpu_object.memoryUtil > target_util: + logger.info( + f"Waiting for GPU util to reach {target_util}. " + f"Util: {gpu_object.memoryUtil:0.3f}" + ) + time.sleep(delay_s) + else: + return True + raise RuntimeError("GPU memory was not freed.") + + +@DeveloperAPI +def validate_save_restore( + trainable_cls: Type, + config: Optional[Dict] = None, + num_gpus: int = 0, +): + """Helper method to check if your Trainable class will resume correctly. + + Args: + trainable_cls: Trainable class for evaluation. + config: Config to pass to Trainable when testing. + num_gpus: GPU resources to allocate when testing. + use_object_store: Whether to save and restore to Ray's object + store. Recommended to set this to True if planning to use + algorithms that pause training (i.e., PBT, HyperBand). + """ + assert ray.is_initialized(), "Need Ray to be initialized." + + remote_cls = ray.remote(num_gpus=num_gpus)(trainable_cls) + trainable_1 = remote_cls.remote(config=config) + trainable_2 = remote_cls.remote(config=config) + + from ray.air.constants import TRAINING_ITERATION + + for _ in range(3): + res = ray.get(trainable_1.train.remote()) + + assert res.get(TRAINING_ITERATION), ( + "Validation will not pass because it requires `training_iteration` " + "to be returned." + ) + + ray.get(trainable_2.restore.remote(trainable_1.save.remote())) + + res = ray.get(trainable_2.train.remote()) + assert res[TRAINING_ITERATION] == 4 + + res = ray.get(trainable_2.train.remote()) + assert res[TRAINING_ITERATION] == 5 + return True + + +def _detect_config_single(func): + """Check if func({}) works.""" + func_sig = inspect.signature(func) + use_config_single = True + try: + func_sig.bind({}) + except Exception as e: + logger.debug(str(e)) + use_config_single = False + return use_config_single + + +@PublicAPI() +def validate_warmstart( + parameter_names: List[str], + points_to_evaluate: List[Union[List, Dict]], + evaluated_rewards: List, + validate_point_name_lengths: bool = True, +): + """Generic validation of a Searcher's warm start functionality. + Raises exceptions in case of type and length mismatches between + parameters. + + If ``validate_point_name_lengths`` is False, the equality of lengths + between ``points_to_evaluate`` and ``parameter_names`` will not be + validated. + """ + if points_to_evaluate: + if not isinstance(points_to_evaluate, list): + raise TypeError( + "points_to_evaluate expected to be a list, got {}.".format( + type(points_to_evaluate) + ) + ) + for point in points_to_evaluate: + if not isinstance(point, (dict, list)): + raise TypeError( + f"points_to_evaluate expected to include list or dict, " + f"got {point}." + ) + + if validate_point_name_lengths and (not len(point) == len(parameter_names)): + raise ValueError( + "Dim of point {}".format(point) + + " and parameter_names {}".format(parameter_names) + + " do not match." + ) + + if points_to_evaluate and evaluated_rewards: + if not isinstance(evaluated_rewards, list): + raise TypeError( + "evaluated_rewards expected to be a list, got {}.".format( + type(evaluated_rewards) + ) + ) + if not len(evaluated_rewards) == len(points_to_evaluate): + raise ValueError( + "Dim of evaluated_rewards {}".format(evaluated_rewards) + + " and points_to_evaluate {}".format(points_to_evaluate) + + " do not match." + )