File size: 4,872 Bytes
747451d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | # /*---------------------------------------------------------------------------------------------
# * Copyright (c) 2022 STMicroelectronics.
# * All rights reserved.
# * This software is licensed under terms that can be found in the LICENSE file in
# * the root directory of this software component.
# * If no LICENSE file comes with this software, it is provided AS-IS.
# *--------------------------------------------------------------------------------------------*/
import os, sys, time
import numpy as np
import tensorflow as tf
def set_gpu_memory_limit(gigabytes):
"""
Sets the upper memory limit for the first GPU to the specified number of gigabytes.
Args:
gigabytes (int): The number of gigabytes to set as the upper memory limit.
Raises:
RuntimeError: If virtual devices have not been set before GPUs are initialized.
Returns:
None
"""
# GPU memory usage configuration
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
tf.config.set_logical_device_configuration(
gpus[0],
[tf.config.LogicalDeviceConfiguration(memory_limit=1024 * gigabytes)])
logical_gpus = tf.config.list_logical_devices('GPU')
print("{} physical GPUs, {} logical GPUs".format(len(gpus), len(logical_gpus)))
print("[INFO] : Setting upper memory limit to {}GBytes on gpu[0]".format(gigabytes))
except:
raise RuntimeError("\nVirtual devices must be set before GPUs have been initialized.")
def inc_gpu_mode() -> None:
"""
Increases the GPU memory allocation incrementally as needed.
Returns:
None
"""
physical_gpus = tf.config.experimental.list_physical_devices('GPU')
if not physical_gpus:
return
try:
for gpu in physical_gpus:
tf.config.experimental.set_memory_growth(gpu, True)
except RuntimeError as e:
print(f"Error setting memory growth: {e}")
def check_training_determinism(model: tf.keras.Model, sample_ds: tf.data.Dataset):
"""
Check if there are operations that can rise exceptions during training.
Args:
model (tf.keras.Model): A keras model.
Returns:
valid_training (bool): True if the training raise no exception.
"""
valid_training = True
x_sample, y_sample = next(iter(sample_ds))
try:
with tf.GradientTape() as g:
y = model(x_sample, training=True)
loss = model.loss(y_sample, y)
_ = g.gradient(loss, model.trainable_variables)
except Exception as error:
print(f"[WARN] {error}")
valid_training = False
return valid_training
def get_mem_consumption(batchsize,input_shape,model):
"""
Calculate the memory consumption and time consumed for a given batch size,
input shape, and model.
Args:
batchsize (int): The batch size for the input data.
input_shape (tuple): The shape of the input data.
model (tf.keras.Model): The model to calculate memory consumption and time for.
Returns:
tuple: A tuple containing the peak memory consumption (in GB).
"""
img = np.random.rand(batchsize,*input_shape)
labels_shape = model.output.shape
label = np.random.rand(*labels_shape)
tf.config.experimental.reset_memory_stats("GPU:0")
t1 = time.time()
with tf.GradientTape(watch_accessed_variables=False) as g:
g.watch(model.trainable_variables)
model_output = model(img, training=True)
loss = model.loss(label, model_output)
gradients = g.gradient(loss, model.trainable_variables)
model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
t2 = time.time()
info = tf.config.experimental.get_memory_info("GPU:0")
tf.config.experimental.reset_memory_stats("GPU:0")
time_consumed = t2 - t1
return (info["peak"]/ 1024 / 1024 /1024, time_consumed)
def gpu_benchmark(gpu_limit,batch_size,input_shape,model):
"""
Benchmark the memory consumption of a given model with a given batch size and input shape.
Args:
gpu_limit (float): The maximum amount of GPU memory (in GB) that can be used.
batch_size (int): The batch size for the input data.
input_shape (tuple): The shape of the input data.
model (tf.keras.Model): The model to benchmark.
Returns:
bool: A boolean indicating whether an exception was raised during the benchmarking process.
"""
S_E = False
try:
info = get_mem_consumption(batch_size,input_shape,model)
memory_with_tf_overhead = info[0]
if gpu_limit > memory_with_tf_overhead:
print("[INFO] : Model memory requirement: {:.2f} GB".format(memory_with_tf_overhead))
except Exception as e:
S_E = True
return S_E
|