Spaces:

STMicroelectronics
/

stm32-modelzoo-app

Running

File size: 4,872 Bytes

747451d

# /*---------------------------------------------------------------------------------------------
#  * Copyright (c) 2022 STMicroelectronics.
#  * All rights reserved.
#  * This software is licensed under terms that can be found in the LICENSE file in
#  * the root directory of this software component.
#  * If no LICENSE file comes with this software, it is provided AS-IS.
#  *--------------------------------------------------------------------------------------------*/

import os, sys, time
import numpy as np
import tensorflow as tf


def set_gpu_memory_limit(gigabytes):
    """
     Sets the upper memory limit for the first GPU to the specified number of gigabytes.

     Args:
         gigabytes (int): The number of gigabytes to set as the upper memory limit.

     Raises:
         RuntimeError: If virtual devices have not been set before GPUs are initialized.

     Returns:
         None
     """
    # GPU memory usage configuration
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            tf.config.set_logical_device_configuration(
                gpus[0],
                [tf.config.LogicalDeviceConfiguration(memory_limit=1024 * gigabytes)])
            logical_gpus = tf.config.list_logical_devices('GPU')
            print("{} physical GPUs, {} logical GPUs".format(len(gpus), len(logical_gpus)))
            print("[INFO] : Setting upper memory limit to {}GBytes on gpu[0]".format(gigabytes))
        except:
            raise RuntimeError("\nVirtual devices must be set before GPUs have been initialized.")


def inc_gpu_mode() -> None:
    """
    Increases the GPU memory allocation incrementally as needed.

    Returns:
        None
    """
    physical_gpus = tf.config.experimental.list_physical_devices('GPU')
    if not physical_gpus:
        return

    try:
        for gpu in physical_gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(f"Error setting memory growth: {e}")


def check_training_determinism(model: tf.keras.Model, sample_ds: tf.data.Dataset):
    """
    Check if there are operations that can rise exceptions during training.

    Args:
        model (tf.keras.Model): A keras model.
    
    Returns:
        valid_training (bool): True if the training raise no exception.
    """
    valid_training = True
    x_sample, y_sample = next(iter(sample_ds))

    try:
        with tf.GradientTape() as g:
            y = model(x_sample, training=True)
            loss = model.loss(y_sample, y)
        _ = g.gradient(loss, model.trainable_variables)
        
    except Exception as error:
        print(f"[WARN] {error}")
        valid_training = False

    return valid_training


def get_mem_consumption(batchsize,input_shape,model):
    """
    Calculate the memory consumption and time consumed for a given batch size,
    input shape, and model.

    Args:
        batchsize (int): The batch size for the input data.
        input_shape (tuple): The shape of the input data.
        model (tf.keras.Model): The model to calculate memory consumption and time for.

    Returns:
        tuple: A tuple containing the peak memory consumption (in GB).
    """
    img = np.random.rand(batchsize,*input_shape)
    labels_shape = model.output.shape
    label = np.random.rand(*labels_shape)
    tf.config.experimental.reset_memory_stats("GPU:0")
    t1 = time.time()
    with tf.GradientTape(watch_accessed_variables=False) as g:
        g.watch(model.trainable_variables)
        model_output = model(img, training=True)
        loss = model.loss(label, model_output)
    gradients = g.gradient(loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    t2 = time.time()
    info = tf.config.experimental.get_memory_info("GPU:0")
    tf.config.experimental.reset_memory_stats("GPU:0")
    time_consumed = t2 - t1
    return (info["peak"]/ 1024 / 1024 /1024, time_consumed)


def gpu_benchmark(gpu_limit,batch_size,input_shape,model):
    """
    Benchmark the memory consumption of a given model with a given batch size and input shape.

    Args:
        gpu_limit (float): The maximum amount of GPU memory (in GB) that can be used.
        batch_size (int): The batch size for the input data.
        input_shape (tuple): The shape of the input data.
        model (tf.keras.Model): The model to benchmark.

    Returns:
        bool: A boolean indicating whether an exception was raised during the benchmarking process.
    """
    S_E = False
    try:
        info = get_mem_consumption(batch_size,input_shape,model)
        memory_with_tf_overhead = info[0]
        if gpu_limit > memory_with_tf_overhead:
            print("[INFO] : Model memory requirement: {:.2f} GB".format(memory_with_tf_overhead))
    except Exception as e:
        S_E = True
    return S_E