FBAGSTM's picture
STM32 AI Experimentation Hub
747451d
# /*---------------------------------------------------------------------------------------------
# * Copyright (c) 2022-2023 STMicroelectronics.
# * All rights reserved.
# *
# * This software is licensed under terms that can be found in the LICENSE file in
# * the root directory of this software component.
# * If no LICENSE file comes with this software, it is provided AS-IS.
# *--------------------------------------------------------------------------------------------*/
import os
import sys
import hydra
import warnings
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import argparse
from pathlib import Path
from omegaconf import DictConfig
from hydra.core.hydra_config import HydraConfig
import mlflow
import tensorflow as tf
from clearml import Task
from clearml.backend_config.defs import get_active_config_file
import torch
import torch.backends.cudnn as cudnn
import random
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.dirname(SCRIPT_DIR))
from api.api import get_model, get_dataloaders, get_quantizer, get_predictor, get_evaluator, get_trainer
from common.utils import mlflow_ini, set_gpu_memory_limit, get_random_seed, log_to_file
from common.benchmarking import benchmark, cloud_connect
from common.evaluation import gen_load_val
from common.prediction import gen_load_val_predict
from object_detection.tf.src.utils import get_config
from object_detection.tf.src.deployment import deploy, deploy_mpu
from common.onnx_utils.onnx_model_convertor import torch_model_export_static
from object_detection.pt.src.utils.yolod import (configure_module,
configure_nccl, configure_omp,
init_distributed_mode)
# This function turns Tensorflow's eager mode on and off.
# Eager mode is for debugging the Model Zoo code and is slower.
# Do not set argument to True to avoid runtime penalties.
tf.config.run_functions_eagerly(False)
def process_mode(cfg: DictConfig):
"""
Execution of the various services
Args:
cfg: Configuration dictionary.
Returns:
None
"""
mode = cfg.operation_mode
mlflow.log_param("model_path", cfg.model.model_path)
# logging the operation_mode in the output_dir/stm32ai_main.log file
log_to_file(cfg.output_dir, f'operation_mode: {mode}')
# it makes class_names important for now, untill we work on the dataloader
num_classes = len(cfg.dataset.class_names) if cfg.dataset.class_names else None
dropout = cfg.training.dropout if cfg.training and 'dropout' in cfg.training else None
num_anchors = len(cfg.postprocessing.yolo_anchors) if cfg.postprocessing and 'yolo_anchors' in cfg.postprocessing else None
saved_model_dir = os.path.join(cfg.output_dir, cfg.general.saved_models_dir)
os.makedirs(saved_model_dir, exist_ok=True)
# create model
model = get_model(cfg=cfg)
if cfg.model.framework == 'torch' and isinstance(model, torch.nn.Module) and cfg.operation_mode not in ['training', 'chain_tb', 'chain_tqe', 'chain_tqeb', 'chain_tbqeb']:
# Export Torch models in onnx format for all services but training
# (export to onnx is also handled at the end of the trainer.train() method)
model = torch_model_export_static(cfg=cfg,
model_dir=saved_model_dir,
model=model)
# Creates dataloaders
if mode not in ['benchmarking', 'deployment']:
dataloaders = get_dataloaders(cfg=cfg)
if mode == "training":
trainer = get_trainer(cfg=cfg,
model=model,
dataloaders=dataloaders)
trainer.train()
print("[INFO] training complete")
elif mode == "evaluation":
# Generates the model to be loaded on the stm32n6 device using stedgeai core,
# then loads it and validates in on the device if required.
gen_load_val(cfg=cfg, model=model)
# Launches evaluation on the target through the model zoo evaluation service
os.chdir(os.path.dirname(os.path.realpath(__file__)))
evaluator = get_evaluator(cfg=cfg,
model=model,
dataloaders=dataloaders)
metrics = evaluator.evaluate()
print("[INFO] evaluation complete")
elif mode == "quantization":
quantizer = get_quantizer(cfg=cfg,
model=model,
dataloaders=dataloaders)
quantized_model = quantizer.quantize()
print("[INFO] quantization complete")
elif mode == "prediction":
# Generates the model to be loaded on the stm32n6 device using stedgeai core,
# then loads it and validates in on the device if required.
gen_load_val_predict(cfg=cfg, model=model)
# Launches prediction on the target through the model zoo prediction service
os.chdir(os.path.dirname(os.path.realpath(__file__)))
predictor = get_predictor(cfg=cfg,
model=model,
dataloaders=dataloaders)
predictor.predict()
print("[INFO] prediction complete")
elif mode == 'benchmarking':
benchmark(cfg, model_path_to_benchmark=model.model_path)
print("[INFO] benchmarking complete")
elif mode == 'deployment':
if cfg.hardware_type == "MPU":
deploy_mpu(cfg, model_path_to_deploy=model.model_path, credentials=credentials)
else:
deploy(cfg=cfg, model_path_to_deploy=model.model_path)
print("[INFO] deployment complete")
if cfg.deployment.hardware_setup.board == "STM32N6570-DK":
print('[INFO] : Please on STM32N6570-DK toggle the boot switches to the left and power cycle the board.')
elif mode == 'chain_tqe':
trainer = get_trainer(cfg=cfg,
model=model,
dataloaders=dataloaders)
trained_model = trainer.train()
quantizer = get_quantizer(cfg=cfg,
model=trained_model,
dataloaders=dataloaders)
quantized_model = quantizer.quantize()
evaluator = get_evaluator(cfg=cfg,
model=quantized_model,
dataloaders=dataloaders)
metrics = evaluator.evaluate()
print("Trained model path:", trained_model.model_path)
print("Quantized model path:", quantized_model.model_path)
print("[INFO] chain_tqe complete")
elif mode == 'chain_tqeb':
credentials = None
if cfg.tools.stedgeai.on_cloud:
_, _, credentials = cloud_connect(stedgeai_core_version=cfg.tools.stedgeai.version)
trainer = get_trainer(cfg=cfg,
model=model,
dataloaders=dataloaders)
trained_model = trainer.train()
quantizer = get_quantizer(cfg=cfg,
model=trained_model,
dataloaders=dataloaders)
quantized_model = quantizer.quantize()
evaluator = get_evaluator(cfg=cfg,
model=quantized_model,
dataloaders=dataloaders)
metrics = evaluator.evaluate()
benchmark(cfg, model_path_to_benchmark=quantized_model.model_path, credentials=credentials)
print("Trained model path:", trained_model.model_path)
print("Quantized model path:", quantized_model.model_path)
print("[INFO] chain_tqeb complete")
elif mode == 'chain_eqe':
evaluator = get_evaluator(cfg=cfg,
model=model,
dataloaders=dataloaders)
metrics = evaluator.evaluate()
quantizer = get_quantizer(cfg=cfg,
model=model,
dataloaders=dataloaders)
quantized_model = quantizer.quantize()
evaluator = get_evaluator(cfg=cfg,
model=quantized_model,
dataloaders=dataloaders)
metrics = evaluator.evaluate()
print("Quantized model path:", quantized_model.model_path)
print("[INFO] chain_eqe complete")
elif mode == 'chain_eqeb':
credentials = None
if cfg.tools.stedgeai.on_cloud:
_, _, credentials = cloud_connect(stedgeai_core_version=cfg.tools.stedgeai.version)
evaluator = get_evaluator(cfg=cfg,
model=model,
dataloaders=dataloaders)
metrics = evaluator.evaluate()
quantizer = get_quantizer(cfg=cfg,
model=model,
dataloaders=dataloaders)
quantized_model = quantizer.quantize()
evaluator = get_evaluator(cfg=cfg,
model=quantized_model,
dataloaders=dataloaders)
metrics = evaluator.evaluate()
benchmark(cfg, model_path_to_benchmark=quantized_model.model_path, credentials=credentials)
print("Quantized model path:", quantized_model.model_path)
print("[INFO] chain_eqeb complete")
elif mode == 'chain_qb':
credentials = None
if cfg.tools.stedgeai.on_cloud:
_, _, credentials = cloud_connect(stedgeai_core_version=cfg.tools.stedgeai.version)
quantizer = get_quantizer(cfg=cfg,
model=model,
dataloaders=dataloaders)
quantized_model = quantizer.quantize()
benchmark(cfg, model_path_to_benchmark=quantized_model.model_path, credentials=credentials)
print("Quantized model path:", quantized_model.model_path)
print("[INFO] chain_qb complete")
elif mode == 'chain_qd':
quantizer = get_quantizer(cfg=cfg,
model=model,
dataloaders=dataloaders)
quantized_model = quantizer.quantize()
if cfg.hardware_type == "MPU":
deploy_mpu(cfg, model_path_to_deploy=quantized_model.model_path)
else:
deploy(cfg, model_path_to_deploy=quantized_model.model_path)
print("Quantized model path:", quantized_model.model_path)
print("[INFO] chain_qd complete")
else:
raise RuntimeError(f"Internal error: invalid operation mode: {mode}")
if mode in ['benchmarking', 'chain_tbqeb', 'chain_qb', 'chain_eqeb']:
mlflow.log_param("stedgeai_core_version", cfg.tools.stedgeai.version)
mlflow.log_param("target", cfg.benchmarking.board)
# logging the completion of the chain
log_to_file(cfg.output_dir, f'operation finished: {mode}')
# ClearML - Example how to get task's context anywhere in the file.
# Checks if there's a valid ClearML configuration file
if get_active_config_file() is not None:
print(f"[INFO] : ClearML task connection")
task = Task.current_task()
task.connect(cfg)
def _torch_specific_initializations(cfg: DictConfig = None) -> None:
"""
PyTorch-specific initializations.
This function is used for PyTorch-specific initializations,
- multi GPU set up
- Setting a random seed for reproducibility.
Args:
cfg (DictConfig): Configuration object.
"""
if cfg.general.global_seed is not None:
random.seed(cfg.general.global_seed)
torch.manual_seed(cfg.general.global_seed)
cudnn.deterministic = True
warnings.warn(
"You have chosen to seed training. This will turn on the CUDNN deterministic setting, "
"which can slow down your training considerably! You may see unexpected behavior "
"when restarting from checkpoints."
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Torch specific initializations for Multi GPU
configure_module()
configure_nccl()
configure_omp()
init_distributed_mode()
cudnn.benchmark = True
cfg.device = device
@hydra.main(version_base=None, config_path="", config_name="user_config")
def main(cfg: DictConfig) -> None:
"""
Main entry point of the script.
Args:
cfg: Configuration dictionary.
Returns:
None
"""
# Configure the GPU (the 'general' section may be missing)
if "general" in cfg and cfg.general:
# Set upper limit on usable GPU memory
if "gpu_memory_limit" in cfg.general and cfg.general.gpu_memory_limit:
set_gpu_memory_limit(cfg.general.gpu_memory_limit)
else:
print("[WARNING] The usable GPU memory is unlimited.\n"
"Please consider setting the 'gpu_memory_limit' attribute "
"in the 'general' section of your configuration file.")
# Parse the configuration file
cfg = get_config(cfg)
cfg.output_dir = HydraConfig.get().runtime.output_dir
mlflow_ini(cfg)
if cfg.model.framework == 'torch':
print (f'[Info]: Torch specific initializations')
_torch_specific_initializations(cfg)
# Checks if there's a valid ClearML configuration file
print(f"[INFO] : ClearML config check")
if get_active_config_file() is not None:
print(f"[INFO] : ClearML initialization and configuration")
# ClearML - Initializing ClearML's Task object.
task = Task.init(project_name=cfg.general.project_name,
task_name='od_modelzoo_task')
# ClearML - Optional yaml logging
task.connect_configuration(name=cfg.operation_mode,
configuration=cfg)
# Seed global seed for random generators
seed = get_random_seed(cfg)
print(f'[INFO] : The random seed for this simulation is {seed}')
if seed is not None:
tf.keras.utils.set_random_seed(seed)
# The default hardware type is "MCU".
process_mode(cfg)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--config-path', type=str, default='', help='Path to folder containing configuration file')
parser.add_argument('--config-name', type=str, default='user_config', help='name of the configuration file')
# Add arguments to the parser
parser.add_argument('params', nargs='*',
help='List of parameters to over-ride in config.yaml')
args = parser.parse_args()
# Call the main function
main()
# log the config_path and config_name parameters
mlflow.log_param('config_path', args.config_path)
mlflow.log_param('config_name', args.config_name)
mlflow.end_run()