|
|
import logging |
|
|
import os |
|
|
import time |
|
|
from typing import List |
|
|
|
|
|
import torch |
|
|
from eval import verification |
|
|
from torch import distributed |
|
|
from torch.utils.tensorboard import SummaryWriter |
|
|
from utils.utils_logging import AverageMeter |
|
|
|
|
|
|
|
|
class CallBackVerification(object): |
|
|
def __init__(self, val_targets, rec_prefix, summary_writer=None, image_size=(112, 112), wandb_logger=None): |
|
|
self.rank: int = distributed.get_rank() |
|
|
self.highest_acc: float = 0.0 |
|
|
self.highest_acc_list: List[float] = [0.0] * len(val_targets) |
|
|
self.ver_list: List[object] = [] |
|
|
self.ver_name_list: List[str] = [] |
|
|
if self.rank is 0: |
|
|
self.init_dataset(val_targets=val_targets, data_dir=rec_prefix, image_size=image_size) |
|
|
|
|
|
self.summary_writer = summary_writer |
|
|
self.wandb_logger = wandb_logger |
|
|
|
|
|
def ver_test(self, backbone: torch.nn.Module, global_step: int): |
|
|
results = [] |
|
|
for i in range(len(self.ver_list)): |
|
|
acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(self.ver_list[i], backbone, 10, 10) |
|
|
logging.info("[%s][%d]XNorm: %f" % (self.ver_name_list[i], global_step, xnorm)) |
|
|
logging.info("[%s][%d]Accuracy-Flip: %1.5f+-%1.5f" % (self.ver_name_list[i], global_step, acc2, std2)) |
|
|
|
|
|
self.summary_writer: SummaryWriter |
|
|
self.summary_writer.add_scalar( |
|
|
tag=self.ver_name_list[i], |
|
|
scalar_value=acc2, |
|
|
global_step=global_step, |
|
|
) |
|
|
if self.wandb_logger: |
|
|
import wandb |
|
|
|
|
|
self.wandb_logger.log( |
|
|
{ |
|
|
f"Acc/val-Acc1 {self.ver_name_list[i]}": acc1, |
|
|
f"Acc/val-Acc2 {self.ver_name_list[i]}": acc2, |
|
|
|
|
|
|
|
|
} |
|
|
) |
|
|
|
|
|
if acc2 > self.highest_acc_list[i]: |
|
|
self.highest_acc_list[i] = acc2 |
|
|
logging.info( |
|
|
"[%s][%d]Accuracy-Highest: %1.5f" % (self.ver_name_list[i], global_step, self.highest_acc_list[i]) |
|
|
) |
|
|
results.append(acc2) |
|
|
|
|
|
def init_dataset(self, val_targets, data_dir, image_size): |
|
|
for name in val_targets: |
|
|
path = os.path.join(data_dir, name + ".bin") |
|
|
if os.path.exists(path): |
|
|
data_set = verification.load_bin(path, image_size) |
|
|
self.ver_list.append(data_set) |
|
|
self.ver_name_list.append(name) |
|
|
|
|
|
def __call__(self, num_update, backbone: torch.nn.Module): |
|
|
if self.rank is 0 and num_update > 0: |
|
|
backbone.eval() |
|
|
self.ver_test(backbone, num_update) |
|
|
backbone.train() |
|
|
|
|
|
|
|
|
class CallBackLogging(object): |
|
|
def __init__(self, frequent, total_step, batch_size, start_step=0, writer=None): |
|
|
self.frequent: int = frequent |
|
|
self.rank: int = distributed.get_rank() |
|
|
self.world_size: int = distributed.get_world_size() |
|
|
self.time_start = time.time() |
|
|
self.total_step: int = total_step |
|
|
self.start_step: int = start_step |
|
|
self.batch_size: int = batch_size |
|
|
self.writer = writer |
|
|
|
|
|
self.init = False |
|
|
self.tic = 0 |
|
|
|
|
|
def __call__( |
|
|
self, |
|
|
global_step: int, |
|
|
loss: AverageMeter, |
|
|
epoch: int, |
|
|
fp16: bool, |
|
|
learning_rate: float, |
|
|
grad_scaler: torch.cuda.amp.GradScaler, |
|
|
): |
|
|
if self.rank == 0 and global_step > 0 and global_step % self.frequent == 0: |
|
|
if self.init: |
|
|
try: |
|
|
speed: float = self.frequent * self.batch_size / (time.time() - self.tic) |
|
|
speed_total = speed * self.world_size |
|
|
except ZeroDivisionError: |
|
|
speed_total = float("inf") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
time_now = time.time() |
|
|
time_sec = int(time_now - self.time_start) |
|
|
time_sec_avg = time_sec / (global_step - self.start_step + 1) |
|
|
eta_sec = time_sec_avg * (self.total_step - global_step - 1) |
|
|
time_for_end = eta_sec / 3600 |
|
|
if self.writer is not None: |
|
|
self.writer.add_scalar("time_for_end", time_for_end, global_step) |
|
|
self.writer.add_scalar("learning_rate", learning_rate, global_step) |
|
|
self.writer.add_scalar("loss", loss.avg, global_step) |
|
|
if fp16: |
|
|
msg = ( |
|
|
"Speed %.2f samples/sec Loss %.4f LearningRate %.6f Epoch: %d Global Step: %d " |
|
|
"Fp16 Grad Scale: %2.f Required: %1.f hours" |
|
|
% ( |
|
|
speed_total, |
|
|
loss.avg, |
|
|
learning_rate, |
|
|
epoch, |
|
|
global_step, |
|
|
grad_scaler.get_scale(), |
|
|
time_for_end, |
|
|
) |
|
|
) |
|
|
else: |
|
|
msg = ( |
|
|
"Speed %.2f samples/sec Loss %.4f LearningRate %.6f Epoch: %d Global Step: %d " |
|
|
"Required: %1.f hours" |
|
|
% (speed_total, loss.avg, learning_rate, epoch, global_step, time_for_end) |
|
|
) |
|
|
logging.info(msg) |
|
|
loss.reset() |
|
|
self.tic = time.time() |
|
|
else: |
|
|
self.init = True |
|
|
self.tic = time.time() |
|
|
|