UniRig

Running on Zero

App Files Files Community

UniRig / src /model /pointcept /engines /eval.py

MohamedRashad

Add skin and tokenizer systems with parsing and tokenization functionalities

11b119e 11 months ago

raw

history blame contribute delete

17.1 kB

	import os
	import sys
	import weakref
	import torch
	torch.multiprocessing.set_start_method('spawn')
	import torch.nn as nn
	import torch.utils.data
	from functools import partial

	if sys.version_info >= (3, 10):
	from collections.abc import Iterator
	else:
	from collections import Iterator
	from tensorboardX import SummaryWriter

	from .defaults import create_ddp_model, worker_init_fn
	from .hooks import HookBase, build_hooks
	import pointcept.utils.comm as comm
	from pointcept.datasets import build_dataset, point_collate_fn, collate_fn
	from pointcept.models import build_model
	from pointcept.utils.logger import get_root_logger
	from pointcept.utils.optimizer import build_optimizer
	from pointcept.utils.scheduler import build_scheduler
	from pointcept.utils.events import EventStorage
	from pointcept.utils.registry import Registry

	from sklearn.preprocessing import QuantileTransformer
	from pointcept.utils.timer import Timer

	TRAINERS = Registry("trainers")
	from cuml.cluster.hdbscan import HDBSCAN
	# from sklearn.cluster import HDBSCAN
	import open3d as o3d
	import matplotlib.colors as mcolors
	import numpy as np
	from collections import OrderedDict
	import trimesh
	import pointops

	class TrainerBase:
	def __init__(self) -> None:
	self.hooks = []
	self.epoch = 0
	self.start_epoch = 0
	self.max_epoch = 0
	self.max_iter = 0
	self.comm_info = dict()
	self.data_iterator: Iterator = enumerate([])
	self.storage: EventStorage
	self.writer: SummaryWriter
	self._iter_timer = Timer()

	def register_hooks(self, hooks) -> None:
	hooks = build_hooks(hooks)
	for h in hooks:
	assert isinstance(h, HookBase)
	# To avoid circular reference, hooks and trainer cannot own each other.
	# This normally does not matter, but will cause memory leak if the
	# involved objects contain __del__:
	# See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/
	h.trainer = weakref.proxy(self)
	self.hooks.extend(hooks)

	def train(self):
	with EventStorage() as self.storage:
	# => before train
	self.before_train()
	for self.epoch in range(self.start_epoch, self.max_epoch):
	# => before epoch
	self.before_epoch()
	# => run_epoch
	for (
	self.comm_info["iter"],
	self.comm_info["input_dict"],
	) in self.data_iterator:
	# => before_step
	self.before_step()
	# => run_step
	self.run_step()
	# => after_step
	self.after_step()
	# => after epoch
	self.after_epoch()
	# => after train
	self.after_train()

	def before_train(self):
	for h in self.hooks:
	h.before_train()

	def before_epoch(self):
	for h in self.hooks:
	h.before_epoch()

	def before_step(self):
	for h in self.hooks:
	h.before_step()

	def run_step(self):
	raise NotImplementedError

	def after_step(self):
	for h in self.hooks:
	h.after_step()

	def after_epoch(self):
	for h in self.hooks:
	h.after_epoch()
	self.storage.reset_histories()

	def after_train(self):
	# Sync GPU before running train hooks
	comm.synchronize()
	for h in self.hooks:
	h.after_train()
	if comm.is_main_process():
	self.writer.close()


	@TRAINERS.register_module("DefaultTrainer")
	class Trainer(TrainerBase):
	def __init__(self, cfg):
	super(Trainer, self).__init__()
	self.epoch = 0
	self.start_epoch = 0
	self.max_epoch = cfg.eval_epoch
	self.best_metric_value = -torch.inf
	self.logger = get_root_logger(
	log_file=os.path.join(cfg.save_path, "train.log"),
	# file_mode="a" if cfg.resume else "w",
	file_mode="a",
	)
	self.logger.info("=> Loading config ...")
	self.cfg = cfg
	self.logger.info(f"Save path: {cfg.save_path}")
	self.logger.info(f"Config:\n{cfg.pretty_text}")
	self.logger.info("=> Building model ...")
	self.model = self.build_model()
	self.logger.info("=> Building val dataset & dataloader ...")
	self.train_loader = self.build_train_loader()
	self.logger.info("=> Building hooks ...")
	self.register_hooks(self.cfg.hooks)

	# !!!
	self.val_scales_list = self.cfg.val_scales_list
	self.mesh_voting = self.cfg.mesh_voting
	self.backbone_weight_path = self.cfg.backbone_weight_path


	def eval(self):
	# val_data = build_dataset(self.cfg.data.val)
	self.logger.info("=> Loading checkpoint & weight ...")
	if self.backbone_weight_path != None:
	self.logger.info("=> Loading checkpoint of pretrained backbone")
	if os.path.isfile(self.backbone_weight_path):
	checkpoint = torch.load(
	self.backbone_weight_path,
	map_location=lambda storage, loc: storage.cuda(),
	)
	weight = OrderedDict()
	for key, value in checkpoint["state_dict"].items():
	if not key.startswith("module."):
	if comm.get_world_size() > 1:
	key = "module." + key # xxx.xxx -> module.xxx.xxx
	# Now all keys contain "module." no matter DDP or not.
	# if self.keywords in key:
	# key = key.replace(self.keywords, self.replacement)
	if comm.get_world_size() == 1:
	key = key[7:] # module.xxx.xxx -> xxx.xxx
	# if key.startswith("backbone."):
	# key = key[9:] # backbone.xxx.xxx -> xxx.xxx
	key = "backbone." + key # xxx.xxx -> backbone.xxx.xxx
	weight[key] = value
	load_state_info = self.model.load_state_dict(weight, strict=False)
	self.logger.info(f"Missing keys: {load_state_info[0]}")
	else:
	self.logger.info(f"No weight found at: {self.backbone_weight_path}")

	if self.cfg.weight and os.path.isfile(self.cfg.weight):
	checkpoint = torch.load(
	self.cfg.weight,
	map_location=lambda storage, loc: storage.cuda(),
	)
	load_state_info = self.model.load_state_dict(checkpoint["state_dict"], strict=False)
	self.logger.info(f"Missing keys: {load_state_info[0]}")
	scale_statistics = checkpoint["state_dict"]["scale_statistics"]
	self.model.quantile_transformer = self._get_quantile_func(scale_statistics)
	else:
	self.logger.info(f"No weight found at: {self.cfg.weight}")
	self.cfg.weight = "last"

	self.model.eval()
	save_root = os.path.join(self.cfg.save_path, "vis_pcd", os.path.splitext(os.path.basename(self.cfg.weight))[0])
	os.makedirs(save_root, exist_ok=True)
	group_save_root = os.path.join(self.cfg.save_path, "results", os.path.splitext(os.path.basename(self.cfg.weight))[0])
	os.makedirs(group_save_root, exist_ok=True)

	hex_colors = list(mcolors.CSS4_COLORS.values())
	rgb_colors = np.array([mcolors.to_rgb(color) for color in hex_colors if color not in ['#000000', '#FFFFFF']])
	def relative_luminance(color):
	return 0.2126 * color[0] + 0.7152 * color[1] + 0.0722 * color[2]
	rgb_colors = [color for color in rgb_colors if (relative_luminance(color) > 0.4 and relative_luminance(color) < 0.8)]
	np.random.shuffle(rgb_colors)
	input_dict = self.train_loader.val_data()

	pcd_inverse = self.train_loader.pcd_inverse
	if self.mesh_voting:
	mesh = trimesh.load(self.train_loader.mesh_path)
	if isinstance(mesh, trimesh.Scene):
	mesh = mesh.dump(concatenate=True)
	mesh.visual = trimesh.visual.ColorVisuals(mesh=mesh)

	for scale in self.val_scales_list:
	input_dict["scale"] = scale
	instance_feat = self.model(input_dict).cpu().detach().numpy()

	clusterer = HDBSCAN(
	cluster_selection_epsilon=0.1,
	min_samples=30,
	min_cluster_size=30,
	allow_single_cluster=False,
	).fit(instance_feat)

	labels = clusterer.labels_
	invalid_label_mask = labels == -1
	if invalid_label_mask.sum() > 0:
	if invalid_label_mask.sum() == len(invalid_label_mask):
	labels = np.zeros_like(labels)
	else:
	coord = input_dict["obj"]["coord"].cuda().contiguous().float()
	valid_coord = coord[~invalid_label_mask]
	valid_offset = torch.tensor(valid_coord.shape[0]).cuda()
	invalid_coord = coord[invalid_label_mask]
	invalid_offset = torch.tensor(invalid_coord.shape[0]).cuda()
	indices, distances = pointops.knn_query(1, valid_coord, valid_offset, invalid_coord, invalid_offset)
	indices = indices[:, 0].cpu().numpy()
	labels[invalid_label_mask] = labels[~invalid_label_mask][indices]


	# np.save(os.path.join(group_save_root, f"{str(scale)}.npy"), labels)
	save_path = os.path.join(save_root, f"{str(scale)}.ply")
	coord = input_dict["obj"]["coord"].cpu().numpy()
	random_color = []
	for i in range(max(labels) + 1):
	random_color.append(rgb_colors[i % len(rgb_colors)])
	random_color.append(np.array([0, 0, 0]))
	color = [random_color[i] for i in labels]
	pcd = o3d.geometry.PointCloud()
	pcd.points = o3d.utility.Vector3dVector(coord)
	pcd.colors = o3d.utility.Vector3dVector(color)
	o3d.io.write_point_cloud(save_path, pcd)

	labels = labels[pcd_inverse]

	# print(len(clusterer.labels_))
	self.logger.info(f"scale_{scale} has {max(labels)+1} groups")
	if self.mesh_voting:
	face_index = self.train_loader.face_index
	face_index = face_index[pcd_inverse]

	# Compute votes for each face using NumPy's bincount function
	# labels = clusterer.labels_
	num_faces = len(mesh.faces)
	num_labels = max(labels) + 1
	votes = np.zeros((num_faces, num_labels), dtype=np.int32)
	np.add.at(votes, (face_index, labels), 1)

	# Find the label with most votes for each face using NumPy's argmax function
	max_votes_labels = np.argmax(votes, axis=1)
	# Set the label to -1 for faces that have no corresponding points
	max_votes_labels[np.all(votes == 0, axis=1)] = -1

	valid_mask = max_votes_labels != -1
	face_centroids = mesh.triangles_center
	coord = torch.tensor(face_centroids).cuda().contiguous().float()
	valid_coord = coord[valid_mask]
	valid_offset = torch.tensor(valid_coord.shape[0]).cuda()
	invalid_coord = coord[~valid_mask]
	invalid_offset = torch.tensor(invalid_coord.shape[0]).cuda()
	indices, distances = pointops.knn_query(1, valid_coord, valid_offset, invalid_coord, invalid_offset)
	# # the first column is the point itself
	# indices = indices[:, 1].cpu().numpy()
	indices = indices[:, 0].cpu().numpy()
	mesh_group = max_votes_labels.copy()
	mesh_group[~valid_mask] = mesh_group[valid_mask][indices]

	np.save(os.path.join(group_save_root, f"mesh_{str(scale)}.npy"), mesh_group)

	# Assign color to each face based on the label with most votes
	for face, label in enumerate(mesh_group):
	color = (random_color[label] * 255).astype(np.uint8)
	color_with_alpha = np.append(color, 255) # Add alpha value
	mesh.visual.face_colors[face] = color_with_alpha

	# Save the new mesh
	mesh_save_path = os.path.join(save_root, f"mesh_{str(scale)}.ply")
	mesh.export(mesh_save_path)


	def _get_quantile_func(self, scales: torch.Tensor, distribution="normal"):
	"""
	Use 3D scale statistics to normalize scales -- use quantile transformer.
	"""
	scales = scales.flatten()
	max_grouping_scale = 2
	scales = scales[(scales > 0) & (scales < max_grouping_scale)]

	scales = scales.detach().cpu().numpy()

	# Calculate quantile transformer
	quantile_transformer = QuantileTransformer(output_distribution=distribution)
	quantile_transformer = quantile_transformer.fit(scales.reshape(-1, 1))

	def quantile_transformer_func(scales):
	# This function acts as a wrapper for QuantileTransformer.
	# QuantileTransformer expects a numpy array, while we have a torch tensor.
	return torch.Tensor(
	quantile_transformer.transform(scales.cpu().numpy())
	).to(scales.device)

	return quantile_transformer_func

	def run_step(self):
	input_dict = self.comm_info["input_dict"]
	for key in input_dict.keys():
	if isinstance(input_dict[key], torch.Tensor):
	input_dict[key] = input_dict[key].cuda(non_blocking=True)
	with torch.cuda.amp.autocast(enabled=self.cfg.enable_amp):
	output_dict = self.model(input_dict)
	loss = output_dict["loss"]
	self.optimizer.zero_grad()
	if self.cfg.enable_amp:
	self.scaler.scale(loss).backward()
	self.scaler.step(self.optimizer)

	# When enable amp, optimizer.step call are skipped if the loss scaling factor is too large.
	# Fix torch warning scheduler step before optimizer step.
	scaler = self.scaler.get_scale()
	self.scaler.update()
	if scaler <= self.scaler.get_scale():
	self.scheduler.step()
	else:
	loss.backward()
	self.optimizer.step()
	self.scheduler.step()
	if self.cfg.empty_cache:
	torch.cuda.empty_cache()
	self.comm_info["model_output_dict"] = output_dict

	def build_model(self):
	model = build_model(self.cfg.model)
	if self.cfg.sync_bn:
	model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
	n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
	# logger.info(f"Model: \n{self.model}")
	self.logger.info(f"Num params: {n_parameters}")
	model = create_ddp_model(
	model.cuda(),
	broadcast_buffers=False,
	find_unused_parameters=self.cfg.find_unused_parameters,
	)
	return model

	def build_writer(self):
	writer = SummaryWriter(self.cfg.save_path) if comm.is_main_process() else None
	self.logger.info(f"Tensorboard writer logging dir: {self.cfg.save_path}")
	return writer

	def build_train_loader(self):
	self.cfg.data.train.split = "val"
	self.cfg.data.train.oid = self.cfg.oid
	self.cfg.data.train.label = self.cfg.label
	train_data = build_dataset(self.cfg.data.train)
	return train_data

	def build_val_loader(self):
	val_loader = None
	if self.cfg.evaluate:
	val_data = build_dataset(self.cfg.data.val)
	if comm.get_world_size() > 1:
	val_sampler = torch.utils.data.distributed.DistributedSampler(val_data)
	else:
	val_sampler = None
	val_loader = torch.utils.data.DataLoader(
	val_data,
	batch_size=self.cfg.batch_size_val_per_gpu,
	shuffle=False,
	num_workers=self.cfg.num_worker_per_gpu,
	pin_memory=True,
	sampler=val_sampler,
	collate_fn=collate_fn,
	)
	return val_loader

	def build_optimizer(self):
	return build_optimizer(self.cfg.optimizer, self.model, self.cfg.param_dicts)

	def build_scheduler(self):
	assert hasattr(self, "optimizer")
	assert hasattr(self, "train_loader")
	# self.cfg.scheduler.total_steps = len(self.train_loader) * self.cfg.eval_epoch
	self.cfg.scheduler.total_steps = self.max_epoch
	return build_scheduler(self.cfg.scheduler, self.optimizer)

	def build_scaler(self):
	scaler = torch.cuda.amp.GradScaler() if self.cfg.enable_amp else None
	return scaler