Modelsgate

Runtime error

Modelsgate / triposplat.py

bennyguo

initial commit

c43e1bf 3 days ago

26.9 kB

	import numpy as np
	import torch
	import torch.nn.functional as F
	import safetensors.torch
	from PIL import Image, ImageFilter
	from torchvision import transforms
	from tqdm.auto import tqdm

	from model import (
	DinoV3ViT, Flux2VAEEncoder, BiRefNet,
	OctreeProbabilityFixedlenDecoder, ElasticGaussianFixedlenDecoder,
	LatentSeqMMFlowModel, OctreeGaussianDecoder,
	)


	# ---------------------------------------------------------------------------
	# Gaussian
	# ---------------------------------------------------------------------------

	class Gaussian:
	def __init__(self, aabb: list, sh_degree: int = 0, mininum_kernel_size: float = 0.0,
	scaling_bias: float = 0.01, opacity_bias: float = 0.1,
	scaling_activation: str = "exp", device='cuda'):
	self.sh_degree = sh_degree
	self.mininum_kernel_size = mininum_kernel_size
	self.scaling_bias = scaling_bias
	self.opacity_bias = opacity_bias
	self.device = device
	self.aabb = torch.tensor(aabb, dtype=torch.float32, device=device)

	if scaling_activation == "exp":
	self._scaling_activation = torch.exp
	self._inverse_scaling_activation = torch.log
	elif scaling_activation == "softplus":
	self._scaling_activation = F.softplus
	self._inverse_scaling_activation = lambda x: x + torch.log(-torch.expm1(-x))

	self._opacity_activation = torch.sigmoid
	self._inverse_opacity_activation = lambda x: torch.log(x / (1 - x))

	self.scale_bias = self._inverse_scaling_activation(torch.tensor(self.scaling_bias)).to(self.device)
	self.rots_bias = torch.zeros(4, device=self.device)
	self.rots_bias[0] = 1
	self.opacity_bias_val = self._inverse_opacity_activation(torch.tensor(self.opacity_bias)).to(self.device)

	self._storage = {}

	def _get_store(self, name):
	return self._storage.get(name)

	def _set_store(self, name, value):
	self._storage[name] = value

	@property
	def _xyz(self):
	return self._get_store("_xyz")
	@_xyz.setter
	def _xyz(self, value):
	if value is None:
	self._set_store("_xyz", None); self._set_store("xyz", None); return
	self._set_store("_xyz", value)
	self._set_store("xyz", value * self.aabb[None, 3:] + self.aabb[None, :3])

	@property
	def get_xyz(self):
	return self._get_store("xyz")

	@property
	def _features_dc(self):
	return self._get_store("_features_dc")
	@_features_dc.setter
	def _features_dc(self, value):
	self._set_store("_features_dc", value)

	@property
	def _opacity(self):
	return self._get_store("_opacity")
	@_opacity.setter
	def _opacity(self, value):
	if value is None:
	self._set_store("_opacity", None); self._set_store("opacity", None); return
	self._set_store("_opacity", value)
	self._set_store("opacity", self._opacity_activation(value + self.opacity_bias_val))

	@property
	def get_opacity(self):
	return self._get_store("opacity")

	@property
	def _scaling(self):
	return self._get_store("_scaling")
	@_scaling.setter
	def _scaling(self, value):
	if value is None:
	self._set_store("_scaling", None); self._set_store("scaling", None); return
	self._set_store("_scaling", value)
	s = self._scaling_activation(value + self.scale_bias)
	s = torch.square(s) + self.mininum_kernel_size ** 2
	self._set_store("scaling", torch.sqrt(s))

	@property
	def get_scaling(self):
	return self._get_store("scaling")

	@property
	def _rotation(self):
	return self._get_store("_rotation")
	@_rotation.setter
	def _rotation(self, value):
	self._set_store("_rotation", value)

	def construct_list_of_attributes(self):
	l = ['x', 'y', 'z', 'nx', 'ny', 'nz']
	dc = self._features_dc
	for i in range(dc.shape[1] * dc.shape[2]):
	l.append(f'f_dc_{i}')
	l.append('opacity')
	for i in range(self._scaling.shape[1]):
	l.append(f'scale_{i}')
	for i in range(self._rotation.shape[1]):
	l.append(f'rot_{i}')
	return l

	_DEFAULT_TRANSFORM = [[1, 0, 0], [0, 0, -1], [0, 1, 0]]

	def _get_ply_data(self, transform=None):
	xyz = self.get_xyz.detach().cpu().numpy()
	normals = np.zeros_like(xyz)
	f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
	opacities = self._inverse_opacity_activation(self.get_opacity).detach().cpu().numpy()
	scale = torch.log(self.get_scaling).detach().cpu().numpy()
	rotation = (self._rotation + self.rots_bias[None, :]).detach().cpu().numpy()
	if transform is not None:
	transform = np.array(transform)
	xyz = np.matmul(xyz, transform.T)
	R_mat = _quat_to_matrix(rotation)
	R_mat = np.matmul(transform, R_mat)
	rotation = _matrix_to_quat(R_mat)
	return xyz, normals, f_dc, opacities, scale, rotation

	def _transformed_xyz_rot(self, transform=None):
	if transform is None:
	transform = self._DEFAULT_TRANSFORM
	transform = np.array(transform, dtype=np.float32)
	xyz = self.get_xyz.detach().cpu().numpy().astype(np.float32)
	rotation = (self._rotation + self.rots_bias[None, :]).detach().cpu().numpy()
	xyz = np.matmul(xyz, transform.T)
	R_mat = _quat_to_matrix(rotation)
	R_mat = np.matmul(transform, R_mat)
	rotation = _matrix_to_quat(R_mat)
	return xyz, rotation

	def to_ply_bytes(self, transform=None) -> bytes:
	if transform is None:
	transform = self._DEFAULT_TRANSFORM
	xyz, normals, f_dc, opacities, scale, rotation = self._get_ply_data(transform=transform)
	dtype_full = [(attr, 'f4') for attr in self.construct_list_of_attributes()]
	elements = np.empty(xyz.shape[0], dtype=dtype_full)
	elements[:] = list(map(tuple, np.concatenate((xyz, normals, f_dc, opacities, scale, rotation), axis=1)))
	return _binary_ply_bytes(elements, dtype_full)

	def to_splat_bytes(self, transform=None) -> bytes:
	if transform is None:
	transform = self._DEFAULT_TRANSFORM
	xyz, rotation = self._transformed_xyz_rot(transform=transform)
	scale = self.get_scaling.detach().cpu().numpy().astype(np.float32)
	opacity = self.get_opacity.detach().cpu().numpy()
	f_dc = self._features_dc.detach().cpu().numpy()
	C0 = 0.28209479177387814
	# .splat packs color as 4 bytes RGBA: RGB from the SH DC term, A from opacity.
	rgb = np.clip((f_dc[:, 0, :] * C0 + 0.5) * 255, 0, 255).astype(np.uint8)
	alpha = np.clip(opacity[:, 0:1] * 255, 0, 255).astype(np.uint8)
	rgba = np.concatenate([rgb, alpha], axis=1)
	rot = rotation / np.linalg.norm(rotation, axis=-1, keepdims=True)
	rot_u8 = np.clip(rot * 128 + 128, 0, 255).astype(np.uint8)
	order = np.argsort(-opacity[:, 0] * np.prod(scale, axis=-1))
	xyz, scale, rgba, rot_u8 = xyz[order], scale[order], rgba[order], rot_u8[order]
	# Per-splat record is exactly 32 bytes: xyz(12) + scale(12) + rgba(4) + rot(4).
	data = np.concatenate([
	xyz.astype(np.float32).view(np.uint8).reshape(-1, 12),
	scale.astype(np.float32).view(np.uint8).reshape(-1, 12),
	rgba.reshape(-1, 4),
	rot_u8.reshape(-1, 4),
	], axis=1).reshape(-1)
	return data.tobytes()

	def save_ply(self, path, transform=None):
	with open(path, 'wb') as f:
	f.write(self.to_ply_bytes(transform=transform))

	def save_splat(self, path, transform=None):
	with open(path, 'wb') as f:
	f.write(self.to_splat_bytes(transform=transform))


	def _binary_ply_bytes(elements, dtype_full) -> bytes:
	num_vertices = len(elements)
	header = "ply\nformat binary_little_endian 1.0\n"
	header += f"element vertex {num_vertices}\n"
	type_map = {'f4': 'float', 'u1': 'uchar', 'i4': 'int'}
	for name, t in dtype_full:
	header += f"property {type_map.get(t, t)} {name}\n"
	header += "end_header\n"
	return header.encode('ascii') + elements.tobytes()


	def _quat_to_matrix(q):
	q = q / np.linalg.norm(q, axis=-1, keepdims=True)
	w, x, y, z = q[:, 0], q[:, 1], q[:, 2], q[:, 3]
	R = np.stack([
	1 - 2(yy + zz), 2(xy - wz), 2(xz + w*y),
	2(xy + wz), 1 - 2(xx + zz), 2(yz - w*x),
	2(xz - wy), 2(yz + wx), 1 - 2(xx + y*y),
	], axis=-1).reshape(-1, 3, 3)
	return R


	def _matrix_to_quat(R):
	trace = R[:, 0, 0] + R[:, 1, 1] + R[:, 2, 2]
	q = np.zeros((R.shape[0], 4), dtype=R.dtype)
	s = np.sqrt(np.maximum(trace + 1, 0)) * 2
	q[:, 0] = 0.25 * s
	q[:, 1] = (R[:, 2, 1] - R[:, 1, 2]) / np.where(s != 0, s, 1)
	q[:, 2] = (R[:, 0, 2] - R[:, 2, 0]) / np.where(s != 0, s, 1)
	q[:, 3] = (R[:, 1, 0] - R[:, 0, 1]) / np.where(s != 0, s, 1)
	m01 = (R[:, 0, 0] >= R[:, 1, 1]) & (R[:, 0, 0] >= R[:, 2, 2]) & (s == 0)
	s1 = np.sqrt(np.maximum(1 + R[:, 0, 0] - R[:, 1, 1] - R[:, 2, 2], 0)) * 2
	q[m01, 0] = (R[m01, 2, 1] - R[m01, 1, 2]) / s1[m01]
	q[m01, 1] = 0.25 * s1[m01]
	q[m01, 2] = (R[m01, 0, 1] + R[m01, 1, 0]) / s1[m01]
	q[m01, 3] = (R[m01, 0, 2] + R[m01, 2, 0]) / s1[m01]
	m11 = (R[:, 1, 1] > R[:, 0, 0]) & (R[:, 1, 1] >= R[:, 2, 2]) & (s == 0)
	s2 = np.sqrt(np.maximum(1 + R[:, 1, 1] - R[:, 0, 0] - R[:, 2, 2], 0)) * 2
	q[m11, 0] = (R[m11, 0, 2] - R[m11, 2, 0]) / s2[m11]
	q[m11, 1] = (R[m11, 0, 1] + R[m11, 1, 0]) / s2[m11]
	q[m11, 2] = 0.25 * s2[m11]
	q[m11, 3] = (R[m11, 1, 2] + R[m11, 2, 1]) / s2[m11]
	m21 = (R[:, 2, 2] > R[:, 0, 0]) & (R[:, 2, 2] > R[:, 1, 1]) & (s == 0)
	s3 = np.sqrt(np.maximum(1 + R[:, 2, 2] - R[:, 0, 0] - R[:, 1, 1], 0)) * 2
	q[m21, 0] = (R[m21, 1, 0] - R[m21, 0, 1]) / s3[m21]
	q[m21, 1] = (R[m21, 0, 2] + R[m21, 2, 0]) / s3[m21]
	q[m21, 2] = (R[m21, 1, 2] + R[m21, 2, 1]) / s3[m21]
	q[m21, 3] = 0.25 * s3[m21]
	return q / np.linalg.norm(q, axis=-1, keepdims=True)


	def _build_gaussians(decoder: ElasticGaussianFixedlenDecoder, points_pred: dict, pred: dict):
	x = points_pred
	offset = decoder._get_offset(pred['features'])
	h = pred["features"]
	ret = []
	for i in range(h.shape[0]):
	g = Gaussian(
	sh_degree=0,
	aabb=[-0.5, -0.5, -0.5, 1.0, 1.0, 1.0],
	mininum_kernel_size=decoder.rep_config['filter_kernel_size_3d'],
	scaling_bias=decoder.rep_config['scaling_bias'],
	opacity_bias=decoder.rep_config['opacity_bias'],
	scaling_activation=decoder.rep_config['scaling_activation'],
	)
	_x = x["points"][i, :, None, :]
	for k, v in decoder.layout.items():
	if k == '_xyz':
	setattr(g, k, (offset[i] + _x).flatten(0, 1))
	elif k in ('_xyz_center', '_offset_scale'):
	continue
	else:
	feats = h[i][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape']).flatten(0, 1)
	setattr(g, k, feats * decoder.rep_config['lr'][k])
	ret.append(g)
	return ret


	# ---------------------------------------------------------------------------
	# Euler flow sampler
	# ---------------------------------------------------------------------------

	class FlowEulerCfgSampler:
	def __init__(self, sigma_min: float = 1e-5):
	self.sigma_min = sigma_min

	def _get_batch_size(self, x_t):
	return next(iter(x_t.values())).shape[0] if isinstance(x_t, dict) else x_t.shape[0]

	def _get_device(self, x_t):
	return next(iter(x_t.values())).device if isinstance(x_t, dict) else x_t.device

	def _inference_model(self, model, x_t, t, cond=None):
	batch = self._get_batch_size(x_t)
	device = self._get_device(x_t)
	t_scaled = torch.tensor([1000 * t] * batch, device=device, dtype=torch.float32)
	if isinstance(cond, dict):
	for k, v in cond.items():
	if isinstance(v, torch.Tensor) and v.shape[0] == 1 and batch > 1:
	cond[k] = v.repeat(batch, ([1] (len(v.shape) - 1)))
	elif cond is not None and cond.shape[0] == 1 and batch > 1:
	cond = cond.repeat(batch, ([1] (len(cond.shape) - 1)))
	return model(x_t, t_scaled, cond)

	def _cfg_prediction(self, model, x_t, t, cond, neg_cond, guidance_scale):
	# Diffusers-style convention: guidance_scale == 1 (or <= 1, or None) means no CFG —
	# only the conditional pass runs, halving the per-step cost. > 1 enables CFG and
	# blends as `pred = s * cond + (1 - s) * uncond = s * cond - (s - 1) * uncond`.
	pred_v = self._inference_model(model, x_t, t, cond)
	if isinstance(guidance_scale, dict):
	if not any(s > 1 for s in guidance_scale.values()):
	return pred_v
	neg_pred_v = self._inference_model(model, x_t, t, neg_cond)
	for key in pred_v:
	s = guidance_scale.get(key, 1.0)
	if s > 1:
	pred_v[key] = s * pred_v[key] - (s - 1) * neg_pred_v[key]
	return pred_v
	if guidance_scale is None or guidance_scale <= 1:
	return pred_v
	neg_pred_v = self._inference_model(model, x_t, t, neg_cond)
	for key in pred_v:
	pred_v[key] = guidance_scale * pred_v[key] - (guidance_scale - 1) * neg_pred_v[key]
	return pred_v

	@torch.no_grad()
	def sample(self, model, noise, cond, neg_cond, steps=50, shift=1.0,
	guidance_scale=None, show_progress=False, callback=None):
	sample = noise
	t_seq = shift * np.linspace(1, 0, steps + 1) / (1 + (shift - 1) * np.linspace(1, 0, steps + 1))
	t_pairs = list(zip(t_seq[:-1], t_seq[1:]))
	iterator = tqdm(t_pairs, desc="Sampling", total=steps) if show_progress else t_pairs
	for i, (t, t_prev) in enumerate(iterator):
	x_t = {k: v.clone() for k, v in sample.items()} if isinstance(sample, dict) else sample.clone()
	pred_v = self._cfg_prediction(model, x_t, t, cond, neg_cond, guidance_scale)
	dt = t - t_prev
	if isinstance(sample, dict):
	for key in sample:
	sample[key] = sample[key] - pred_v[key] * dt
	else:
	sample = sample - pred_v * dt
	if callback is not None:
	callback(i + 1, steps)
	return sample


	# ---------------------------------------------------------------------------
	# Component loaders
	# ---------------------------------------------------------------------------

	def _place(m, device, dtype):
	if device is not None or dtype is not None:
	m = m.to(device=device, dtype=dtype)
	return m.eval()


	def load_dinov3(path: str, device=None, dtype=None) -> DinoV3ViT:
	m = DinoV3ViT()
	m.load_safetensors(path)
	return _place(m, device, dtype)


	def load_vae_encoder(path: str, device=None, dtype=None) -> Flux2VAEEncoder:
	m = Flux2VAEEncoder()
	m.load_safetensors(path)
	return _place(m, device, dtype)


	def load_rmbg(path: str, device=None, dtype=None) -> BiRefNet:
	m = BiRefNet()
	m.load_safetensors(path)
	return _place(m, device, dtype)


	FLOW_MODEL_ARGS = dict(
	q_token_length=8192, in_channels=16, cam_channels=5, out_channels=16,
	model_channels=1024, cond_channels=1280, cond2_channels=128,
	num_refiner_blocks=2, num_blocks=24, num_heads=16, mlp_ratio=4,
	qk_rms_norm=True, share_mod=True, use_shift_table=True,
	)


	def load_flow_model(path: str, device=None, dtype=None) -> LatentSeqMMFlowModel:
	m = LatentSeqMMFlowModel(**FLOW_MODEL_ARGS)
	m.load_safetensors(path)
	return _place(m, device, dtype)


	OCTREE_DECODER_ARGS = dict(
	model_channels=1024, cond_channels=16,
	num_blocks=4, num_heads=16, mlp_ratio=4, share_mod=True,
	)

	GS_DECODER_ARGS = dict(
	in_channels=3, model_channels=1024, cond_channels=16,
	attn_mode="full", num_blocks=16, num_heads=16, mlp_ratio=4,
	use_learned_offset_scale=True, use_per_offset=True,
	representation_config=dict(
	lr=dict(_xyz=1.0, _features_dc=1.0, _opacity=1.0, _scaling=1.0, _rotation=0.1),
	perturb_offset=True, perturbe_size=1.5, offset_scale=0.05, num_gaussians=32,
	filter_kernel_size_3d=0.0009, scaling_bias=0.004, opacity_bias=0.1,
	scaling_activation="softplus",
	),
	)


	def load_decoder(path: str, device=None, dtype=None) -> OctreeGaussianDecoder:
	m = OctreeGaussianDecoder(OCTREE_DECODER_ARGS, GS_DECODER_ARGS)
	m.load_safetensors(path)
	return _place(m, device, dtype)


	# ---------------------------------------------------------------------------
	# Pipeline stages
	# ---------------------------------------------------------------------------

	_CANVAS_SIZE = 1024


	def _image_to_pil(image) -> Image.Image:
	if isinstance(image, Image.Image):
	return image
	if isinstance(image, (str, bytes)) or hasattr(image, "__fspath__"):
	return Image.open(image)
	if isinstance(image, torch.Tensor):
	t = image.detach().cpu()
	if t.ndim == 4:
	assert t.shape[0] == 1, (
	f"batched image input is not supported (got B={t.shape[0]}); "
	"pass one image at a time"
	)
	t = t[0]
	arr = (t.clamp(0, 1) * 255).to(torch.uint8).numpy()
	mode = "RGBA" if arr.shape[-1] == 4 else "RGB"
	return Image.fromarray(arr, mode=mode)
	raise TypeError(f"unsupported image type: {type(image)}")


	def preprocess_image(image, rmbg: BiRefNet, erode_radius: int = 1) -> Image.Image:
	image = _image_to_pil(image)
	size = _CANVAS_SIZE
	w, h = image.size
	s = size / min(w, h)
	image = image.resize((max(1, int(round(w * s))), max(1, int(round(h * s)))), Image.LANCZOS)
	has_real_alpha = (image.mode == "RGBA"
	and np.array(image.getchannel(3), dtype=np.int32).min() < 255)
	if not has_real_alpha:
	image = rmbg.remove_background(image.convert("RGB"))
	if erode_radius > 0:
	image.putalpha(image.getchannel(3).filter(ImageFilter.MinFilter(2 * erode_radius + 1)))
	alpha = np.array(image.getchannel(3))
	ys, xs = np.nonzero(alpha)
	bbox = [xs.min(), ys.min(), xs.max(), ys.max()]
	cx, cy = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
	half = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2 * 1.2
	image = image.crop([int(cx - half), int(cy - half), int(cx + half), int(cy + half)])
	image = image.resize((size, size), Image.LANCZOS)
	bg = Image.new("RGB", (size, size), (0, 0, 0))
	bg.paste(image, mask=image.split()[3])
	return bg


	_DINOV3_NORMALIZE = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])


	@torch.no_grad()
	def encode_image(image: Image.Image, dinov3: DinoV3ViT, vae_encoder: Flux2VAEEncoder,
	generator: torch.Generator = None) -> dict:
	device = next(dinov3.parameters()).device
	img_tensor = transforms.ToTensor()(image).unsqueeze(0).to(device=device, dtype=torch.float32)
	img_normed = _DINOV3_NORMALIZE(img_tensor)
	dinov3_dtype = next(dinov3.parameters()).dtype
	vae_dtype = next(vae_encoder.parameters()).dtype
	dinov3_feat = dinov3(pixel_values=img_normed.to(dinov3_dtype))
	dinov3_feat = F.layer_norm(dinov3_feat.float(), dinov3_feat.shape[-1:])
	vae_feat = vae_encoder.encode(img_tensor.to(vae_dtype) * 2 - 1,
	deterministic=False, generator=generator)
	# pad 5 zero tokens so feature2's token length matches feature1's (cls + 4 registers + patches)
	zero_reg = torch.zeros(vae_feat.shape[0], 5, vae_feat.shape[2],
	dtype=vae_feat.dtype, device=vae_feat.device)
	vae_feat = torch.cat([zero_reg, vae_feat], dim=1)
	return {'feature1': dinov3_feat, 'feature2': vae_feat}


	@torch.no_grad()
	def sample_latent(flow_model: LatentSeqMMFlowModel, cond: dict,
	steps: int = 50, guidance_scale: float = 7.0, shift: float = 3.0,
	generator: torch.Generator = None,
	show_progress: bool = False, callback=None) -> dict:
	device = flow_model.device
	neg_cond = {k: torch.zeros_like(v) for k, v in cond.items()}
	noise = {'latent': torch.randn(1, flow_model.q_token_length, flow_model.in_channels,
	device=device, generator=generator)}
	if flow_model.cam_channels is not None:
	noise['camera'] = torch.randn(1, 1, flow_model.cam_channels,
	device=device, generator=generator)
	sampler = FlowEulerCfgSampler()
	return sampler.sample(flow_model, noise, cond=cond, neg_cond=neg_cond,
	steps=steps, guidance_scale=guidance_scale, shift=shift,
	show_progress=show_progress, callback=callback)


	# ---------------------------------------------------------------------------
	# Pipeline
	# ---------------------------------------------------------------------------

	class TripoSplatPipeline:
	def __init__(self, ckpt_path: str, decoder_path: str, dinov3_path: str,
	flux2_vae_encoder_path: str, rmbg_path: str, device: str = "cuda"):
	self._device = torch.device(device)
	self.dinov3 = load_dinov3 (dinov3_path, device=self._device, dtype=torch.bfloat16)
	self.vae_encoder = load_vae_encoder (flux2_vae_encoder_path, device=self._device, dtype=torch.bfloat16)
	self.rmbg = load_rmbg (rmbg_path, device=self._device, dtype=torch.float16)
	self.flow_model = load_flow_model (ckpt_path, device=self._device, dtype=torch.float16)
	self.decoder = load_decoder (decoder_path, device=self._device, dtype=torch.float16)

	def preprocess_image(self, image, erode_radius: int = 1) -> Image.Image:
	return preprocess_image(image, self.rmbg, erode_radius=erode_radius)

	def encode_image(self, image: Image.Image, generator: torch.Generator = None) -> dict:
	return encode_image(image, self.dinov3, self.vae_encoder, generator=generator)

	def sample_latent(self, cond: dict, steps: int = 50, guidance_scale: float = 7.0,
	shift: float = 3.0, generator: torch.Generator = None,
	show_progress: bool = False, callback=None) -> dict:
	return sample_latent(self.flow_model, cond, steps=steps, guidance_scale=guidance_scale,
	shift=shift, generator=generator,
	show_progress=show_progress, callback=callback)

	def decode_latent(self, latent: torch.Tensor, num_gaussians: int = 262144):
	return self.decoder.decode(latent, num_gaussians=num_gaussians)

	_NUM_GAUSSIANS_MIN = 32768
	_NUM_GAUSSIANS_MAX = 262144

	def _validate_num_gaussians(self, n: int) -> int:
	assert self._NUM_GAUSSIANS_MIN <= n <= self._NUM_GAUSSIANS_MAX, (
	f"num_gaussians must be in [{self._NUM_GAUSSIANS_MIN}, {self._NUM_GAUSSIANS_MAX}], got {n}"
	)
	gpp = self.decoder.gaussians_per_point
	if n % gpp == 0:
	return n
	rounded = round(n / gpp) * gpp
	print(f"[TripoSplatPipeline] num_gaussians={n} is not a multiple of {gpp}; rounding to {rounded}")
	return rounded

	@torch.no_grad()
	def run(self, image, seed: int = 42, steps: int = 20, guidance_scale: float = 3.0,
	shift: float = 3.0, num_gaussians=262144, erode_radius: int = 1,
	show_progress: bool = False, callback=None):
	"""
	Args:
	image: Input image. Accepts a file path / PIL.Image / torch.Tensor
	(`[1,H,W,C]` or `[H,W,C]`, float in `[0, 1]`, optional alpha
	channel as the 4th channel).
	seed: RNG seed for the VAE encoder's stochastic latent sampling and
	the initial flow-matching noise. Same seed → same output.
	steps: Number of Euler integrator steps in the flow-matching sampler.
	More steps → better fidelity, linear runtime cost.
	Recommend: 10~20.
	guidance_scale: Classifier-free-guidance strength (diffusers
	convention). `≤ 1.0` disables CFG. Higher → more detail,
	stronger adherence to the input image; too high can cause color
	oversaturation.
	Recommend: 3.0.
	shift: Flow-matching timestep schedule shift. `1.0` gives a uniform
	schedule; `>1.0` allocates more steps to the early/high-noise end.
	Recommend: 3.0.
	num_gaussians: Target Gaussian-splat count. An `int` returns a
	single `Gaussian`. A `list` / `tuple` of ints returns a
	`list[Gaussian]`. Each count is rounded to the nearest multiple
	of 32. More gaussians → more detail but higher rendering and
	storage cost.
	Recommend: 32768~262144.
	erode_radius: Pixel radius used to erode the alpha matte after
	background removal, to avoid segmentation-border bleed before
	compositing on black. `0` disables; `1` is a 3×3 minimum filter.
	Recommend: 1.
	show_progress: Print a `tqdm` progress bar over sampler steps.
	callback: Optional `fn(step, total)` invoked after each sampler step.
	Useful for external progress UIs (e.g. ComfyUI's
	`ProgressBar.update`).

	Returns:
	`(gaussian, prepared_image)` for an `int` `num_gaussians`, or
	`(list_of_gaussians, prepared_image)` for a `list` / `tuple`. The
	second element is the RGB composite the encoders actually saw —
	useful for display / debugging.
	"""
	if isinstance(num_gaussians, (list, tuple)):
	counts = [self._validate_num_gaussians(n) for n in num_gaussians]
	else:
	counts = [self._validate_num_gaussians(num_gaussians)]

	gen = torch.Generator(device=self._device).manual_seed(seed)
	prepared = self.preprocess_image(image, erode_radius=erode_radius)
	cond = self.encode_image(prepared, generator=gen)
	out = self.sample_latent(cond, steps=steps, guidance_scale=guidance_scale, shift=shift,
	generator=gen, show_progress=show_progress, callback=callback)
	gaussians = [self.decode_latent(out['latent'], num_gaussians=n) for n in counts]
	if isinstance(num_gaussians, (list, tuple)):
	return gaussians, prepared
	return gaussians[0], prepared