Spaces:

TSAnonymousDemo
/

TSEditor

Sleeping

App Files Files Community

TSEditor / models /CSDI /tiffusion.py

PeterYu

update

a785d5a about 1 year ago

raw

history blame contribute delete

34.9 kB

	import math
	import torch
	import torch.nn.functional as F

	from torch import nn
	from einops import reduce
	from tqdm.auto import tqdm
	from functools import partial
	from ..model_utils import default, identity, extract
	from .control import *
	from .diff_csdi import diff_CSDI
	from .csdi import CSDI_base
	import numpy as np


	def linear_beta_schedule(timesteps):
	scale = 1000 / timesteps
	beta_start = scale * 0.0001
	beta_end = scale * 0.02
	return torch.linspace(beta_start, beta_end, timesteps, dtype=torch.float64)


	def cosine_beta_schedule(timesteps, s=0.008):
	"""
	cosine schedule
	as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
	"""
	steps = timesteps + 1
	x = torch.linspace(0, timesteps, steps, dtype=torch.float64)
	alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * math.pi * 0.5) ** 2
	alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
	betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
	return torch.clip(betas, 0, 0.999)


	class Tiffusion(nn.Module):
	def __init__(
	self,
	seq_length,
	feature_size,
	n_layer_enc=3,
	n_layer_dec=6,
	d_model=None,
	timesteps=1000,
	sampling_timesteps=None,
	loss_type="l1",
	beta_schedule="cosine",
	n_heads=4,
	mlp_hidden_times=4,
	eta=0.0,
	attn_pd=0.0,
	resid_pd=0.0,
	kernel_size=None,
	padding_size=None,
	use_ff=True,
	reg_weight=None,
	control_signal={},
	moving_average=False,
	is_unconditional=False,
	target_strategy="mix",
	**kwargs,
	):
	super(Tiffusion, self).__init__()

	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	self.eta, self.use_ff = eta, use_ff
	self.seq_length = seq_length
	self.feature_size = feature_size
	self.ff_weight = default(reg_weight, math.sqrt(self.seq_length) / 5)
	self.sum_weight = default(reg_weight, math.sqrt(self.seq_length // 10) / 50)
	self.training_control_signal = control_signal # training control signal
	self.moving_average = moving_average
	self.is_unconditional = is_unconditional
	self.target_strategy = target_strategy

	self.target_strategy = "random"
	config = {
	"model": {
	"timeemb": 128,
	"featureemb": 16,
	"is_unconditional": False,
	"target_strategy": "mix",
	},
	"diffusion": {
	"layers": 3,
	"channels": 64,
	"nheads": 8,
	"diffusion_embedding_dim": 128,
	"is_linear": False,
	"beta_start": 0.0001,
	"beta_end": 0.5,
	"schedule": "quad",
	"num_steps": 50,
	}
	}

	self.emb_time_dim = config["model"]["timeemb"]
	self.emb_feature_dim = config["model"]["featureemb"]
	self.is_unconditional = config["model"]["is_unconditional"]
	self.target_strategy = config["model"]["target_strategy"]
	# parameters for diffusion models
	config_diff = config["diffusion"]
	self.num_steps = config_diff["num_steps"]
	if config_diff["schedule"] == "quad":
	self.beta = np.linspace(
	config_diff["beta_start"] 0.5, config_diff["beta_end"] 0.5, self.num_steps
	) ** 2
	elif config_diff["schedule"] == "linear":
	self.beta = np.linspace(
	config_diff["beta_start"], config_diff["beta_end"], self.num_steps
	)

	self.alpha_hat = 1 - self.beta
	self.alpha = np.cumprod(self.alpha_hat)
	self.alpha_torch = torch.tensor(self.alpha).float().to(self.device).unsqueeze(1).unsqueeze(1)

	self.emb_total_dim = self.emb_time_dim + self.emb_feature_dim
	if self.is_unconditional == False:
	self.emb_total_dim += 1 # for conditional mask
	self.target_dim = feature_size
	print(feature_size)
	self.embed_layer = nn.Embedding(
	num_embeddings=self.target_dim
	, embedding_dim=self.emb_feature_dim
	)

	self.diffmodel = diff_CSDI(
	{
	"layers": 3,
	"channels": 64,
	"nheads": 8,
	"diffusion_embedding_dim": 128,
	"is_linear": False,
	"beta_start": 0.0001,
	"beta_end": 0.5,
	"schedule": "quad",
	"num_steps": 50,
	"side_dim": self.emb_total_dim
	},
	(1 if self.is_unconditional == True else 2)
	)

	if beta_schedule == "linear":
	betas = linear_beta_schedule(timesteps)
	elif beta_schedule == "cosine":
	betas = cosine_beta_schedule(timesteps)
	else:
	raise ValueError(f"unknown beta schedule {beta_schedule}")

	alphas = 1.0 - betas
	alphas_cumprod = torch.cumprod(alphas, dim=0)
	alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value=1.0)

	(timesteps,) = betas.shape
	self.num_timesteps = int(timesteps)
	self.loss_type = loss_type

	# sampling related parameters
	self.sampling_timesteps = default(
	sampling_timesteps, timesteps
	) # default num sampling timesteps to number of timesteps at training

	assert self.sampling_timesteps <= timesteps
	self.fast_sampling = self.sampling_timesteps < timesteps

	# helper function to register buffer from float64 to float32

	register_buffer = lambda name, val: self.register_buffer(
	name, val.to(torch.float32)
	)

	register_buffer("betas", betas)
	register_buffer("alphas_cumprod", alphas_cumprod)
	register_buffer("alphas_cumprod_prev", alphas_cumprod_prev)

	# calculations for diffusion q(x_t \| x_{t-1}) and others

	register_buffer("sqrt_alphas_cumprod", torch.sqrt(alphas_cumprod))
	register_buffer(
	"sqrt_one_minus_alphas_cumprod", torch.sqrt(1.0 - alphas_cumprod)
	)
	register_buffer("log_one_minus_alphas_cumprod", torch.log(1.0 - alphas_cumprod))
	register_buffer("sqrt_recip_alphas_cumprod", torch.sqrt(1.0 / alphas_cumprod))
	register_buffer(
	"sqrt_recipm1_alphas_cumprod", torch.sqrt(1.0 / alphas_cumprod - 1)
	)

	# calculations for posterior q(x_{t-1} \| x_t, x_0)

	posterior_variance = (
	betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
	)

	# above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)

	register_buffer("posterior_variance", posterior_variance)

	# below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain

	register_buffer(
	"posterior_log_variance_clipped",
	torch.log(posterior_variance.clamp(min=1e-20)),
	)
	register_buffer(
	"posterior_mean_coef1",
	betas * torch.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod),
	)
	register_buffer(
	"posterior_mean_coef2",
	(1.0 - alphas_cumprod_prev) * torch.sqrt(alphas) / (1.0 - alphas_cumprod),
	)

	# calculate reweighting

	register_buffer(
	"loss_weight",
	torch.sqrt(alphas) * torch.sqrt(1.0 - alphas_cumprod) / betas / 100,
	)

	def predict_noise_from_start(self, x_t, t, x0):
	return (
	extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0
	) / extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)

	def predict_start_from_noise(self, x_t, t, noise):
	return (
	extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
	- extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
	)

	def q_posterior(self, x_start, x_t, t):
	posterior_mean = (
	extract(self.posterior_mean_coef1, t, x_t.shape) * x_start
	+ extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
	)
	posterior_variance = extract(self.posterior_variance, t, x_t.shape)
	posterior_log_variance_clipped = extract(
	self.posterior_log_variance_clipped, t, x_t.shape
	)
	return posterior_mean, posterior_variance, posterior_log_variance_clipped

	def output(self, x, t, padding_masks=None, control_signal=None):
	"""Modified output function to work with CSDI"""
	if isinstance(t, int):
	t = torch.tensor([t]).to(x.device)

	# Prepare side info
	observed_tp = torch.arange(x.shape[1], device=x.device).float()
	observed_tp = observed_tp.unsqueeze(0).expand(x.shape[0], -1)
	side_info = self.get_side_info(observed_tp, padding_masks)

	# Get model prediction
	predicted, _ = self.diffmodel(x, side_info, t)
	return predicted

	def generate_mts(self, batch_size=16):
	feature_size, seq_length = self.feature_size, self.seq_length
	sample_fn = self.fast_sample if self.fast_sampling else self.sample
	return sample_fn((batch_size, seq_length, feature_size))

	def generate_mts_infill(self, target, partial_mask=None, clip_denoised=True, model_kwargs=None):
	"""Improved method for conditional generation"""
	with torch.no_grad():
	# Setup inputs
	observed_tp = torch.arange(target.shape[1], device=target.device).float()
	observed_tp = observed_tp.unsqueeze(0).expand(target.shape[0], -1)

	# Generate side info
	side_info = self.get_side_info(observed_tp, partial_mask)

	# Sample using CSDI imputation
	samples = self.impute(
	observed_data=target,
	cond_mask=partial_mask,
	side_info=side_info,
	n_samples=1
	)

	return samples.squeeze(1)

	# def fast_sample_infill_float_mask(
	# self,
	# shape,
	# target: torch.Tensor, # target time series # [B, L, C]
	# sampling_timesteps,
	# partial_mask: torch.Tensor = None, # float mask between 0 and 1 # [B, L, C]
	# clip_denoised=True,
	# model_kwargs=None,
	# ):
	# batch, device, total_timesteps, eta = (
	# shape[0],
	# self.betas.device,
	# self.num_timesteps,
	# self.eta,
	# )
	# # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
	# times = torch.linspace(-1, total_timesteps - 1, steps=sampling_timesteps + 1)

	# times = list(reversed(times.int().tolist()))
	# time_pairs = list(
	# zip(times[:-1], times[1:])
	# ) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]

	# # Initialize with noise
	# img = torch.randn(shape, device=device) # [B, L, C]

	# for time, time_next in tqdm(
	# time_pairs, desc="conditional sampling loop time step"
	# ):
	# time_cond = torch.full((batch,), time, device=device, dtype=torch.long)
	# # pred_noise, x_start, *_ = self.model_predictions(
	# # img,
	# # time_cond,
	# # clip_x_start=clip_denoised,
	# # control_signal=model_kwargs.get("model_control_signal", {}),
	# # )
	# # x, t, clip_x_start=False, padding_masks=None, control_signal=None
	# # if padding_masks is None:
	# padding_masks = torch.ones(
	# img.shape[0], self.seq_length, dtype=bool, device=img.device
	# )
	# maybe_clip = (
	# partial(torch.clamp, min=-1.0, max=1.0) if clip_denoised else identity
	# )
	# # def output(self, x, t, padding_masks=None, control_signal=None):
	# # """Modified output function to work with CSDI"""
	# # if isinstance(t, int):
	# # t = torch.tensor([t]).to(x.device)

	# # # Prepare side info
	# # observed_tp = torch.arange(x.shape[1], device=x.device).float()
	# # observed_tp = observed_tp.unsqueeze(0).expand(x.shape[0], -1)
	# # side_info = self.get_side_info(observed_tp, padding_masks)

	# # # Get model prediction
	# # predicted, _ = self.diffmodel(x, side_info, t)
	# # return predicted
	# predicted, _ = self.diffmodel(img, time_cond)
	# coeff1 = 1 / self.alpha_hat[time] ** 0.5
	# coeff2 = (1 - self.alpha_hat[time]) / (1 - self.alpha[time]) ** 0.5
	# x_start = coeff1 * (img - coeff2 * predicted)
	# # x_start = self.output(img, time_cond, padding_masks)
	# x_start = maybe_clip(x_start)
	# pred_noise = self.predict_noise_from_start(img, time_cond, x_start)
	# # return pred_noise, x_start

	# if time_next < 0:
	# img = x_start
	# continue

	# # Compute the predicted mean
	# alpha = self.alphas_cumprod[time]
	# alpha_next = self.alphas_cumprod[time_next]
	# sigma = (
	# eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
	# )
	# c = (1 - alpha_next - sigma**2).sqrt()

	# noise = torch.randn_like(img)
	# pred_mean = x_start * alpha_next.sqrt() + c * pred_noise
	# img = pred_mean + sigma * noise

	# # # Langevin Dynamics part for additional gradient updates
	# # img = self.langevin_fn(
	# # sample=img,
	# # mean=pred_mean,
	# # sigma=sigma,
	# # t=time_cond,
	# # tgt_embs=target,
	# # partial_mask=partial_mask,
	# # enable_float_mask=True,
	# # **model_kwargs,
	# # )
	# img = img * (1 - partial_mask) + target * partial_mask

	# img = img * (1 - partial_mask) + target * partial_mask
	# return img


	def langevin_fn(
	self,
	coef,
	partial_mask,
	tgt_embs,
	learning_rate,
	sample,
	mean,
	sigma,
	t,
	coef_=0.0,
	gradient_control_signal={},
	model_control_signal={},
	side_info=None,
	**kwargs,
	):
	# we thus run more gradient updates at large diffusion step t to guide the generation then
	# reduce the number of gradient steps in stages to accelerate sampling.
	if t[0].item() < self.num_timesteps * 0.02 :
	K = 0
	elif t[0].item() > self.num_timesteps * 0.9:
	K = 3
	elif t[0].item() > self.num_timesteps * 0.75:
	K = 2
	learning_rate = learning_rate * 0.5
	else:
	K = 1
	learning_rate = learning_rate * 0.25

	input_embs_param = torch.nn.Parameter(sample)

	# 获取时间相关的权重调整因子
	time_weight = get_time_dependent_weights(t[0], self.num_timesteps)

	with torch.enable_grad():
	for iteration in range(K):
	# x_i+1 = x_i + noise * grad(logp(x_i)) + sqrt(2noise) z_i
	optimizer = torch.optim.Adagrad([input_embs_param], lr=learning_rate)
	optimizer.zero_grad()

	# x_start = self.output(
	# x=input_embs_param,
	# t=t,
	# control_signal=model_control_signal,
	# )

	# Prepare model input
	# if self.is_unconditional:
	# diff_input = cond_mask * observed_data + (1.0 - cond_mask) * current_sample
	# diff_input = diff_input.unsqueeze(1)
	# else:
	# cond_obs = (cond_mask * observed_data).unsqueeze(1)
	# noisy_target = ((1 - cond_mask) * current_sample).unsqueeze(1)
	# diff_input = torch.cat([cond_obs, noisy_target], dim=1)
	if self.is_unconditional:
	diff_input = input_embs_param.unsqueeze(1)
	else:
	cond_obs = (partial_mask * tgt_embs).unsqueeze(1)
	noisy_target = ((1 - partial_mask) * input_embs_param).unsqueeze(1)
	diff_input = torch.cat([cond_obs, noisy_target], dim=1)

	x_start, _ = self.diffmodel(diff_input, side_info, t)


	if sigma.mean() == 0:
	logp_term = (
	coef * ((mean - input_embs_param) ** 2 / 1.0).mean(dim=0).sum()
	)
	# determine the partical_mask is float
	if kwargs.get("enable_float_mask", False):
	infill_loss = (x_start * (partial_mask) - tgt_embs * (partial_mask)) ** 2
	else:
	infill_loss = (x_start[partial_mask] - tgt_embs[partial_mask]) ** 2
	infill_loss = infill_loss.mean(dim=0).sum()
	else:
	logp_term = (
	coef
	* ((mean - input_embs_param) ** 2 / sigma).mean(dim=0).sum()
	)
	if kwargs.get("enable_float_mask", False):
	infill_loss = (x_start * (partial_mask) - tgt_embs * (partial_mask)) ** 2
	else:
	infill_loss = (x_start[partial_mask] - tgt_embs[partial_mask]) ** 2
	infill_loss = (infill_loss / sigma.mean()).mean(dim=0).sum()
	gradient_scale = gradient_control_signal.get("gradient_scale", 1.0) # 全局梯度缩放因子
	control_loss = 0

	auc_sum, peak_points, bar_regions, target_freq = \
	gradient_control_signal.get("auc"), gradient_control_signal.get("peak_points"), gradient_control_signal.get("bar_regions"), gradient_control_signal.get("target_freq")

	# 1. 原有的sum控制
	if auc_sum is not None:
	sum_weight = gradient_control_signal.get("auc_weight", 1.0) * time_weight
	auc_loss = - sum_weight * sum_guidance(
	x=input_embs_param,
	t=t,
	target_sum=auc_sum,
	gradient_scale=gradient_scale,
	segments=gradient_control_signal.get("segments", ())
	)
	control_loss += auc_loss

	# 峰值引导
	if peak_points is not None:
	peak_weight = gradient_control_signal.get("peak_weight", 1.0) * time_weight
	peak_loss = - peak_weight * peak_guidance(
	x=input_embs_param,
	t=t,
	peak_points=peak_points,
	window_size=gradient_control_signal.get("peak_window_size", 5),
	alpha_1=gradient_control_signal.get("peak_alpha_1", 1.2),
	gradient_scale=gradient_scale
	)
	control_loss += peak_loss

	# 区间引导
	if bar_regions is not None:
	bar_weight = gradient_control_signal.get("bar_weight", 1.0) * time_weight
	bar_loss = -bar_weight * bar_guidance(
	x=input_embs_param,
	t=t,
	bar_regions=bar_regions,
	gradient_scale=gradient_scale
	)
	control_loss += bar_loss

	# 频率引导
	if target_freq is not None:
	freq_weight = gradient_control_signal.get("freq_weight", 1.0) * time_weight
	freq_loss = -freq_weight * frequency_guidance(
	x=input_embs_param,
	t=t,
	target_freq=target_freq,
	freq_weight=freq_weight,
	gradient_scale=gradient_scale
	)
	control_loss += freq_loss


	loss = logp_term + infill_loss + control_loss
	loss.backward()
	optimizer.step()
	torch.nn.utils.clip_grad_norm_([input_embs_param], gradient_control_signal.get("max_grad_norm", 1.0))

	epsilon = torch.randn_like(input_embs_param.data)
	noise_scale = coef_ * sigma.mean().item()
	input_embs_param = torch.nn.Parameter(
	(
	input_embs_param.data + noise_scale * epsilon
	).detach()
	)

	if kwargs.get("enable_float_mask", False):
	sample = sample * partial_mask + input_embs_param.data * (1 - partial_mask)
	else:
	sample[~partial_mask] = input_embs_param.data[~partial_mask]
	return sample

	def predict_weighted_points(
	self,
	observed_points: torch.Tensor,
	observed_mask: torch.Tensor,
	coef=1e-1,
	stepsize=1e-1,
	sampling_steps=50,
	**kargs,
	):
	model_kwargs = {}
	model_kwargs["coef"] = coef
	model_kwargs["learning_rate"] = stepsize
	model_kwargs = {model_kwargs, kargs}
	assert len(observed_points.shape) == 2, "observed_points should be 2D, batch size = 1"
	x = observed_points.unsqueeze(0)
	float_mask = observed_mask.unsqueeze(0) # x != 0, 1 for observed, 0 for missing, bool tensor
	binary_mask = float_mask.clone()
	binary_mask[binary_mask > 0] = 1

	x = x * 2 - 1 # normalize
	self.device = x.device
	x, float_mask, binary_mask = x.to(self.device), float_mask.to(self.device), binary_mask.to(self.device)
	if sampling_steps == self.num_timesteps:
	print("normal sampling")
	raise NotImplementedError
	sample = self.ema.ema_model.sample_infill_float_mask(
	shape=x.shape,
	target=x * binary_mask, # x * t_m, 1 for observed, 0 for missing
	partial_mask=float_mask,
	model_kwargs=model_kwargs,
	)
	# x: partially noise : (batch_size, seq_length, feature_dim)
	else:
	print("fast sampling")
	sample = self.fast_sample_infill_float_mask(
	shape=x.shape,
	target=x * binary_mask, # x * t_m, 1 for observed, 0 for missing
	partial_mask=float_mask,
	model_kwargs=model_kwargs,
	sampling_timesteps=sampling_steps,
	)

	# unnormalize
	sample = (sample + 1) / 2
	return sample.squeeze(0).detach().cpu().numpy()

	def forward(self, x, **kwargs):
	"""Modified forward pass for CSDI training"""
	# Convert input from [B, C, L] to [B, L, C]
	observed_data = x.permute(0, 2, 1)
	observed_mask = kwargs.get("observed_mask", torch.ones_like(observed_data))
	observed_tp = torch.arange(observed_data.shape[1], device=x.device).float()
	observed_tp = observed_tp.unsqueeze(0).expand(x.shape[0], -1)

	# Generate masks
	is_train = kwargs.get("is_train", 1)
	if is_train:
	cond_mask = self.get_randmask(observed_mask)
	else:
	gt_mask = kwargs.get("gt_mask", observed_mask.clone())
	if "pred_length" in kwargs:
	gt_mask[:,:,-kwargs["pred_length"]:] = 0
	cond_mask = gt_mask

	# Get side info and calculate loss
	side_info = self.get_side_info(observed_tp, cond_mask)
	loss_func = self.calc_loss if is_train else self.calc_loss_valid
	return loss_func(observed_data, cond_mask, observed_mask, side_info, is_train)

	def time_embedding(self, pos, d_model=128):
	pe = torch.zeros(pos.shape[0], pos.shape[1], d_model).to(pos.device)
	position = pos.unsqueeze(2)
	div_term = 1 / torch.pow(
	10000.0, torch.arange(0, d_model, 2).to(pos.device) / d_model
	)
	pe[:, :, 0::2] = torch.sin(position * div_term)
	pe[:, :, 1::2] = torch.cos(position * div_term)
	return pe

	def get_randmask(self, observed_mask):
	rand_for_mask = torch.rand_like(observed_mask) * observed_mask
	rand_for_mask = rand_for_mask.reshape(len(rand_for_mask), -1)
	for i in range(len(observed_mask)):
	sample_ratio = np.random.rand() # missing ratio
	num_observed = observed_mask[i].sum().item()
	num_masked = round(num_observed * sample_ratio)
	rand_for_mask[i][rand_for_mask[i].topk(num_masked).indices] = -1
	cond_mask = (rand_for_mask > 0).reshape(observed_mask.shape).float()
	return cond_mask

	def get_hist_mask(self, observed_mask, for_pattern_mask=None):
	if for_pattern_mask is None:
	for_pattern_mask = observed_mask
	if self.target_strategy == "mix":
	rand_mask = self.get_randmask(observed_mask)

	cond_mask = observed_mask.clone()
	for i in range(len(cond_mask)):
	mask_choice = np.random.rand()
	if self.target_strategy == "mix" and mask_choice > 0.5:
	cond_mask[i] = rand_mask[i]
	else: # draw another sample for histmask (i-1 corresponds to another sample)
	cond_mask[i] = cond_mask[i] * for_pattern_mask[i - 1]
	return cond_mask

	def get_test_pattern_mask(self, observed_mask, test_pattern_mask):
	return observed_mask * test_pattern_mask


	def get_side_info(self, observed_tp, cond_mask):
	B, K, L = cond_mask.shape

	time_embed = self.time_embedding(observed_tp, self.emb_time_dim) # (B,L,emb) torch.Size([64, 24, 128])
	# print(time_embed.shape)
	time_embed = time_embed.unsqueeze(2).expand(-1, -1, K, -1)
	feature_embed = self.embed_layer(
	torch.arange(self.target_dim).to(observed_tp.device)
	) # (K, emb)
	# print("feature_embed",feature_embed.shape)
	feature_embed = feature_embed.unsqueeze(0).unsqueeze(0).expand(B, L, -1, -1)

	# torch.Size([64, 24, 24, 128])[64, 28, 28, 16])
	# print(time_embed.shape, feature_embed.shape)
	side_info = torch.cat([time_embed, feature_embed], dim=-1) # (B,L,K,*)
	side_info = side_info.permute(0, 3, 2, 1) # (B,*,K,L)

	if self.is_unconditional == False:
	side_mask = cond_mask.unsqueeze(1) # (B,1,K,L)
	side_info = torch.cat([side_info, side_mask], dim=1)

	return side_info

	def calc_loss_valid(
	self, observed_data, cond_mask, observed_mask, side_info, is_train
	):
	loss_sum = 0
	for t in range(self.num_steps): # calculate loss for all t
	loss = self.calc_loss(
	observed_data, cond_mask, observed_mask, side_info, is_train, set_t=t
	)
	loss_sum += loss.detach()
	return loss_sum / self.num_steps

	def calc_loss(
	self, observed_data, cond_mask, observed_mask, side_info, is_train, set_t=-1
	):
	B, K, L = observed_data.shape
	if is_train != 1: # for validation
	t = (torch.ones(B) * set_t).long().to(self.device)
	else:
	t = torch.randint(0, self.num_steps, [B]).to(self.device)
	current_alpha = self.alpha_torch[t] # (B,1,1)
	noise = torch.randn_like(observed_data)
	noisy_data = (current_alpha ** 0.5) * observed_data + (1.0 - current_alpha) ** 0.5 * noise

	total_input = self.set_input_to_diffmodel(noisy_data, observed_data, cond_mask)

	predicted, _ = self.diffmodel(total_input, side_info, t) # (B,K,L)

	target_mask = observed_mask - cond_mask
	residual = (noise - predicted) * target_mask
	num_eval = target_mask.sum()
	loss = (residual ** 2).sum() / (num_eval if num_eval > 0 else 1)
	return loss

	def evaluate(self, batch, n_samples):
	(
	observed_data, # [B, L, K]
	observed_mask, # 1 for observed, 0 for missing
	observed_tp, # [0, 1, 2, ..., L-1]
	gt_mask,
	_,
	cut_length,
	) = self.process_data(batch)

	with torch.no_grad():
	cond_mask = gt_mask
	target_mask = observed_mask - cond_mask # 1 for missing, 0 for observed

	side_info = self.get_side_info(observed_tp, cond_mask)

	samples = self.impute(observed_data, cond_mask, side_info, n_samples)

	for i in range(len(cut_length)): # to avoid double evaluation
	target_mask[i, ..., 0 : cut_length[i].item()] = 0
	return samples, observed_data, target_mask, observed_mask, observed_tp

	def impute(self, observed_data, cond_mask, side_info, n_samples):
	"""Modified impute function with Langevin dynamics and control signals"""
	B, K, L = observed_data.shape
	imputed_samples = torch.zeros(B, n_samples, K, L).to(self.device)

	# Setup sampling parameters
	# times = torch.linspace(-1, self.num_steps - 1, steps=self.sampling_timesteps + 1)
	# times = list(reversed(times.int().tolist()))
	# time_pairs = list(zip(times[:-1], times[1:]))
	for i in range(n_samples):
	# Initialize with noise
	current_sample = torch.randn_like(observed_data)

	# for t, time_next in tqdm(time_pairs, desc="Imputation sampling"):
	for t in range(self.num_steps - 1, -1, -1):
	# Prepare time condition
	# time_cond = torch.full((B,), time, device=self.device, dtype=torch.long)
	time_cond = torch.tensor([t]).to(self.device)

	# Prepare model input
	if self.is_unconditional:
	diff_input = cond_mask * observed_data + (1.0 - cond_mask) * current_sample
	diff_input = diff_input.unsqueeze(1)
	else:
	cond_obs = (cond_mask * observed_data).unsqueeze(1)
	noisy_target = ((1 - cond_mask) * current_sample).unsqueeze(1)
	diff_input = torch.cat([cond_obs, noisy_target], dim=1)

	predicted, _ = self.diffmodel(diff_input, side_info, torch.tensor([t]).to(self.device))

	coeff1 = 1 / self.alpha_hat[t] ** 0.5
	coeff2 = (1 - self.alpha_hat[t]) / (1 - self.alpha[t]) ** 0.5
	current_sample = coeff1 * (current_sample - coeff2 * predicted)

	if t > 0:
	noise = torch.randn_like(current_sample)
	sigma = (
	(1.0 - self.alpha[t - 1]) / (1.0 - self.alpha[t]) * self.beta[t]
	) ** 0.5
	current_sample += sigma * noise

	# # Get prediction
	# predicted = self.diffmodel(diff_input, side_info, time_cond)[0]

	# if time_next < 0:
	# current_sample = predicted
	# continue

	# # Update sample with noise
	# alpha = self.alpha[time]
	# alpha_next = self.alpha[time_next]

	# # Compute transition parameters
	# sigma = self.eta * ((1 - alpha_next) / (1 - alpha) * (1 - alpha / alpha_next)).sqrt()
	# c = (1 - alpha_next - sigma**2).sqrt()

	# # Update sample
	# noise = torch.randn_like(current_sample)
	# pred_mean = predicted * alpha_next.sqrt() + c * current_sample
	# current_sample = pred_mean + sigma * noise

	# # # Apply Langevin dynamics and control signals
	# # if model_kwargs is not None:
	# # current_sample = self.langevin_fn(
	# # sample=current_sample,
	# # mean=pred_mean,
	# # sigma=sigma,
	# # t=time_cond,
	# # tgt_embs=observed_data,
	# # partial_mask=cond_mask,
	# # enable_float_mask=True,
	# # side_info=side_info,
	# # **model_kwargs
	# # )

	# # Apply conditioning
	# current_sample = current_sample * (1 - cond_mask) + observed_data * cond_mask

	imputed_samples[:, i] = current_sample

	return imputed_samples

	def fast_sample_infill_float_mask(
	self,
	shape,
	target: torch.Tensor,
	sampling_timesteps,
	partial_mask: torch.Tensor = None,
	clip_denoised=True,
	model_kwargs=None,
	):
	"""Simplified fast sampling that uses improved impute function"""
	batch = shape[0]
	device = self.device

	target = target.permute(0, 2, 1)
	partial_mask = partial_mask.permute(0, 2, 1)

	# Generate timepoints
	observed_tp = torch.arange(shape[1], device=device).float()
	observed_tp = observed_tp.unsqueeze(0).expand(batch, -1)

	# Get side info
	side_info = self.get_side_info(observed_tp, partial_mask)

	# Use modified impute function with control signals
	samples = self.impute(
	observed_data=target,
	cond_mask=partial_mask,
	side_info=side_info,
	n_samples=1,
	)

	return samples.squeeze(1).permute(0, 2, 1)